Skip to content

Commit e9877e0

Browse files
committed
lint
1 parent dea697d commit e9877e0

2 files changed

Lines changed: 28 additions & 29 deletions

File tree

definitions/declarations/httparchive.js

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,7 @@
44
schema: 'crawl_staging',
55
name: table
66
})
7-
)
8-
9-
// See https://github.com/HTTPArchive/dataform/issues/43
10-
assert('corrupted_technology_values')
11-
.tags(['crawl_complete'])
12-
.query(ctx => `
13-
SELECT
14-
date,
15-
client,
16-
tech,
17-
COUNT(DISTINCT page) AS cnt_pages,
18-
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
19-
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
20-
LEFT JOIN pages.technologies AS tech
21-
LEFT JOIN tech.categories AS category
22-
WHERE
23-
date = '${constants.currentMonth}' AND
24-
(
25-
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
26-
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
27-
OR ARRAY_LENGTH(tech.categories) = 0
28-
)
29-
GROUP BY
30-
date,
31-
client,
32-
tech
33-
HAVING cnt_pages > 20
34-
ORDER BY cnt_pages DESC
35-
`);
7+
);
368

379
// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
3810
['technologies', 'categories'].forEach(table =>

definitions/output/crawl/pages.js

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,30 @@
1+
// See https://github.com/HTTPArchive/dataform/issues/43
2+
assert('corrupted_technology_values')
3+
.tags(['crawl_complete'])
4+
.query(ctx => `
5+
SELECT
6+
date,
7+
client,
8+
tech,
9+
COUNT(DISTINCT page) AS cnt_pages,
10+
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
11+
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
12+
LEFT JOIN pages.technologies AS tech
13+
LEFT JOIN tech.categories AS category
14+
WHERE
15+
date = '${constants.currentMonth}' AND
16+
(
17+
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
18+
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
19+
OR ARRAY_LENGTH(tech.categories) = 0
20+
)
21+
GROUP BY
22+
date,
23+
client,
24+
tech
25+
ORDER BY cnt_pages DESC
26+
`)
27+
128
publish('pages', {
229
type: 'incremental',
330
protected: true,

0 commit comments

Comments
 (0)