Skip to content

Commit f96cdf1

Browse files
Merge branch 'main' into judicial-snake
2 parents e3ec99f + 8e01af9 commit f96cdf1

6 files changed

Lines changed: 54 additions & 49 deletions

File tree

definitions/declarations/httparchive.js

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,34 +4,7 @@
44
schema: 'crawl_staging',
55
name: table
66
})
7-
)
8-
9-
// See https://github.com/HTTPArchive/dataform/issues/43
10-
assert('corrupted_technology_values')
11-
.tags(['crawl_complete'])
12-
.query(ctx => `
13-
SELECT
14-
date,
15-
client,
16-
tech,
17-
COUNT(DISTINCT page) AS cnt_pages,
18-
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
19-
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
20-
LEFT JOIN pages.technologies AS tech
21-
LEFT JOIN tech.categories AS category
22-
WHERE
23-
date = '${constants.currentMonth}' AND
24-
(
25-
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
26-
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
27-
OR ARRAY_LENGTH(tech.categories) = 0
28-
)
29-
GROUP BY
30-
date,
31-
client,
32-
tech
33-
ORDER BY cnt_pages DESC
34-
`);
7+
);
358

369
// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
3710
['technologies', 'categories'].forEach(table =>

definitions/output/crawl/pages.js

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,30 @@
1+
// See https://github.com/HTTPArchive/dataform/issues/43
2+
assert('corrupted_technology_values')
3+
.tags(['crawl_complete'])
4+
.query(ctx => `
5+
SELECT
6+
date,
7+
client,
8+
tech,
9+
COUNT(DISTINCT page) AS cnt_pages,
10+
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
11+
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
12+
LEFT JOIN pages.technologies AS tech
13+
LEFT JOIN tech.categories AS category
14+
WHERE
15+
date = '${constants.currentMonth}' AND
16+
(
17+
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
18+
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
19+
OR ARRAY_LENGTH(tech.categories) = 0
20+
)
21+
GROUP BY
22+
date,
23+
client,
24+
tech
25+
ORDER BY cnt_pages DESC
26+
`)
27+
128
publish('pages', {
229
type: 'incremental',
330
protected: true,
@@ -47,7 +74,8 @@ publish('pages', {
4774
technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)',
4875
metadata: 'Additional metadata about the test'
4976
},
50-
tags: ['crawl_complete']
77+
tags: ['crawl_complete'],
78+
dependOnDependencyAssertions: true
5179
}).preOps(ctx => `
5280
DELETE FROM ${ctx.self()}
5381
WHERE date = '${constants.currentMonth}' AND

definitions/output/crawl/requests.js

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,23 +38,27 @@ publish('requests', {
3838
},
3939
tags: ['crawl_complete']
4040
}).preOps(ctx => `
41-
FOR client_value IN (SELECT * FROM UNNEST(['desktop', 'mobile']) AS client) DO
42-
FOR is_root_page_value IN (SELECT * FROM UNNEST([TRUE, FALSE]) AS is_root_page) DO
41+
FOR client_var IN (SELECT * FROM UNNEST(['desktop', 'mobile']) AS value) DO
42+
FOR is_root_page_var IN (SELECT * FROM UNNEST([TRUE, FALSE]) AS value) DO
43+
FOR rank_lt_50M_var IN (SELECT * FROM UNNEST([TRUE, FALSE]) AS value) DO
4344
44-
-- Delete old entries
45-
DELETE FROM ${ctx.self()}
46-
WHERE date = '${constants.currentMonth}'
47-
AND client = client_value.client
48-
AND is_root_page = is_root_page_value.is_root_page;
45+
-- Delete old entries
46+
DELETE FROM ${ctx.self()}
47+
WHERE date = '${constants.currentMonth}' AND
48+
client = client_var.value AND
49+
is_root_page = is_root_page_var.value AND
50+
(rank < 50000000) = rank_lt_50M_var.value;
4951
50-
-- Insert new entries
51-
INSERT INTO ${ctx.self()}
52-
SELECT *
53-
FROM ${ctx.ref('crawl_staging', 'requests')}
54-
WHERE date = '${constants.currentMonth}' AND
55-
client = client_value.client AND
56-
is_root_page = is_root_page_value.is_root_page ${constants.devRankFilter};
52+
-- Insert new entries
53+
INSERT INTO ${ctx.self()}
54+
SELECT *
55+
FROM ${ctx.ref('crawl_staging', 'requests')}
56+
WHERE date = '${constants.currentMonth}' AND
57+
client = client_var.value AND
58+
is_root_page = is_root_page_var.value AND
59+
(rank < 50000000) = rank_lt_50M_var.value ${constants.devRankFilter};
5760
61+
END FOR;
5862
END FOR;
5963
END FOR;
6064
`).query(ctx => `

definitions/output/reports/cwv_tech_categories.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ technology_stats AS (
5050
SELECT
5151
technology,
5252
category_obj AS categories,
53-
SUM(origins) AS total_origins
53+
SUM(origins.dektop + origins.mobile) AS total_origins
5454
FROM ${ctx.ref('reports', 'cwv_tech_technologies')}
5555
GROUP BY
5656
technology,

package-lock.json

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "crawl-data",
33
"author": "@max-ostapenko",
44
"dependencies": {
5-
"@dataform/core": "3.0.15"
5+
"@dataform/core": "^3.0.19"
66
},
77
"scripts": {
88
"format": "npx standard --fix; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint . --fix; terraform -chdir=infra/tf fmt -recursive",

0 commit comments

Comments
 (0)