Skip to content

Commit f34e8d6

Browse files
fail after >200 invalid tech pages (#76)
1 parent 8e01af9 commit f34e8d6

File tree

1 file changed

+10
-5
lines changed

1 file changed

+10
-5
lines changed

definitions/output/crawl/pages.js

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@ assert('corrupted_technology_values')
33
.tags(['crawl_complete'])
44
.query(ctx => `
55
SELECT
6-
date,
7-
client,
8-
tech,
9-
COUNT(DISTINCT page) AS cnt_pages,
10-
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
6+
/*
7+
date,
8+
client,
9+
tech,
10+
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages,
11+
*/
12+
COUNT(DISTINCT page) AS cnt_pages
1113
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
1214
LEFT JOIN pages.technologies AS tech
1315
LEFT JOIN tech.categories AS category
@@ -18,11 +20,14 @@ WHERE
1820
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
1921
OR ARRAY_LENGTH(tech.categories) = 0
2022
)
23+
/*
2124
GROUP BY
2225
date,
2326
client,
2427
tech
2528
ORDER BY cnt_pages DESC
29+
*/
30+
HAVING cnt_pages > 200
2631
`)
2732

2833
publish('pages', {

0 commit comments

Comments
 (0)