File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 44 schema : 'crawl_staging' ,
55 name : table
66 } )
7- )
8-
9- // See https://github.com/HTTPArchive/dataform/issues/43
10- assert ( 'corrupted_technology_values' )
11- . tags ( [ 'crawl_complete' ] )
12- . query ( ctx => `
13- SELECT
14- date,
15- client,
16- tech,
17- COUNT(DISTINCT page) AS cnt_pages,
18- ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
19- FROM ${ ctx . ref ( 'crawl_staging' , 'pages' ) } AS pages
20- LEFT JOIN pages.technologies AS tech
21- LEFT JOIN tech.categories AS category
22- WHERE
23- date = '${ constants . currentMonth } ' AND
24- (
25- tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
26- OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
27- OR ARRAY_LENGTH(tech.categories) = 0
28- )
29- GROUP BY
30- date,
31- client,
32- tech
33- HAVING cnt_pages > 20
34- ORDER BY cnt_pages DESC
35- ` ) ;
7+ ) ;
368
379// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
3810[ 'technologies' , 'categories' ] . forEach ( table =>
Original file line number Diff line number Diff line change 1+ // See https://github.com/HTTPArchive/dataform/issues/43
2+ assert ( 'corrupted_technology_values' )
3+ . tags ( [ 'crawl_complete' ] )
4+ . query ( ctx => `
5+ SELECT
6+ date,
7+ client,
8+ tech,
9+ COUNT(DISTINCT page) AS cnt_pages,
10+ ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
11+ FROM ${ ctx . ref ( 'crawl_staging' , 'pages' ) } AS pages
12+ LEFT JOIN pages.technologies AS tech
13+ LEFT JOIN tech.categories AS category
14+ WHERE
15+ date = '${ constants . currentMonth } ' AND
16+ (
17+ tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
18+ OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
19+ OR ARRAY_LENGTH(tech.categories) = 0
20+ )
21+ GROUP BY
22+ date,
23+ client,
24+ tech
25+ ORDER BY cnt_pages DESC
26+ ` )
27+
128publish ( 'pages' , {
229 type : 'incremental' ,
330 protected : true ,
You can’t perform that action at this time.
0 commit comments