Skip to content

Commit 40d09ae

Browse files
max-ostapenkoGCP Dataform
authored andcommitted
Merge branch 'judicial-snake' into judicial-snake
2 parents 57ca803 + 22df13b commit 40d09ae

34 files changed

Lines changed: 1213 additions & 744 deletions

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,9 @@ tf_plan:
1111

1212
tf_apply:
1313
terraform -chdir=infra/tf init && terraform -chdir=infra/tf apply -auto-approve
14-
cd infra/bigquery-export/ && npm install && npm run buildpack
14+
15+
bigquery_export_deploy:
16+
cd infra/bigquery-export && npm install && npm run buildpack
17+
18+
bigquery_export_spark_deploy:
19+
cd infra/bigquery_export_spark && gcloud builds submit --region=global --tag us-docker.pkg.dev/httparchive/bigquery-spark-procedures/firestore_export:latest

README.md

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ Consumers:
4747

4848
### Triggering workflows
4949

50-
In order to unify the workflow triggering mechanism, we use [a Cloud Run function](./src/README.md) that can be invoked in a number of ways (e.g. listen to PubSub messages), do intermediate checks and trigger the particular Dataform workflow execution configuration.
50+
In order to unify the workflow triggering mechanism, we use [a Cloud Run function](./infra/README.md) that can be invoked in a number of ways (e.g. listen to PubSub messages), do intermediate checks and trigger the particular Dataform workflow execution configuration.
5151

5252
## Contributing
5353

@@ -59,5 +59,38 @@ In order to unify the workflow triggering mechanism, we use [a Cloud Run functio
5959

6060
#### Workspace hints
6161

62-
1. In `workflow_settings.yaml` set `env_name: dev` to process sampled data.
63-
2. In `includes/constants.js` set `today` or other variables to a custome value.
62+
1. In `workflow_settings.yaml` set `environment: dev` to process sampled data.
63+
2. For development and testing, you can modify variables in `includes/constants.js`, but note that these are programmatically generated.
64+
65+
## Repository Structure
66+
67+
- `definitions/` - Contains the core Dataform SQL definitions and declarations
68+
- `output/` - Contains the main pipeline transformation logic
69+
- `declarations/` - Contains referenced tables/views declarations and other resources definitions
70+
- `includes/` - Contains shared JavaScript utilities and constants
71+
- `infra/` - Infrastructure code and deployment configurations
72+
- `dataform-trigger/` - Cloud Run function for workflow automation
73+
- `tf/` - Terraform configurations
74+
- `bigquery-export/` - BigQuery export configurations
75+
- `docs/` - Additional documentation
76+
77+
## Development Setup
78+
79+
1. Install dependencies:
80+
81+
```bash
82+
npm install
83+
```
84+
85+
2. Available Scripts:
86+
87+
- `npm run format` - Format code using Standard.js, fix Markdown issues, and format Terraform files
88+
- `npm run lint` - Run linting checks on JavaScript, Markdown files, and compile Dataform configs
89+
90+
## Code Quality
91+
92+
This repository uses:
93+
94+
- Standard.js for JavaScript code style
95+
- Markdownlint for Markdown file formatting
96+
- Dataform's built-in compiler for SQL validation

definitions/declarations/httparchive.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ ORDER BY cnt_pages DESC
4242
)
4343

4444
operate('create_reservation_assignment')
45-
.tags(['crawl_complete'])
45+
// .tags(['crawl_complete'])
4646
.queries(ctx => `
4747
CREATE ASSIGNMENT
4848
\`httparchive.region-us.retrospective-reprocessing.pipeline\`
@@ -52,8 +52,8 @@ OPTIONS (
5252
`)
5353

5454
operate('drop_reservation_assignment')
55-
.dependencies(['requests_10k'])
56-
.tags(['crawl_complete'])
55+
// .dependencies(['requests_10k'])
56+
// .tags(['crawl_complete'])
5757
.queries(ctx => `
5858
DROP ASSIGNMENT IF EXISTS
5959
\`httparchive.region-us.retrospective-reprocessing.pipeline\`

definitions/output/reports/reports_dynamic.js

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,21 @@ if (iterations.length === 1) {
2323
}).preOps(ctx => `
2424
--DELETE FROM ${ctx.self()}
2525
--WHERE date = '${params.date}';
26-
`).query(ctx => `
27-
/* {"dataform_trigger": "report_complete", "date": "${params.date}", "name": "${metric.id}", "type": "${sql.type}"} */` +
28-
sql.query(ctx, params))
26+
`).query(
27+
ctx => sql.query(ctx, params)
28+
).postOps(ctx => `
29+
SELECT
30+
reports.run_export_job(
31+
JSON '''{
32+
"destination": "cloud_storage",
33+
"config": {
34+
"bucket": "httparchive",
35+
"name": "reports/${constants.environment}/${metric.id}_${sql.type}_${params.date}.json"
36+
},
37+
"query": "SELECT FORMAT_DATE('%Y_%m_%d', date) AS date, * EXCEPT(date) FROM ${ctx.self()}"
38+
}'''
39+
);
40+
`)
2941
})
3042
})
3143
} else {
@@ -38,10 +50,23 @@ sql.query(ctx, params))
3850
DELETE FROM reports.${metric.id}_${sql.type}
3951
WHERE date = '${params.date}';
4052
41-
/* {"dataform_trigger": "report_complete", "date": "${params.date}", "name": "${metric.id}", "type": "${sql.type}"} */
42-
INSERT INTO reports.${metric.id}_${sql.type}` +
43-
sql.query(ctx, params))
53+
INSERT INTO reports.${metric.id}_${sql.type}` + sql.query(ctx, params)
54+
).postOps(ctx => `
55+
SELECT
56+
reports.run_export_job(
57+
JSON '''{
58+
"dataform_trigger": "report_complete",
59+
"date": "${params.date}",
60+
"name": "${metric.id}",
61+
"type": "${sql.type}",
62+
"environment": "${constants.environment}"
63+
}'''
64+
);
65+
`)
4466
})
4567
})
4668
})
4769
}
70+
71+
// --"query": "SELECT * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${params.date}'"
72+
// `${this.storagePath}${date.replaceAll('-', '_')}/${metric}.json`

definitions/output/reports/tech_report_adoption.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ publish('tech_report_adoption', {
1313
DELETE FROM ${ctx.self()}
1414
WHERE date = '${pastMonth}';
1515
`).query(ctx => `
16-
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "adoption", "type": "report"} */
1716
SELECT
1817
date,
1918
geo,
@@ -32,4 +31,15 @@ GROUP BY
3231
rank,
3332
technology,
3433
version
34+
`).postOps(ctx => `
35+
SELECT
36+
reports.run_export_job(
37+
JSON '''{
38+
"dataform_trigger": "tech_report_complete",
39+
"date": "${pastMonth}",
40+
"name": "adoption",
41+
"type": "report",
42+
"environment": "${constants.environment}"
43+
}'''
44+
);
3545
`)

definitions/output/reports/tech_report_audits.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ return Object.keys(auditMap).map(function(key) {
6767
DELETE FROM ${ctx.self()}
6868
WHERE date = '${pastMonth}';
6969
`).query(ctx => `
70-
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "audits", "type": "report"} */
7170
SELECT
7271
date,
7372
geo,
@@ -86,4 +85,15 @@ GROUP BY
8685
rank,
8786
technology,
8887
version
88+
`).postOps(ctx => `
89+
SELECT
90+
reports.run_export_job(
91+
JSON '''{
92+
"dataform_trigger": "tech_report_complete",
93+
"date": "${pastMonth}",
94+
"name": "audits",
95+
"type": "report",
96+
"environment": "${constants.environment}"
97+
}'''
98+
);
8999
`)

definitions/output/reports/tech_report_categories.js

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ publish('tech_report_categories', {
55
type: 'table',
66
tags: ['tech_report']
77
}).query(ctx => `
8-
/* {"dataform_trigger": "tech_report_complete", "name": "categories", "type": "dict"} */
98
WITH pages AS (
109
SELECT DISTINCT
1110
client,
@@ -91,4 +90,14 @@ FROM (
9190
FROM pages
9291
GROUP BY client
9392
)
93+
`).postOps(ctx => `
94+
SELECT
95+
reports.run_export_job(
96+
JSON '''{
97+
"dataform_trigger": "tech_report_complete",
98+
"name": "categories",
99+
"type": "dict",
100+
"environment": "${constants.environment}"
101+
}'''
102+
);
94103
`)

definitions/output/reports/tech_report_core_web_vitals.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ return Object.values(vitals)
6868
DELETE FROM ${ctx.self()}
6969
WHERE date = '${pastMonth}';
7070
`).query(ctx => `
71-
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "core_web_vitals", "type": "report"} */
7271
SELECT
7372
date,
7473
geo,
@@ -100,4 +99,15 @@ GROUP BY
10099
rank,
101100
technology,
102101
version
102+
`).postOps(ctx => `
103+
SELECT
104+
reports.run_export_job(
105+
JSON '''{
106+
"dataform_trigger": "tech_report_complete",
107+
"date": "${pastMonth}",
108+
"name": "core_web_vitals",
109+
"type": "report",
110+
"environment": "${constants.environment}"
111+
}'''
112+
);
103113
`)

definitions/output/reports/tech_report_lighthouse.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ return Object.values(lighthouse)
5454
DELETE FROM ${ctx.self()}
5555
WHERE date = '${pastMonth}';
5656
`).query(ctx => `
57-
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "lighthouse", "type": "report"} */
5857
SELECT
5958
date,
6059
geo,
@@ -76,4 +75,15 @@ GROUP BY
7675
rank,
7776
technology,
7877
version
78+
`).postOps(ctx => `
79+
SELECT
80+
reports.run_export_job(
81+
JSON '''{
82+
"dataform_trigger": "tech_report_complete",
83+
"date": "${pastMonth}",
84+
"name": "lighthouse",
85+
"type": "report",
86+
"environment": "${constants.environment}"
87+
}'''
88+
);
7989
`)

definitions/output/reports/tech_report_page_weight.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ return Object.values(pageWeight)
4848
DELETE FROM ${ctx.self()}
4949
WHERE date = '${pastMonth}';
5050
`).query(ctx => `
51-
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "page_weight", "type": "report"} */
5251
SELECT
5352
date,
5453
geo,
@@ -69,4 +68,15 @@ GROUP BY
6968
rank,
7069
technology,
7170
version
71+
`).postOps(ctx => `
72+
SELECT
73+
reports.run_export_job(
74+
JSON '''{
75+
"dataform_trigger": "tech_report_complete",
76+
"date": "${pastMonth}",
77+
"name": "page_weight",
78+
"type": "report",
79+
"environment": "${constants.environment}"
80+
}'''
81+
);
7282
`)

0 commit comments

Comments
 (0)