Skip to content

Commit f349462

Browse files
Add alarms
1 parent a1c77ae commit f349462

18 files changed

Lines changed: 713 additions & 0 deletions

File tree

infrastructure/terraform/components/api/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,11 @@ No requirements.
4242

4343
| Name | Source | Version |
4444
|------|--------|---------|
45+
| <a name="module_apigw_alarms"></a> [apigw\_alarms](#module\_apigw\_alarms) | ../../modules/alarms-apigw | n/a |
4546
| <a name="module_authorizer_lambda"></a> [authorizer\_lambda](#module\_authorizer\_lambda) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a |
47+
| <a name="module_ddb_alarms_letters"></a> [ddb\_alarms\_letters](#module\_ddb\_alarms\_letters) | ../../modules/alarms-ddb | n/a |
48+
| <a name="module_ddb_alarms_mi"></a> [ddb\_alarms\_mi](#module\_ddb\_alarms\_mi) | ../../modules/alarms-ddb | n/a |
49+
| <a name="module_ddb_alarms_suppliers"></a> [ddb\_alarms\_suppliers](#module\_ddb\_alarms\_suppliers) | ../../modules/alarms-ddb | n/a |
4650
| <a name="module_domain_truststore"></a> [domain\_truststore](#module\_domain\_truststore) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a |
4751
| <a name="module_eventpub"></a> [eventpub](#module\_eventpub) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-eventpub.zip | n/a |
4852
| <a name="module_eventsub"></a> [eventsub](#module\_eventsub) | ../../modules/eventsub | n/a |
@@ -51,6 +55,7 @@ No requirements.
5155
| <a name="module_get_letters"></a> [get\_letters](#module\_get\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a |
5256
| <a name="module_get_status"></a> [get\_status](#module\_get\_status) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a |
5357
| <a name="module_kms"></a> [kms](#module\_kms) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-kms.zip | n/a |
58+
| <a name="module_lambda_alarms"></a> [lambda\_alarms](#module\_lambda\_alarms) | ../../modules/alarms-lambda | n/a |
5459
| <a name="module_letter_status_update"></a> [letter\_status\_update](#module\_letter\_status\_update) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a |
5560
| <a name="module_letter_status_updates_queue"></a> [letter\_status\_updates\_queue](#module\_letter\_status\_updates\_queue) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.24/terraform-sqs.zip | n/a |
5661
| <a name="module_letter_updates_transformer"></a> [letter\_updates\_transformer](#module\_letter\_updates\_transformer) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a |
@@ -60,6 +65,7 @@ No requirements.
6065
| <a name="module_post_letters"></a> [post\_letters](#module\_post\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a |
6166
| <a name="module_post_mi"></a> [post\_mi](#module\_post\_mi) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a |
6267
| <a name="module_s3bucket_test_letters"></a> [s3bucket\_test\_letters](#module\_s3bucket\_test\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a |
68+
| <a name="module_sqs_alarms"></a> [sqs\_alarms](#module\_sqs\_alarms) | ../../modules/alarms-sqs | n/a |
6369
| <a name="module_sqs_letter_updates"></a> [sqs\_letter\_updates](#module\_sqs\_letter\_updates) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-sqs.zip | n/a |
6470
| <a name="module_supplier_ssl"></a> [supplier\_ssl](#module\_supplier\_ssl) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-ssl.zip | n/a |
6571
| <a name="module_upsert_letter"></a> [upsert\_letter](#module\_upsert\_letter) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a |
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
locals {
2+
lambda_alarm_targets = {
3+
authorizer_lambda = module.authorizer_lambda.function_name
4+
get_letter = module.get_letter.function_name
5+
get_letters = module.get_letters.function_name
6+
get_letter_data = module.get_letter_data.function_name
7+
get_status = module.get_status.function_name
8+
patch_letter = module.patch_letter.function_name
9+
post_letters = module.post_letters.function_name
10+
post_mi = module.post_mi.function_name
11+
upsert_letter = module.upsert_letter.function_name
12+
letter_status_update = module.letter_status_update.function_name
13+
letter_updates_transformer = module.letter_updates_transformer.function_name
14+
mi_updates_transformer = module.mi_updates_transformer.function_name
15+
}
16+
17+
sqs_queue_names = {
18+
letter_updates = module.sqs_letter_updates.sqs_queue_name
19+
letter_status_updates = module.letter_status_updates_queue.sqs_queue_name
20+
}
21+
}
22+
23+
module "lambda_alarms" {
24+
for_each = local.lambda_alarm_targets
25+
source = "../../modules/alarms-lambda"
26+
27+
alarm_prefix = local.csi
28+
function_name = each.value
29+
log_group_name = "/aws/lambda/${each.value}"
30+
tags = local.default_tags
31+
}
32+
33+
module "ddb_alarms_letters" {
34+
source = "../../modules/alarms-ddb"
35+
alarm_prefix = local.csi
36+
table_name = aws_dynamodb_table.letters.name
37+
tags = local.default_tags
38+
}
39+
40+
module "ddb_alarms_mi" {
41+
source = "../../modules/alarms-ddb"
42+
alarm_prefix = local.csi
43+
table_name = aws_dynamodb_table.mi.name
44+
tags = local.default_tags
45+
}
46+
47+
module "ddb_alarms_suppliers" {
48+
source = "../../modules/alarms-ddb"
49+
alarm_prefix = local.csi
50+
table_name = aws_dynamodb_table.suppliers.name
51+
tags = local.default_tags
52+
}
53+
54+
module "sqs_alarms" {
55+
for_each = local.sqs_queue_names
56+
source = "../../modules/alarms-sqs"
57+
58+
alarm_prefix = local.csi
59+
queue_name = each.value
60+
dlq_queue_name = replace(each.value, "-queue", "-dlq")
61+
tags = local.default_tags
62+
}
63+
64+
module "apigw_alarms" {
65+
source = "../../modules/alarms-apigw"
66+
alarm_prefix = local.csi
67+
api_name = aws_api_gateway_rest_api.main.name
68+
stage_name = aws_api_gateway_stage.main.stage_name
69+
tags = local.default_tags
70+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<!-- BEGIN_TF_DOCS -->
2+
<!-- markdownlint-disable -->
3+
<!-- vale off -->
4+
5+
## Requirements
6+
7+
| Name | Version |
8+
|------|---------|
9+
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.9.0 |
10+
## Inputs
11+
12+
| Name | Description | Type | Default | Required |
13+
|------|-------------|------|---------|:--------:|
14+
| <a name="input_alarm_prefix"></a> [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes |
15+
| <a name="input_api_name"></a> [api\_name](#input\_api\_name) | n/a | `string` | n/a | yes |
16+
| <a name="input_error_5xx_evaluation_periods"></a> [error\_5xx\_evaluation\_periods](#input\_error\_5xx\_evaluation\_periods) | n/a | `number` | `1` | no |
17+
| <a name="input_error_5xx_period_seconds"></a> [error\_5xx\_period\_seconds](#input\_error\_5xx\_period\_seconds) | n/a | `number` | `60` | no |
18+
| <a name="input_error_5xx_threshold"></a> [error\_5xx\_threshold](#input\_error\_5xx\_threshold) | n/a | `number` | `0` | no |
19+
| <a name="input_latency_anomaly_sensitivity"></a> [latency\_anomaly\_sensitivity](#input\_latency\_anomaly\_sensitivity) | n/a | `number` | `2` | no |
20+
| <a name="input_latency_datapoints_to_alarm"></a> [latency\_datapoints\_to\_alarm](#input\_latency\_datapoints\_to\_alarm) | n/a | `number` | `3` | no |
21+
| <a name="input_latency_evaluation_periods"></a> [latency\_evaluation\_periods](#input\_latency\_evaluation\_periods) | n/a | `number` | `5` | no |
22+
| <a name="input_latency_period_seconds"></a> [latency\_period\_seconds](#input\_latency\_period\_seconds) | n/a | `number` | `60` | no |
23+
| <a name="input_latency_threshold_ms"></a> [latency\_threshold\_ms](#input\_latency\_threshold\_ms) | n/a | `number` | `29000` | no |
24+
| <a name="input_stage_name"></a> [stage\_name](#input\_stage\_name) | n/a | `string` | n/a | yes |
25+
| <a name="input_tags"></a> [tags](#input\_tags) | n/a | `map(string)` | `{}` | no |
26+
## Modules
27+
28+
No modules.
29+
## Outputs
30+
31+
No outputs.
32+
<!-- vale on -->
33+
<!-- markdownlint-enable -->
34+
<!-- END_TF_DOCS -->
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
locals {
2+
api_dimensions = {
3+
ApiName = var.api_name
4+
Stage = var.stage_name
5+
}
6+
}
7+
8+
resource "aws_cloudwatch_metric_alarm" "five_xx" {
9+
alarm_name = "${var.alarm_prefix}-apigw-5xx"
10+
alarm_description = "RELIABILITY: API Gateway 5xx responses"
11+
12+
namespace = "AWS/ApiGateway"
13+
metric_name = "5XXError"
14+
statistic = "Sum"
15+
period = var.error_5xx_period_seconds
16+
17+
evaluation_periods = var.error_5xx_evaluation_periods
18+
threshold = var.error_5xx_threshold
19+
comparison_operator = "GreaterThanThreshold"
20+
treat_missing_data = "notBreaching"
21+
22+
dimensions = local.api_dimensions
23+
24+
actions_enabled = false
25+
alarm_actions = []
26+
ok_actions = []
27+
insufficient_data_actions = []
28+
tags = var.tags
29+
}
30+
31+
resource "aws_cloudwatch_metric_alarm" "latency_threshold" {
32+
alarm_name = "${var.alarm_prefix}-apigw-latency-threshold"
33+
alarm_description = "RELIABILITY: API Gateway latency above threshold"
34+
35+
namespace = "AWS/ApiGateway"
36+
metric_name = "Latency"
37+
statistic = "Average"
38+
period = var.latency_period_seconds
39+
40+
evaluation_periods = var.latency_evaluation_periods
41+
threshold = var.latency_threshold_ms
42+
comparison_operator = "GreaterThanThreshold"
43+
treat_missing_data = "notBreaching"
44+
45+
dimensions = local.api_dimensions
46+
47+
actions_enabled = false
48+
alarm_actions = []
49+
ok_actions = []
50+
insufficient_data_actions = []
51+
tags = var.tags
52+
}
53+
54+
resource "aws_cloudwatch_metric_alarm" "latency_anomaly" {
55+
alarm_name = "${var.alarm_prefix}-apigw-latency-anomaly"
56+
alarm_description = "RELIABILITY: API Gateway latency anomaly"
57+
comparison_operator = "GreaterThanUpperThreshold"
58+
evaluation_periods = var.latency_evaluation_periods
59+
datapoints_to_alarm = var.latency_datapoints_to_alarm
60+
threshold_metric_id = "ad1"
61+
treat_missing_data = "notBreaching"
62+
63+
actions_enabled = false
64+
alarm_actions = []
65+
ok_actions = []
66+
insufficient_data_actions = []
67+
tags = var.tags
68+
69+
metric_query {
70+
id = "m1"
71+
metric {
72+
metric_name = "Latency"
73+
namespace = "AWS/ApiGateway"
74+
stat = "Average"
75+
period = var.latency_period_seconds
76+
dimensions = local.api_dimensions
77+
}
78+
return_data = true
79+
}
80+
81+
metric_query {
82+
id = "ad1"
83+
expression = "ANOMALY_DETECTION_BAND(m1, ${var.latency_anomaly_sensitivity})"
84+
label = "Latency (expected)"
85+
return_data = false
86+
}
87+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
variable "alarm_prefix" {
2+
type = string
3+
}
4+
5+
variable "api_name" {
6+
type = string
7+
}
8+
9+
variable "stage_name" {
10+
type = string
11+
}
12+
13+
variable "tags" {
14+
type = map(string)
15+
default = {}
16+
}
17+
18+
variable "error_5xx_threshold" {
19+
type = number
20+
default = 0
21+
}
22+
23+
variable "error_5xx_period_seconds" {
24+
type = number
25+
default = 60
26+
}
27+
28+
variable "error_5xx_evaluation_periods" {
29+
type = number
30+
default = 1
31+
}
32+
33+
variable "latency_threshold_ms" {
34+
type = number
35+
default = 29000
36+
}
37+
38+
variable "latency_period_seconds" {
39+
type = number
40+
default = 60
41+
}
42+
43+
variable "latency_evaluation_periods" {
44+
type = number
45+
default = 5
46+
}
47+
48+
variable "latency_datapoints_to_alarm" {
49+
type = number
50+
default = 3
51+
}
52+
53+
variable "latency_anomaly_sensitivity" {
54+
type = number
55+
default = 2
56+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
terraform {
3+
required_providers {
4+
aws = {
5+
source = "hashicorp/aws"
6+
}
7+
}
8+
required_version = ">= 1.9.0"
9+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
<!-- BEGIN_TF_DOCS -->
2+
<!-- markdownlint-disable -->
3+
<!-- vale off -->
4+
5+
## Requirements
6+
7+
| Name | Version |
8+
|------|---------|
9+
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.9.0 |
10+
## Inputs
11+
12+
| Name | Description | Type | Default | Required |
13+
|------|-------------|------|---------|:--------:|
14+
| <a name="input_alarm_prefix"></a> [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes |
15+
| <a name="input_evaluation_periods"></a> [evaluation\_periods](#input\_evaluation\_periods) | n/a | `number` | `1` | no |
16+
| <a name="input_period_seconds"></a> [period\_seconds](#input\_period\_seconds) | n/a | `number` | `60` | no |
17+
| <a name="input_read_throttle_threshold"></a> [read\_throttle\_threshold](#input\_read\_throttle\_threshold) | n/a | `number` | `0` | no |
18+
| <a name="input_table_name"></a> [table\_name](#input\_table\_name) | n/a | `string` | n/a | yes |
19+
| <a name="input_tags"></a> [tags](#input\_tags) | n/a | `map(string)` | `{}` | no |
20+
| <a name="input_write_throttle_threshold"></a> [write\_throttle\_threshold](#input\_write\_throttle\_threshold) | n/a | `number` | `0` | no |
21+
## Modules
22+
23+
No modules.
24+
## Outputs
25+
26+
No outputs.
27+
<!-- vale on -->
28+
<!-- markdownlint-enable -->
29+
<!-- END_TF_DOCS -->
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
resource "aws_cloudwatch_metric_alarm" "read_throttle" {
2+
alarm_name = "${var.alarm_prefix}-ddb-${var.table_name}-read-throttle"
3+
alarm_description = "RELIABILITY: DynamoDB read throttling"
4+
5+
namespace = "AWS/DynamoDB"
6+
metric_name = "ReadThrottleEvents"
7+
statistic = "Sum"
8+
period = var.period_seconds
9+
10+
evaluation_periods = var.evaluation_periods
11+
threshold = var.read_throttle_threshold
12+
comparison_operator = "GreaterThanThreshold"
13+
treat_missing_data = "notBreaching"
14+
15+
dimensions = { TableName = var.table_name }
16+
17+
actions_enabled = false
18+
alarm_actions = []
19+
ok_actions = []
20+
insufficient_data_actions = []
21+
tags = var.tags
22+
}
23+
24+
resource "aws_cloudwatch_metric_alarm" "write_throttle" {
25+
alarm_name = "${var.alarm_prefix}-ddb-${var.table_name}-write-throttle"
26+
alarm_description = "RELIABILITY: DynamoDB write throttling"
27+
28+
namespace = "AWS/DynamoDB"
29+
metric_name = "WriteThrottleEvents"
30+
statistic = "Sum"
31+
period = var.period_seconds
32+
33+
evaluation_periods = var.evaluation_periods
34+
threshold = var.write_throttle_threshold
35+
comparison_operator = "GreaterThanThreshold"
36+
treat_missing_data = "notBreaching"
37+
38+
dimensions = { TableName = var.table_name }
39+
40+
actions_enabled = false
41+
alarm_actions = []
42+
ok_actions = []
43+
insufficient_data_actions = []
44+
tags = var.tags
45+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
variable "alarm_prefix" {
2+
type = string
3+
}
4+
5+
variable "table_name" {
6+
type = string
7+
}
8+
9+
variable "tags" {
10+
type = map(string)
11+
default = {}
12+
}
13+
14+
variable "period_seconds" {
15+
type = number
16+
default = 60
17+
}
18+
19+
variable "evaluation_periods" {
20+
type = number
21+
default = 1
22+
}
23+
24+
variable "read_throttle_threshold" {
25+
type = number
26+
default = 0
27+
}
28+
29+
variable "write_throttle_threshold" {
30+
type = number
31+
default = 0
32+
}

0 commit comments

Comments
 (0)