msp: add monitoring stack (#58816)

Adds GCP Monitoring Alert Policies for Cloud Run Services, Cloud Run Jobs and, if enabled, Cloud Redis
2026-02-06 14:11:44 +00:00 · 2023-12-13 19:40:57 +00:00 · 2023-12-13 19:40:57 +00:00 · 8c9e114549
commit 8c9e114549
parent b344c534e9
15 changed files with 936 additions and 9 deletions
--- a/dev/managedservicesplatform/BUILD.bazel
+++ b/dev/managedservicesplatform/BUILD.bazel
@ -12,6 +12,7 @@ go_library(
        "//dev/managedservicesplatform/internal/stack",
        "//dev/managedservicesplatform/internal/stack/cloudrun",
        "//dev/managedservicesplatform/internal/stack/iam",
+        "//dev/managedservicesplatform/internal/stack/monitoring",
        "//dev/managedservicesplatform/internal/stack/options/terraformversion",
        "//dev/managedservicesplatform/internal/stack/options/tfcbackend",
        "//dev/managedservicesplatform/internal/stack/project",
--- a/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/BUILD.bazel
+++ b/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/BUILD.bazel
@ -0,0 +1,26 @@
+load("//dev:go_defs.bzl", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "monitoringalertpolicy",
+    srcs = ["monitoringalertpolicy.go"],
+    importpath = "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/monitoringalertpolicy",
+    visibility = ["//dev/managedservicesplatform:__subpackages__"],
+    deps = [
+        "//dev/managedservicesplatform/internal/resourceid",
+        "//lib/errors",
+        "//lib/pointers",
+        "@com_github_aws_constructs_go_constructs_v10//:constructs",
+        "@com_github_sourcegraph_managed_services_platform_cdktf_gen_google//monitoringalertpolicy",
+    ],
+)
+
+go_test(
+    name = "monitoringalertpolicy_test",
+    srcs = ["monitoringalertpolicy_test.go"],
+    embed = [":monitoringalertpolicy"],
+    deps = [
+        "//lib/pointers",
+        "@com_github_hexops_autogold_v2//:autogold",
+    ],
+)
--- a/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy.go
+++ b/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy.go
@ -0,0 +1,314 @@
+package monitoringalertpolicy
+
+import (
+	"fmt"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/sourcegraph/sourcegraph/lib/errors"
+
+	"github.com/aws/constructs-go/constructs/v10"
+	"github.com/sourcegraph/managed-services-platform-cdktf/gen/google/monitoringalertpolicy"
+
+	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid"
+	"github.com/sourcegraph/sourcegraph/lib/pointers"
+)
+
+type Aligner string
+
+const (
+	MonitoringAlignNone          Aligner = "ALIGN_NONE"
+	MonitoringAlignDelta         Aligner = "ALIGN_DELTA"
+	MonitoringAlignRate          Aligner = "ALIGN_RATE"
+	MonitoringAlignInterpolate   Aligner = "ALIGN_INTERPOLATE"
+	MonitoringAlignNextOrder     Aligner = "ALIGN_NEXT_ORDER"
+	MonitoringAlignMin           Aligner = "ALIGN_MIN"
+	MonitoringAlignMax           Aligner = "ALIGN_MAX"
+	MonitoringAlignMean          Aligner = "ALIGN_MEAN"
+	MonitoringAlignCount         Aligner = "ALIGN_COUNT"
+	MonitoringAlignSum           Aligner = "ALIGN_SUM"
+	MonitoringAlignStddev        Aligner = "ALIGN_STDDEV"
+	MonitoringAlignCountTrue     Aligner = "ALIGN_COUNT_TRUE"
+	MonitoringAlignCountFalse    Aligner = "ALIGN_COUNT_FALSE"
+	MonitoringAlignFractionTrue  Aligner = "ALIGN_FRACTION_TRUE"
+	MonitoringAlignPercentile99  Aligner = "ALIGN_PERCENTILE_99"
+	MonitoringAlignPercentile95  Aligner = "ALIGN_PERCENTILE_95"
+	MonitoringAlignPercentile50  Aligner = "ALIGN_PERCENTILE_50"
+	MonitoringAlignPercentile05  Aligner = "ALIGN_PERCENTILE_05"
+	MonitoringAlignPercentChange Aligner = "ALIGN_PERCENT_CHANGE"
+)
+
+type Reducer string
+
+const (
+	MonitoringReduceNone         Reducer = "REDUCE_NONE"
+	MonitoringReduceMean         Reducer = "REDUCE_MEAN"
+	MonitoringReduceMin          Reducer = "REDUCE_MIN"
+	MonitoringReduceMax          Reducer = "REDUCE_MAX"
+	MonitoringReduceSum          Reducer = "REDUCE_SUM"
+	MonitoringReduceStddev       Reducer = "REDUCE_STDDEV"
+	MonitoringReduceCount        Reducer = "REDUCE_COUNT"
+	MonitoringReduceCountTrue    Reducer = "REDUCE_COUNT_TRUE"
+	MonitoringReduceCountFalse   Reducer = "REDUCE_COUNT_FALSE"
+	MonitoringReduceFractionTrue Reducer = "REDUCE_FRACTION_TRUE"
+	MonitoringReducePercentile99 Reducer = "REDUCE_PERCENTILE_99"
+	MonitoringReducePercentile95 Reducer = "REDUCE_PERCENTILE_95"
+	MonitoringReducePercentile50 Reducer = "REDUCE_PERCENTILE_50"
+	MonitoringReducePercentile05 Reducer = "REDUCE_PERCENTILE_05"
+)
+
+type Comparison string
+
+const (
+	ComparisonGT Comparison = "COMPARISON_GT"
+	ComparisonLT Comparison = "COMPARISON_LT"
+)
+
+// ThresholdAggregation for alerting when a metric exceeds a defined threshold
+//
+// Must specify a `metric.type` filter. Additional filters are optional.
+// All filters are joined with ` AND `
+//
+// GroupByFields is an optional field specifying time series labels to aggregate:
+//   - For services it defaults to `["resource.label.revision_name"]`; additional fields are appended
+//   - For jobs there is no default
+type ThresholdAggregation struct {
+	Filters       map[string]string
+	GroupByFields []string
+	Comparison    Comparison
+	Aligner       Aligner
+	Reducer       Reducer
+	Period        string
+	Threshold     float64
+	Duration      string
+}
+
+// ResponseCodeMetric for alerting when the number of a certain response code exceeds a threshold
+//
+// Must specify either `Code` (e.g. 404) or `CodeClass` (e.g. 4xx)
+//
+// `ExcludeCodes` allows filtering out specific response codes from the `CodeClass`
+type ResponseCodeMetric struct {
+	Code         *int
+	CodeClass    *string
+	ExcludeCodes []string
+	Ratio        float64
+	Duration     *string
+}
+
+type CloudService int
+
+const (
+	CloudRunService CloudService = iota
+	CloudRunJob
+	CloudRedis
+)
+
+// Config for a Monitoring Alert Policy
+// Must define either `ThresholdAggregation` or `ResponseCodeMetric`
+type Config struct {
+	// A unique identifier
+	ID          string
+	Name        string
+	Description *string
+	ProjectID   string
+	// Name of the service/job/redis to filter the alert on
+	ServiceName string
+	// Type of the service/job/redis
+	ServiceKind CloudService
+
+	ThresholdAggregation *ThresholdAggregation
+	ResponseCodeMetric   *ResponseCodeMetric
+}
+
+type Output struct {
+}
+
+func New(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
+	if config.ThresholdAggregation == nil && config.ResponseCodeMetric == nil {
+		return nil, errors.New("Must provide either SingleMetric or ResponseCodeMetric config")
+	}
+
+	if config.ThresholdAggregation != nil && config.ResponseCodeMetric != nil {
+		return nil, errors.New("Must provide either SingleMetric or ResponseCodeMetric config, not both")
+	}
+
+	if config.ThresholdAggregation != nil {
+		if len(config.ThresholdAggregation.Filters) == 0 {
+			return nil, errors.New("must specify at least one filter for threshold aggregation")
+		}
+
+		if _, ok := config.ThresholdAggregation.Filters["metric.type"]; !ok {
+			return nil, errors.New("must specify filter for `metric.type`")
+		}
+		return thresholdAggregation(scope, id, config)
+	}
+	return responseCodeMetric(scope, id, config)
+}
+
+// threshholdAggregation defines a monitoring alert policy based on a single metric threshold
+func thresholdAggregation(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
+	// Set some defaults
+	switch config.ServiceKind {
+	case CloudRunService:
+		config.ThresholdAggregation.GroupByFields = append([]string{"resource.label.revision_name"}, config.ThresholdAggregation.GroupByFields...)
+	case CloudRunJob:
+		// No defaults
+	case CloudRedis:
+		// No defaults
+	default:
+		return nil, errors.Newf("invalid service kind %q", config.ServiceKind)
+	}
+
+	if config.ThresholdAggregation.Comparison == "" {
+		config.ThresholdAggregation.Comparison = ComparisonGT
+	}
+
+	if config.ThresholdAggregation.Duration == "" {
+		config.ThresholdAggregation.Duration = "0s"
+	}
+
+	_ = monitoringalertpolicy.NewMonitoringAlertPolicy(scope,
+		id.TerraformID(config.ID), &monitoringalertpolicy.MonitoringAlertPolicyConfig{
+			Project:     pointers.Ptr(config.ProjectID),
+			DisplayName: pointers.Ptr(config.Name),
+			Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{
+				Content:  config.Description,
+				MimeType: pointers.Ptr("text/markdown"),
+			},
+			Combiner: pointers.Ptr("OR"),
+			Conditions: []monitoringalertpolicy.MonitoringAlertPolicyConditions{
+				{
+					DisplayName: pointers.Ptr(config.Name),
+					ConditionThreshold: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThreshold{
+						Aggregations: []monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdAggregations{
+							{
+								AlignmentPeriod:    pointers.Ptr(config.ThresholdAggregation.Period),
+								PerSeriesAligner:   pointers.Ptr(string(config.ThresholdAggregation.Aligner)),
+								CrossSeriesReducer: pointers.Ptr(string(config.ThresholdAggregation.Reducer)),
+								GroupByFields:      pointers.Ptr(pointers.Slice(config.ThresholdAggregation.GroupByFields)),
+							},
+						},
+						Comparison:     pointers.Ptr(string(config.ThresholdAggregation.Comparison)),
+						Duration:       pointers.Ptr(config.ThresholdAggregation.Duration),
+						Filter:         pointers.Ptr(buildFilter(config)),
+						ThresholdValue: pointers.Float64(config.ThresholdAggregation.Threshold),
+						Trigger: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger{
+							Count: pointers.Float64(1),
+						},
+					},
+				},
+			},
+			AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{
+				AutoClose: pointers.Ptr("604800s"),
+			},
+		})
+	return &Output{}, nil
+}
+
+// buildFilter creates the Filter string for a ThresholdAggregation monitoring alert policy
+func buildFilter(config *Config) string {
+	filters := make([]string, 0)
+	for key, val := range config.ThresholdAggregation.Filters {
+		filters = append(filters, fmt.Sprintf(`%s = "%s"`, key, val))
+	}
+
+	// Sort to ensure stable output for testing, because
+	// config.ThresholdAggregation.Filters is a map.
+	sort.Strings(filters)
+
+	switch config.ServiceKind {
+	case CloudRunService:
+		filters = append(filters,
+			`resource.type = "cloud_run_revision"`,
+			fmt.Sprintf(`resource.labels.service_name = "%s"`, config.ServiceName),
+		)
+	case CloudRunJob:
+		filters = append(filters,
+			`resource.type = "cloud_run_job"`,
+			fmt.Sprintf(`resource.labels.job_name = "%s"`, config.ServiceName),
+		)
+	case CloudRedis:
+		filters = append(filters,
+			`resource.type = "redis_instance"`,
+			fmt.Sprintf(`resource.labels.redis_instance_id = "%s"`, config.ServiceName),
+		)
+	}
+
+	return strings.Join(filters, " AND ")
+}
+
+// responseCodeMetric defines the MonitoringAlertPolicy for response code metrics
+// Supports a single Code e.g. 404 or an entire Code Class e.g. 4xx
+// Optionally when using a Code Class, codes to exclude can be defined
+func responseCodeMetric(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
+	query := responseCodeBuilder(config)
+
+	if config.ResponseCodeMetric.Duration == nil {
+		config.ResponseCodeMetric.Duration = pointers.Ptr("60s")
+	}
+
+	_ = monitoringalertpolicy.NewMonitoringAlertPolicy(scope,
+		id.TerraformID(config.ID), &monitoringalertpolicy.MonitoringAlertPolicyConfig{
+			Project:     pointers.Ptr(config.ProjectID),
+			DisplayName: pointers.Ptr(fmt.Sprintf("High Ratio of %s Responses", config.Name)),
+			Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{
+				Content:  config.Description,
+				MimeType: pointers.Ptr("text/markdown"),
+			},
+			Combiner: pointers.Ptr("OR"),
+			Conditions: []monitoringalertpolicy.MonitoringAlertPolicyConditions{
+				{
+					DisplayName: pointers.Ptr(fmt.Sprintf("High Ratio of %s Responses", config.Name)),
+					ConditionMonitoringQueryLanguage: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionMonitoringQueryLanguage{
+						Query:    pointers.Ptr(query),
+						Duration: config.ResponseCodeMetric.Duration,
+						Trigger: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionMonitoringQueryLanguageTrigger{
+							Count: pointers.Float64(1),
+						},
+					},
+				},
+			},
+			AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{
+				AutoClose: pointers.Ptr("604800s"),
+			},
+		})
+	return &Output{}, nil
+}
+
+// responseCodeBuilder builds the MQL for a response code metric alert
+func responseCodeBuilder(config *Config) string {
+	var builder strings.Builder
+
+	builder.WriteString(`fetch cloud_run_revision
+| metric 'run.googleapis.com/request_count'
+| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)]
+| every 15s
+| {
+`)
+	if config.ResponseCodeMetric.CodeClass != nil {
+		builder.WriteString("  group_by [metric.response_code, metric.response_code_class],\n")
+	} else {
+		builder.WriteString("  group_by [metric.response_code],\n")
+	}
+	builder.WriteString("  [response_code_count_aggregate: aggregate(value_request_count_aggregate)]\n")
+	if config.ResponseCodeMetric.Code != nil {
+		builder.WriteString(fmt.Sprintf("  | filter (metric.response_code = '%d')\n", *config.ResponseCodeMetric.Code))
+	} else {
+		builder.WriteString(fmt.Sprintf("  | filter (metric.response_code_class = '%s')\n", *config.ResponseCodeMetric.CodeClass))
+	}
+	if config.ResponseCodeMetric.ExcludeCodes != nil && len(config.ResponseCodeMetric.ExcludeCodes) > 0 {
+		for _, code := range config.ResponseCodeMetric.ExcludeCodes {
+			builder.WriteString(fmt.Sprintf("  | filter (metric.response_code != '%s')\n", code))
+		}
+	}
+	builder.WriteString(`; group_by [],
+  [value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)]
+}
+| join
+| value [response_code_ratio: val(0) / val(1)]
+`)
+	builder.WriteString(fmt.Sprintf("| condition gt(val(), %s)\n", strconv.FormatFloat(config.ResponseCodeMetric.Ratio, 'f', -1, 64)))
+	return builder.String()
+}
--- a/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy_test.go
+++ b/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy_test.go
@ -0,0 +1,136 @@
+package monitoringalertpolicy
+
+import (
+	"testing"
+
+	"github.com/hexops/autogold/v2"
+
+	"github.com/sourcegraph/sourcegraph/lib/pointers"
+)
+
+func TestBuildFilter(t *testing.T) {
+	for _, tc := range []struct {
+		name   string
+		config Config
+		want   autogold.Value
+	}{
+		{
+			name: "Service Metric",
+			config: Config{
+				ServiceName: "my-service-name",
+				ServiceKind: CloudRunService,
+				ThresholdAggregation: &ThresholdAggregation{
+					Filters: map[string]string{
+						"metric.type": "run.googleapis.com/container/startup_latencies",
+					},
+				},
+			},
+			want: autogold.Expect(`metric.type = "run.googleapis.com/container/startup_latencies" AND resource.type = "cloud_run_revision" AND resource.labels.service_name = "my-service-name"`),
+		},
+		{
+			name: "Job Metric",
+			config: Config{
+				ServiceName: "my-job-name",
+				ServiceKind: CloudRunJob,
+				ThresholdAggregation: &ThresholdAggregation{
+					Filters: map[string]string{
+						"metric.type":          "run.googleapis.com/job/completed_task_attempt_count",
+						"metric.labels.result": "failed",
+					},
+				},
+			},
+			want: autogold.Expect(`metric.labels.result = "failed" AND metric.type = "run.googleapis.com/job/completed_task_attempt_count" AND resource.type = "cloud_run_job" AND resource.labels.job_name = "my-job-name"`),
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			got := buildFilter(&tc.config)
+			tc.want.Equal(t, got)
+		})
+	}
+}
+
+func TestResponseCodeBuilder(t *testing.T) {
+	for _, tc := range []struct {
+		name string
+		ResponseCodeMetric
+		want autogold.Value
+	}{
+		{
+			name: "Single Response Code",
+			ResponseCodeMetric: ResponseCodeMetric{
+				Code:  pointers.Ptr(404),
+				Ratio: 0.1,
+			},
+			want: autogold.Expect(`fetch cloud_run_revision
+| metric 'run.googleapis.com/request_count'
+| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)]
+| every 15s
+| {
+  group_by [metric.response_code],
+  [response_code_count_aggregate: aggregate(value_request_count_aggregate)]
+  | filter (metric.response_code = '404')
+; group_by [],
+  [value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)]
+}
+| join
+| value [response_code_ratio: val(0) / val(1)]
+| condition gt(val(), 0.1)
+`),
+		},
+		{
+			name: "Response Code Class",
+			ResponseCodeMetric: ResponseCodeMetric{
+				CodeClass: pointers.Ptr("4xx"),
+				Ratio:     0.4,
+			},
+			want: autogold.Expect(`fetch cloud_run_revision
+| metric 'run.googleapis.com/request_count'
+| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)]
+| every 15s
+| {
+  group_by [metric.response_code, metric.response_code_class],
+  [response_code_count_aggregate: aggregate(value_request_count_aggregate)]
+  | filter (metric.response_code_class = '4xx')
+; group_by [],
+  [value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)]
+}
+| join
+| value [response_code_ratio: val(0) / val(1)]
+| condition gt(val(), 0.4)
+`),
+		},
+		{
+			name: "Response Code Class + Exclude",
+			ResponseCodeMetric: ResponseCodeMetric{
+				CodeClass:    pointers.Ptr("4xx"),
+				ExcludeCodes: []string{"404", "429"},
+				Ratio:        0.8,
+			},
+			want: autogold.Expect(`fetch cloud_run_revision
+| metric 'run.googleapis.com/request_count'
+| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)]
+| every 15s
+| {
+  group_by [metric.response_code, metric.response_code_class],
+  [response_code_count_aggregate: aggregate(value_request_count_aggregate)]
+  | filter (metric.response_code_class = '4xx')
+  | filter (metric.response_code != '404')
+  | filter (metric.response_code != '429')
+; group_by [],
+  [value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)]
+}
+| join
+| value [response_code_ratio: val(0) / val(1)]
+| condition gt(val(), 0.8)
+`),
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			got := responseCodeBuilder(&Config{
+				ServiceName:        "test-service",
+				ResponseCodeMetric: &tc.ResponseCodeMetric,
+			})
+			tc.want.Equal(t, got)
+		})
+	}
+}
--- a/dev/managedservicesplatform/internal/resource/redis/redis.go
+++ b/dev/managedservicesplatform/internal/resource/redis/redis.go
@ -15,6 +15,7 @@ import (
 )

 type Output struct {
+	ID          *string
 	Endpoint    string
 	Certificate gsmsecret.Output
 }
@ -64,5 +65,6 @@ func New(scope constructs.Construct, id resourceid.ID, config Config) (*Output,
 		Endpoint: fmt.Sprintf("rediss://:%s@%s:%v",
 			*redis.AuthString(), *redis.Host(), *redis.Port()),
 		Certificate: *redisCACert,
+		ID:          redis.Id(),
 	}, nil
 }
--- a/dev/managedservicesplatform/internal/stack/cloudrun/cloudrun.go
+++ b/dev/managedservicesplatform/internal/stack/cloudrun/cloudrun.go
@ -36,7 +36,9 @@ import (
 	"github.com/sourcegraph/sourcegraph/lib/pointers"
 )

-type CrossStackOutput struct{}
+type CrossStackOutput struct {
+	RedisInstanceID *string
+}

 type Variables struct {
 	ProjectID                      string
@ -143,6 +145,8 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu

 	// redisInstance is only created and non-nil if Redis is configured for the
 	// environment.
+	// If Redis is configured, populate cross-stack output with Redis ID.
+	var redisInstanceID *string
 	if vars.Environment.Resources != nil && vars.Environment.Resources.Redis != nil {
 		redisInstance, err := redis.New(stack,
 			resourceid.New("redis"),
@ -156,6 +160,8 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu
 			return nil, errors.Wrap(err, "failed to render Redis instance")
 		}

+		redisInstanceID = redisInstance.ID
+
 		// Configure endpoint string.
 		cloudRunBuilder.AddEnv("REDIS_ENDPOINT", redisInstance.Endpoint)

@ -265,7 +271,9 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu
 		"Cloud Run resource location")
 	locals.Add("image_tag", imageTag.StringValue,
 		"Resolved tag of service image to deploy")
-	return &CrossStackOutput{}, nil
+	return &CrossStackOutput{
+		RedisInstanceID: redisInstanceID,
+	}, nil
 }

 type envVariablesData struct {
--- a/dev/managedservicesplatform/internal/stack/monitoring/BUILD.bazel
+++ b/dev/managedservicesplatform/internal/stack/monitoring/BUILD.bazel
@ -0,0 +1,18 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "monitoring",
+    srcs = ["monitoring.go"],
+    importpath = "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/monitoring",
+    visibility = ["//dev/managedservicesplatform:__subpackages__"],
+    deps = [
+        "//dev/managedservicesplatform/internal/resource/monitoringalertpolicy",
+        "//dev/managedservicesplatform/internal/resourceid",
+        "//dev/managedservicesplatform/internal/stack",
+        "//dev/managedservicesplatform/internal/stack/options/googleprovider",
+        "//dev/managedservicesplatform/spec",
+        "//lib/errors",
+        "//lib/pointers",
+        "@com_github_hashicorp_terraform_cdk_go_cdktf//:cdktf",
+    ],
+)
--- a/dev/managedservicesplatform/internal/stack/monitoring/monitoring.go
+++ b/dev/managedservicesplatform/internal/stack/monitoring/monitoring.go
@ -0,0 +1,288 @@
+package monitoring
+
+import (
+	"github.com/hashicorp/terraform-cdk-go/cdktf"
+
+	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/monitoringalertpolicy"
+	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid"
+	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack"
+	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/googleprovider"
+	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/spec"
+	"github.com/sourcegraph/sourcegraph/lib/errors"
+	"github.com/sourcegraph/sourcegraph/lib/pointers"
+)
+
+// Common
+// - Container (8)
+//    - run.googleapis.com/container/billable_instance_time
+//    - run.googleapis.com/container/cpu/allocation_time
+//    * run.googleapis.com/container/cpu/utilizations
+//    - run.googleapis.com/container/memory/allocation_time
+//    * run.googleapis.com/container/memory/utilizations
+//    * run.googleapis.com/container/startup_latencies
+//    - run.googleapis.com/container/network/received_bytes_count
+//    - run.googleapis.com/container/network/sent_bytes_count
+// - Log-based metrics (2)
+//    - logging.googleapis.com/byte_count
+//    - logging.googleapis.com/log_entry_count
+// Cloud Run Job
+// - Job (4)
+//    - run.googleapis.com/job/completed_execution_count
+//    * run.googleapis.com/job/completed_task_attempt_count
+//    - run.googleapis.com/job/running_executions
+//    - run.googleapis.com/job/running_task_attempts
+// Cloud Run Service
+// - Container (9)
+//    - run.googleapis.com/container/completed_probe_attempt_count
+//    - run.googleapis.com/container/completed_probe_count
+//    - run.googleapis.com/container/probe_attempt_latencies
+//    - run.googleapis.com/container/probe_latencies
+//    * run.googleapis.com/container/instance_count
+//    - run.googleapis.com/container/max_request_concurrencies
+//    - run.googleapis.com/container/cpu/usage
+//    - run.googleapis.com/container/containers
+//    - run.googleapis.com/container/memory/usage
+// - Request_count (1)
+//    - run.googleapis.com/request_count
+// - Request_latencies (1)
+//    * run.googleapis.com/request_latencies
+// - Pending_queue (1)
+//    - run.googleapis.com/pending_queue/pending_requests
+
+type CrossStackOutput struct{}
+
+type Variables struct {
+	ProjectID  string
+	Service    spec.ServiceSpec
+	Monitoring spec.MonitoringSpec
+	MaxCount   *int
+
+	// If Redis is enabled we configure alerts for it
+	RedisInstanceID *string
+}
+
+const StackName = "monitoring"
+
+func NewStack(stacks *stack.Set, vars Variables) (*CrossStackOutput, error) {
+	stack, _, err := stacks.New(StackName, googleprovider.With(vars.ProjectID))
+	if err != nil {
+		return nil, err
+	}
+
+	id := resourceid.New("monitoring")
+	err = commonAlerts(stack, id.Group("common"), vars)
+	if err != nil {
+		return nil, errors.Wrap(err, "failed to create common alerts")
+	}
+
+	switch pointers.Deref(vars.Service.Kind, spec.ServiceKindService) {
+	case spec.ServiceKindService:
+		if err = serviceAlerts(stack, id.Group("service"), vars); err != nil {
+			return nil, errors.Wrap(err, "failed to create service alerts")
+		}
+
+		if vars.Monitoring.Alerts.ResponseCodeRatios != nil {
+			if err = responseCodeMetrics(stack, id.Group("response-code"), vars); err != nil {
+				return nil, errors.Wrap(err, "failed to create response code metrics")
+			}
+		}
+	case spec.ServiceKindJob:
+		if err = jobAlerts(stack, id.Group("job"), vars); err != nil {
+			return nil, errors.Wrap(err, "failed to create job alerts")
+		}
+	default:
+		return nil, errors.New("unknown service kind")
+	}
+
+	if vars.RedisInstanceID != nil {
+		if err = redisAlerts(stack, id.Group("redis"), vars); err != nil {
+			return nil, errors.Wrap(err, "failed to create redis alerts")
+		}
+	}
+
+	return &CrossStackOutput{}, nil
+}
+
+func commonAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
+	// Convert a spec.ServiceKind into a monitoringalertpolicy.ServiceKind
+	serviceKind := monitoringalertpolicy.CloudRunService
+	kind := pointers.Deref(vars.Service.Kind, "service")
+	if kind == spec.ServiceKindJob {
+		serviceKind = monitoringalertpolicy.CloudRunJob
+	}
+
+	for _, config := range []monitoringalertpolicy.Config{
+		{
+			ID:          "cpu",
+			Name:        "High Container CPU Utilization",
+			Description: pointers.Ptr("High CPU Usage - it may be neccessaru to reduce load or increase CPU allocation"),
+			ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
+				Filters:   map[string]string{"metric.type": "run.googleapis.com/container/cpu/utilizations"},
+				Aligner:   monitoringalertpolicy.MonitoringAlignPercentile99,
+				Reducer:   monitoringalertpolicy.MonitoringReduceMax,
+				Period:    "300s",
+				Threshold: 0.8,
+			},
+		},
+		{
+			ID:          "memory",
+			Name:        "High Container Memory Utilization",
+			Description: pointers.Ptr("High Memory Usage - it may be neccessary to reduce load or increase memory allocation"),
+			ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
+				Filters:   map[string]string{"metric.type": "run.googleapis.com/container/memory/utilizations"},
+				Aligner:   monitoringalertpolicy.MonitoringAlignPercentile99,
+				Reducer:   monitoringalertpolicy.MonitoringReduceMax,
+				Period:    "300s",
+				Threshold: 0.8,
+			},
+		},
+		{
+			ID:          "startup",
+			Name:        "Container Startup Latency",
+			Description: pointers.Ptr("Instance is taking a long time to start up - something may be blocking startup"),
+			ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
+				Filters:   map[string]string{"metric.type": "run.googleapis.com/container/startup_latencies"},
+				Aligner:   monitoringalertpolicy.MonitoringAlignPercentile99,
+				Reducer:   monitoringalertpolicy.MonitoringReduceMax,
+				Period:    "60s",
+				Threshold: 10000,
+			},
+		},
+	} {
+
+		config.ProjectID = vars.ProjectID
+		config.ServiceName = vars.Service.ID
+		config.ServiceKind = serviceKind
+		if _, err := monitoringalertpolicy.New(stack, id, &config); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func serviceAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
+	// Only provision if MaxCount is specified above 5
+	if pointers.Deref(vars.MaxCount, 0) > 5 {
+		if _, err := monitoringalertpolicy.New(stack, id, &monitoringalertpolicy.Config{
+			ID:          "instance_count",
+			Name:        "Container Instance Count",
+			Description: pointers.Ptr("There are a lot of Cloud Run instances running - we may need to increase per-instance requests make make sure we won't hit the configured max instance count"),
+			ProjectID:   vars.ProjectID,
+			ServiceName: vars.Service.ID,
+			ServiceKind: monitoringalertpolicy.CloudRunService,
+			ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
+				Filters: map[string]string{"metric.type": "run.googleapis.com/container/instance_count"},
+				Aligner: monitoringalertpolicy.MonitoringAlignMax,
+				Reducer: monitoringalertpolicy.MonitoringReduceMax,
+				Period:  "60s",
+			},
+		}); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func jobAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
+	// Alert whenever a Cloud Run Job fails
+	if _, err := monitoringalertpolicy.New(stack, id, &monitoringalertpolicy.Config{
+		ID:          "job_failures",
+		Name:        "Cloud Run Job Failures",
+		Description: pointers.Ptr("Failed executions of Cloud Run Job"),
+		ProjectID:   vars.ProjectID,
+		ServiceName: vars.Service.ID,
+		ServiceKind: monitoringalertpolicy.CloudRunJob,
+		ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
+			Filters: map[string]string{
+				"metric.type":          "run.googleapis.com/job/completed_task_attempt_count",
+				"metric.labels.result": "failed",
+			},
+			GroupByFields: []string{"metric.label.result"},
+			Aligner:       monitoringalertpolicy.MonitoringAlignCount,
+			Reducer:       monitoringalertpolicy.MonitoringReduceSum,
+			Period:        "60s",
+			Threshold:     0,
+		},
+	}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func responseCodeMetrics(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
+	for _, config := range vars.Monitoring.Alerts.ResponseCodeRatios {
+
+		if _, err := monitoringalertpolicy.New(stack, id, &monitoringalertpolicy.Config{
+			ID:          config.ID,
+			ProjectID:   vars.ProjectID,
+			Name:        config.Name,
+			ServiceName: vars.Service.ID,
+			ServiceKind: monitoringalertpolicy.CloudRunService,
+			ResponseCodeMetric: &monitoringalertpolicy.ResponseCodeMetric{
+				Code:         config.Code,
+				CodeClass:    config.CodeClass,
+				ExcludeCodes: config.ExcludeCodes,
+				Ratio:        config.Ratio,
+				Duration:     config.Duration,
+			},
+		}); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func redisAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
+	for _, config := range []monitoringalertpolicy.Config{
+		{
+			ID:          "memory",
+			Name:        "Cloud Redis - System Memory Utilization",
+			Description: pointers.Ptr("This alert fires if the system memory utilization is above the set threshold. The utilization is measured on a scale of 0 to 1."),
+			ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
+				Filters:   map[string]string{"metric.type": "redis.googleapis.com/stats/memory/system_memory_usage_ratio"},
+				Aligner:   monitoringalertpolicy.MonitoringAlignMean,
+				Reducer:   monitoringalertpolicy.MonitoringReduceNone,
+				Period:    "300s",
+				Threshold: 0.8,
+			},
+		},
+		{
+			ID:          "cpu",
+			Name:        "Cloud Redis - System CPU Utilization",
+			Description: pointers.Ptr("This alert fires if the Redis Engine CPU Utilization goes above the set threshold. The utilization is measured on a scale of 0 to 1."),
+			ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
+				Filters:       map[string]string{"metric.type": "redis.googleapis.com/stats/cpu_utilization_main_thread"},
+				GroupByFields: []string{"resource.label.instance_id", "resource.label.node_id"},
+				Aligner:       monitoringalertpolicy.MonitoringAlignRate,
+				Reducer:       monitoringalertpolicy.MonitoringReduceSum,
+				Period:        "300s",
+				Threshold:     0.9,
+			},
+		},
+		{
+			ID:          "failover",
+			Name:        "Cloud Redis - Standard Instance Failover",
+			Description: pointers.Ptr("This alert fires if failover occurs for a standard tier instance."),
+			ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
+				Filters:       map[string]string{"metric.type": "redis.googleapis.com/stats/cpu_utilization_main_thread"},
+				GroupByFields: []string{"resource.label.instance_id", "resource.label.node_id"},
+				Aligner:       monitoringalertpolicy.MonitoringAlignStddev,
+				Reducer:       monitoringalertpolicy.MonitoringReduceNone,
+				Period:        "300s",
+				Threshold:     0,
+			},
+		},
+	} {
+		config.ProjectID = vars.ProjectID
+		config.ServiceName = *vars.RedisInstanceID
+		config.ServiceKind = monitoringalertpolicy.CloudRedis
+		if _, err := monitoringalertpolicy.New(stack, id, &config); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
--- a/dev/managedservicesplatform/managedservicesplatform.go
+++ b/dev/managedservicesplatform/managedservicesplatform.go
@ -11,6 +11,7 @@ import (
 	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack"
 	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/cloudrun"
 	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/iam"
+	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/monitoring"
 	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/terraformversion"
 	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/tfcbackend"
 	"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/project"
@ -53,6 +54,7 @@ func (r *Renderer) RenderEnvironment(
 	svc spec.ServiceSpec,
 	build spec.BuildSpec,
 	env spec.EnvironmentSpec,
+	monitoringSpec spec.MonitoringSpec,
 ) (*CDKTF, error) {
 	terraformVersion := terraform.Version
 	stackSetOptions := []stack.NewStackOption{
@ -107,7 +109,7 @@ func (r *Renderer) RenderEnvironment(
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to create IAM stack")
 	}
-	if _, err := cloudrun.NewStack(stacks, cloudrun.Variables{
+	cloudrunOutput, err := cloudrun.NewStack(stacks, cloudrun.Variables{
 		ProjectID:                      *projectOutput.Project.ProjectId(),
 		CloudRunWorkloadServiceAccount: iamOutput.CloudRunWorkloadServiceAccount,

@ -116,10 +118,26 @@ func (r *Renderer) RenderEnvironment(
 		Environment: env,

 		StableGenerate: r.StableGenerate,
-	}); err != nil {
+	})
+	if err != nil {
 		return nil, errors.Wrap(err, "failed to create cloudrun stack")
 	}

+	if _, err := monitoring.NewStack(stacks, monitoring.Variables{
+		ProjectID:  *projectOutput.Project.ProjectId(),
+		Service:    svc,
+		Monitoring: monitoringSpec,
+		MaxCount: func() *int {
+			if env.Instances.Scaling != nil {
+				return env.Instances.Scaling.MaxCount
+			}
+			return nil
+		}(),
+		RedisInstanceID: cloudrunOutput.RedisInstanceID,
+	}); err != nil {
+		return nil, errors.Wrap(err, "failed to create monitoring stack")
+	}
+
 	// Return CDKTF representation for caller to synthesize
 	return &CDKTF{
 		app:              stack.ExtractApp(stacks),
--- a/dev/managedservicesplatform/spec/BUILD.bazel
+++ b/dev/managedservicesplatform/spec/BUILD.bazel
@ -6,6 +6,7 @@ go_library(
    srcs = [
        "build.go",
        "environment.go",
+        "monitoring.go",
        "service.go",
        "spec.go",
    ],
--- a/dev/managedservicesplatform/spec/monitoring.go
+++ b/dev/managedservicesplatform/spec/monitoring.go
@ -0,0 +1,95 @@
+package spec
+
+import (
+	"time"
+
+	"github.com/grafana/regexp"
+
+	"github.com/sourcegraph/sourcegraph/lib/errors"
+)
+
+var codeClassPattern = regexp.MustCompile(`\dx+`)
+
+type MonitoringSpec struct {
+	// Alerts is a list of alert configurations for the deployment
+	Alerts MonitoringAlertsSpec `json:"alerts"`
+}
+
+func (s *MonitoringSpec) Validate() []error {
+	var errs []error
+	errs = append(errs, s.Alerts.Validate()...)
+	return errs
+}
+
+type MonitoringAlertsSpec struct {
+	ResponseCodeRatios []ResponseCodeRatioSpec `json:"responseCodeRatios"`
+}
+
+type ResponseCodeRatioSpec struct {
+	ID           string   `json:"id"`
+	Name         string   `json:"name"`
+	Description  *string  `json:"description,omitempty"`
+	Code         *int     `json:"code,omitempty"`
+	CodeClass    *string  `json:"codeClass,omitempty"`
+	ExcludeCodes []string `json:"excludeCodes,omitempty"`
+	Duration     *string  `json:"duration,omitempty"`
+	Ratio        float64  `json:"ratio"`
+}
+
+func (s *MonitoringAlertsSpec) Validate() []error {
+	var errs []error
+	// Use map to contain seen IDs to ensure uniqueness
+	ids := make(map[string]struct{})
+	for _, r := range s.ResponseCodeRatios {
+		if r.ID == "" {
+			errs = append(errs, errors.New("responseCodeRatios[].id is required and cannot be empty"))
+		}
+		if _, ok := ids[r.ID]; ok {
+			errs = append(errs, errors.Newf("response code alert IDs must be unique, found duplicate ID: %s", r.ID))
+		}
+		ids[r.ID] = struct{}{}
+		errs = append(errs, r.Validate()...)
+	}
+	return errs
+}
+
+func (r *ResponseCodeRatioSpec) Validate() []error {
+	var errs []error
+
+	if r.ID == "" {
+		errs = append(errs, errors.New("responseCodeRatios[].id is required"))
+	}
+
+	if r.Name == "" {
+		errs = append(errs, errors.New("responseCodeRatios[].name is required"))
+	}
+
+	if r.Ratio < 0 || r.Ratio > 1 {
+		errs = append(errs, errors.New("responseCodeRatios[].ratio must be between 0 and 1"))
+	}
+
+	if r.CodeClass != nil && r.Code != nil {
+		errs = append(errs, errors.New("only one of responseCodeRatios[].code or responseCodeRatios[].codeClass should be specified"))
+	}
+
+	if r.Code != nil && *r.Code <= 0 {
+		errs = append(errs, errors.New("responseCodeRatios[].code must be positive"))
+	}
+
+	if r.CodeClass != nil {
+		if !codeClassPattern.MatchString(*r.CodeClass) {
+			errs = append(errs, errors.New("responseCodeRatios[].codeClass must match the format Nxx (e.g. 4xx, 5xx)"))
+		}
+	}
+
+	if r.Duration != nil {
+		duration, err := time.ParseDuration(*r.Duration)
+		if err != nil {
+			errs = append(errs, errors.Wrap(err, "responseCodeRatios[].duration must be in the format of XXs"))
+		} else if duration%time.Minute != 0 {
+			errs = append(errs, errors.New("responseCodeRatios[].duration must be a multiple of 60s"))
+		}
+	}
+
+	return errs
+}
--- a/dev/managedservicesplatform/spec/spec.go
+++ b/dev/managedservicesplatform/spec/spec.go
@ -25,6 +25,7 @@ type Spec struct {
 	Service      ServiceSpec       `json:"service"`
 	Build        BuildSpec         `json:"build"`
 	Environments []EnvironmentSpec `json:"environments"`
+	Monitoring   MonitoringSpec    `json:"monitoring"`
 }

 // Open a specification file, validate it, unmarshal the data as a MSP spec,
@ -83,6 +84,7 @@ func (s Spec) Validate() []error {
 	for _, env := range s.Environments {
 		errs = append(errs, env.Validate()...)
 	}
+	errs = append(errs, s.Monitoring.Validate()...)
 	return errs
 }

--- a/dev/sg/msp/sg_msp.go
+++ b/dev/sg/msp/sg_msp.go
@ -274,7 +274,7 @@ Supports completions on services and environments.`,
 								return errors.Newf("environment %q not found in service spec", targetEnv)
 							}

-							if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, *env); err != nil {
+							if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, *env, service.Monitoring); err != nil {
 								return errors.Wrapf(err, "sync env %q", env.ID)
 							}
 						} else {
@ -282,7 +282,7 @@ Supports completions on services and environments.`,
 								return errors.New("second argument environment ID is required without the '-all' flag")
 							}
 							for _, env := range service.Environments {
-								if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, env); err != nil {
+								if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, env, service.Monitoring); err != nil {
 									return errors.Wrapf(err, "sync env %q", env.ID)
 								}
 							}
@ -323,7 +323,7 @@ Supports completions on services and environments.`,
 	}
 }

-func syncEnvironmentWorkspaces(c *cli.Context, tfc *terraformcloud.Client, service spec.ServiceSpec, build spec.BuildSpec, env spec.EnvironmentSpec) error {
+func syncEnvironmentWorkspaces(c *cli.Context, tfc *terraformcloud.Client, service spec.ServiceSpec, build spec.BuildSpec, env spec.EnvironmentSpec, monitoring spec.MonitoringSpec) error {
 	if os.TempDir() == "" {
 		return errors.New("no temp dir available")
 	}
@ -341,7 +341,7 @@ func syncEnvironmentWorkspaces(c *cli.Context, tfc *terraformcloud.Client, servi
 	renderPending := std.Out.Pending(output.Styledf(output.StylePending,
 		"[%s] Rendering required Terraform Cloud workspaces for environment %q",
 		service.ID, env.ID))
-	cdktf, err := renderer.RenderEnvironment(service, build, env)
+	cdktf, err := renderer.RenderEnvironment(service, build, env, monitoring)
 	if err != nil {
 		return err
 	}
@ -452,7 +452,7 @@ func generateTerraform(serviceID string, opts generateTerraformOptions) error {
 		}

 		// Render environment
-		cdktf, err := renderer.RenderEnvironment(service.Service, service.Build, env)
+		cdktf, err := renderer.RenderEnvironment(service.Service, service.Build, env, service.Monitoring)
 		if err != nil {
 			return err
 		}
--- a/lib/pointers/ptr.go
+++ b/lib/pointers/ptr.go
@ -53,3 +53,13 @@ func Float64[T numberType](v T) *float64 {
 func Stringf(format string, a ...any) *string {
 	return Ptr(fmt.Sprintf(format, a...))
 }
+
+// Slice takes a slice of values and turns it into a slice of pointers.
+func Slice[S []V, V any](s S) []*V {
+	slice := make([]*V, len(s))
+	for i, v := range s {
+		v := v // copy
+		slice[i] = &v
+	}
+	return slice
+}
--- a/lib/pointers/ptr_test.go
+++ b/lib/pointers/ptr_test.go
@ -203,3 +203,11 @@ func TestDeref(t *testing.T) {
 		runDerefTest(t, tc)
 	}
 }
+
+func TestSlice(t *testing.T) {
+	values := []string{"1", "2", "3"}
+	pointified := Slice(values)
+	for i, p := range pointified {
+		assert.Equal(t, values[i], *p)
+	}
+}