diff --git a/dev/managedservicesplatform/BUILD.bazel b/dev/managedservicesplatform/BUILD.bazel index df527bba223..cded1cfb824 100644 --- a/dev/managedservicesplatform/BUILD.bazel +++ b/dev/managedservicesplatform/BUILD.bazel @@ -12,6 +12,7 @@ go_library( "//dev/managedservicesplatform/internal/stack", "//dev/managedservicesplatform/internal/stack/cloudrun", "//dev/managedservicesplatform/internal/stack/iam", + "//dev/managedservicesplatform/internal/stack/monitoring", "//dev/managedservicesplatform/internal/stack/options/terraformversion", "//dev/managedservicesplatform/internal/stack/options/tfcbackend", "//dev/managedservicesplatform/internal/stack/project", diff --git a/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/BUILD.bazel b/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/BUILD.bazel new file mode 100644 index 00000000000..ed106bd9d40 --- /dev/null +++ b/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/BUILD.bazel @@ -0,0 +1,26 @@ +load("//dev:go_defs.bzl", "go_test") +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "monitoringalertpolicy", + srcs = ["monitoringalertpolicy.go"], + importpath = "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/monitoringalertpolicy", + visibility = ["//dev/managedservicesplatform:__subpackages__"], + deps = [ + "//dev/managedservicesplatform/internal/resourceid", + "//lib/errors", + "//lib/pointers", + "@com_github_aws_constructs_go_constructs_v10//:constructs", + "@com_github_sourcegraph_managed_services_platform_cdktf_gen_google//monitoringalertpolicy", + ], +) + +go_test( + name = "monitoringalertpolicy_test", + srcs = ["monitoringalertpolicy_test.go"], + embed = [":monitoringalertpolicy"], + deps = [ + "//lib/pointers", + "@com_github_hexops_autogold_v2//:autogold", + ], +) diff --git a/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy.go b/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy.go new file mode 100644 index 00000000000..9468c6fab73 --- /dev/null +++ b/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy.go @@ -0,0 +1,314 @@ +package monitoringalertpolicy + +import ( + "fmt" + "sort" + "strconv" + "strings" + + "github.com/sourcegraph/sourcegraph/lib/errors" + + "github.com/aws/constructs-go/constructs/v10" + "github.com/sourcegraph/managed-services-platform-cdktf/gen/google/monitoringalertpolicy" + + "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid" + "github.com/sourcegraph/sourcegraph/lib/pointers" +) + +type Aligner string + +const ( + MonitoringAlignNone Aligner = "ALIGN_NONE" + MonitoringAlignDelta Aligner = "ALIGN_DELTA" + MonitoringAlignRate Aligner = "ALIGN_RATE" + MonitoringAlignInterpolate Aligner = "ALIGN_INTERPOLATE" + MonitoringAlignNextOrder Aligner = "ALIGN_NEXT_ORDER" + MonitoringAlignMin Aligner = "ALIGN_MIN" + MonitoringAlignMax Aligner = "ALIGN_MAX" + MonitoringAlignMean Aligner = "ALIGN_MEAN" + MonitoringAlignCount Aligner = "ALIGN_COUNT" + MonitoringAlignSum Aligner = "ALIGN_SUM" + MonitoringAlignStddev Aligner = "ALIGN_STDDEV" + MonitoringAlignCountTrue Aligner = "ALIGN_COUNT_TRUE" + MonitoringAlignCountFalse Aligner = "ALIGN_COUNT_FALSE" + MonitoringAlignFractionTrue Aligner = "ALIGN_FRACTION_TRUE" + MonitoringAlignPercentile99 Aligner = "ALIGN_PERCENTILE_99" + MonitoringAlignPercentile95 Aligner = "ALIGN_PERCENTILE_95" + MonitoringAlignPercentile50 Aligner = "ALIGN_PERCENTILE_50" + MonitoringAlignPercentile05 Aligner = "ALIGN_PERCENTILE_05" + MonitoringAlignPercentChange Aligner = "ALIGN_PERCENT_CHANGE" +) + +type Reducer string + +const ( + MonitoringReduceNone Reducer = "REDUCE_NONE" + MonitoringReduceMean Reducer = "REDUCE_MEAN" + MonitoringReduceMin Reducer = "REDUCE_MIN" + MonitoringReduceMax Reducer = "REDUCE_MAX" + MonitoringReduceSum Reducer = "REDUCE_SUM" + MonitoringReduceStddev Reducer = "REDUCE_STDDEV" + MonitoringReduceCount Reducer = "REDUCE_COUNT" + MonitoringReduceCountTrue Reducer = "REDUCE_COUNT_TRUE" + MonitoringReduceCountFalse Reducer = "REDUCE_COUNT_FALSE" + MonitoringReduceFractionTrue Reducer = "REDUCE_FRACTION_TRUE" + MonitoringReducePercentile99 Reducer = "REDUCE_PERCENTILE_99" + MonitoringReducePercentile95 Reducer = "REDUCE_PERCENTILE_95" + MonitoringReducePercentile50 Reducer = "REDUCE_PERCENTILE_50" + MonitoringReducePercentile05 Reducer = "REDUCE_PERCENTILE_05" +) + +type Comparison string + +const ( + ComparisonGT Comparison = "COMPARISON_GT" + ComparisonLT Comparison = "COMPARISON_LT" +) + +// ThresholdAggregation for alerting when a metric exceeds a defined threshold +// +// Must specify a `metric.type` filter. Additional filters are optional. +// All filters are joined with ` AND ` +// +// GroupByFields is an optional field specifying time series labels to aggregate: +// - For services it defaults to `["resource.label.revision_name"]`; additional fields are appended +// - For jobs there is no default +type ThresholdAggregation struct { + Filters map[string]string + GroupByFields []string + Comparison Comparison + Aligner Aligner + Reducer Reducer + Period string + Threshold float64 + Duration string +} + +// ResponseCodeMetric for alerting when the number of a certain response code exceeds a threshold +// +// Must specify either `Code` (e.g. 404) or `CodeClass` (e.g. 4xx) +// +// `ExcludeCodes` allows filtering out specific response codes from the `CodeClass` +type ResponseCodeMetric struct { + Code *int + CodeClass *string + ExcludeCodes []string + Ratio float64 + Duration *string +} + +type CloudService int + +const ( + CloudRunService CloudService = iota + CloudRunJob + CloudRedis +) + +// Config for a Monitoring Alert Policy +// Must define either `ThresholdAggregation` or `ResponseCodeMetric` +type Config struct { + // A unique identifier + ID string + Name string + Description *string + ProjectID string + // Name of the service/job/redis to filter the alert on + ServiceName string + // Type of the service/job/redis + ServiceKind CloudService + + ThresholdAggregation *ThresholdAggregation + ResponseCodeMetric *ResponseCodeMetric +} + +type Output struct { +} + +func New(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) { + if config.ThresholdAggregation == nil && config.ResponseCodeMetric == nil { + return nil, errors.New("Must provide either SingleMetric or ResponseCodeMetric config") + } + + if config.ThresholdAggregation != nil && config.ResponseCodeMetric != nil { + return nil, errors.New("Must provide either SingleMetric or ResponseCodeMetric config, not both") + } + + if config.ThresholdAggregation != nil { + if len(config.ThresholdAggregation.Filters) == 0 { + return nil, errors.New("must specify at least one filter for threshold aggregation") + } + + if _, ok := config.ThresholdAggregation.Filters["metric.type"]; !ok { + return nil, errors.New("must specify filter for `metric.type`") + } + return thresholdAggregation(scope, id, config) + } + return responseCodeMetric(scope, id, config) +} + +// threshholdAggregation defines a monitoring alert policy based on a single metric threshold +func thresholdAggregation(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) { + // Set some defaults + switch config.ServiceKind { + case CloudRunService: + config.ThresholdAggregation.GroupByFields = append([]string{"resource.label.revision_name"}, config.ThresholdAggregation.GroupByFields...) + case CloudRunJob: + // No defaults + case CloudRedis: + // No defaults + default: + return nil, errors.Newf("invalid service kind %q", config.ServiceKind) + } + + if config.ThresholdAggregation.Comparison == "" { + config.ThresholdAggregation.Comparison = ComparisonGT + } + + if config.ThresholdAggregation.Duration == "" { + config.ThresholdAggregation.Duration = "0s" + } + + _ = monitoringalertpolicy.NewMonitoringAlertPolicy(scope, + id.TerraformID(config.ID), &monitoringalertpolicy.MonitoringAlertPolicyConfig{ + Project: pointers.Ptr(config.ProjectID), + DisplayName: pointers.Ptr(config.Name), + Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{ + Content: config.Description, + MimeType: pointers.Ptr("text/markdown"), + }, + Combiner: pointers.Ptr("OR"), + Conditions: []monitoringalertpolicy.MonitoringAlertPolicyConditions{ + { + DisplayName: pointers.Ptr(config.Name), + ConditionThreshold: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThreshold{ + Aggregations: []monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdAggregations{ + { + AlignmentPeriod: pointers.Ptr(config.ThresholdAggregation.Period), + PerSeriesAligner: pointers.Ptr(string(config.ThresholdAggregation.Aligner)), + CrossSeriesReducer: pointers.Ptr(string(config.ThresholdAggregation.Reducer)), + GroupByFields: pointers.Ptr(pointers.Slice(config.ThresholdAggregation.GroupByFields)), + }, + }, + Comparison: pointers.Ptr(string(config.ThresholdAggregation.Comparison)), + Duration: pointers.Ptr(config.ThresholdAggregation.Duration), + Filter: pointers.Ptr(buildFilter(config)), + ThresholdValue: pointers.Float64(config.ThresholdAggregation.Threshold), + Trigger: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger{ + Count: pointers.Float64(1), + }, + }, + }, + }, + AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{ + AutoClose: pointers.Ptr("604800s"), + }, + }) + return &Output{}, nil +} + +// buildFilter creates the Filter string for a ThresholdAggregation monitoring alert policy +func buildFilter(config *Config) string { + filters := make([]string, 0) + for key, val := range config.ThresholdAggregation.Filters { + filters = append(filters, fmt.Sprintf(`%s = "%s"`, key, val)) + } + + // Sort to ensure stable output for testing, because + // config.ThresholdAggregation.Filters is a map. + sort.Strings(filters) + + switch config.ServiceKind { + case CloudRunService: + filters = append(filters, + `resource.type = "cloud_run_revision"`, + fmt.Sprintf(`resource.labels.service_name = "%s"`, config.ServiceName), + ) + case CloudRunJob: + filters = append(filters, + `resource.type = "cloud_run_job"`, + fmt.Sprintf(`resource.labels.job_name = "%s"`, config.ServiceName), + ) + case CloudRedis: + filters = append(filters, + `resource.type = "redis_instance"`, + fmt.Sprintf(`resource.labels.redis_instance_id = "%s"`, config.ServiceName), + ) + } + + return strings.Join(filters, " AND ") +} + +// responseCodeMetric defines the MonitoringAlertPolicy for response code metrics +// Supports a single Code e.g. 404 or an entire Code Class e.g. 4xx +// Optionally when using a Code Class, codes to exclude can be defined +func responseCodeMetric(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) { + query := responseCodeBuilder(config) + + if config.ResponseCodeMetric.Duration == nil { + config.ResponseCodeMetric.Duration = pointers.Ptr("60s") + } + + _ = monitoringalertpolicy.NewMonitoringAlertPolicy(scope, + id.TerraformID(config.ID), &monitoringalertpolicy.MonitoringAlertPolicyConfig{ + Project: pointers.Ptr(config.ProjectID), + DisplayName: pointers.Ptr(fmt.Sprintf("High Ratio of %s Responses", config.Name)), + Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{ + Content: config.Description, + MimeType: pointers.Ptr("text/markdown"), + }, + Combiner: pointers.Ptr("OR"), + Conditions: []monitoringalertpolicy.MonitoringAlertPolicyConditions{ + { + DisplayName: pointers.Ptr(fmt.Sprintf("High Ratio of %s Responses", config.Name)), + ConditionMonitoringQueryLanguage: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionMonitoringQueryLanguage{ + Query: pointers.Ptr(query), + Duration: config.ResponseCodeMetric.Duration, + Trigger: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionMonitoringQueryLanguageTrigger{ + Count: pointers.Float64(1), + }, + }, + }, + }, + AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{ + AutoClose: pointers.Ptr("604800s"), + }, + }) + return &Output{}, nil +} + +// responseCodeBuilder builds the MQL for a response code metric alert +func responseCodeBuilder(config *Config) string { + var builder strings.Builder + + builder.WriteString(`fetch cloud_run_revision +| metric 'run.googleapis.com/request_count' +| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)] +| every 15s +| { +`) + if config.ResponseCodeMetric.CodeClass != nil { + builder.WriteString(" group_by [metric.response_code, metric.response_code_class],\n") + } else { + builder.WriteString(" group_by [metric.response_code],\n") + } + builder.WriteString(" [response_code_count_aggregate: aggregate(value_request_count_aggregate)]\n") + if config.ResponseCodeMetric.Code != nil { + builder.WriteString(fmt.Sprintf(" | filter (metric.response_code = '%d')\n", *config.ResponseCodeMetric.Code)) + } else { + builder.WriteString(fmt.Sprintf(" | filter (metric.response_code_class = '%s')\n", *config.ResponseCodeMetric.CodeClass)) + } + if config.ResponseCodeMetric.ExcludeCodes != nil && len(config.ResponseCodeMetric.ExcludeCodes) > 0 { + for _, code := range config.ResponseCodeMetric.ExcludeCodes { + builder.WriteString(fmt.Sprintf(" | filter (metric.response_code != '%s')\n", code)) + } + } + builder.WriteString(`; group_by [], + [value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)] +} +| join +| value [response_code_ratio: val(0) / val(1)] +`) + builder.WriteString(fmt.Sprintf("| condition gt(val(), %s)\n", strconv.FormatFloat(config.ResponseCodeMetric.Ratio, 'f', -1, 64))) + return builder.String() +} diff --git a/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy_test.go b/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy_test.go new file mode 100644 index 00000000000..82c6d825798 --- /dev/null +++ b/dev/managedservicesplatform/internal/resource/monitoringalertpolicy/monitoringalertpolicy_test.go @@ -0,0 +1,136 @@ +package monitoringalertpolicy + +import ( + "testing" + + "github.com/hexops/autogold/v2" + + "github.com/sourcegraph/sourcegraph/lib/pointers" +) + +func TestBuildFilter(t *testing.T) { + for _, tc := range []struct { + name string + config Config + want autogold.Value + }{ + { + name: "Service Metric", + config: Config{ + ServiceName: "my-service-name", + ServiceKind: CloudRunService, + ThresholdAggregation: &ThresholdAggregation{ + Filters: map[string]string{ + "metric.type": "run.googleapis.com/container/startup_latencies", + }, + }, + }, + want: autogold.Expect(`metric.type = "run.googleapis.com/container/startup_latencies" AND resource.type = "cloud_run_revision" AND resource.labels.service_name = "my-service-name"`), + }, + { + name: "Job Metric", + config: Config{ + ServiceName: "my-job-name", + ServiceKind: CloudRunJob, + ThresholdAggregation: &ThresholdAggregation{ + Filters: map[string]string{ + "metric.type": "run.googleapis.com/job/completed_task_attempt_count", + "metric.labels.result": "failed", + }, + }, + }, + want: autogold.Expect(`metric.labels.result = "failed" AND metric.type = "run.googleapis.com/job/completed_task_attempt_count" AND resource.type = "cloud_run_job" AND resource.labels.job_name = "my-job-name"`), + }, + } { + t.Run(tc.name, func(t *testing.T) { + got := buildFilter(&tc.config) + tc.want.Equal(t, got) + }) + } +} + +func TestResponseCodeBuilder(t *testing.T) { + for _, tc := range []struct { + name string + ResponseCodeMetric + want autogold.Value + }{ + { + name: "Single Response Code", + ResponseCodeMetric: ResponseCodeMetric{ + Code: pointers.Ptr(404), + Ratio: 0.1, + }, + want: autogold.Expect(`fetch cloud_run_revision +| metric 'run.googleapis.com/request_count' +| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)] +| every 15s +| { + group_by [metric.response_code], + [response_code_count_aggregate: aggregate(value_request_count_aggregate)] + | filter (metric.response_code = '404') +; group_by [], + [value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)] +} +| join +| value [response_code_ratio: val(0) / val(1)] +| condition gt(val(), 0.1) +`), + }, + { + name: "Response Code Class", + ResponseCodeMetric: ResponseCodeMetric{ + CodeClass: pointers.Ptr("4xx"), + Ratio: 0.4, + }, + want: autogold.Expect(`fetch cloud_run_revision +| metric 'run.googleapis.com/request_count' +| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)] +| every 15s +| { + group_by [metric.response_code, metric.response_code_class], + [response_code_count_aggregate: aggregate(value_request_count_aggregate)] + | filter (metric.response_code_class = '4xx') +; group_by [], + [value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)] +} +| join +| value [response_code_ratio: val(0) / val(1)] +| condition gt(val(), 0.4) +`), + }, + { + name: "Response Code Class + Exclude", + ResponseCodeMetric: ResponseCodeMetric{ + CodeClass: pointers.Ptr("4xx"), + ExcludeCodes: []string{"404", "429"}, + Ratio: 0.8, + }, + want: autogold.Expect(`fetch cloud_run_revision +| metric 'run.googleapis.com/request_count' +| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)] +| every 15s +| { + group_by [metric.response_code, metric.response_code_class], + [response_code_count_aggregate: aggregate(value_request_count_aggregate)] + | filter (metric.response_code_class = '4xx') + | filter (metric.response_code != '404') + | filter (metric.response_code != '429') +; group_by [], + [value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)] +} +| join +| value [response_code_ratio: val(0) / val(1)] +| condition gt(val(), 0.8) +`), + }, + } { + t.Run(tc.name, func(t *testing.T) { + got := responseCodeBuilder(&Config{ + ServiceName: "test-service", + ResponseCodeMetric: &tc.ResponseCodeMetric, + }) + tc.want.Equal(t, got) + }) + } +} diff --git a/dev/managedservicesplatform/internal/resource/redis/redis.go b/dev/managedservicesplatform/internal/resource/redis/redis.go index 753b6e70179..76e46a2f563 100644 --- a/dev/managedservicesplatform/internal/resource/redis/redis.go +++ b/dev/managedservicesplatform/internal/resource/redis/redis.go @@ -15,6 +15,7 @@ import ( ) type Output struct { + ID *string Endpoint string Certificate gsmsecret.Output } @@ -64,5 +65,6 @@ func New(scope constructs.Construct, id resourceid.ID, config Config) (*Output, Endpoint: fmt.Sprintf("rediss://:%s@%s:%v", *redis.AuthString(), *redis.Host(), *redis.Port()), Certificate: *redisCACert, + ID: redis.Id(), }, nil } diff --git a/dev/managedservicesplatform/internal/stack/cloudrun/cloudrun.go b/dev/managedservicesplatform/internal/stack/cloudrun/cloudrun.go index 5fa4eb8e60e..59b52eb44a6 100644 --- a/dev/managedservicesplatform/internal/stack/cloudrun/cloudrun.go +++ b/dev/managedservicesplatform/internal/stack/cloudrun/cloudrun.go @@ -36,7 +36,9 @@ import ( "github.com/sourcegraph/sourcegraph/lib/pointers" ) -type CrossStackOutput struct{} +type CrossStackOutput struct { + RedisInstanceID *string +} type Variables struct { ProjectID string @@ -143,6 +145,8 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu // redisInstance is only created and non-nil if Redis is configured for the // environment. + // If Redis is configured, populate cross-stack output with Redis ID. + var redisInstanceID *string if vars.Environment.Resources != nil && vars.Environment.Resources.Redis != nil { redisInstance, err := redis.New(stack, resourceid.New("redis"), @@ -156,6 +160,8 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu return nil, errors.Wrap(err, "failed to render Redis instance") } + redisInstanceID = redisInstance.ID + // Configure endpoint string. cloudRunBuilder.AddEnv("REDIS_ENDPOINT", redisInstance.Endpoint) @@ -265,7 +271,9 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu "Cloud Run resource location") locals.Add("image_tag", imageTag.StringValue, "Resolved tag of service image to deploy") - return &CrossStackOutput{}, nil + return &CrossStackOutput{ + RedisInstanceID: redisInstanceID, + }, nil } type envVariablesData struct { diff --git a/dev/managedservicesplatform/internal/stack/monitoring/BUILD.bazel b/dev/managedservicesplatform/internal/stack/monitoring/BUILD.bazel new file mode 100644 index 00000000000..596cc0fc270 --- /dev/null +++ b/dev/managedservicesplatform/internal/stack/monitoring/BUILD.bazel @@ -0,0 +1,18 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "monitoring", + srcs = ["monitoring.go"], + importpath = "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/monitoring", + visibility = ["//dev/managedservicesplatform:__subpackages__"], + deps = [ + "//dev/managedservicesplatform/internal/resource/monitoringalertpolicy", + "//dev/managedservicesplatform/internal/resourceid", + "//dev/managedservicesplatform/internal/stack", + "//dev/managedservicesplatform/internal/stack/options/googleprovider", + "//dev/managedservicesplatform/spec", + "//lib/errors", + "//lib/pointers", + "@com_github_hashicorp_terraform_cdk_go_cdktf//:cdktf", + ], +) diff --git a/dev/managedservicesplatform/internal/stack/monitoring/monitoring.go b/dev/managedservicesplatform/internal/stack/monitoring/monitoring.go new file mode 100644 index 00000000000..279a5ec5e25 --- /dev/null +++ b/dev/managedservicesplatform/internal/stack/monitoring/monitoring.go @@ -0,0 +1,288 @@ +package monitoring + +import ( + "github.com/hashicorp/terraform-cdk-go/cdktf" + + "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/monitoringalertpolicy" + "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid" + "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack" + "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/googleprovider" + "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/spec" + "github.com/sourcegraph/sourcegraph/lib/errors" + "github.com/sourcegraph/sourcegraph/lib/pointers" +) + +// Common +// - Container (8) +// - run.googleapis.com/container/billable_instance_time +// - run.googleapis.com/container/cpu/allocation_time +// * run.googleapis.com/container/cpu/utilizations +// - run.googleapis.com/container/memory/allocation_time +// * run.googleapis.com/container/memory/utilizations +// * run.googleapis.com/container/startup_latencies +// - run.googleapis.com/container/network/received_bytes_count +// - run.googleapis.com/container/network/sent_bytes_count +// - Log-based metrics (2) +// - logging.googleapis.com/byte_count +// - logging.googleapis.com/log_entry_count +// Cloud Run Job +// - Job (4) +// - run.googleapis.com/job/completed_execution_count +// * run.googleapis.com/job/completed_task_attempt_count +// - run.googleapis.com/job/running_executions +// - run.googleapis.com/job/running_task_attempts +// Cloud Run Service +// - Container (9) +// - run.googleapis.com/container/completed_probe_attempt_count +// - run.googleapis.com/container/completed_probe_count +// - run.googleapis.com/container/probe_attempt_latencies +// - run.googleapis.com/container/probe_latencies +// * run.googleapis.com/container/instance_count +// - run.googleapis.com/container/max_request_concurrencies +// - run.googleapis.com/container/cpu/usage +// - run.googleapis.com/container/containers +// - run.googleapis.com/container/memory/usage +// - Request_count (1) +// - run.googleapis.com/request_count +// - Request_latencies (1) +// * run.googleapis.com/request_latencies +// - Pending_queue (1) +// - run.googleapis.com/pending_queue/pending_requests + +type CrossStackOutput struct{} + +type Variables struct { + ProjectID string + Service spec.ServiceSpec + Monitoring spec.MonitoringSpec + MaxCount *int + + // If Redis is enabled we configure alerts for it + RedisInstanceID *string +} + +const StackName = "monitoring" + +func NewStack(stacks *stack.Set, vars Variables) (*CrossStackOutput, error) { + stack, _, err := stacks.New(StackName, googleprovider.With(vars.ProjectID)) + if err != nil { + return nil, err + } + + id := resourceid.New("monitoring") + err = commonAlerts(stack, id.Group("common"), vars) + if err != nil { + return nil, errors.Wrap(err, "failed to create common alerts") + } + + switch pointers.Deref(vars.Service.Kind, spec.ServiceKindService) { + case spec.ServiceKindService: + if err = serviceAlerts(stack, id.Group("service"), vars); err != nil { + return nil, errors.Wrap(err, "failed to create service alerts") + } + + if vars.Monitoring.Alerts.ResponseCodeRatios != nil { + if err = responseCodeMetrics(stack, id.Group("response-code"), vars); err != nil { + return nil, errors.Wrap(err, "failed to create response code metrics") + } + } + case spec.ServiceKindJob: + if err = jobAlerts(stack, id.Group("job"), vars); err != nil { + return nil, errors.Wrap(err, "failed to create job alerts") + } + default: + return nil, errors.New("unknown service kind") + } + + if vars.RedisInstanceID != nil { + if err = redisAlerts(stack, id.Group("redis"), vars); err != nil { + return nil, errors.Wrap(err, "failed to create redis alerts") + } + } + + return &CrossStackOutput{}, nil +} + +func commonAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error { + // Convert a spec.ServiceKind into a monitoringalertpolicy.ServiceKind + serviceKind := monitoringalertpolicy.CloudRunService + kind := pointers.Deref(vars.Service.Kind, "service") + if kind == spec.ServiceKindJob { + serviceKind = monitoringalertpolicy.CloudRunJob + } + + for _, config := range []monitoringalertpolicy.Config{ + { + ID: "cpu", + Name: "High Container CPU Utilization", + Description: pointers.Ptr("High CPU Usage - it may be neccessaru to reduce load or increase CPU allocation"), + ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{ + Filters: map[string]string{"metric.type": "run.googleapis.com/container/cpu/utilizations"}, + Aligner: monitoringalertpolicy.MonitoringAlignPercentile99, + Reducer: monitoringalertpolicy.MonitoringReduceMax, + Period: "300s", + Threshold: 0.8, + }, + }, + { + ID: "memory", + Name: "High Container Memory Utilization", + Description: pointers.Ptr("High Memory Usage - it may be neccessary to reduce load or increase memory allocation"), + ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{ + Filters: map[string]string{"metric.type": "run.googleapis.com/container/memory/utilizations"}, + Aligner: monitoringalertpolicy.MonitoringAlignPercentile99, + Reducer: monitoringalertpolicy.MonitoringReduceMax, + Period: "300s", + Threshold: 0.8, + }, + }, + { + ID: "startup", + Name: "Container Startup Latency", + Description: pointers.Ptr("Instance is taking a long time to start up - something may be blocking startup"), + ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{ + Filters: map[string]string{"metric.type": "run.googleapis.com/container/startup_latencies"}, + Aligner: monitoringalertpolicy.MonitoringAlignPercentile99, + Reducer: monitoringalertpolicy.MonitoringReduceMax, + Period: "60s", + Threshold: 10000, + }, + }, + } { + + config.ProjectID = vars.ProjectID + config.ServiceName = vars.Service.ID + config.ServiceKind = serviceKind + if _, err := monitoringalertpolicy.New(stack, id, &config); err != nil { + return err + } + } + + return nil +} + +func serviceAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error { + // Only provision if MaxCount is specified above 5 + if pointers.Deref(vars.MaxCount, 0) > 5 { + if _, err := monitoringalertpolicy.New(stack, id, &monitoringalertpolicy.Config{ + ID: "instance_count", + Name: "Container Instance Count", + Description: pointers.Ptr("There are a lot of Cloud Run instances running - we may need to increase per-instance requests make make sure we won't hit the configured max instance count"), + ProjectID: vars.ProjectID, + ServiceName: vars.Service.ID, + ServiceKind: monitoringalertpolicy.CloudRunService, + ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{ + Filters: map[string]string{"metric.type": "run.googleapis.com/container/instance_count"}, + Aligner: monitoringalertpolicy.MonitoringAlignMax, + Reducer: monitoringalertpolicy.MonitoringReduceMax, + Period: "60s", + }, + }); err != nil { + return err + } + } + return nil +} + +func jobAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error { + // Alert whenever a Cloud Run Job fails + if _, err := monitoringalertpolicy.New(stack, id, &monitoringalertpolicy.Config{ + ID: "job_failures", + Name: "Cloud Run Job Failures", + Description: pointers.Ptr("Failed executions of Cloud Run Job"), + ProjectID: vars.ProjectID, + ServiceName: vars.Service.ID, + ServiceKind: monitoringalertpolicy.CloudRunJob, + ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{ + Filters: map[string]string{ + "metric.type": "run.googleapis.com/job/completed_task_attempt_count", + "metric.labels.result": "failed", + }, + GroupByFields: []string{"metric.label.result"}, + Aligner: monitoringalertpolicy.MonitoringAlignCount, + Reducer: monitoringalertpolicy.MonitoringReduceSum, + Period: "60s", + Threshold: 0, + }, + }); err != nil { + return err + } + + return nil +} + +func responseCodeMetrics(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error { + for _, config := range vars.Monitoring.Alerts.ResponseCodeRatios { + + if _, err := monitoringalertpolicy.New(stack, id, &monitoringalertpolicy.Config{ + ID: config.ID, + ProjectID: vars.ProjectID, + Name: config.Name, + ServiceName: vars.Service.ID, + ServiceKind: monitoringalertpolicy.CloudRunService, + ResponseCodeMetric: &monitoringalertpolicy.ResponseCodeMetric{ + Code: config.Code, + CodeClass: config.CodeClass, + ExcludeCodes: config.ExcludeCodes, + Ratio: config.Ratio, + Duration: config.Duration, + }, + }); err != nil { + return err + } + } + + return nil +} + +func redisAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error { + for _, config := range []monitoringalertpolicy.Config{ + { + ID: "memory", + Name: "Cloud Redis - System Memory Utilization", + Description: pointers.Ptr("This alert fires if the system memory utilization is above the set threshold. The utilization is measured on a scale of 0 to 1."), + ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{ + Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/memory/system_memory_usage_ratio"}, + Aligner: monitoringalertpolicy.MonitoringAlignMean, + Reducer: monitoringalertpolicy.MonitoringReduceNone, + Period: "300s", + Threshold: 0.8, + }, + }, + { + ID: "cpu", + Name: "Cloud Redis - System CPU Utilization", + Description: pointers.Ptr("This alert fires if the Redis Engine CPU Utilization goes above the set threshold. The utilization is measured on a scale of 0 to 1."), + ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{ + Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/cpu_utilization_main_thread"}, + GroupByFields: []string{"resource.label.instance_id", "resource.label.node_id"}, + Aligner: monitoringalertpolicy.MonitoringAlignRate, + Reducer: monitoringalertpolicy.MonitoringReduceSum, + Period: "300s", + Threshold: 0.9, + }, + }, + { + ID: "failover", + Name: "Cloud Redis - Standard Instance Failover", + Description: pointers.Ptr("This alert fires if failover occurs for a standard tier instance."), + ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{ + Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/cpu_utilization_main_thread"}, + GroupByFields: []string{"resource.label.instance_id", "resource.label.node_id"}, + Aligner: monitoringalertpolicy.MonitoringAlignStddev, + Reducer: monitoringalertpolicy.MonitoringReduceNone, + Period: "300s", + Threshold: 0, + }, + }, + } { + config.ProjectID = vars.ProjectID + config.ServiceName = *vars.RedisInstanceID + config.ServiceKind = monitoringalertpolicy.CloudRedis + if _, err := monitoringalertpolicy.New(stack, id, &config); err != nil { + return err + } + } + + return nil +} diff --git a/dev/managedservicesplatform/managedservicesplatform.go b/dev/managedservicesplatform/managedservicesplatform.go index 5428fb679b8..40eec3ec703 100644 --- a/dev/managedservicesplatform/managedservicesplatform.go +++ b/dev/managedservicesplatform/managedservicesplatform.go @@ -11,6 +11,7 @@ import ( "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack" "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/cloudrun" "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/iam" + "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/monitoring" "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/terraformversion" "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/tfcbackend" "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/project" @@ -53,6 +54,7 @@ func (r *Renderer) RenderEnvironment( svc spec.ServiceSpec, build spec.BuildSpec, env spec.EnvironmentSpec, + monitoringSpec spec.MonitoringSpec, ) (*CDKTF, error) { terraformVersion := terraform.Version stackSetOptions := []stack.NewStackOption{ @@ -107,7 +109,7 @@ func (r *Renderer) RenderEnvironment( if err != nil { return nil, errors.Wrap(err, "failed to create IAM stack") } - if _, err := cloudrun.NewStack(stacks, cloudrun.Variables{ + cloudrunOutput, err := cloudrun.NewStack(stacks, cloudrun.Variables{ ProjectID: *projectOutput.Project.ProjectId(), CloudRunWorkloadServiceAccount: iamOutput.CloudRunWorkloadServiceAccount, @@ -116,10 +118,26 @@ func (r *Renderer) RenderEnvironment( Environment: env, StableGenerate: r.StableGenerate, - }); err != nil { + }) + if err != nil { return nil, errors.Wrap(err, "failed to create cloudrun stack") } + if _, err := monitoring.NewStack(stacks, monitoring.Variables{ + ProjectID: *projectOutput.Project.ProjectId(), + Service: svc, + Monitoring: monitoringSpec, + MaxCount: func() *int { + if env.Instances.Scaling != nil { + return env.Instances.Scaling.MaxCount + } + return nil + }(), + RedisInstanceID: cloudrunOutput.RedisInstanceID, + }); err != nil { + return nil, errors.Wrap(err, "failed to create monitoring stack") + } + // Return CDKTF representation for caller to synthesize return &CDKTF{ app: stack.ExtractApp(stacks), diff --git a/dev/managedservicesplatform/spec/BUILD.bazel b/dev/managedservicesplatform/spec/BUILD.bazel index d7aea1694d4..f59e6e32122 100644 --- a/dev/managedservicesplatform/spec/BUILD.bazel +++ b/dev/managedservicesplatform/spec/BUILD.bazel @@ -6,6 +6,7 @@ go_library( srcs = [ "build.go", "environment.go", + "monitoring.go", "service.go", "spec.go", ], diff --git a/dev/managedservicesplatform/spec/monitoring.go b/dev/managedservicesplatform/spec/monitoring.go new file mode 100644 index 00000000000..c31b9b7acc2 --- /dev/null +++ b/dev/managedservicesplatform/spec/monitoring.go @@ -0,0 +1,95 @@ +package spec + +import ( + "time" + + "github.com/grafana/regexp" + + "github.com/sourcegraph/sourcegraph/lib/errors" +) + +var codeClassPattern = regexp.MustCompile(`\dx+`) + +type MonitoringSpec struct { + // Alerts is a list of alert configurations for the deployment + Alerts MonitoringAlertsSpec `json:"alerts"` +} + +func (s *MonitoringSpec) Validate() []error { + var errs []error + errs = append(errs, s.Alerts.Validate()...) + return errs +} + +type MonitoringAlertsSpec struct { + ResponseCodeRatios []ResponseCodeRatioSpec `json:"responseCodeRatios"` +} + +type ResponseCodeRatioSpec struct { + ID string `json:"id"` + Name string `json:"name"` + Description *string `json:"description,omitempty"` + Code *int `json:"code,omitempty"` + CodeClass *string `json:"codeClass,omitempty"` + ExcludeCodes []string `json:"excludeCodes,omitempty"` + Duration *string `json:"duration,omitempty"` + Ratio float64 `json:"ratio"` +} + +func (s *MonitoringAlertsSpec) Validate() []error { + var errs []error + // Use map to contain seen IDs to ensure uniqueness + ids := make(map[string]struct{}) + for _, r := range s.ResponseCodeRatios { + if r.ID == "" { + errs = append(errs, errors.New("responseCodeRatios[].id is required and cannot be empty")) + } + if _, ok := ids[r.ID]; ok { + errs = append(errs, errors.Newf("response code alert IDs must be unique, found duplicate ID: %s", r.ID)) + } + ids[r.ID] = struct{}{} + errs = append(errs, r.Validate()...) + } + return errs +} + +func (r *ResponseCodeRatioSpec) Validate() []error { + var errs []error + + if r.ID == "" { + errs = append(errs, errors.New("responseCodeRatios[].id is required")) + } + + if r.Name == "" { + errs = append(errs, errors.New("responseCodeRatios[].name is required")) + } + + if r.Ratio < 0 || r.Ratio > 1 { + errs = append(errs, errors.New("responseCodeRatios[].ratio must be between 0 and 1")) + } + + if r.CodeClass != nil && r.Code != nil { + errs = append(errs, errors.New("only one of responseCodeRatios[].code or responseCodeRatios[].codeClass should be specified")) + } + + if r.Code != nil && *r.Code <= 0 { + errs = append(errs, errors.New("responseCodeRatios[].code must be positive")) + } + + if r.CodeClass != nil { + if !codeClassPattern.MatchString(*r.CodeClass) { + errs = append(errs, errors.New("responseCodeRatios[].codeClass must match the format Nxx (e.g. 4xx, 5xx)")) + } + } + + if r.Duration != nil { + duration, err := time.ParseDuration(*r.Duration) + if err != nil { + errs = append(errs, errors.Wrap(err, "responseCodeRatios[].duration must be in the format of XXs")) + } else if duration%time.Minute != 0 { + errs = append(errs, errors.New("responseCodeRatios[].duration must be a multiple of 60s")) + } + } + + return errs +} diff --git a/dev/managedservicesplatform/spec/spec.go b/dev/managedservicesplatform/spec/spec.go index a42e76e003e..b503728091d 100644 --- a/dev/managedservicesplatform/spec/spec.go +++ b/dev/managedservicesplatform/spec/spec.go @@ -25,6 +25,7 @@ type Spec struct { Service ServiceSpec `json:"service"` Build BuildSpec `json:"build"` Environments []EnvironmentSpec `json:"environments"` + Monitoring MonitoringSpec `json:"monitoring"` } // Open a specification file, validate it, unmarshal the data as a MSP spec, @@ -83,6 +84,7 @@ func (s Spec) Validate() []error { for _, env := range s.Environments { errs = append(errs, env.Validate()...) } + errs = append(errs, s.Monitoring.Validate()...) return errs } diff --git a/dev/sg/msp/sg_msp.go b/dev/sg/msp/sg_msp.go index e25dc0b9349..c47236a5efc 100644 --- a/dev/sg/msp/sg_msp.go +++ b/dev/sg/msp/sg_msp.go @@ -274,7 +274,7 @@ Supports completions on services and environments.`, return errors.Newf("environment %q not found in service spec", targetEnv) } - if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, *env); err != nil { + if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, *env, service.Monitoring); err != nil { return errors.Wrapf(err, "sync env %q", env.ID) } } else { @@ -282,7 +282,7 @@ Supports completions on services and environments.`, return errors.New("second argument environment ID is required without the '-all' flag") } for _, env := range service.Environments { - if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, env); err != nil { + if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, env, service.Monitoring); err != nil { return errors.Wrapf(err, "sync env %q", env.ID) } } @@ -323,7 +323,7 @@ Supports completions on services and environments.`, } } -func syncEnvironmentWorkspaces(c *cli.Context, tfc *terraformcloud.Client, service spec.ServiceSpec, build spec.BuildSpec, env spec.EnvironmentSpec) error { +func syncEnvironmentWorkspaces(c *cli.Context, tfc *terraformcloud.Client, service spec.ServiceSpec, build spec.BuildSpec, env spec.EnvironmentSpec, monitoring spec.MonitoringSpec) error { if os.TempDir() == "" { return errors.New("no temp dir available") } @@ -341,7 +341,7 @@ func syncEnvironmentWorkspaces(c *cli.Context, tfc *terraformcloud.Client, servi renderPending := std.Out.Pending(output.Styledf(output.StylePending, "[%s] Rendering required Terraform Cloud workspaces for environment %q", service.ID, env.ID)) - cdktf, err := renderer.RenderEnvironment(service, build, env) + cdktf, err := renderer.RenderEnvironment(service, build, env, monitoring) if err != nil { return err } @@ -452,7 +452,7 @@ func generateTerraform(serviceID string, opts generateTerraformOptions) error { } // Render environment - cdktf, err := renderer.RenderEnvironment(service.Service, service.Build, env) + cdktf, err := renderer.RenderEnvironment(service.Service, service.Build, env, service.Monitoring) if err != nil { return err } diff --git a/lib/pointers/ptr.go b/lib/pointers/ptr.go index 642cf7eb382..d104434b515 100644 --- a/lib/pointers/ptr.go +++ b/lib/pointers/ptr.go @@ -53,3 +53,13 @@ func Float64[T numberType](v T) *float64 { func Stringf(format string, a ...any) *string { return Ptr(fmt.Sprintf(format, a...)) } + +// Slice takes a slice of values and turns it into a slice of pointers. +func Slice[S []V, V any](s S) []*V { + slice := make([]*V, len(s)) + for i, v := range s { + v := v // copy + slice[i] = &v + } + return slice +} diff --git a/lib/pointers/ptr_test.go b/lib/pointers/ptr_test.go index 41a02ba5f75..c535be6eb60 100644 --- a/lib/pointers/ptr_test.go +++ b/lib/pointers/ptr_test.go @@ -203,3 +203,11 @@ func TestDeref(t *testing.T) { runDerefTest(t, tc) } } + +func TestSlice(t *testing.T) { + values := []string{"1", "2", "3"} + pointified := Slice(values) + for i, p := range pointified { + assert.Equal(t, values[i], *p) + } +}