msp: add monitoring stack (#58816)

Adds GCP Monitoring Alert Policies for Cloud Run Services, Cloud Run Jobs and, if enabled, Cloud Redis
This commit is contained in:
James Cotter 2023-12-13 19:40:57 +00:00 committed by GitHub
parent b344c534e9
commit 8c9e114549
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 936 additions and 9 deletions

View File

@ -12,6 +12,7 @@ go_library(
"//dev/managedservicesplatform/internal/stack",
"//dev/managedservicesplatform/internal/stack/cloudrun",
"//dev/managedservicesplatform/internal/stack/iam",
"//dev/managedservicesplatform/internal/stack/monitoring",
"//dev/managedservicesplatform/internal/stack/options/terraformversion",
"//dev/managedservicesplatform/internal/stack/options/tfcbackend",
"//dev/managedservicesplatform/internal/stack/project",

View File

@ -0,0 +1,26 @@
load("//dev:go_defs.bzl", "go_test")
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "monitoringalertpolicy",
srcs = ["monitoringalertpolicy.go"],
importpath = "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/monitoringalertpolicy",
visibility = ["//dev/managedservicesplatform:__subpackages__"],
deps = [
"//dev/managedservicesplatform/internal/resourceid",
"//lib/errors",
"//lib/pointers",
"@com_github_aws_constructs_go_constructs_v10//:constructs",
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_google//monitoringalertpolicy",
],
)
go_test(
name = "monitoringalertpolicy_test",
srcs = ["monitoringalertpolicy_test.go"],
embed = [":monitoringalertpolicy"],
deps = [
"//lib/pointers",
"@com_github_hexops_autogold_v2//:autogold",
],
)

View File

@ -0,0 +1,314 @@
package monitoringalertpolicy
import (
"fmt"
"sort"
"strconv"
"strings"
"github.com/sourcegraph/sourcegraph/lib/errors"
"github.com/aws/constructs-go/constructs/v10"
"github.com/sourcegraph/managed-services-platform-cdktf/gen/google/monitoringalertpolicy"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid"
"github.com/sourcegraph/sourcegraph/lib/pointers"
)
type Aligner string
const (
MonitoringAlignNone Aligner = "ALIGN_NONE"
MonitoringAlignDelta Aligner = "ALIGN_DELTA"
MonitoringAlignRate Aligner = "ALIGN_RATE"
MonitoringAlignInterpolate Aligner = "ALIGN_INTERPOLATE"
MonitoringAlignNextOrder Aligner = "ALIGN_NEXT_ORDER"
MonitoringAlignMin Aligner = "ALIGN_MIN"
MonitoringAlignMax Aligner = "ALIGN_MAX"
MonitoringAlignMean Aligner = "ALIGN_MEAN"
MonitoringAlignCount Aligner = "ALIGN_COUNT"
MonitoringAlignSum Aligner = "ALIGN_SUM"
MonitoringAlignStddev Aligner = "ALIGN_STDDEV"
MonitoringAlignCountTrue Aligner = "ALIGN_COUNT_TRUE"
MonitoringAlignCountFalse Aligner = "ALIGN_COUNT_FALSE"
MonitoringAlignFractionTrue Aligner = "ALIGN_FRACTION_TRUE"
MonitoringAlignPercentile99 Aligner = "ALIGN_PERCENTILE_99"
MonitoringAlignPercentile95 Aligner = "ALIGN_PERCENTILE_95"
MonitoringAlignPercentile50 Aligner = "ALIGN_PERCENTILE_50"
MonitoringAlignPercentile05 Aligner = "ALIGN_PERCENTILE_05"
MonitoringAlignPercentChange Aligner = "ALIGN_PERCENT_CHANGE"
)
type Reducer string
const (
MonitoringReduceNone Reducer = "REDUCE_NONE"
MonitoringReduceMean Reducer = "REDUCE_MEAN"
MonitoringReduceMin Reducer = "REDUCE_MIN"
MonitoringReduceMax Reducer = "REDUCE_MAX"
MonitoringReduceSum Reducer = "REDUCE_SUM"
MonitoringReduceStddev Reducer = "REDUCE_STDDEV"
MonitoringReduceCount Reducer = "REDUCE_COUNT"
MonitoringReduceCountTrue Reducer = "REDUCE_COUNT_TRUE"
MonitoringReduceCountFalse Reducer = "REDUCE_COUNT_FALSE"
MonitoringReduceFractionTrue Reducer = "REDUCE_FRACTION_TRUE"
MonitoringReducePercentile99 Reducer = "REDUCE_PERCENTILE_99"
MonitoringReducePercentile95 Reducer = "REDUCE_PERCENTILE_95"
MonitoringReducePercentile50 Reducer = "REDUCE_PERCENTILE_50"
MonitoringReducePercentile05 Reducer = "REDUCE_PERCENTILE_05"
)
type Comparison string
const (
ComparisonGT Comparison = "COMPARISON_GT"
ComparisonLT Comparison = "COMPARISON_LT"
)
// ThresholdAggregation for alerting when a metric exceeds a defined threshold
//
// Must specify a `metric.type` filter. Additional filters are optional.
// All filters are joined with ` AND `
//
// GroupByFields is an optional field specifying time series labels to aggregate:
// - For services it defaults to `["resource.label.revision_name"]`; additional fields are appended
// - For jobs there is no default
type ThresholdAggregation struct {
Filters map[string]string
GroupByFields []string
Comparison Comparison
Aligner Aligner
Reducer Reducer
Period string
Threshold float64
Duration string
}
// ResponseCodeMetric for alerting when the number of a certain response code exceeds a threshold
//
// Must specify either `Code` (e.g. 404) or `CodeClass` (e.g. 4xx)
//
// `ExcludeCodes` allows filtering out specific response codes from the `CodeClass`
type ResponseCodeMetric struct {
Code *int
CodeClass *string
ExcludeCodes []string
Ratio float64
Duration *string
}
type CloudService int
const (
CloudRunService CloudService = iota
CloudRunJob
CloudRedis
)
// Config for a Monitoring Alert Policy
// Must define either `ThresholdAggregation` or `ResponseCodeMetric`
type Config struct {
// A unique identifier
ID string
Name string
Description *string
ProjectID string
// Name of the service/job/redis to filter the alert on
ServiceName string
// Type of the service/job/redis
ServiceKind CloudService
ThresholdAggregation *ThresholdAggregation
ResponseCodeMetric *ResponseCodeMetric
}
type Output struct {
}
func New(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
if config.ThresholdAggregation == nil && config.ResponseCodeMetric == nil {
return nil, errors.New("Must provide either SingleMetric or ResponseCodeMetric config")
}
if config.ThresholdAggregation != nil && config.ResponseCodeMetric != nil {
return nil, errors.New("Must provide either SingleMetric or ResponseCodeMetric config, not both")
}
if config.ThresholdAggregation != nil {
if len(config.ThresholdAggregation.Filters) == 0 {
return nil, errors.New("must specify at least one filter for threshold aggregation")
}
if _, ok := config.ThresholdAggregation.Filters["metric.type"]; !ok {
return nil, errors.New("must specify filter for `metric.type`")
}
return thresholdAggregation(scope, id, config)
}
return responseCodeMetric(scope, id, config)
}
// threshholdAggregation defines a monitoring alert policy based on a single metric threshold
func thresholdAggregation(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
// Set some defaults
switch config.ServiceKind {
case CloudRunService:
config.ThresholdAggregation.GroupByFields = append([]string{"resource.label.revision_name"}, config.ThresholdAggregation.GroupByFields...)
case CloudRunJob:
// No defaults
case CloudRedis:
// No defaults
default:
return nil, errors.Newf("invalid service kind %q", config.ServiceKind)
}
if config.ThresholdAggregation.Comparison == "" {
config.ThresholdAggregation.Comparison = ComparisonGT
}
if config.ThresholdAggregation.Duration == "" {
config.ThresholdAggregation.Duration = "0s"
}
_ = monitoringalertpolicy.NewMonitoringAlertPolicy(scope,
id.TerraformID(config.ID), &monitoringalertpolicy.MonitoringAlertPolicyConfig{
Project: pointers.Ptr(config.ProjectID),
DisplayName: pointers.Ptr(config.Name),
Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{
Content: config.Description,
MimeType: pointers.Ptr("text/markdown"),
},
Combiner: pointers.Ptr("OR"),
Conditions: []monitoringalertpolicy.MonitoringAlertPolicyConditions{
{
DisplayName: pointers.Ptr(config.Name),
ConditionThreshold: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThreshold{
Aggregations: []monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdAggregations{
{
AlignmentPeriod: pointers.Ptr(config.ThresholdAggregation.Period),
PerSeriesAligner: pointers.Ptr(string(config.ThresholdAggregation.Aligner)),
CrossSeriesReducer: pointers.Ptr(string(config.ThresholdAggregation.Reducer)),
GroupByFields: pointers.Ptr(pointers.Slice(config.ThresholdAggregation.GroupByFields)),
},
},
Comparison: pointers.Ptr(string(config.ThresholdAggregation.Comparison)),
Duration: pointers.Ptr(config.ThresholdAggregation.Duration),
Filter: pointers.Ptr(buildFilter(config)),
ThresholdValue: pointers.Float64(config.ThresholdAggregation.Threshold),
Trigger: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger{
Count: pointers.Float64(1),
},
},
},
},
AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{
AutoClose: pointers.Ptr("604800s"),
},
})
return &Output{}, nil
}
// buildFilter creates the Filter string for a ThresholdAggregation monitoring alert policy
func buildFilter(config *Config) string {
filters := make([]string, 0)
for key, val := range config.ThresholdAggregation.Filters {
filters = append(filters, fmt.Sprintf(`%s = "%s"`, key, val))
}
// Sort to ensure stable output for testing, because
// config.ThresholdAggregation.Filters is a map.
sort.Strings(filters)
switch config.ServiceKind {
case CloudRunService:
filters = append(filters,
`resource.type = "cloud_run_revision"`,
fmt.Sprintf(`resource.labels.service_name = "%s"`, config.ServiceName),
)
case CloudRunJob:
filters = append(filters,
`resource.type = "cloud_run_job"`,
fmt.Sprintf(`resource.labels.job_name = "%s"`, config.ServiceName),
)
case CloudRedis:
filters = append(filters,
`resource.type = "redis_instance"`,
fmt.Sprintf(`resource.labels.redis_instance_id = "%s"`, config.ServiceName),
)
}
return strings.Join(filters, " AND ")
}
// responseCodeMetric defines the MonitoringAlertPolicy for response code metrics
// Supports a single Code e.g. 404 or an entire Code Class e.g. 4xx
// Optionally when using a Code Class, codes to exclude can be defined
func responseCodeMetric(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
query := responseCodeBuilder(config)
if config.ResponseCodeMetric.Duration == nil {
config.ResponseCodeMetric.Duration = pointers.Ptr("60s")
}
_ = monitoringalertpolicy.NewMonitoringAlertPolicy(scope,
id.TerraformID(config.ID), &monitoringalertpolicy.MonitoringAlertPolicyConfig{
Project: pointers.Ptr(config.ProjectID),
DisplayName: pointers.Ptr(fmt.Sprintf("High Ratio of %s Responses", config.Name)),
Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{
Content: config.Description,
MimeType: pointers.Ptr("text/markdown"),
},
Combiner: pointers.Ptr("OR"),
Conditions: []monitoringalertpolicy.MonitoringAlertPolicyConditions{
{
DisplayName: pointers.Ptr(fmt.Sprintf("High Ratio of %s Responses", config.Name)),
ConditionMonitoringQueryLanguage: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionMonitoringQueryLanguage{
Query: pointers.Ptr(query),
Duration: config.ResponseCodeMetric.Duration,
Trigger: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionMonitoringQueryLanguageTrigger{
Count: pointers.Float64(1),
},
},
},
},
AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{
AutoClose: pointers.Ptr("604800s"),
},
})
return &Output{}, nil
}
// responseCodeBuilder builds the MQL for a response code metric alert
func responseCodeBuilder(config *Config) string {
var builder strings.Builder
builder.WriteString(`fetch cloud_run_revision
| metric 'run.googleapis.com/request_count'
| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)]
| every 15s
| {
`)
if config.ResponseCodeMetric.CodeClass != nil {
builder.WriteString(" group_by [metric.response_code, metric.response_code_class],\n")
} else {
builder.WriteString(" group_by [metric.response_code],\n")
}
builder.WriteString(" [response_code_count_aggregate: aggregate(value_request_count_aggregate)]\n")
if config.ResponseCodeMetric.Code != nil {
builder.WriteString(fmt.Sprintf(" | filter (metric.response_code = '%d')\n", *config.ResponseCodeMetric.Code))
} else {
builder.WriteString(fmt.Sprintf(" | filter (metric.response_code_class = '%s')\n", *config.ResponseCodeMetric.CodeClass))
}
if config.ResponseCodeMetric.ExcludeCodes != nil && len(config.ResponseCodeMetric.ExcludeCodes) > 0 {
for _, code := range config.ResponseCodeMetric.ExcludeCodes {
builder.WriteString(fmt.Sprintf(" | filter (metric.response_code != '%s')\n", code))
}
}
builder.WriteString(`; group_by [],
[value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)]
}
| join
| value [response_code_ratio: val(0) / val(1)]
`)
builder.WriteString(fmt.Sprintf("| condition gt(val(), %s)\n", strconv.FormatFloat(config.ResponseCodeMetric.Ratio, 'f', -1, 64)))
return builder.String()
}

View File

@ -0,0 +1,136 @@
package monitoringalertpolicy
import (
"testing"
"github.com/hexops/autogold/v2"
"github.com/sourcegraph/sourcegraph/lib/pointers"
)
func TestBuildFilter(t *testing.T) {
for _, tc := range []struct {
name string
config Config
want autogold.Value
}{
{
name: "Service Metric",
config: Config{
ServiceName: "my-service-name",
ServiceKind: CloudRunService,
ThresholdAggregation: &ThresholdAggregation{
Filters: map[string]string{
"metric.type": "run.googleapis.com/container/startup_latencies",
},
},
},
want: autogold.Expect(`metric.type = "run.googleapis.com/container/startup_latencies" AND resource.type = "cloud_run_revision" AND resource.labels.service_name = "my-service-name"`),
},
{
name: "Job Metric",
config: Config{
ServiceName: "my-job-name",
ServiceKind: CloudRunJob,
ThresholdAggregation: &ThresholdAggregation{
Filters: map[string]string{
"metric.type": "run.googleapis.com/job/completed_task_attempt_count",
"metric.labels.result": "failed",
},
},
},
want: autogold.Expect(`metric.labels.result = "failed" AND metric.type = "run.googleapis.com/job/completed_task_attempt_count" AND resource.type = "cloud_run_job" AND resource.labels.job_name = "my-job-name"`),
},
} {
t.Run(tc.name, func(t *testing.T) {
got := buildFilter(&tc.config)
tc.want.Equal(t, got)
})
}
}
func TestResponseCodeBuilder(t *testing.T) {
for _, tc := range []struct {
name string
ResponseCodeMetric
want autogold.Value
}{
{
name: "Single Response Code",
ResponseCodeMetric: ResponseCodeMetric{
Code: pointers.Ptr(404),
Ratio: 0.1,
},
want: autogold.Expect(`fetch cloud_run_revision
| metric 'run.googleapis.com/request_count'
| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)]
| every 15s
| {
group_by [metric.response_code],
[response_code_count_aggregate: aggregate(value_request_count_aggregate)]
| filter (metric.response_code = '404')
; group_by [],
[value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)]
}
| join
| value [response_code_ratio: val(0) / val(1)]
| condition gt(val(), 0.1)
`),
},
{
name: "Response Code Class",
ResponseCodeMetric: ResponseCodeMetric{
CodeClass: pointers.Ptr("4xx"),
Ratio: 0.4,
},
want: autogold.Expect(`fetch cloud_run_revision
| metric 'run.googleapis.com/request_count'
| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)]
| every 15s
| {
group_by [metric.response_code, metric.response_code_class],
[response_code_count_aggregate: aggregate(value_request_count_aggregate)]
| filter (metric.response_code_class = '4xx')
; group_by [],
[value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)]
}
| join
| value [response_code_ratio: val(0) / val(1)]
| condition gt(val(), 0.4)
`),
},
{
name: "Response Code Class + Exclude",
ResponseCodeMetric: ResponseCodeMetric{
CodeClass: pointers.Ptr("4xx"),
ExcludeCodes: []string{"404", "429"},
Ratio: 0.8,
},
want: autogold.Expect(`fetch cloud_run_revision
| metric 'run.googleapis.com/request_count'
| group_by 15s, [value_request_count_aggregate: aggregate(value.request_count)]
| every 15s
| {
group_by [metric.response_code, metric.response_code_class],
[response_code_count_aggregate: aggregate(value_request_count_aggregate)]
| filter (metric.response_code_class = '4xx')
| filter (metric.response_code != '404')
| filter (metric.response_code != '429')
; group_by [],
[value_request_count_aggregate_aggregate: aggregate(value_request_count_aggregate)]
}
| join
| value [response_code_ratio: val(0) / val(1)]
| condition gt(val(), 0.8)
`),
},
} {
t.Run(tc.name, func(t *testing.T) {
got := responseCodeBuilder(&Config{
ServiceName: "test-service",
ResponseCodeMetric: &tc.ResponseCodeMetric,
})
tc.want.Equal(t, got)
})
}
}

View File

@ -15,6 +15,7 @@ import (
)
type Output struct {
ID *string
Endpoint string
Certificate gsmsecret.Output
}
@ -64,5 +65,6 @@ func New(scope constructs.Construct, id resourceid.ID, config Config) (*Output,
Endpoint: fmt.Sprintf("rediss://:%s@%s:%v",
*redis.AuthString(), *redis.Host(), *redis.Port()),
Certificate: *redisCACert,
ID: redis.Id(),
}, nil
}

View File

@ -36,7 +36,9 @@ import (
"github.com/sourcegraph/sourcegraph/lib/pointers"
)
type CrossStackOutput struct{}
type CrossStackOutput struct {
RedisInstanceID *string
}
type Variables struct {
ProjectID string
@ -143,6 +145,8 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu
// redisInstance is only created and non-nil if Redis is configured for the
// environment.
// If Redis is configured, populate cross-stack output with Redis ID.
var redisInstanceID *string
if vars.Environment.Resources != nil && vars.Environment.Resources.Redis != nil {
redisInstance, err := redis.New(stack,
resourceid.New("redis"),
@ -156,6 +160,8 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu
return nil, errors.Wrap(err, "failed to render Redis instance")
}
redisInstanceID = redisInstance.ID
// Configure endpoint string.
cloudRunBuilder.AddEnv("REDIS_ENDPOINT", redisInstance.Endpoint)
@ -265,7 +271,9 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu
"Cloud Run resource location")
locals.Add("image_tag", imageTag.StringValue,
"Resolved tag of service image to deploy")
return &CrossStackOutput{}, nil
return &CrossStackOutput{
RedisInstanceID: redisInstanceID,
}, nil
}
type envVariablesData struct {

View File

@ -0,0 +1,18 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "monitoring",
srcs = ["monitoring.go"],
importpath = "github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/monitoring",
visibility = ["//dev/managedservicesplatform:__subpackages__"],
deps = [
"//dev/managedservicesplatform/internal/resource/monitoringalertpolicy",
"//dev/managedservicesplatform/internal/resourceid",
"//dev/managedservicesplatform/internal/stack",
"//dev/managedservicesplatform/internal/stack/options/googleprovider",
"//dev/managedservicesplatform/spec",
"//lib/errors",
"//lib/pointers",
"@com_github_hashicorp_terraform_cdk_go_cdktf//:cdktf",
],
)

View File

@ -0,0 +1,288 @@
package monitoring
import (
"github.com/hashicorp/terraform-cdk-go/cdktf"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/monitoringalertpolicy"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/googleprovider"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/spec"
"github.com/sourcegraph/sourcegraph/lib/errors"
"github.com/sourcegraph/sourcegraph/lib/pointers"
)
// Common
// - Container (8)
// - run.googleapis.com/container/billable_instance_time
// - run.googleapis.com/container/cpu/allocation_time
// * run.googleapis.com/container/cpu/utilizations
// - run.googleapis.com/container/memory/allocation_time
// * run.googleapis.com/container/memory/utilizations
// * run.googleapis.com/container/startup_latencies
// - run.googleapis.com/container/network/received_bytes_count
// - run.googleapis.com/container/network/sent_bytes_count
// - Log-based metrics (2)
// - logging.googleapis.com/byte_count
// - logging.googleapis.com/log_entry_count
// Cloud Run Job
// - Job (4)
// - run.googleapis.com/job/completed_execution_count
// * run.googleapis.com/job/completed_task_attempt_count
// - run.googleapis.com/job/running_executions
// - run.googleapis.com/job/running_task_attempts
// Cloud Run Service
// - Container (9)
// - run.googleapis.com/container/completed_probe_attempt_count
// - run.googleapis.com/container/completed_probe_count
// - run.googleapis.com/container/probe_attempt_latencies
// - run.googleapis.com/container/probe_latencies
// * run.googleapis.com/container/instance_count
// - run.googleapis.com/container/max_request_concurrencies
// - run.googleapis.com/container/cpu/usage
// - run.googleapis.com/container/containers
// - run.googleapis.com/container/memory/usage
// - Request_count (1)
// - run.googleapis.com/request_count
// - Request_latencies (1)
// * run.googleapis.com/request_latencies
// - Pending_queue (1)
// - run.googleapis.com/pending_queue/pending_requests
type CrossStackOutput struct{}
type Variables struct {
ProjectID string
Service spec.ServiceSpec
Monitoring spec.MonitoringSpec
MaxCount *int
// If Redis is enabled we configure alerts for it
RedisInstanceID *string
}
const StackName = "monitoring"
func NewStack(stacks *stack.Set, vars Variables) (*CrossStackOutput, error) {
stack, _, err := stacks.New(StackName, googleprovider.With(vars.ProjectID))
if err != nil {
return nil, err
}
id := resourceid.New("monitoring")
err = commonAlerts(stack, id.Group("common"), vars)
if err != nil {
return nil, errors.Wrap(err, "failed to create common alerts")
}
switch pointers.Deref(vars.Service.Kind, spec.ServiceKindService) {
case spec.ServiceKindService:
if err = serviceAlerts(stack, id.Group("service"), vars); err != nil {
return nil, errors.Wrap(err, "failed to create service alerts")
}
if vars.Monitoring.Alerts.ResponseCodeRatios != nil {
if err = responseCodeMetrics(stack, id.Group("response-code"), vars); err != nil {
return nil, errors.Wrap(err, "failed to create response code metrics")
}
}
case spec.ServiceKindJob:
if err = jobAlerts(stack, id.Group("job"), vars); err != nil {
return nil, errors.Wrap(err, "failed to create job alerts")
}
default:
return nil, errors.New("unknown service kind")
}
if vars.RedisInstanceID != nil {
if err = redisAlerts(stack, id.Group("redis"), vars); err != nil {
return nil, errors.Wrap(err, "failed to create redis alerts")
}
}
return &CrossStackOutput{}, nil
}
func commonAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
// Convert a spec.ServiceKind into a monitoringalertpolicy.ServiceKind
serviceKind := monitoringalertpolicy.CloudRunService
kind := pointers.Deref(vars.Service.Kind, "service")
if kind == spec.ServiceKindJob {
serviceKind = monitoringalertpolicy.CloudRunJob
}
for _, config := range []monitoringalertpolicy.Config{
{
ID: "cpu",
Name: "High Container CPU Utilization",
Description: pointers.Ptr("High CPU Usage - it may be neccessaru to reduce load or increase CPU allocation"),
ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "run.googleapis.com/container/cpu/utilizations"},
Aligner: monitoringalertpolicy.MonitoringAlignPercentile99,
Reducer: monitoringalertpolicy.MonitoringReduceMax,
Period: "300s",
Threshold: 0.8,
},
},
{
ID: "memory",
Name: "High Container Memory Utilization",
Description: pointers.Ptr("High Memory Usage - it may be neccessary to reduce load or increase memory allocation"),
ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "run.googleapis.com/container/memory/utilizations"},
Aligner: monitoringalertpolicy.MonitoringAlignPercentile99,
Reducer: monitoringalertpolicy.MonitoringReduceMax,
Period: "300s",
Threshold: 0.8,
},
},
{
ID: "startup",
Name: "Container Startup Latency",
Description: pointers.Ptr("Instance is taking a long time to start up - something may be blocking startup"),
ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "run.googleapis.com/container/startup_latencies"},
Aligner: monitoringalertpolicy.MonitoringAlignPercentile99,
Reducer: monitoringalertpolicy.MonitoringReduceMax,
Period: "60s",
Threshold: 10000,
},
},
} {
config.ProjectID = vars.ProjectID
config.ServiceName = vars.Service.ID
config.ServiceKind = serviceKind
if _, err := monitoringalertpolicy.New(stack, id, &config); err != nil {
return err
}
}
return nil
}
func serviceAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
// Only provision if MaxCount is specified above 5
if pointers.Deref(vars.MaxCount, 0) > 5 {
if _, err := monitoringalertpolicy.New(stack, id, &monitoringalertpolicy.Config{
ID: "instance_count",
Name: "Container Instance Count",
Description: pointers.Ptr("There are a lot of Cloud Run instances running - we may need to increase per-instance requests make make sure we won't hit the configured max instance count"),
ProjectID: vars.ProjectID,
ServiceName: vars.Service.ID,
ServiceKind: monitoringalertpolicy.CloudRunService,
ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "run.googleapis.com/container/instance_count"},
Aligner: monitoringalertpolicy.MonitoringAlignMax,
Reducer: monitoringalertpolicy.MonitoringReduceMax,
Period: "60s",
},
}); err != nil {
return err
}
}
return nil
}
func jobAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
// Alert whenever a Cloud Run Job fails
if _, err := monitoringalertpolicy.New(stack, id, &monitoringalertpolicy.Config{
ID: "job_failures",
Name: "Cloud Run Job Failures",
Description: pointers.Ptr("Failed executions of Cloud Run Job"),
ProjectID: vars.ProjectID,
ServiceName: vars.Service.ID,
ServiceKind: monitoringalertpolicy.CloudRunJob,
ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
Filters: map[string]string{
"metric.type": "run.googleapis.com/job/completed_task_attempt_count",
"metric.labels.result": "failed",
},
GroupByFields: []string{"metric.label.result"},
Aligner: monitoringalertpolicy.MonitoringAlignCount,
Reducer: monitoringalertpolicy.MonitoringReduceSum,
Period: "60s",
Threshold: 0,
},
}); err != nil {
return err
}
return nil
}
func responseCodeMetrics(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
for _, config := range vars.Monitoring.Alerts.ResponseCodeRatios {
if _, err := monitoringalertpolicy.New(stack, id, &monitoringalertpolicy.Config{
ID: config.ID,
ProjectID: vars.ProjectID,
Name: config.Name,
ServiceName: vars.Service.ID,
ServiceKind: monitoringalertpolicy.CloudRunService,
ResponseCodeMetric: &monitoringalertpolicy.ResponseCodeMetric{
Code: config.Code,
CodeClass: config.CodeClass,
ExcludeCodes: config.ExcludeCodes,
Ratio: config.Ratio,
Duration: config.Duration,
},
}); err != nil {
return err
}
}
return nil
}
func redisAlerts(stack cdktf.TerraformStack, id resourceid.ID, vars Variables) error {
for _, config := range []monitoringalertpolicy.Config{
{
ID: "memory",
Name: "Cloud Redis - System Memory Utilization",
Description: pointers.Ptr("This alert fires if the system memory utilization is above the set threshold. The utilization is measured on a scale of 0 to 1."),
ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/memory/system_memory_usage_ratio"},
Aligner: monitoringalertpolicy.MonitoringAlignMean,
Reducer: monitoringalertpolicy.MonitoringReduceNone,
Period: "300s",
Threshold: 0.8,
},
},
{
ID: "cpu",
Name: "Cloud Redis - System CPU Utilization",
Description: pointers.Ptr("This alert fires if the Redis Engine CPU Utilization goes above the set threshold. The utilization is measured on a scale of 0 to 1."),
ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/cpu_utilization_main_thread"},
GroupByFields: []string{"resource.label.instance_id", "resource.label.node_id"},
Aligner: monitoringalertpolicy.MonitoringAlignRate,
Reducer: monitoringalertpolicy.MonitoringReduceSum,
Period: "300s",
Threshold: 0.9,
},
},
{
ID: "failover",
Name: "Cloud Redis - Standard Instance Failover",
Description: pointers.Ptr("This alert fires if failover occurs for a standard tier instance."),
ThresholdAggregation: &monitoringalertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/cpu_utilization_main_thread"},
GroupByFields: []string{"resource.label.instance_id", "resource.label.node_id"},
Aligner: monitoringalertpolicy.MonitoringAlignStddev,
Reducer: monitoringalertpolicy.MonitoringReduceNone,
Period: "300s",
Threshold: 0,
},
},
} {
config.ProjectID = vars.ProjectID
config.ServiceName = *vars.RedisInstanceID
config.ServiceKind = monitoringalertpolicy.CloudRedis
if _, err := monitoringalertpolicy.New(stack, id, &config); err != nil {
return err
}
}
return nil
}

View File

@ -11,6 +11,7 @@ import (
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/cloudrun"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/iam"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/monitoring"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/terraformversion"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/tfcbackend"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/project"
@ -53,6 +54,7 @@ func (r *Renderer) RenderEnvironment(
svc spec.ServiceSpec,
build spec.BuildSpec,
env spec.EnvironmentSpec,
monitoringSpec spec.MonitoringSpec,
) (*CDKTF, error) {
terraformVersion := terraform.Version
stackSetOptions := []stack.NewStackOption{
@ -107,7 +109,7 @@ func (r *Renderer) RenderEnvironment(
if err != nil {
return nil, errors.Wrap(err, "failed to create IAM stack")
}
if _, err := cloudrun.NewStack(stacks, cloudrun.Variables{
cloudrunOutput, err := cloudrun.NewStack(stacks, cloudrun.Variables{
ProjectID: *projectOutput.Project.ProjectId(),
CloudRunWorkloadServiceAccount: iamOutput.CloudRunWorkloadServiceAccount,
@ -116,10 +118,26 @@ func (r *Renderer) RenderEnvironment(
Environment: env,
StableGenerate: r.StableGenerate,
}); err != nil {
})
if err != nil {
return nil, errors.Wrap(err, "failed to create cloudrun stack")
}
if _, err := monitoring.NewStack(stacks, monitoring.Variables{
ProjectID: *projectOutput.Project.ProjectId(),
Service: svc,
Monitoring: monitoringSpec,
MaxCount: func() *int {
if env.Instances.Scaling != nil {
return env.Instances.Scaling.MaxCount
}
return nil
}(),
RedisInstanceID: cloudrunOutput.RedisInstanceID,
}); err != nil {
return nil, errors.Wrap(err, "failed to create monitoring stack")
}
// Return CDKTF representation for caller to synthesize
return &CDKTF{
app: stack.ExtractApp(stacks),

View File

@ -6,6 +6,7 @@ go_library(
srcs = [
"build.go",
"environment.go",
"monitoring.go",
"service.go",
"spec.go",
],

View File

@ -0,0 +1,95 @@
package spec
import (
"time"
"github.com/grafana/regexp"
"github.com/sourcegraph/sourcegraph/lib/errors"
)
var codeClassPattern = regexp.MustCompile(`\dx+`)
type MonitoringSpec struct {
// Alerts is a list of alert configurations for the deployment
Alerts MonitoringAlertsSpec `json:"alerts"`
}
func (s *MonitoringSpec) Validate() []error {
var errs []error
errs = append(errs, s.Alerts.Validate()...)
return errs
}
type MonitoringAlertsSpec struct {
ResponseCodeRatios []ResponseCodeRatioSpec `json:"responseCodeRatios"`
}
type ResponseCodeRatioSpec struct {
ID string `json:"id"`
Name string `json:"name"`
Description *string `json:"description,omitempty"`
Code *int `json:"code,omitempty"`
CodeClass *string `json:"codeClass,omitempty"`
ExcludeCodes []string `json:"excludeCodes,omitempty"`
Duration *string `json:"duration,omitempty"`
Ratio float64 `json:"ratio"`
}
func (s *MonitoringAlertsSpec) Validate() []error {
var errs []error
// Use map to contain seen IDs to ensure uniqueness
ids := make(map[string]struct{})
for _, r := range s.ResponseCodeRatios {
if r.ID == "" {
errs = append(errs, errors.New("responseCodeRatios[].id is required and cannot be empty"))
}
if _, ok := ids[r.ID]; ok {
errs = append(errs, errors.Newf("response code alert IDs must be unique, found duplicate ID: %s", r.ID))
}
ids[r.ID] = struct{}{}
errs = append(errs, r.Validate()...)
}
return errs
}
func (r *ResponseCodeRatioSpec) Validate() []error {
var errs []error
if r.ID == "" {
errs = append(errs, errors.New("responseCodeRatios[].id is required"))
}
if r.Name == "" {
errs = append(errs, errors.New("responseCodeRatios[].name is required"))
}
if r.Ratio < 0 || r.Ratio > 1 {
errs = append(errs, errors.New("responseCodeRatios[].ratio must be between 0 and 1"))
}
if r.CodeClass != nil && r.Code != nil {
errs = append(errs, errors.New("only one of responseCodeRatios[].code or responseCodeRatios[].codeClass should be specified"))
}
if r.Code != nil && *r.Code <= 0 {
errs = append(errs, errors.New("responseCodeRatios[].code must be positive"))
}
if r.CodeClass != nil {
if !codeClassPattern.MatchString(*r.CodeClass) {
errs = append(errs, errors.New("responseCodeRatios[].codeClass must match the format Nxx (e.g. 4xx, 5xx)"))
}
}
if r.Duration != nil {
duration, err := time.ParseDuration(*r.Duration)
if err != nil {
errs = append(errs, errors.Wrap(err, "responseCodeRatios[].duration must be in the format of XXs"))
} else if duration%time.Minute != 0 {
errs = append(errs, errors.New("responseCodeRatios[].duration must be a multiple of 60s"))
}
}
return errs
}

View File

@ -25,6 +25,7 @@ type Spec struct {
Service ServiceSpec `json:"service"`
Build BuildSpec `json:"build"`
Environments []EnvironmentSpec `json:"environments"`
Monitoring MonitoringSpec `json:"monitoring"`
}
// Open a specification file, validate it, unmarshal the data as a MSP spec,
@ -83,6 +84,7 @@ func (s Spec) Validate() []error {
for _, env := range s.Environments {
errs = append(errs, env.Validate()...)
}
errs = append(errs, s.Monitoring.Validate()...)
return errs
}

View File

@ -274,7 +274,7 @@ Supports completions on services and environments.`,
return errors.Newf("environment %q not found in service spec", targetEnv)
}
if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, *env); err != nil {
if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, *env, service.Monitoring); err != nil {
return errors.Wrapf(err, "sync env %q", env.ID)
}
} else {
@ -282,7 +282,7 @@ Supports completions on services and environments.`,
return errors.New("second argument environment ID is required without the '-all' flag")
}
for _, env := range service.Environments {
if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, env); err != nil {
if err := syncEnvironmentWorkspaces(c, tfcClient, service.Service, service.Build, env, service.Monitoring); err != nil {
return errors.Wrapf(err, "sync env %q", env.ID)
}
}
@ -323,7 +323,7 @@ Supports completions on services and environments.`,
}
}
func syncEnvironmentWorkspaces(c *cli.Context, tfc *terraformcloud.Client, service spec.ServiceSpec, build spec.BuildSpec, env spec.EnvironmentSpec) error {
func syncEnvironmentWorkspaces(c *cli.Context, tfc *terraformcloud.Client, service spec.ServiceSpec, build spec.BuildSpec, env spec.EnvironmentSpec, monitoring spec.MonitoringSpec) error {
if os.TempDir() == "" {
return errors.New("no temp dir available")
}
@ -341,7 +341,7 @@ func syncEnvironmentWorkspaces(c *cli.Context, tfc *terraformcloud.Client, servi
renderPending := std.Out.Pending(output.Styledf(output.StylePending,
"[%s] Rendering required Terraform Cloud workspaces for environment %q",
service.ID, env.ID))
cdktf, err := renderer.RenderEnvironment(service, build, env)
cdktf, err := renderer.RenderEnvironment(service, build, env, monitoring)
if err != nil {
return err
}
@ -452,7 +452,7 @@ func generateTerraform(serviceID string, opts generateTerraformOptions) error {
}
// Render environment
cdktf, err := renderer.RenderEnvironment(service.Service, service.Build, env)
cdktf, err := renderer.RenderEnvironment(service.Service, service.Build, env, service.Monitoring)
if err != nil {
return err
}

View File

@ -53,3 +53,13 @@ func Float64[T numberType](v T) *float64 {
func Stringf(format string, a ...any) *string {
return Ptr(fmt.Sprintf(format, a...))
}
// Slice takes a slice of values and turns it into a slice of pointers.
func Slice[S []V, V any](s S) []*V {
slice := make([]*V, len(s))
for i, v := range s {
v := v // copy
slice[i] = &v
}
return slice
}

View File

@ -203,3 +203,11 @@ func TestDeref(t *testing.T) {
runDerefTest(t, tc)
}
}
func TestSlice(t *testing.T) {
values := []string{"1", "2", "3"}
pointified := Slice(values)
for i, p := range pointified {
assert.Equal(t, values[i], *p)
}
}