msp/monitoring: add external uptime check and alert, rework health probes configuration (#59461)

The new configuration is mostly based on Cody Gateway - if a service has an external domain, we create an uptime check and alert on failures.

The uptime check uses MSP standards, which depends on whether or not service health probes are configured. Since we use this in several places now, I've also reworked the health probes configuration to make it easier to reason with:

1. `healthProbes` now configures all healthchecks. `startupProbe` and `livenessProbe` has been removed
2. `disabled` is now `healthzProbes` - this configures if MSP healthchecks should be used, instead of default `/` ones.
3. By default, if no config is provided, MSP healthchecks are not used
4. If config is provided, MSP healthchecks must be explicitly disabled

Closes https://github.com/sourcegraph/managed-services/issues/350

This is required for our upcoming vendor evaluations as well.

This PR also includes a variety of internal improvements to alert policies.
This commit is contained in:
Robert Lin 2024-01-15 16:50:41 -08:00 committed by GitHub
parent f407f96455
commit 0060df720e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 452 additions and 188 deletions

View File

@ -8,6 +8,7 @@ go_library(
visibility = ["//dev/managedservicesplatform:__subpackages__"],
deps = [
"//dev/managedservicesplatform/internal/resourceid",
"//dev/managedservicesplatform/spec",
"//lib/errors",
"//lib/pointers",
"@com_github_aws_constructs_go_constructs_v10//:constructs",

View File

@ -13,6 +13,7 @@ import (
"github.com/sourcegraph/managed-services-platform-cdktf/gen/google/monitoringnotificationchannel"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/spec"
"github.com/sourcegraph/sourcegraph/lib/pointers"
)
@ -83,6 +84,10 @@ type ThresholdAggregation struct {
Period string
Threshold float64
Duration string
// Trigger is the strategy for determining if an alert should fire based
// on the thresholds.
Trigger TriggerKind
}
// ResponseCodeMetric for alerting when the number of a certain response code exceeds a threshold
@ -98,30 +103,47 @@ type ResponseCodeMetric struct {
Duration *string
}
type CloudService int
type ResourceKind string
const (
CloudRunService CloudService = iota
CloudRunJob
CloudRedis
CloudRunService ResourceKind = "cloud-run-service"
CloudRunJob ResourceKind = "cloud-run-job"
CloudRedis ResourceKind = "cloud-redis"
URLUptime ResourceKind = "url-uptime"
)
type TriggerKind int
const (
// TriggerKindAnyViolation is trigger { count: 1 } - any violation will
// cause an alert to fire. This is the default.
TriggerKindAnyViolation TriggerKind = iota
// TriggerKindAllInViolation is trigger { percent: 100 } - all time series
// must be in violation for alert to fire.
TriggerKindAllInViolation
)
// Config for a Monitoring Alert Policy
// Must define either `ThresholdAggregation` or `ResponseCodeMetric`
type Config struct {
// ServiceEnvironmentSlug is $SERVICE_ID#$ENV_ID, and is used for generating
// docs links in alert descriptions.
ServiceEnvironmentSlug string
Service spec.ServiceSpec
EnvironmentID string
ProjectID string
// ID is unique identifier of the alert policy
ID string
// Name is a human-readable name for the alert policy
Name string
// Description is a Markdown-format description for the alert policy. Some
// unified context will be included as well, including links to the service
// handbook page and so on.
Description string
// ResourceKind identifies what is being monitored.
ResourceKind ResourceKind
// ResourceName is the identifier for the monitored resource of ResourceKind.
ResourceName string
// ID is unique identifier
ID string
Name string
Description *string
ProjectID string
// Name of the service/job/redis to filter the alert on
ServiceName string
// Type of the service/job/redis
ServiceKind CloudService
// NotificationChannels to subscribe on this alert
NotificationChannels []monitoringnotificationchannel.MonitoringNotificationChannel
@ -129,9 +151,12 @@ type Config struct {
ResponseCodeMetric *ResponseCodeMetric
}
type Output struct {
func (c Config) getDocsSlug() string {
return fmt.Sprintf("%s#%s", c.Service.ID, c.EnvironmentID)
}
type Output struct{}
func New(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
if config.ThresholdAggregation == nil && config.ResponseCodeMetric == nil {
return nil, errors.New("Must provide either SingleMetric or ResponseCodeMetric config")
@ -142,18 +167,18 @@ func New(scope constructs.Construct, id resourceid.ID, config *Config) (*Output,
}
// Universal alert description addendum
if config.ServiceEnvironmentSlug == "" {
return nil, errors.New("ServiceEnvironmentSlug is required")
if config.Service.ID == "" {
return nil, errors.New("Service is required")
}
if pointers.DerefZero(config.Description) == "" {
if config.Description == "" {
return nil, errors.New("Description is required")
} else {
config.Description = pointers.Stringf(`%s
config.Description = fmt.Sprintf(`%s
See https://handbook.sourcegraph.com/departments/engineering/managed-services/%s for service and infrastructure access details.
If you need additional assistance, reach out to #discuss-core-services.`,
*config.Description,
config.ServiceEnvironmentSlug)
config.Description,
config.getDocsSlug())
}
if config.ThresholdAggregation != nil {
@ -172,15 +197,15 @@ If you need additional assistance, reach out to #discuss-core-services.`,
// threshholdAggregation defines a monitoring alert policy based on a single metric threshold
func newThresholdAggregationAlert(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
// Set some defaults
switch config.ServiceKind {
switch config.ResourceKind {
case CloudRunService:
config.ThresholdAggregation.GroupByFields = append([]string{"resource.label.revision_name"}, config.ThresholdAggregation.GroupByFields...)
case CloudRunJob:
// No defaults
case CloudRedis:
config.ThresholdAggregation.GroupByFields = append(
[]string{"resource.label.revision_name"},
config.ThresholdAggregation.GroupByFields...)
case CloudRunJob, CloudRedis, URLUptime:
// No defaults
default:
return nil, errors.Newf("invalid service kind %q", config.ServiceKind)
return nil, errors.Newf("invalid service kind %q", config.ResourceKind)
}
if config.ThresholdAggregation.Comparison == "" {
@ -196,14 +221,33 @@ func newThresholdAggregationAlert(scope constructs.Construct, id resourceid.ID,
Project: pointers.Ptr(config.ProjectID),
DisplayName: pointers.Ptr(config.Name),
Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{
Content: config.Description,
Subject: pointers.Stringf("%s (%s): %s",
config.Service.GetName(), config.EnvironmentID, config.Name),
Content: pointers.Ptr(config.Description),
MimeType: pointers.Ptr("text/markdown"),
},
UserLabels: &map[string]*string{
"source": pointers.Ptr("managed-services-platform"),
"msp_alert_id": pointers.Ptr(config.ID),
"msp_gcp_project": pointers.Ptr(config.ProjectID),
"source": pointers.Ptr("managed-services-platform"),
"resource_kind": pointers.Ptr(string(config.ResourceKind)),
"msp_alert_id": pointers.Ptr(config.ID),
"msp_service_id": pointers.Ptr(config.Service.ID),
"msp_environment_id": pointers.Ptr(config.EnvironmentID),
},
// Notification strategy
AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{
AutoClose: pointers.Ptr("86400s"), // 24 hours
},
NotificationChannels: notificationChannelIDs(config.NotificationChannels),
// For now, set all MSP alerts as WARNING. In the future, we should
// have different severity levels.
// https://github.com/sourcegraph/managed-services/issues/385
// Possible values: ["CRITICAL", "ERROR", "WARNING"]
Severity: pointers.Ptr("WARNING"),
// Conditions
Combiner: pointers.Ptr("OR"),
Conditions: []monitoringalertpolicy.MonitoringAlertPolicyConditions{
{
@ -212,8 +256,8 @@ func newThresholdAggregationAlert(scope constructs.Construct, id resourceid.ID,
Aggregations: []monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdAggregations{
{
AlignmentPeriod: pointers.Ptr(config.ThresholdAggregation.Period),
PerSeriesAligner: pointers.Ptr(string(config.ThresholdAggregation.Aligner)),
CrossSeriesReducer: pointers.Ptr(string(config.ThresholdAggregation.Reducer)),
PerSeriesAligner: pointers.NonZeroPtr(string(config.ThresholdAggregation.Aligner)),
CrossSeriesReducer: pointers.NonZeroPtr(string(config.ThresholdAggregation.Reducer)),
GroupByFields: pointers.Ptr(pointers.Slice(config.ThresholdAggregation.GroupByFields)),
},
},
@ -221,16 +265,24 @@ func newThresholdAggregationAlert(scope constructs.Construct, id resourceid.ID,
Duration: pointers.Ptr(config.ThresholdAggregation.Duration),
Filter: pointers.Ptr(buildFilter(config)),
ThresholdValue: pointers.Float64(config.ThresholdAggregation.Threshold),
Trigger: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger{
Count: pointers.Float64(1),
},
Trigger: func() *monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger {
switch config.ThresholdAggregation.Trigger {
case TriggerKindAllInViolation:
return &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger{
Percent: pointers.Float64(100),
}
case TriggerKindAnyViolation:
fallthrough
default:
return &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger{
Count: pointers.Float64(1),
}
}
}(),
},
},
},
AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{
AutoClose: pointers.Ptr("604800s"),
},
NotificationChannels: notificationChannelIDs(config.NotificationChannels),
})
return &Output{}, nil
}
@ -246,21 +298,26 @@ func buildFilter(config *Config) string {
// config.ThresholdAggregation.Filters is a map.
sort.Strings(filters)
switch config.ServiceKind {
switch config.ResourceKind {
case CloudRunService:
filters = append(filters,
`resource.type = "cloud_run_revision"`,
fmt.Sprintf(`resource.labels.service_name = "%s"`, config.ServiceName),
fmt.Sprintf(`resource.labels.service_name = "%s"`, config.ResourceName),
)
case CloudRunJob:
filters = append(filters,
`resource.type = "cloud_run_job"`,
fmt.Sprintf(`resource.labels.job_name = "%s"`, config.ServiceName),
fmt.Sprintf(`resource.labels.job_name = "%s"`, config.ResourceName),
)
case CloudRedis:
filters = append(filters,
`resource.type = "redis_instance"`,
fmt.Sprintf(`resource.labels.instance_id = "%s"`, config.ServiceName),
fmt.Sprintf(`resource.labels.instance_id = "%s"`, config.ResourceName),
)
case URLUptime:
filters = append(filters,
`resource.type = "uptime_url"`,
fmt.Sprintf(`metric.labels.check_id = "%s"`, config.ResourceName),
)
}
@ -282,7 +339,7 @@ func newResponseCodeMetricAlert(scope constructs.Construct, id resourceid.ID, co
Project: pointers.Ptr(config.ProjectID),
DisplayName: pointers.Ptr(fmt.Sprintf("High Ratio of %s Responses", config.Name)),
Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{
Content: config.Description,
Content: pointers.Ptr(config.Description),
MimeType: pointers.Ptr("text/markdown"),
},
Combiner: pointers.Ptr("OR"),

View File

@ -17,8 +17,8 @@ func TestBuildFilter(t *testing.T) {
{
name: "Service Metric",
config: Config{
ServiceName: "my-service-name",
ServiceKind: CloudRunService,
ResourceName: "my-service-name",
ResourceKind: CloudRunService,
ThresholdAggregation: &ThresholdAggregation{
Filters: map[string]string{
"metric.type": "run.googleapis.com/container/startup_latencies",
@ -30,8 +30,8 @@ func TestBuildFilter(t *testing.T) {
{
name: "Job Metric",
config: Config{
ServiceName: "my-job-name",
ServiceKind: CloudRunJob,
ResourceName: "my-job-name",
ResourceKind: CloudRunJob,
ThresholdAggregation: &ThresholdAggregation{
Filters: map[string]string{
"metric.type": "run.googleapis.com/job/completed_task_attempt_count",
@ -127,7 +127,7 @@ func TestResponseCodeBuilder(t *testing.T) {
} {
t.Run(tc.name, func(t *testing.T) {
got := responseCodeBuilder(&Config{
ServiceName: "test-service",
ResourceName: "test-service",
ResponseCodeMetric: &tc.ResponseCodeMetric,
})
tc.want.Equal(t, got)

View File

@ -127,8 +127,11 @@ func (r *Renderer) RenderEnvironment(
return nil, errors.Wrap(err, "failed to create cloudrun stack")
}
if _, err := monitoring.NewStack(stacks, monitoring.Variables{
ProjectID: *projectOutput.Project.ProjectId(),
Service: svc,
ProjectID: *projectOutput.Project.ProjectId(),
Service: svc,
EnvironmentCategory: env.Category,
EnvironmentID: env.ID,
Monitoring: monitoringSpec,
MaxInstanceCount: func() *int {
if env.Instances.Scaling != nil {
@ -136,13 +139,11 @@ func (r *Renderer) RenderEnvironment(
}
return nil
}(),
RedisInstanceID: cloudrunOutput.RedisInstanceID,
ServiceStartupProbe: pointers.DerefZero(env.EnvironmentServiceSpec).StatupProbe,
// Notification configuration
EnvironmentCategory: env.Category,
EnvironmentID: env.ID,
Owners: svc.Owners,
ExternalDomain: pointers.DerefZero(env.EnvironmentServiceSpec).Domain,
ServiceAuthentication: pointers.DerefZero(env.EnvironmentServiceSpec).Authentication,
DiagnosticsSecret: cloudrunOutput.DiagnosticsSecret,
RedisInstanceID: cloudrunOutput.RedisInstanceID,
ServiceHealthProbes: pointers.DerefZero(env.EnvironmentServiceSpec).HealthProbes,
}); err != nil {
return nil, errors.Wrap(err, "failed to create monitoring stack")
}

View File

@ -94,6 +94,7 @@ type EnvironmentSpec struct {
func (s EnvironmentSpec) Validate() []error {
var errs []error
// Validate basic configuration
if s.ID == "" {
errs = append(errs, errors.New("id is required"))
}
@ -111,9 +112,14 @@ func (s EnvironmentSpec) Validate() []error {
return append(errs, errors.Wrap(err, "category"))
}
// Validate other shared sub-specs
errs = append(errs, s.Deploy.Validate()...)
errs = append(errs, s.Resources.Validate()...)
errs = append(errs, s.Instances.Validate()...)
// Validate service-specific specs
errs = append(errs, s.EnvironmentServiceSpec.Validate()...)
return errs
}
@ -215,17 +221,12 @@ type EnvironmentServiceSpec struct {
//
// Only supported for services of 'kind: service'.
Domain *EnvironmentServiceDomainSpec `yaml:"domain,omitempty"`
// StatupProbe is provisioned by default. It can be disabled with the
// 'disabled' field. Probes are made to the MSP-standard '/-/healthz'
// endpoint.
// HealthProbes configures both startup and continuous liveness probes.
// If nil or explicitly disabled, no MSP-standard '/-/healthz' probes will
// be configured.
//
// Only supported for services of 'kind: service'.
StatupProbe *EnvironmentServiceStartupProbeSpec `yaml:"startupProbe,omitempty"`
// LivenessProbe is only provisioned if this field is set. Probes are made
// to the MSP-standard '/-/healthz' endpoint.
//
// Only supported for services of 'kind: service'.
LivenessProbe *EnvironmentServiceLivenessProbeSpec `yaml:"livenessProbe,omitempty"`
HealthProbes *EnvironmentServiceHealthProbesSpec `yaml:"healthProbes,omitempty"`
// Authentication configures access to the service. By default, the service
// is publically available, and the service should handle any required
// authentication by itself. Set this field to an empty value to not
@ -242,6 +243,15 @@ type EnvironmentServiceSpec struct {
Authentication *EnvironmentServiceAuthenticationSpec `yaml:"authentication,omitempty"`
}
func (s *EnvironmentServiceSpec) Validate() []error {
if s == nil {
return nil
}
var errs []error
errs = append(errs, s.HealthProbes.Validate()...)
return errs
}
type EnvironmentServiceDomainSpec struct {
// Type is one of 'none' or 'cloudflare'. If empty, defaults to 'none'.
Type EnvironmentDomainType `yaml:"type"`
@ -389,61 +399,119 @@ type EnvironmentInstancesScalingSpec struct {
MaxCount *int `yaml:"maxCount,omitempty"`
}
type EnvironmentServiceLivenessProbeSpec struct {
// Timeout configures the period of time after which the probe times out,
// in seconds.
//
// Defaults to 1 second.
Timeout *int `yaml:"timeout,omitempty"`
// Interval configures the interval, in seconds, at which to
// probe the deployed service.
//
// Defaults to 1 second.
Interval *int `yaml:"interval,omitempty"`
}
type EnvironmentServiceAuthenticationSpec struct {
// Sourcegraph enables access to everyone in the sourcegraph.com GSuite
// domain.
Sourcegraph *bool `yaml:"sourcegraph,omitempty"`
}
type EnvironmentServiceStartupProbeSpec struct {
// Disabled configures whether the MSP startup probe should be disabled.
// We recommend disabling it when creating a service, and re-enabling it
// once the service is healthy.
type EnvironmentServiceHealthProbesSpec struct {
// HealthzProbes configures whether the MSP-standard '/-/healthz' service
// probes should be disabled. We recommend disabling it when creating a
// service, and re-enabling it once the service is confirmed to be deployed
// and healthy. When disabling, you should explicitly set
// 'healthzProbes: false'.
//
// - When disabled, the default probe is a very generous one that waits 240s
// for your service to respond with anything at all on '/'
// - When enabled, the MSP-standard '/-/healthz' diagnostic check is used
// with a generated diagnostics secret.
// with a generated diagnostics secret enforcing Timeout and Interval.
//
// This prevents the first Terraform apply from failing if your healthcheck
// is comprehensive.
Disabled *bool `yaml:"disabled,omitempty"`
// Disabling the probe on first startup prevents the first Terraform apply
// from failing if your healthcheck is comprehensive, or if you haven't
// implemented '/-/healthz' yet.
HealthzProbes *bool `yaml:"healthzProbes,omitempty"`
// Timeout configures the period of time after which the probe times out,
// in seconds.
// Timeout configures the period of time after which a health probe times
// out, in seconds.
//
// Defaults to 1 second.
// Defaults to 3 seconds.
Timeout *int `yaml:"timeout,omitempty"`
// Interval configures the frequency, in seconds, at which to
// probe the deployed service. Must be greater than or equal to timeout.
// StartupInterval configures the frequency, in seconds, at which to
// probe the deployed service on startup. Must be greater than or equal to
// timeout.
//
// Defaults to timeout.
Interval *int `yaml:"interval,omitempty"`
StartupInterval *int `yaml:"startupInterval,omitempty"`
// StartupInterval configures the frequency, in seconds, at which to
// probe the deployed service after startup to continuously check its health.
// Must be greater than or equal to timeout.
//
// Defaults to timeout * 10.
LivenessInterval *int `yaml:"livenessInterval,omitempty"`
}
func (s *EnvironmentServiceStartupProbeSpec) MaximumLatencySeconds() int {
func (s *EnvironmentServiceHealthProbesSpec) Validate() []error {
if s == nil {
s = &EnvironmentServiceStartupProbeSpec{}
return nil
}
if pointers.DerefZero(s.Disabled) {
var errs []error
if !s.UseHealthzProbes() {
if s.Timeout != nil || s.StartupInterval != nil || s.LivenessInterval != nil {
errs = append(errs,
errors.New("timeout, startupInterval and livenessInterval can only be configured when healthzProbes is enabled"))
}
// Nothing else to check
return errs
}
if s.GetTimeoutSeconds() > s.GetStartupIntervalSeconds() {
errs = append(errs, errors.New("startupInterval must be greater than or equal to timeout"))
}
if s.GetTimeoutSeconds() > s.GetLivenessIntervalSeconds() {
errs = append(errs, errors.New("livenessInterval must be greater than or equal to timeout"))
}
return errs
}
// UseHealthzProbes indicates whether the MSP-standard '/-/healthz' probes
// with diagnostics secrets should be used.
func (s *EnvironmentServiceHealthProbesSpec) UseHealthzProbes() bool {
// No config == disabled
if s == nil {
return false
}
// If config is provided, must be explicitly disabled with 'enabled: false'
return pointers.Deref(s.HealthzProbes, true)
}
// MaximumStartupLatencySeconds infers the overal maximum latency for a
// healthcheck to return healthy when the service is starting up.
func (s *EnvironmentServiceHealthProbesSpec) MaximumStartupLatencySeconds() int {
if !s.UseHealthzProbes() {
return 240 // maximum Cloud Run timeout
}
// Maximum startup latency is retries x interval.
const maxRetries = 3
return maxRetries * pointers.Deref(s.Interval, 1)
return maxRetries * s.GetStartupIntervalSeconds()
}
// GetStartupIntervalSeconds returns the configured value, the default, or 0 if the spec is nil.
func (s *EnvironmentServiceHealthProbesSpec) GetStartupIntervalSeconds() int {
if s == nil {
return 0
}
return pointers.Deref(s.StartupInterval, s.GetTimeoutSeconds())
}
// GetLivenessIntervalSeconds returns the configured value, the default, or 0 if the spec is nil.
func (s *EnvironmentServiceHealthProbesSpec) GetLivenessIntervalSeconds() int {
if s == nil {
return 0
}
return pointers.Deref(s.LivenessInterval, s.GetTimeoutSeconds()*10) // 10x timeout default
}
// GetTimeoutSeconds returns the configured value, the default, or 0 if the spec is nil.
func (s *EnvironmentServiceHealthProbesSpec) GetTimeoutSeconds() int {
if s == nil {
return 0
}
return pointers.Deref(s.Timeout, 3)
}
type EnvironmentJobSpec struct {

View File

@ -37,6 +37,11 @@ func (s ServiceSpec) GetName() string {
return pointers.Deref(s.Name, s.ID)
}
// GetKind returns Kind if configured, otherwise the default (ServiceKindService).
func (s ServiceSpec) GetKind() ServiceKind {
return pointers.Deref(s.Kind, ServiceKindService)
}
func (s ServiceSpec) Validate() []error {
var errs []error

View File

@ -37,7 +37,8 @@ import (
)
type CrossStackOutput struct {
RedisInstanceID *string
DiagnosticsSecret *random.Output
RedisInstanceID *string
}
type Variables struct {
@ -291,7 +292,8 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu
locals.Add("image_tag", *imageTag.StringValue,
"Resolved tag of service image to deploy")
return &CrossStackOutput{
RedisInstanceID: redisInstanceID,
DiagnosticsSecret: diagnosticsSecret,
RedisInstanceID: redisInstanceID,
}, nil
}

View File

@ -161,17 +161,10 @@ func (b *serviceBuilder) Build(stack cdktf.TerraformStack, vars builder.Variable
// Do healthchecks with authorization based on MSP convention.
StartupProbe: func() *cloudrunv2service.CloudRunV2ServiceTemplateContainersStartupProbe {
// Default: enabled
if vars.Environment.StatupProbe != nil &&
pointers.Deref(vars.Environment.StatupProbe.Disabled, false) {
if !vars.Environment.HealthProbes.UseHealthzProbes() {
return nil
}
// Set zero value for ease of reference
if vars.Environment.StatupProbe == nil {
vars.Environment.StatupProbe = &spec.EnvironmentServiceStartupProbeSpec{}
}
return &cloudrunv2service.CloudRunV2ServiceTemplateContainersStartupProbe{
HttpGet: &cloudrunv2service.CloudRunV2ServiceTemplateContainersStartupProbeHttpGet{
Path: pointers.Ptr(builder.HealthCheckEndpoint),
@ -181,16 +174,16 @@ func (b *serviceBuilder) Build(stack cdktf.TerraformStack, vars builder.Variable
}},
},
InitialDelaySeconds: pointers.Float64(0),
TimeoutSeconds: pointers.Float64(pointers.Deref(vars.Environment.StatupProbe.Timeout, 1)),
PeriodSeconds: pointers.Float64(pointers.Deref(vars.Environment.StatupProbe.Interval, 1)),
TimeoutSeconds: pointers.Float64(vars.Environment.HealthProbes.GetTimeoutSeconds()),
PeriodSeconds: pointers.Float64(vars.Environment.HealthProbes.GetStartupIntervalSeconds()),
FailureThreshold: pointers.Float64(3),
}
}(),
LivenessProbe: func() *cloudrunv2service.CloudRunV2ServiceTemplateContainersLivenessProbe {
// Default: disabled
if vars.Environment.LivenessProbe == nil {
if !vars.Environment.HealthProbes.UseHealthzProbes() {
return nil
}
return &cloudrunv2service.CloudRunV2ServiceTemplateContainersLivenessProbe{
HttpGet: &cloudrunv2service.CloudRunV2ServiceTemplateContainersLivenessProbeHttpGet{
Path: pointers.Ptr(builder.HealthCheckEndpoint),
@ -199,9 +192,9 @@ func (b *serviceBuilder) Build(stack cdktf.TerraformStack, vars builder.Variable
Value: pointers.Ptr(fmt.Sprintf("Bearer %s", vars.DiagnosticsSecret.HexValue)),
}},
},
TimeoutSeconds: pointers.Float64(pointers.Deref(vars.Environment.LivenessProbe.Timeout, 1)),
PeriodSeconds: pointers.Float64(pointers.Deref(vars.Environment.LivenessProbe.Interval, 1)),
FailureThreshold: pointers.Float64(2),
TimeoutSeconds: pointers.Float64(vars.Environment.HealthProbes.GetTimeoutSeconds()),
PeriodSeconds: pointers.Float64(vars.Environment.HealthProbes.GetLivenessIntervalSeconds()),
FailureThreshold: pointers.Float64(3),
}
}(),

View File

@ -12,6 +12,7 @@ go_library(
"//dev/managedservicesplatform/googlesecretsmanager",
"//dev/managedservicesplatform/internal/resource/alertpolicy",
"//dev/managedservicesplatform/internal/resource/gsmsecret",
"//dev/managedservicesplatform/internal/resource/random",
"//dev/managedservicesplatform/internal/resourceid",
"//dev/managedservicesplatform/internal/stack",
"//dev/managedservicesplatform/internal/stack/options/googleprovider",
@ -22,6 +23,7 @@ go_library(
"//lib/pointers",
"@com_github_hashicorp_terraform_cdk_go_cdktf//:cdktf",
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_google//monitoringnotificationchannel",
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_google//monitoringuptimecheckconfig",
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_opsgenie//apiintegration",
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_opsgenie//dataopsgenieteam",
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_slack//conversation",

View File

@ -7,6 +7,7 @@ import (
"github.com/hashicorp/terraform-cdk-go/cdktf"
"github.com/sourcegraph/managed-services-platform-cdktf/gen/google/monitoringnotificationchannel"
"github.com/sourcegraph/managed-services-platform-cdktf/gen/google/monitoringuptimecheckconfig"
opsgenieintegration "github.com/sourcegraph/managed-services-platform-cdktf/gen/opsgenie/apiintegration"
"github.com/sourcegraph/managed-services-platform-cdktf/gen/opsgenie/dataopsgenieteam"
slackconversation "github.com/sourcegraph/managed-services-platform-cdktf/gen/slack/conversation"
@ -14,6 +15,7 @@ import (
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/googlesecretsmanager"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/alertpolicy"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/gsmsecret"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/random"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack"
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/googleprovider"
@ -64,18 +66,8 @@ import (
type CrossStackOutput struct{}
type Variables struct {
ProjectID string
Service spec.ServiceSpec
Monitoring spec.MonitoringSpec
// MaxInstanceCount informs service scaling alerts.
MaxInstanceCount *int
// If Redis is enabled we configure alerts for it
RedisInstanceID *string
// ServiceStartupProbe is used to determine the threshold for service
// startup latency alerts.
ServiceStartupProbe *spec.EnvironmentServiceStartupProbeSpec
ProjectID string
Service spec.ServiceSpec
// EnvironmentCategory dictates what kind of notifications are set up:
//
// 1. 'test' services only generate Slack notifications.
@ -90,9 +82,22 @@ type Variables struct {
EnvironmentCategory spec.EnvironmentCategory
// EnvironmentID is the name of the service environment.
EnvironmentID string
// Owners is a list of team names. Each owner MUST correspond to the name
// of a team in Opsgenie.
Owners []string
Monitoring spec.MonitoringSpec
// MaxInstanceCount informs service scaling alerts.
MaxInstanceCount *int
// ExternalDomain informs external health checks on the service domain.
ExternalDomain *spec.EnvironmentServiceDomainSpec
// ServiceAuthentication informs external health checks on the service
// domain. Currently, any configuration will disable external health checks.
ServiceAuthentication *spec.EnvironmentServiceAuthenticationSpec
// DiagnosticsSecret is used to configure external health checks.
DiagnosticsSecret *random.Output
// If Redis is enabled we configure alerts for it
RedisInstanceID *string
// ServiceHealthProbes is used to determine the threshold for service
// startup latency alerts.
ServiceHealthProbes *spec.EnvironmentServiceHealthProbesSpec
}
const StackName = "monitoring"
@ -127,7 +132,7 @@ func NewStack(stacks *stack.Set, vars Variables) (*CrossStackOutput, error) {
// case spec.EnvironmentCategoryInternal, spec.EnvironmentCategoryExternal:
// opsgenieAlerts = true
// }
for i, owner := range vars.Owners {
for i, owner := range vars.Service.Owners {
// Use index because Opsgenie team names has lax character requirements
id := id.Group("opsgenie_owner_%d", i)
// Opsgenie team corresponding to owner must exist
@ -251,7 +256,7 @@ func NewStack(stacks *stack.Set, vars Variables) (*CrossStackOutput, error) {
return nil, errors.Wrap(err, "failed to create common alerts")
}
switch pointers.Deref(vars.Service.Kind, spec.ServiceKindService) {
switch vars.Service.GetKind() {
case spec.ServiceKindService:
if err = createServiceAlerts(stack, id.Group("service"), vars, channels); err != nil {
return nil, errors.Wrap(err, "failed to create service alerts")
@ -287,16 +292,23 @@ func createCommonAlerts(
) error {
// Convert a spec.ServiceKind into a alertpolicy.ServiceKind
serviceKind := alertpolicy.CloudRunService
kind := pointers.Deref(vars.Service.Kind, spec.ServiceKindService)
kind := vars.Service.GetKind()
if kind == spec.ServiceKindJob {
serviceKind = alertpolicy.CloudRunJob
}
for _, config := range []alertpolicy.Config{
// Iterate over a list of Redis alert configurations. Custom struct defines
// the field we expect to vary between each.
for _, config := range []struct {
ID string
Name string
Description string
ThresholdAggregation *alertpolicy.ThresholdAggregation
}{
{
ID: "cpu",
Name: "High Container CPU Utilization",
Description: pointers.Ptr("High CPU Usage - it may be neccessary to reduce load or increase CPU allocation"),
Description: "High CPU Usage - it may be neccessary to reduce load or increase CPU allocation",
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "run.googleapis.com/container/cpu/utilizations"},
Aligner: alertpolicy.MonitoringAlignPercentile99,
@ -308,7 +320,7 @@ func createCommonAlerts(
{
ID: "memory",
Name: "High Container Memory Utilization",
Description: pointers.Ptr("High Memory Usage - it may be neccessary to reduce load or increase memory allocation"),
Description: "High Memory Usage - it may be neccessary to reduce load or increase memory allocation",
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "run.googleapis.com/container/memory/utilizations"},
Aligner: alertpolicy.MonitoringAlignPercentile99,
@ -320,7 +332,7 @@ func createCommonAlerts(
{
ID: "startup",
Name: "Container Startup Latency",
Description: pointers.Ptr("Service containers are taking too long to start up - something may be blocking startup"),
Description: "Service containers are taking longer than configured timeouts to start up.",
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "run.googleapis.com/container/startup_latencies"},
Aligner: alertpolicy.MonitoringAlignPercentile99,
@ -334,17 +346,28 @@ func createCommonAlerts(
}
// otherwise, use the startup probe configuration to
// determine the threshold for how long we should be waiting
return float64(vars.ServiceStartupProbe.MaximumLatencySeconds()) * 1000 // ms
return float64(vars.ServiceHealthProbes.MaximumStartupLatencySeconds()) * 1000 // ms
}(),
},
},
} {
config.ServiceEnvironmentSlug = fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID)
config.ProjectID = vars.ProjectID
config.ServiceName = vars.Service.ID
config.ServiceKind = serviceKind
config.NotificationChannels = channels
if _, err := alertpolicy.New(stack, id, &config); err != nil {
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
// Resource we are targetting in this helper
ResourceKind: serviceKind,
ResourceName: vars.Service.ID,
// Alert policy
ID: config.ID,
Name: config.Name,
Description: config.Description,
ThresholdAggregation: config.ThresholdAggregation,
// Shared configuration
Service: vars.Service,
EnvironmentID: vars.EnvironmentID,
ProjectID: vars.ProjectID,
NotificationChannels: channels,
}); err != nil {
return err
}
}
@ -361,14 +384,15 @@ func createServiceAlerts(
// Only provision if MaxCount is specified above 5
if pointers.Deref(vars.MaxInstanceCount, 0) > 5 {
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
ServiceEnvironmentSlug: fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID),
Service: vars.Service,
EnvironmentID: vars.EnvironmentID,
ID: "instance_count",
Name: "Container Instance Count",
Description: pointers.Ptr("There are a lot of Cloud Run instances running - we may need to increase per-instance requests make make sure we won't hit the configured max instance count"),
ProjectID: vars.ProjectID,
ServiceName: vars.Service.ID,
ServiceKind: alertpolicy.CloudRunService,
ID: "instance_count",
Name: "Container Instance Count",
Description: "There are a lot of Cloud Run instances running - we may need to increase per-instance requests make make sure we won't hit the configured max instance count",
ProjectID: vars.ProjectID,
ResourceName: vars.Service.ID,
ResourceKind: alertpolicy.CloudRunService,
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "run.googleapis.com/container/instance_count"},
Aligner: alertpolicy.MonitoringAlignMax,
@ -377,9 +401,100 @@ func createServiceAlerts(
},
NotificationChannels: channels,
}); err != nil {
return err
return errors.Wrap(err, "instance_count")
}
}
// If an external DNS name is provisioned, use it to check service availability
// from outside Cloud Run. The service must not use IAM auth.
if vars.ServiceAuthentication == nil && vars.ExternalDomain.GetDNSName() != "" {
if err := createExternalHealthcheckAlert(stack, id, vars, channels); err != nil {
return errors.Wrap(err, "external_healthcheck")
}
}
return nil
}
func createExternalHealthcheckAlert(
stack cdktf.TerraformStack,
id resourceid.ID,
vars Variables,
channels []monitoringnotificationchannel.MonitoringNotificationChannel,
) error {
var (
healthcheckPath = "/"
healthcheckHeaders = map[string]*string{}
)
// Only use MSP runtime standards if we know the service supports it.
if vars.ServiceHealthProbes.UseHealthzProbes() {
healthcheckPath = "/-/healthz"
healthcheckHeaders = map[string]*string{
"Authorization": pointers.Stringf("Bearer %s", vars.DiagnosticsSecret.HexValue),
}
}
externalDNS := vars.ExternalDomain.GetDNSName()
uptimeCheck := monitoringuptimecheckconfig.NewMonitoringUptimeCheckConfig(stack, id.TerraformID("external_uptime_check"), &monitoringuptimecheckconfig.MonitoringUptimeCheckConfigConfig{
Project: &vars.ProjectID,
DisplayName: pointers.Stringf("External Uptime Check for %s", externalDNS),
// https://cloud.google.com/monitoring/api/resources#tag_uptime_url
MonitoredResource: &monitoringuptimecheckconfig.MonitoringUptimeCheckConfigMonitoredResource{
Type: pointers.Ptr("uptime_url"),
Labels: &map[string]*string{
"project_id": &vars.ProjectID,
"host": &externalDNS,
},
},
// 1 to 60 seconds.
Timeout: pointers.Stringf("%ds", vars.ServiceHealthProbes.GetTimeoutSeconds()),
// Only supported values are 60s (1 minute), 300s (5 minutes),
// 600s (10 minutes), and 900s (15 minutes)
Period: pointers.Ptr("60s"),
HttpCheck: &monitoringuptimecheckconfig.MonitoringUptimeCheckConfigHttpCheck{
Port: pointers.Float64(443),
UseSsl: pointers.Ptr(true),
ValidateSsl: pointers.Ptr(true),
Path: &healthcheckPath,
Headers: &healthcheckHeaders,
AcceptedResponseStatusCodes: &[]*monitoringuptimecheckconfig.MonitoringUptimeCheckConfigHttpCheckAcceptedResponseStatusCodes{
{
StatusClass: pointers.Ptr("STATUS_CLASS_2XX"),
},
},
},
})
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
Service: vars.Service,
EnvironmentID: vars.EnvironmentID,
ID: "external_health_check",
Name: "External Uptime Check",
Description: fmt.Sprintf("Service is failing to repond on https://%s - this may be expected if the service was recently provisioned or if its external domain has changed.", externalDNS),
ProjectID: vars.ProjectID,
ResourceKind: alertpolicy.URLUptime,
ResourceName: *uptimeCheck.UptimeCheckId(),
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
Filters: map[string]string{
"metric.type": "monitoring.googleapis.com/uptime_check/check_passed",
},
Aligner: alertpolicy.MonitoringAlignFractionTrue,
// Checks occur every 60s, in a 300s window if 2/5 fail we are in trouble
Period: "300s",
Duration: "0s",
Comparison: alertpolicy.ComparisonLT,
Threshold: 0.4,
// Alert when all locations go down
Trigger: alertpolicy.TriggerKindAllInViolation,
},
NotificationChannels: channels,
}); err != nil {
return err
}
return nil
}
@ -391,14 +506,15 @@ func createJobAlerts(
) error {
// Alert whenever a Cloud Run Job fails
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
ServiceEnvironmentSlug: fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID),
Service: vars.Service,
EnvironmentID: vars.EnvironmentID,
ID: "job_failures",
Name: "Cloud Run Job Failures",
Description: pointers.Ptr("Failed executions of Cloud Run Job"),
ProjectID: vars.ProjectID,
ServiceName: vars.Service.ID,
ServiceKind: alertpolicy.CloudRunJob,
ID: "job_failures",
Name: "Cloud Run Job Failures",
Description: "Cloud Run Job executions failed",
ProjectID: vars.ProjectID,
ResourceName: vars.Service.ID,
ResourceKind: alertpolicy.CloudRunJob,
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
Filters: map[string]string{
"metric.type": "run.googleapis.com/job/completed_task_attempt_count",
@ -426,13 +542,14 @@ func createResponseCodeMetrics(
) error {
for _, config := range vars.Monitoring.Alerts.ResponseCodeRatios {
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
ServiceEnvironmentSlug: fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID),
Service: vars.Service,
EnvironmentID: vars.EnvironmentID,
ID: config.ID,
ProjectID: vars.ProjectID,
Name: config.Name,
ServiceName: vars.Service.ID,
ServiceKind: alertpolicy.CloudRunService,
ID: config.ID,
ProjectID: vars.ProjectID,
Name: config.Name,
ResourceName: vars.Service.ID,
ResourceKind: alertpolicy.CloudRunService,
ResponseCodeMetric: &alertpolicy.ResponseCodeMetric{
Code: config.Code,
CodeClass: config.CodeClass,
@ -455,11 +572,18 @@ func createRedisAlerts(
vars Variables,
channels []monitoringnotificationchannel.MonitoringNotificationChannel,
) error {
for _, config := range []alertpolicy.Config{
// Iterate over a list of Redis alert configurations. Custom struct defines
// the field we expect to vary between each.
for _, config := range []struct {
ID string
Name string
Description string
ThresholdAggregation *alertpolicy.ThresholdAggregation
}{
{
ID: "memory",
Name: "Cloud Redis - System Memory Utilization",
Description: pointers.Ptr("This alert fires if the system memory utilization is above the set threshold. The utilization is measured on a scale of 0 to 1."),
Description: "Redis System memory utilization is above the set threshold. The utilization is measured on a scale of 0 to 1.",
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/memory/system_memory_usage_ratio"},
Aligner: alertpolicy.MonitoringAlignMean,
@ -471,7 +595,7 @@ func createRedisAlerts(
{
ID: "cpu",
Name: "Cloud Redis - System CPU Utilization",
Description: pointers.Ptr("This alert fires if the Redis Engine CPU Utilization goes above the set threshold. The utilization is measured on a scale of 0 to 1."),
Description: "Redis Engine CPU Utilization goes above the set threshold. The utilization is measured on a scale of 0 to 1.",
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/cpu_utilization_main_thread"},
GroupByFields: []string{"resource.label.instance_id", "resource.label.node_id"},
@ -484,7 +608,7 @@ func createRedisAlerts(
{
ID: "failover",
Name: "Cloud Redis - Standard Instance Failover",
Description: pointers.Ptr("This alert fires if failover occurs for a standard tier instance."),
Description: "Instance failover occured for a standard tier Redis instance.",
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
Filters: map[string]string{"metric.type": "redis.googleapis.com/replication/role"},
Aligner: alertpolicy.MonitoringAlignStddev,
@ -493,12 +617,23 @@ func createRedisAlerts(
},
},
} {
config.ServiceEnvironmentSlug = fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID)
config.ProjectID = vars.ProjectID
config.ServiceName = *vars.RedisInstanceID
config.ServiceKind = alertpolicy.CloudRedis
config.NotificationChannels = channels
if _, err := alertpolicy.New(stack, id, &config); err != nil {
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
// Resource we are targetting in this helper
ResourceKind: alertpolicy.CloudRedis,
ResourceName: *vars.RedisInstanceID,
// Alert policy
ID: config.ID,
Name: config.Name,
Description: config.Description,
ThresholdAggregation: config.ThresholdAggregation,
// Shared configuration
Service: vars.Service,
EnvironmentID: vars.EnvironmentID,
ProjectID: vars.ProjectID,
NotificationChannels: channels,
}); err != nil {
return err
}
}

View File

@ -43,6 +43,6 @@ environments:
scaling:
maxCount: 3
minCount: 1
startupProbe:
healthProbes:
# Only enable if your service implements MSP /-/healthz conventions.
disabled: true
healthzProbes: false

View File

@ -41,9 +41,9 @@ environments:
scaling:
maxCount: 3
minCount: 1
startupProbe:
healthProbes:
# Only enable if your service implements MSP /-/healthz conventions.
disabled: true
healthzProbes: false
- id: second
projectID: msp-example-second-xxxx
# TODO: We initially provision in 'test' to make it easy to access the project

View File

@ -43,6 +43,6 @@ environments:
scaling:
maxCount: 3
minCount: 1
startupProbe:
healthProbes:
# Only enable if your service implements MSP /-/healthz conventions.
disabled: true
healthzProbes: false

View File

@ -41,9 +41,9 @@ environments:
scaling:
maxCount: 3
minCount: 1
startupProbe:
healthProbes:
# Only enable if your service implements MSP /-/healthz conventions.
disabled: true
healthzProbes: false
- id: second
projectID: msp-example-second-xxxx
# TODO: We initially provision in 'test' to make it easy to access the project

View File

@ -43,6 +43,6 @@ environments:
scaling:
maxCount: 3
minCount: 1
startupProbe:
healthProbes:
# Only enable if your service implements MSP /-/healthz conventions.
disabled: true
healthzProbes: false