mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 18:51:59 +00:00
msp/monitoring: add external uptime check and alert, rework health probes configuration (#59461)
The new configuration is mostly based on Cody Gateway - if a service has an external domain, we create an uptime check and alert on failures. The uptime check uses MSP standards, which depends on whether or not service health probes are configured. Since we use this in several places now, I've also reworked the health probes configuration to make it easier to reason with: 1. `healthProbes` now configures all healthchecks. `startupProbe` and `livenessProbe` has been removed 2. `disabled` is now `healthzProbes` - this configures if MSP healthchecks should be used, instead of default `/` ones. 3. By default, if no config is provided, MSP healthchecks are not used 4. If config is provided, MSP healthchecks must be explicitly disabled Closes https://github.com/sourcegraph/managed-services/issues/350 This is required for our upcoming vendor evaluations as well. This PR also includes a variety of internal improvements to alert policies.
This commit is contained in:
parent
f407f96455
commit
0060df720e
@ -8,6 +8,7 @@ go_library(
|
||||
visibility = ["//dev/managedservicesplatform:__subpackages__"],
|
||||
deps = [
|
||||
"//dev/managedservicesplatform/internal/resourceid",
|
||||
"//dev/managedservicesplatform/spec",
|
||||
"//lib/errors",
|
||||
"//lib/pointers",
|
||||
"@com_github_aws_constructs_go_constructs_v10//:constructs",
|
||||
|
||||
@ -13,6 +13,7 @@ import (
|
||||
"github.com/sourcegraph/managed-services-platform-cdktf/gen/google/monitoringnotificationchannel"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid"
|
||||
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/spec"
|
||||
"github.com/sourcegraph/sourcegraph/lib/pointers"
|
||||
)
|
||||
|
||||
@ -83,6 +84,10 @@ type ThresholdAggregation struct {
|
||||
Period string
|
||||
Threshold float64
|
||||
Duration string
|
||||
|
||||
// Trigger is the strategy for determining if an alert should fire based
|
||||
// on the thresholds.
|
||||
Trigger TriggerKind
|
||||
}
|
||||
|
||||
// ResponseCodeMetric for alerting when the number of a certain response code exceeds a threshold
|
||||
@ -98,30 +103,47 @@ type ResponseCodeMetric struct {
|
||||
Duration *string
|
||||
}
|
||||
|
||||
type CloudService int
|
||||
type ResourceKind string
|
||||
|
||||
const (
|
||||
CloudRunService CloudService = iota
|
||||
CloudRunJob
|
||||
CloudRedis
|
||||
CloudRunService ResourceKind = "cloud-run-service"
|
||||
CloudRunJob ResourceKind = "cloud-run-job"
|
||||
CloudRedis ResourceKind = "cloud-redis"
|
||||
URLUptime ResourceKind = "url-uptime"
|
||||
)
|
||||
|
||||
type TriggerKind int
|
||||
|
||||
const (
|
||||
// TriggerKindAnyViolation is trigger { count: 1 } - any violation will
|
||||
// cause an alert to fire. This is the default.
|
||||
TriggerKindAnyViolation TriggerKind = iota
|
||||
// TriggerKindAllInViolation is trigger { percent: 100 } - all time series
|
||||
// must be in violation for alert to fire.
|
||||
TriggerKindAllInViolation
|
||||
)
|
||||
|
||||
// Config for a Monitoring Alert Policy
|
||||
// Must define either `ThresholdAggregation` or `ResponseCodeMetric`
|
||||
type Config struct {
|
||||
// ServiceEnvironmentSlug is $SERVICE_ID#$ENV_ID, and is used for generating
|
||||
// docs links in alert descriptions.
|
||||
ServiceEnvironmentSlug string
|
||||
Service spec.ServiceSpec
|
||||
EnvironmentID string
|
||||
ProjectID string
|
||||
|
||||
// ID is unique identifier of the alert policy
|
||||
ID string
|
||||
// Name is a human-readable name for the alert policy
|
||||
Name string
|
||||
// Description is a Markdown-format description for the alert policy. Some
|
||||
// unified context will be included as well, including links to the service
|
||||
// handbook page and so on.
|
||||
Description string
|
||||
|
||||
// ResourceKind identifies what is being monitored.
|
||||
ResourceKind ResourceKind
|
||||
// ResourceName is the identifier for the monitored resource of ResourceKind.
|
||||
ResourceName string
|
||||
|
||||
// ID is unique identifier
|
||||
ID string
|
||||
Name string
|
||||
Description *string
|
||||
ProjectID string
|
||||
// Name of the service/job/redis to filter the alert on
|
||||
ServiceName string
|
||||
// Type of the service/job/redis
|
||||
ServiceKind CloudService
|
||||
// NotificationChannels to subscribe on this alert
|
||||
NotificationChannels []monitoringnotificationchannel.MonitoringNotificationChannel
|
||||
|
||||
@ -129,9 +151,12 @@ type Config struct {
|
||||
ResponseCodeMetric *ResponseCodeMetric
|
||||
}
|
||||
|
||||
type Output struct {
|
||||
func (c Config) getDocsSlug() string {
|
||||
return fmt.Sprintf("%s#%s", c.Service.ID, c.EnvironmentID)
|
||||
}
|
||||
|
||||
type Output struct{}
|
||||
|
||||
func New(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
|
||||
if config.ThresholdAggregation == nil && config.ResponseCodeMetric == nil {
|
||||
return nil, errors.New("Must provide either SingleMetric or ResponseCodeMetric config")
|
||||
@ -142,18 +167,18 @@ func New(scope constructs.Construct, id resourceid.ID, config *Config) (*Output,
|
||||
}
|
||||
|
||||
// Universal alert description addendum
|
||||
if config.ServiceEnvironmentSlug == "" {
|
||||
return nil, errors.New("ServiceEnvironmentSlug is required")
|
||||
if config.Service.ID == "" {
|
||||
return nil, errors.New("Service is required")
|
||||
}
|
||||
if pointers.DerefZero(config.Description) == "" {
|
||||
if config.Description == "" {
|
||||
return nil, errors.New("Description is required")
|
||||
} else {
|
||||
config.Description = pointers.Stringf(`%s
|
||||
config.Description = fmt.Sprintf(`%s
|
||||
|
||||
See https://handbook.sourcegraph.com/departments/engineering/managed-services/%s for service and infrastructure access details.
|
||||
If you need additional assistance, reach out to #discuss-core-services.`,
|
||||
*config.Description,
|
||||
config.ServiceEnvironmentSlug)
|
||||
config.Description,
|
||||
config.getDocsSlug())
|
||||
}
|
||||
|
||||
if config.ThresholdAggregation != nil {
|
||||
@ -172,15 +197,15 @@ If you need additional assistance, reach out to #discuss-core-services.`,
|
||||
// threshholdAggregation defines a monitoring alert policy based on a single metric threshold
|
||||
func newThresholdAggregationAlert(scope constructs.Construct, id resourceid.ID, config *Config) (*Output, error) {
|
||||
// Set some defaults
|
||||
switch config.ServiceKind {
|
||||
switch config.ResourceKind {
|
||||
case CloudRunService:
|
||||
config.ThresholdAggregation.GroupByFields = append([]string{"resource.label.revision_name"}, config.ThresholdAggregation.GroupByFields...)
|
||||
case CloudRunJob:
|
||||
// No defaults
|
||||
case CloudRedis:
|
||||
config.ThresholdAggregation.GroupByFields = append(
|
||||
[]string{"resource.label.revision_name"},
|
||||
config.ThresholdAggregation.GroupByFields...)
|
||||
case CloudRunJob, CloudRedis, URLUptime:
|
||||
// No defaults
|
||||
default:
|
||||
return nil, errors.Newf("invalid service kind %q", config.ServiceKind)
|
||||
return nil, errors.Newf("invalid service kind %q", config.ResourceKind)
|
||||
}
|
||||
|
||||
if config.ThresholdAggregation.Comparison == "" {
|
||||
@ -196,14 +221,33 @@ func newThresholdAggregationAlert(scope constructs.Construct, id resourceid.ID,
|
||||
Project: pointers.Ptr(config.ProjectID),
|
||||
DisplayName: pointers.Ptr(config.Name),
|
||||
Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{
|
||||
Content: config.Description,
|
||||
Subject: pointers.Stringf("%s (%s): %s",
|
||||
config.Service.GetName(), config.EnvironmentID, config.Name),
|
||||
|
||||
Content: pointers.Ptr(config.Description),
|
||||
MimeType: pointers.Ptr("text/markdown"),
|
||||
},
|
||||
UserLabels: &map[string]*string{
|
||||
"source": pointers.Ptr("managed-services-platform"),
|
||||
"msp_alert_id": pointers.Ptr(config.ID),
|
||||
"msp_gcp_project": pointers.Ptr(config.ProjectID),
|
||||
"source": pointers.Ptr("managed-services-platform"),
|
||||
"resource_kind": pointers.Ptr(string(config.ResourceKind)),
|
||||
|
||||
"msp_alert_id": pointers.Ptr(config.ID),
|
||||
"msp_service_id": pointers.Ptr(config.Service.ID),
|
||||
"msp_environment_id": pointers.Ptr(config.EnvironmentID),
|
||||
},
|
||||
|
||||
// Notification strategy
|
||||
AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{
|
||||
AutoClose: pointers.Ptr("86400s"), // 24 hours
|
||||
},
|
||||
NotificationChannels: notificationChannelIDs(config.NotificationChannels),
|
||||
// For now, set all MSP alerts as WARNING. In the future, we should
|
||||
// have different severity levels.
|
||||
// https://github.com/sourcegraph/managed-services/issues/385
|
||||
// Possible values: ["CRITICAL", "ERROR", "WARNING"]
|
||||
Severity: pointers.Ptr("WARNING"),
|
||||
|
||||
// Conditions
|
||||
Combiner: pointers.Ptr("OR"),
|
||||
Conditions: []monitoringalertpolicy.MonitoringAlertPolicyConditions{
|
||||
{
|
||||
@ -212,8 +256,8 @@ func newThresholdAggregationAlert(scope constructs.Construct, id resourceid.ID,
|
||||
Aggregations: []monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdAggregations{
|
||||
{
|
||||
AlignmentPeriod: pointers.Ptr(config.ThresholdAggregation.Period),
|
||||
PerSeriesAligner: pointers.Ptr(string(config.ThresholdAggregation.Aligner)),
|
||||
CrossSeriesReducer: pointers.Ptr(string(config.ThresholdAggregation.Reducer)),
|
||||
PerSeriesAligner: pointers.NonZeroPtr(string(config.ThresholdAggregation.Aligner)),
|
||||
CrossSeriesReducer: pointers.NonZeroPtr(string(config.ThresholdAggregation.Reducer)),
|
||||
GroupByFields: pointers.Ptr(pointers.Slice(config.ThresholdAggregation.GroupByFields)),
|
||||
},
|
||||
},
|
||||
@ -221,16 +265,24 @@ func newThresholdAggregationAlert(scope constructs.Construct, id resourceid.ID,
|
||||
Duration: pointers.Ptr(config.ThresholdAggregation.Duration),
|
||||
Filter: pointers.Ptr(buildFilter(config)),
|
||||
ThresholdValue: pointers.Float64(config.ThresholdAggregation.Threshold),
|
||||
Trigger: &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger{
|
||||
Count: pointers.Float64(1),
|
||||
},
|
||||
Trigger: func() *monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger {
|
||||
switch config.ThresholdAggregation.Trigger {
|
||||
case TriggerKindAllInViolation:
|
||||
return &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger{
|
||||
Percent: pointers.Float64(100),
|
||||
}
|
||||
|
||||
case TriggerKindAnyViolation:
|
||||
fallthrough
|
||||
default:
|
||||
return &monitoringalertpolicy.MonitoringAlertPolicyConditionsConditionThresholdTrigger{
|
||||
Count: pointers.Float64(1),
|
||||
}
|
||||
}
|
||||
}(),
|
||||
},
|
||||
},
|
||||
},
|
||||
AlertStrategy: &monitoringalertpolicy.MonitoringAlertPolicyAlertStrategy{
|
||||
AutoClose: pointers.Ptr("604800s"),
|
||||
},
|
||||
NotificationChannels: notificationChannelIDs(config.NotificationChannels),
|
||||
})
|
||||
return &Output{}, nil
|
||||
}
|
||||
@ -246,21 +298,26 @@ func buildFilter(config *Config) string {
|
||||
// config.ThresholdAggregation.Filters is a map.
|
||||
sort.Strings(filters)
|
||||
|
||||
switch config.ServiceKind {
|
||||
switch config.ResourceKind {
|
||||
case CloudRunService:
|
||||
filters = append(filters,
|
||||
`resource.type = "cloud_run_revision"`,
|
||||
fmt.Sprintf(`resource.labels.service_name = "%s"`, config.ServiceName),
|
||||
fmt.Sprintf(`resource.labels.service_name = "%s"`, config.ResourceName),
|
||||
)
|
||||
case CloudRunJob:
|
||||
filters = append(filters,
|
||||
`resource.type = "cloud_run_job"`,
|
||||
fmt.Sprintf(`resource.labels.job_name = "%s"`, config.ServiceName),
|
||||
fmt.Sprintf(`resource.labels.job_name = "%s"`, config.ResourceName),
|
||||
)
|
||||
case CloudRedis:
|
||||
filters = append(filters,
|
||||
`resource.type = "redis_instance"`,
|
||||
fmt.Sprintf(`resource.labels.instance_id = "%s"`, config.ServiceName),
|
||||
fmt.Sprintf(`resource.labels.instance_id = "%s"`, config.ResourceName),
|
||||
)
|
||||
case URLUptime:
|
||||
filters = append(filters,
|
||||
`resource.type = "uptime_url"`,
|
||||
fmt.Sprintf(`metric.labels.check_id = "%s"`, config.ResourceName),
|
||||
)
|
||||
}
|
||||
|
||||
@ -282,7 +339,7 @@ func newResponseCodeMetricAlert(scope constructs.Construct, id resourceid.ID, co
|
||||
Project: pointers.Ptr(config.ProjectID),
|
||||
DisplayName: pointers.Ptr(fmt.Sprintf("High Ratio of %s Responses", config.Name)),
|
||||
Documentation: &monitoringalertpolicy.MonitoringAlertPolicyDocumentation{
|
||||
Content: config.Description,
|
||||
Content: pointers.Ptr(config.Description),
|
||||
MimeType: pointers.Ptr("text/markdown"),
|
||||
},
|
||||
Combiner: pointers.Ptr("OR"),
|
||||
|
||||
@ -17,8 +17,8 @@ func TestBuildFilter(t *testing.T) {
|
||||
{
|
||||
name: "Service Metric",
|
||||
config: Config{
|
||||
ServiceName: "my-service-name",
|
||||
ServiceKind: CloudRunService,
|
||||
ResourceName: "my-service-name",
|
||||
ResourceKind: CloudRunService,
|
||||
ThresholdAggregation: &ThresholdAggregation{
|
||||
Filters: map[string]string{
|
||||
"metric.type": "run.googleapis.com/container/startup_latencies",
|
||||
@ -30,8 +30,8 @@ func TestBuildFilter(t *testing.T) {
|
||||
{
|
||||
name: "Job Metric",
|
||||
config: Config{
|
||||
ServiceName: "my-job-name",
|
||||
ServiceKind: CloudRunJob,
|
||||
ResourceName: "my-job-name",
|
||||
ResourceKind: CloudRunJob,
|
||||
ThresholdAggregation: &ThresholdAggregation{
|
||||
Filters: map[string]string{
|
||||
"metric.type": "run.googleapis.com/job/completed_task_attempt_count",
|
||||
@ -127,7 +127,7 @@ func TestResponseCodeBuilder(t *testing.T) {
|
||||
} {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := responseCodeBuilder(&Config{
|
||||
ServiceName: "test-service",
|
||||
ResourceName: "test-service",
|
||||
ResponseCodeMetric: &tc.ResponseCodeMetric,
|
||||
})
|
||||
tc.want.Equal(t, got)
|
||||
|
||||
@ -127,8 +127,11 @@ func (r *Renderer) RenderEnvironment(
|
||||
return nil, errors.Wrap(err, "failed to create cloudrun stack")
|
||||
}
|
||||
if _, err := monitoring.NewStack(stacks, monitoring.Variables{
|
||||
ProjectID: *projectOutput.Project.ProjectId(),
|
||||
Service: svc,
|
||||
ProjectID: *projectOutput.Project.ProjectId(),
|
||||
Service: svc,
|
||||
EnvironmentCategory: env.Category,
|
||||
EnvironmentID: env.ID,
|
||||
|
||||
Monitoring: monitoringSpec,
|
||||
MaxInstanceCount: func() *int {
|
||||
if env.Instances.Scaling != nil {
|
||||
@ -136,13 +139,11 @@ func (r *Renderer) RenderEnvironment(
|
||||
}
|
||||
return nil
|
||||
}(),
|
||||
RedisInstanceID: cloudrunOutput.RedisInstanceID,
|
||||
ServiceStartupProbe: pointers.DerefZero(env.EnvironmentServiceSpec).StatupProbe,
|
||||
|
||||
// Notification configuration
|
||||
EnvironmentCategory: env.Category,
|
||||
EnvironmentID: env.ID,
|
||||
Owners: svc.Owners,
|
||||
ExternalDomain: pointers.DerefZero(env.EnvironmentServiceSpec).Domain,
|
||||
ServiceAuthentication: pointers.DerefZero(env.EnvironmentServiceSpec).Authentication,
|
||||
DiagnosticsSecret: cloudrunOutput.DiagnosticsSecret,
|
||||
RedisInstanceID: cloudrunOutput.RedisInstanceID,
|
||||
ServiceHealthProbes: pointers.DerefZero(env.EnvironmentServiceSpec).HealthProbes,
|
||||
}); err != nil {
|
||||
return nil, errors.Wrap(err, "failed to create monitoring stack")
|
||||
}
|
||||
|
||||
@ -94,6 +94,7 @@ type EnvironmentSpec struct {
|
||||
func (s EnvironmentSpec) Validate() []error {
|
||||
var errs []error
|
||||
|
||||
// Validate basic configuration
|
||||
if s.ID == "" {
|
||||
errs = append(errs, errors.New("id is required"))
|
||||
}
|
||||
@ -111,9 +112,14 @@ func (s EnvironmentSpec) Validate() []error {
|
||||
return append(errs, errors.Wrap(err, "category"))
|
||||
}
|
||||
|
||||
// Validate other shared sub-specs
|
||||
errs = append(errs, s.Deploy.Validate()...)
|
||||
errs = append(errs, s.Resources.Validate()...)
|
||||
errs = append(errs, s.Instances.Validate()...)
|
||||
|
||||
// Validate service-specific specs
|
||||
errs = append(errs, s.EnvironmentServiceSpec.Validate()...)
|
||||
|
||||
return errs
|
||||
}
|
||||
|
||||
@ -215,17 +221,12 @@ type EnvironmentServiceSpec struct {
|
||||
//
|
||||
// Only supported for services of 'kind: service'.
|
||||
Domain *EnvironmentServiceDomainSpec `yaml:"domain,omitempty"`
|
||||
// StatupProbe is provisioned by default. It can be disabled with the
|
||||
// 'disabled' field. Probes are made to the MSP-standard '/-/healthz'
|
||||
// endpoint.
|
||||
// HealthProbes configures both startup and continuous liveness probes.
|
||||
// If nil or explicitly disabled, no MSP-standard '/-/healthz' probes will
|
||||
// be configured.
|
||||
//
|
||||
// Only supported for services of 'kind: service'.
|
||||
StatupProbe *EnvironmentServiceStartupProbeSpec `yaml:"startupProbe,omitempty"`
|
||||
// LivenessProbe is only provisioned if this field is set. Probes are made
|
||||
// to the MSP-standard '/-/healthz' endpoint.
|
||||
//
|
||||
// Only supported for services of 'kind: service'.
|
||||
LivenessProbe *EnvironmentServiceLivenessProbeSpec `yaml:"livenessProbe,omitempty"`
|
||||
HealthProbes *EnvironmentServiceHealthProbesSpec `yaml:"healthProbes,omitempty"`
|
||||
// Authentication configures access to the service. By default, the service
|
||||
// is publically available, and the service should handle any required
|
||||
// authentication by itself. Set this field to an empty value to not
|
||||
@ -242,6 +243,15 @@ type EnvironmentServiceSpec struct {
|
||||
Authentication *EnvironmentServiceAuthenticationSpec `yaml:"authentication,omitempty"`
|
||||
}
|
||||
|
||||
func (s *EnvironmentServiceSpec) Validate() []error {
|
||||
if s == nil {
|
||||
return nil
|
||||
}
|
||||
var errs []error
|
||||
errs = append(errs, s.HealthProbes.Validate()...)
|
||||
return errs
|
||||
}
|
||||
|
||||
type EnvironmentServiceDomainSpec struct {
|
||||
// Type is one of 'none' or 'cloudflare'. If empty, defaults to 'none'.
|
||||
Type EnvironmentDomainType `yaml:"type"`
|
||||
@ -389,61 +399,119 @@ type EnvironmentInstancesScalingSpec struct {
|
||||
MaxCount *int `yaml:"maxCount,omitempty"`
|
||||
}
|
||||
|
||||
type EnvironmentServiceLivenessProbeSpec struct {
|
||||
// Timeout configures the period of time after which the probe times out,
|
||||
// in seconds.
|
||||
//
|
||||
// Defaults to 1 second.
|
||||
Timeout *int `yaml:"timeout,omitempty"`
|
||||
// Interval configures the interval, in seconds, at which to
|
||||
// probe the deployed service.
|
||||
//
|
||||
// Defaults to 1 second.
|
||||
Interval *int `yaml:"interval,omitempty"`
|
||||
}
|
||||
|
||||
type EnvironmentServiceAuthenticationSpec struct {
|
||||
// Sourcegraph enables access to everyone in the sourcegraph.com GSuite
|
||||
// domain.
|
||||
Sourcegraph *bool `yaml:"sourcegraph,omitempty"`
|
||||
}
|
||||
|
||||
type EnvironmentServiceStartupProbeSpec struct {
|
||||
// Disabled configures whether the MSP startup probe should be disabled.
|
||||
// We recommend disabling it when creating a service, and re-enabling it
|
||||
// once the service is healthy.
|
||||
type EnvironmentServiceHealthProbesSpec struct {
|
||||
// HealthzProbes configures whether the MSP-standard '/-/healthz' service
|
||||
// probes should be disabled. We recommend disabling it when creating a
|
||||
// service, and re-enabling it once the service is confirmed to be deployed
|
||||
// and healthy. When disabling, you should explicitly set
|
||||
// 'healthzProbes: false'.
|
||||
//
|
||||
// - When disabled, the default probe is a very generous one that waits 240s
|
||||
// for your service to respond with anything at all on '/'
|
||||
// - When enabled, the MSP-standard '/-/healthz' diagnostic check is used
|
||||
// with a generated diagnostics secret.
|
||||
// with a generated diagnostics secret enforcing Timeout and Interval.
|
||||
//
|
||||
// This prevents the first Terraform apply from failing if your healthcheck
|
||||
// is comprehensive.
|
||||
Disabled *bool `yaml:"disabled,omitempty"`
|
||||
// Disabling the probe on first startup prevents the first Terraform apply
|
||||
// from failing if your healthcheck is comprehensive, or if you haven't
|
||||
// implemented '/-/healthz' yet.
|
||||
HealthzProbes *bool `yaml:"healthzProbes,omitempty"`
|
||||
|
||||
// Timeout configures the period of time after which the probe times out,
|
||||
// in seconds.
|
||||
// Timeout configures the period of time after which a health probe times
|
||||
// out, in seconds.
|
||||
//
|
||||
// Defaults to 1 second.
|
||||
// Defaults to 3 seconds.
|
||||
Timeout *int `yaml:"timeout,omitempty"`
|
||||
// Interval configures the frequency, in seconds, at which to
|
||||
// probe the deployed service. Must be greater than or equal to timeout.
|
||||
|
||||
// StartupInterval configures the frequency, in seconds, at which to
|
||||
// probe the deployed service on startup. Must be greater than or equal to
|
||||
// timeout.
|
||||
//
|
||||
// Defaults to timeout.
|
||||
Interval *int `yaml:"interval,omitempty"`
|
||||
StartupInterval *int `yaml:"startupInterval,omitempty"`
|
||||
|
||||
// StartupInterval configures the frequency, in seconds, at which to
|
||||
// probe the deployed service after startup to continuously check its health.
|
||||
// Must be greater than or equal to timeout.
|
||||
//
|
||||
// Defaults to timeout * 10.
|
||||
LivenessInterval *int `yaml:"livenessInterval,omitempty"`
|
||||
}
|
||||
|
||||
func (s *EnvironmentServiceStartupProbeSpec) MaximumLatencySeconds() int {
|
||||
func (s *EnvironmentServiceHealthProbesSpec) Validate() []error {
|
||||
if s == nil {
|
||||
s = &EnvironmentServiceStartupProbeSpec{}
|
||||
return nil
|
||||
}
|
||||
if pointers.DerefZero(s.Disabled) {
|
||||
var errs []error
|
||||
if !s.UseHealthzProbes() {
|
||||
if s.Timeout != nil || s.StartupInterval != nil || s.LivenessInterval != nil {
|
||||
errs = append(errs,
|
||||
errors.New("timeout, startupInterval and livenessInterval can only be configured when healthzProbes is enabled"))
|
||||
}
|
||||
|
||||
// Nothing else to check
|
||||
return errs
|
||||
}
|
||||
|
||||
if s.GetTimeoutSeconds() > s.GetStartupIntervalSeconds() {
|
||||
errs = append(errs, errors.New("startupInterval must be greater than or equal to timeout"))
|
||||
}
|
||||
if s.GetTimeoutSeconds() > s.GetLivenessIntervalSeconds() {
|
||||
errs = append(errs, errors.New("livenessInterval must be greater than or equal to timeout"))
|
||||
}
|
||||
|
||||
return errs
|
||||
}
|
||||
|
||||
// UseHealthzProbes indicates whether the MSP-standard '/-/healthz' probes
|
||||
// with diagnostics secrets should be used.
|
||||
func (s *EnvironmentServiceHealthProbesSpec) UseHealthzProbes() bool {
|
||||
// No config == disabled
|
||||
if s == nil {
|
||||
return false
|
||||
}
|
||||
// If config is provided, must be explicitly disabled with 'enabled: false'
|
||||
return pointers.Deref(s.HealthzProbes, true)
|
||||
}
|
||||
|
||||
// MaximumStartupLatencySeconds infers the overal maximum latency for a
|
||||
// healthcheck to return healthy when the service is starting up.
|
||||
func (s *EnvironmentServiceHealthProbesSpec) MaximumStartupLatencySeconds() int {
|
||||
if !s.UseHealthzProbes() {
|
||||
return 240 // maximum Cloud Run timeout
|
||||
}
|
||||
// Maximum startup latency is retries x interval.
|
||||
const maxRetries = 3
|
||||
return maxRetries * pointers.Deref(s.Interval, 1)
|
||||
return maxRetries * s.GetStartupIntervalSeconds()
|
||||
}
|
||||
|
||||
// GetStartupIntervalSeconds returns the configured value, the default, or 0 if the spec is nil.
|
||||
func (s *EnvironmentServiceHealthProbesSpec) GetStartupIntervalSeconds() int {
|
||||
if s == nil {
|
||||
return 0
|
||||
}
|
||||
return pointers.Deref(s.StartupInterval, s.GetTimeoutSeconds())
|
||||
}
|
||||
|
||||
// GetLivenessIntervalSeconds returns the configured value, the default, or 0 if the spec is nil.
|
||||
func (s *EnvironmentServiceHealthProbesSpec) GetLivenessIntervalSeconds() int {
|
||||
if s == nil {
|
||||
return 0
|
||||
}
|
||||
return pointers.Deref(s.LivenessInterval, s.GetTimeoutSeconds()*10) // 10x timeout default
|
||||
}
|
||||
|
||||
// GetTimeoutSeconds returns the configured value, the default, or 0 if the spec is nil.
|
||||
func (s *EnvironmentServiceHealthProbesSpec) GetTimeoutSeconds() int {
|
||||
if s == nil {
|
||||
return 0
|
||||
}
|
||||
return pointers.Deref(s.Timeout, 3)
|
||||
}
|
||||
|
||||
type EnvironmentJobSpec struct {
|
||||
|
||||
@ -37,6 +37,11 @@ func (s ServiceSpec) GetName() string {
|
||||
return pointers.Deref(s.Name, s.ID)
|
||||
}
|
||||
|
||||
// GetKind returns Kind if configured, otherwise the default (ServiceKindService).
|
||||
func (s ServiceSpec) GetKind() ServiceKind {
|
||||
return pointers.Deref(s.Kind, ServiceKindService)
|
||||
}
|
||||
|
||||
func (s ServiceSpec) Validate() []error {
|
||||
var errs []error
|
||||
|
||||
|
||||
@ -37,7 +37,8 @@ import (
|
||||
)
|
||||
|
||||
type CrossStackOutput struct {
|
||||
RedisInstanceID *string
|
||||
DiagnosticsSecret *random.Output
|
||||
RedisInstanceID *string
|
||||
}
|
||||
|
||||
type Variables struct {
|
||||
@ -291,7 +292,8 @@ func NewStack(stacks *stack.Set, vars Variables) (crossStackOutput *CrossStackOu
|
||||
locals.Add("image_tag", *imageTag.StringValue,
|
||||
"Resolved tag of service image to deploy")
|
||||
return &CrossStackOutput{
|
||||
RedisInstanceID: redisInstanceID,
|
||||
DiagnosticsSecret: diagnosticsSecret,
|
||||
RedisInstanceID: redisInstanceID,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
||||
@ -161,17 +161,10 @@ func (b *serviceBuilder) Build(stack cdktf.TerraformStack, vars builder.Variable
|
||||
|
||||
// Do healthchecks with authorization based on MSP convention.
|
||||
StartupProbe: func() *cloudrunv2service.CloudRunV2ServiceTemplateContainersStartupProbe {
|
||||
// Default: enabled
|
||||
if vars.Environment.StatupProbe != nil &&
|
||||
pointers.Deref(vars.Environment.StatupProbe.Disabled, false) {
|
||||
if !vars.Environment.HealthProbes.UseHealthzProbes() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Set zero value for ease of reference
|
||||
if vars.Environment.StatupProbe == nil {
|
||||
vars.Environment.StatupProbe = &spec.EnvironmentServiceStartupProbeSpec{}
|
||||
}
|
||||
|
||||
return &cloudrunv2service.CloudRunV2ServiceTemplateContainersStartupProbe{
|
||||
HttpGet: &cloudrunv2service.CloudRunV2ServiceTemplateContainersStartupProbeHttpGet{
|
||||
Path: pointers.Ptr(builder.HealthCheckEndpoint),
|
||||
@ -181,16 +174,16 @@ func (b *serviceBuilder) Build(stack cdktf.TerraformStack, vars builder.Variable
|
||||
}},
|
||||
},
|
||||
InitialDelaySeconds: pointers.Float64(0),
|
||||
TimeoutSeconds: pointers.Float64(pointers.Deref(vars.Environment.StatupProbe.Timeout, 1)),
|
||||
PeriodSeconds: pointers.Float64(pointers.Deref(vars.Environment.StatupProbe.Interval, 1)),
|
||||
TimeoutSeconds: pointers.Float64(vars.Environment.HealthProbes.GetTimeoutSeconds()),
|
||||
PeriodSeconds: pointers.Float64(vars.Environment.HealthProbes.GetStartupIntervalSeconds()),
|
||||
FailureThreshold: pointers.Float64(3),
|
||||
}
|
||||
}(),
|
||||
LivenessProbe: func() *cloudrunv2service.CloudRunV2ServiceTemplateContainersLivenessProbe {
|
||||
// Default: disabled
|
||||
if vars.Environment.LivenessProbe == nil {
|
||||
if !vars.Environment.HealthProbes.UseHealthzProbes() {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &cloudrunv2service.CloudRunV2ServiceTemplateContainersLivenessProbe{
|
||||
HttpGet: &cloudrunv2service.CloudRunV2ServiceTemplateContainersLivenessProbeHttpGet{
|
||||
Path: pointers.Ptr(builder.HealthCheckEndpoint),
|
||||
@ -199,9 +192,9 @@ func (b *serviceBuilder) Build(stack cdktf.TerraformStack, vars builder.Variable
|
||||
Value: pointers.Ptr(fmt.Sprintf("Bearer %s", vars.DiagnosticsSecret.HexValue)),
|
||||
}},
|
||||
},
|
||||
TimeoutSeconds: pointers.Float64(pointers.Deref(vars.Environment.LivenessProbe.Timeout, 1)),
|
||||
PeriodSeconds: pointers.Float64(pointers.Deref(vars.Environment.LivenessProbe.Interval, 1)),
|
||||
FailureThreshold: pointers.Float64(2),
|
||||
TimeoutSeconds: pointers.Float64(vars.Environment.HealthProbes.GetTimeoutSeconds()),
|
||||
PeriodSeconds: pointers.Float64(vars.Environment.HealthProbes.GetLivenessIntervalSeconds()),
|
||||
FailureThreshold: pointers.Float64(3),
|
||||
}
|
||||
}(),
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ go_library(
|
||||
"//dev/managedservicesplatform/googlesecretsmanager",
|
||||
"//dev/managedservicesplatform/internal/resource/alertpolicy",
|
||||
"//dev/managedservicesplatform/internal/resource/gsmsecret",
|
||||
"//dev/managedservicesplatform/internal/resource/random",
|
||||
"//dev/managedservicesplatform/internal/resourceid",
|
||||
"//dev/managedservicesplatform/internal/stack",
|
||||
"//dev/managedservicesplatform/internal/stack/options/googleprovider",
|
||||
@ -22,6 +23,7 @@ go_library(
|
||||
"//lib/pointers",
|
||||
"@com_github_hashicorp_terraform_cdk_go_cdktf//:cdktf",
|
||||
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_google//monitoringnotificationchannel",
|
||||
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_google//monitoringuptimecheckconfig",
|
||||
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_opsgenie//apiintegration",
|
||||
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_opsgenie//dataopsgenieteam",
|
||||
"@com_github_sourcegraph_managed_services_platform_cdktf_gen_slack//conversation",
|
||||
|
||||
@ -7,6 +7,7 @@ import (
|
||||
"github.com/hashicorp/terraform-cdk-go/cdktf"
|
||||
|
||||
"github.com/sourcegraph/managed-services-platform-cdktf/gen/google/monitoringnotificationchannel"
|
||||
"github.com/sourcegraph/managed-services-platform-cdktf/gen/google/monitoringuptimecheckconfig"
|
||||
opsgenieintegration "github.com/sourcegraph/managed-services-platform-cdktf/gen/opsgenie/apiintegration"
|
||||
"github.com/sourcegraph/managed-services-platform-cdktf/gen/opsgenie/dataopsgenieteam"
|
||||
slackconversation "github.com/sourcegraph/managed-services-platform-cdktf/gen/slack/conversation"
|
||||
@ -14,6 +15,7 @@ import (
|
||||
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/googlesecretsmanager"
|
||||
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/alertpolicy"
|
||||
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/gsmsecret"
|
||||
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resource/random"
|
||||
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/resourceid"
|
||||
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack"
|
||||
"github.com/sourcegraph/sourcegraph/dev/managedservicesplatform/internal/stack/options/googleprovider"
|
||||
@ -64,18 +66,8 @@ import (
|
||||
type CrossStackOutput struct{}
|
||||
|
||||
type Variables struct {
|
||||
ProjectID string
|
||||
Service spec.ServiceSpec
|
||||
Monitoring spec.MonitoringSpec
|
||||
|
||||
// MaxInstanceCount informs service scaling alerts.
|
||||
MaxInstanceCount *int
|
||||
// If Redis is enabled we configure alerts for it
|
||||
RedisInstanceID *string
|
||||
// ServiceStartupProbe is used to determine the threshold for service
|
||||
// startup latency alerts.
|
||||
ServiceStartupProbe *spec.EnvironmentServiceStartupProbeSpec
|
||||
|
||||
ProjectID string
|
||||
Service spec.ServiceSpec
|
||||
// EnvironmentCategory dictates what kind of notifications are set up:
|
||||
//
|
||||
// 1. 'test' services only generate Slack notifications.
|
||||
@ -90,9 +82,22 @@ type Variables struct {
|
||||
EnvironmentCategory spec.EnvironmentCategory
|
||||
// EnvironmentID is the name of the service environment.
|
||||
EnvironmentID string
|
||||
// Owners is a list of team names. Each owner MUST correspond to the name
|
||||
// of a team in Opsgenie.
|
||||
Owners []string
|
||||
|
||||
Monitoring spec.MonitoringSpec
|
||||
// MaxInstanceCount informs service scaling alerts.
|
||||
MaxInstanceCount *int
|
||||
// ExternalDomain informs external health checks on the service domain.
|
||||
ExternalDomain *spec.EnvironmentServiceDomainSpec
|
||||
// ServiceAuthentication informs external health checks on the service
|
||||
// domain. Currently, any configuration will disable external health checks.
|
||||
ServiceAuthentication *spec.EnvironmentServiceAuthenticationSpec
|
||||
// DiagnosticsSecret is used to configure external health checks.
|
||||
DiagnosticsSecret *random.Output
|
||||
// If Redis is enabled we configure alerts for it
|
||||
RedisInstanceID *string
|
||||
// ServiceHealthProbes is used to determine the threshold for service
|
||||
// startup latency alerts.
|
||||
ServiceHealthProbes *spec.EnvironmentServiceHealthProbesSpec
|
||||
}
|
||||
|
||||
const StackName = "monitoring"
|
||||
@ -127,7 +132,7 @@ func NewStack(stacks *stack.Set, vars Variables) (*CrossStackOutput, error) {
|
||||
// case spec.EnvironmentCategoryInternal, spec.EnvironmentCategoryExternal:
|
||||
// opsgenieAlerts = true
|
||||
// }
|
||||
for i, owner := range vars.Owners {
|
||||
for i, owner := range vars.Service.Owners {
|
||||
// Use index because Opsgenie team names has lax character requirements
|
||||
id := id.Group("opsgenie_owner_%d", i)
|
||||
// Opsgenie team corresponding to owner must exist
|
||||
@ -251,7 +256,7 @@ func NewStack(stacks *stack.Set, vars Variables) (*CrossStackOutput, error) {
|
||||
return nil, errors.Wrap(err, "failed to create common alerts")
|
||||
}
|
||||
|
||||
switch pointers.Deref(vars.Service.Kind, spec.ServiceKindService) {
|
||||
switch vars.Service.GetKind() {
|
||||
case spec.ServiceKindService:
|
||||
if err = createServiceAlerts(stack, id.Group("service"), vars, channels); err != nil {
|
||||
return nil, errors.Wrap(err, "failed to create service alerts")
|
||||
@ -287,16 +292,23 @@ func createCommonAlerts(
|
||||
) error {
|
||||
// Convert a spec.ServiceKind into a alertpolicy.ServiceKind
|
||||
serviceKind := alertpolicy.CloudRunService
|
||||
kind := pointers.Deref(vars.Service.Kind, spec.ServiceKindService)
|
||||
kind := vars.Service.GetKind()
|
||||
if kind == spec.ServiceKindJob {
|
||||
serviceKind = alertpolicy.CloudRunJob
|
||||
}
|
||||
|
||||
for _, config := range []alertpolicy.Config{
|
||||
// Iterate over a list of Redis alert configurations. Custom struct defines
|
||||
// the field we expect to vary between each.
|
||||
for _, config := range []struct {
|
||||
ID string
|
||||
Name string
|
||||
Description string
|
||||
ThresholdAggregation *alertpolicy.ThresholdAggregation
|
||||
}{
|
||||
{
|
||||
ID: "cpu",
|
||||
Name: "High Container CPU Utilization",
|
||||
Description: pointers.Ptr("High CPU Usage - it may be neccessary to reduce load or increase CPU allocation"),
|
||||
Description: "High CPU Usage - it may be neccessary to reduce load or increase CPU allocation",
|
||||
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
|
||||
Filters: map[string]string{"metric.type": "run.googleapis.com/container/cpu/utilizations"},
|
||||
Aligner: alertpolicy.MonitoringAlignPercentile99,
|
||||
@ -308,7 +320,7 @@ func createCommonAlerts(
|
||||
{
|
||||
ID: "memory",
|
||||
Name: "High Container Memory Utilization",
|
||||
Description: pointers.Ptr("High Memory Usage - it may be neccessary to reduce load or increase memory allocation"),
|
||||
Description: "High Memory Usage - it may be neccessary to reduce load or increase memory allocation",
|
||||
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
|
||||
Filters: map[string]string{"metric.type": "run.googleapis.com/container/memory/utilizations"},
|
||||
Aligner: alertpolicy.MonitoringAlignPercentile99,
|
||||
@ -320,7 +332,7 @@ func createCommonAlerts(
|
||||
{
|
||||
ID: "startup",
|
||||
Name: "Container Startup Latency",
|
||||
Description: pointers.Ptr("Service containers are taking too long to start up - something may be blocking startup"),
|
||||
Description: "Service containers are taking longer than configured timeouts to start up.",
|
||||
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
|
||||
Filters: map[string]string{"metric.type": "run.googleapis.com/container/startup_latencies"},
|
||||
Aligner: alertpolicy.MonitoringAlignPercentile99,
|
||||
@ -334,17 +346,28 @@ func createCommonAlerts(
|
||||
}
|
||||
// otherwise, use the startup probe configuration to
|
||||
// determine the threshold for how long we should be waiting
|
||||
return float64(vars.ServiceStartupProbe.MaximumLatencySeconds()) * 1000 // ms
|
||||
return float64(vars.ServiceHealthProbes.MaximumStartupLatencySeconds()) * 1000 // ms
|
||||
}(),
|
||||
},
|
||||
},
|
||||
} {
|
||||
config.ServiceEnvironmentSlug = fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID)
|
||||
config.ProjectID = vars.ProjectID
|
||||
config.ServiceName = vars.Service.ID
|
||||
config.ServiceKind = serviceKind
|
||||
config.NotificationChannels = channels
|
||||
if _, err := alertpolicy.New(stack, id, &config); err != nil {
|
||||
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
|
||||
// Resource we are targetting in this helper
|
||||
ResourceKind: serviceKind,
|
||||
ResourceName: vars.Service.ID,
|
||||
|
||||
// Alert policy
|
||||
ID: config.ID,
|
||||
Name: config.Name,
|
||||
Description: config.Description,
|
||||
ThresholdAggregation: config.ThresholdAggregation,
|
||||
|
||||
// Shared configuration
|
||||
Service: vars.Service,
|
||||
EnvironmentID: vars.EnvironmentID,
|
||||
ProjectID: vars.ProjectID,
|
||||
NotificationChannels: channels,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
@ -361,14 +384,15 @@ func createServiceAlerts(
|
||||
// Only provision if MaxCount is specified above 5
|
||||
if pointers.Deref(vars.MaxInstanceCount, 0) > 5 {
|
||||
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
|
||||
ServiceEnvironmentSlug: fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID),
|
||||
Service: vars.Service,
|
||||
EnvironmentID: vars.EnvironmentID,
|
||||
|
||||
ID: "instance_count",
|
||||
Name: "Container Instance Count",
|
||||
Description: pointers.Ptr("There are a lot of Cloud Run instances running - we may need to increase per-instance requests make make sure we won't hit the configured max instance count"),
|
||||
ProjectID: vars.ProjectID,
|
||||
ServiceName: vars.Service.ID,
|
||||
ServiceKind: alertpolicy.CloudRunService,
|
||||
ID: "instance_count",
|
||||
Name: "Container Instance Count",
|
||||
Description: "There are a lot of Cloud Run instances running - we may need to increase per-instance requests make make sure we won't hit the configured max instance count",
|
||||
ProjectID: vars.ProjectID,
|
||||
ResourceName: vars.Service.ID,
|
||||
ResourceKind: alertpolicy.CloudRunService,
|
||||
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
|
||||
Filters: map[string]string{"metric.type": "run.googleapis.com/container/instance_count"},
|
||||
Aligner: alertpolicy.MonitoringAlignMax,
|
||||
@ -377,9 +401,100 @@ func createServiceAlerts(
|
||||
},
|
||||
NotificationChannels: channels,
|
||||
}); err != nil {
|
||||
return err
|
||||
return errors.Wrap(err, "instance_count")
|
||||
}
|
||||
}
|
||||
|
||||
// If an external DNS name is provisioned, use it to check service availability
|
||||
// from outside Cloud Run. The service must not use IAM auth.
|
||||
if vars.ServiceAuthentication == nil && vars.ExternalDomain.GetDNSName() != "" {
|
||||
if err := createExternalHealthcheckAlert(stack, id, vars, channels); err != nil {
|
||||
return errors.Wrap(err, "external_healthcheck")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func createExternalHealthcheckAlert(
|
||||
stack cdktf.TerraformStack,
|
||||
id resourceid.ID,
|
||||
vars Variables,
|
||||
channels []monitoringnotificationchannel.MonitoringNotificationChannel,
|
||||
) error {
|
||||
var (
|
||||
healthcheckPath = "/"
|
||||
healthcheckHeaders = map[string]*string{}
|
||||
)
|
||||
// Only use MSP runtime standards if we know the service supports it.
|
||||
if vars.ServiceHealthProbes.UseHealthzProbes() {
|
||||
healthcheckPath = "/-/healthz"
|
||||
healthcheckHeaders = map[string]*string{
|
||||
"Authorization": pointers.Stringf("Bearer %s", vars.DiagnosticsSecret.HexValue),
|
||||
}
|
||||
}
|
||||
|
||||
externalDNS := vars.ExternalDomain.GetDNSName()
|
||||
uptimeCheck := monitoringuptimecheckconfig.NewMonitoringUptimeCheckConfig(stack, id.TerraformID("external_uptime_check"), &monitoringuptimecheckconfig.MonitoringUptimeCheckConfigConfig{
|
||||
Project: &vars.ProjectID,
|
||||
DisplayName: pointers.Stringf("External Uptime Check for %s", externalDNS),
|
||||
|
||||
// https://cloud.google.com/monitoring/api/resources#tag_uptime_url
|
||||
MonitoredResource: &monitoringuptimecheckconfig.MonitoringUptimeCheckConfigMonitoredResource{
|
||||
Type: pointers.Ptr("uptime_url"),
|
||||
Labels: &map[string]*string{
|
||||
"project_id": &vars.ProjectID,
|
||||
"host": &externalDNS,
|
||||
},
|
||||
},
|
||||
|
||||
// 1 to 60 seconds.
|
||||
Timeout: pointers.Stringf("%ds", vars.ServiceHealthProbes.GetTimeoutSeconds()),
|
||||
// Only supported values are 60s (1 minute), 300s (5 minutes),
|
||||
// 600s (10 minutes), and 900s (15 minutes)
|
||||
Period: pointers.Ptr("60s"),
|
||||
HttpCheck: &monitoringuptimecheckconfig.MonitoringUptimeCheckConfigHttpCheck{
|
||||
Port: pointers.Float64(443),
|
||||
UseSsl: pointers.Ptr(true),
|
||||
ValidateSsl: pointers.Ptr(true),
|
||||
Path: &healthcheckPath,
|
||||
Headers: &healthcheckHeaders,
|
||||
AcceptedResponseStatusCodes: &[]*monitoringuptimecheckconfig.MonitoringUptimeCheckConfigHttpCheckAcceptedResponseStatusCodes{
|
||||
{
|
||||
StatusClass: pointers.Ptr("STATUS_CLASS_2XX"),
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
|
||||
Service: vars.Service,
|
||||
EnvironmentID: vars.EnvironmentID,
|
||||
|
||||
ID: "external_health_check",
|
||||
Name: "External Uptime Check",
|
||||
Description: fmt.Sprintf("Service is failing to repond on https://%s - this may be expected if the service was recently provisioned or if its external domain has changed.", externalDNS),
|
||||
ProjectID: vars.ProjectID,
|
||||
|
||||
ResourceKind: alertpolicy.URLUptime,
|
||||
ResourceName: *uptimeCheck.UptimeCheckId(),
|
||||
|
||||
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
|
||||
Filters: map[string]string{
|
||||
"metric.type": "monitoring.googleapis.com/uptime_check/check_passed",
|
||||
},
|
||||
Aligner: alertpolicy.MonitoringAlignFractionTrue,
|
||||
// Checks occur every 60s, in a 300s window if 2/5 fail we are in trouble
|
||||
Period: "300s",
|
||||
Duration: "0s",
|
||||
Comparison: alertpolicy.ComparisonLT,
|
||||
Threshold: 0.4,
|
||||
// Alert when all locations go down
|
||||
Trigger: alertpolicy.TriggerKindAllInViolation,
|
||||
},
|
||||
NotificationChannels: channels,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -391,14 +506,15 @@ func createJobAlerts(
|
||||
) error {
|
||||
// Alert whenever a Cloud Run Job fails
|
||||
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
|
||||
ServiceEnvironmentSlug: fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID),
|
||||
Service: vars.Service,
|
||||
EnvironmentID: vars.EnvironmentID,
|
||||
|
||||
ID: "job_failures",
|
||||
Name: "Cloud Run Job Failures",
|
||||
Description: pointers.Ptr("Failed executions of Cloud Run Job"),
|
||||
ProjectID: vars.ProjectID,
|
||||
ServiceName: vars.Service.ID,
|
||||
ServiceKind: alertpolicy.CloudRunJob,
|
||||
ID: "job_failures",
|
||||
Name: "Cloud Run Job Failures",
|
||||
Description: "Cloud Run Job executions failed",
|
||||
ProjectID: vars.ProjectID,
|
||||
ResourceName: vars.Service.ID,
|
||||
ResourceKind: alertpolicy.CloudRunJob,
|
||||
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
|
||||
Filters: map[string]string{
|
||||
"metric.type": "run.googleapis.com/job/completed_task_attempt_count",
|
||||
@ -426,13 +542,14 @@ func createResponseCodeMetrics(
|
||||
) error {
|
||||
for _, config := range vars.Monitoring.Alerts.ResponseCodeRatios {
|
||||
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
|
||||
ServiceEnvironmentSlug: fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID),
|
||||
Service: vars.Service,
|
||||
EnvironmentID: vars.EnvironmentID,
|
||||
|
||||
ID: config.ID,
|
||||
ProjectID: vars.ProjectID,
|
||||
Name: config.Name,
|
||||
ServiceName: vars.Service.ID,
|
||||
ServiceKind: alertpolicy.CloudRunService,
|
||||
ID: config.ID,
|
||||
ProjectID: vars.ProjectID,
|
||||
Name: config.Name,
|
||||
ResourceName: vars.Service.ID,
|
||||
ResourceKind: alertpolicy.CloudRunService,
|
||||
ResponseCodeMetric: &alertpolicy.ResponseCodeMetric{
|
||||
Code: config.Code,
|
||||
CodeClass: config.CodeClass,
|
||||
@ -455,11 +572,18 @@ func createRedisAlerts(
|
||||
vars Variables,
|
||||
channels []monitoringnotificationchannel.MonitoringNotificationChannel,
|
||||
) error {
|
||||
for _, config := range []alertpolicy.Config{
|
||||
// Iterate over a list of Redis alert configurations. Custom struct defines
|
||||
// the field we expect to vary between each.
|
||||
for _, config := range []struct {
|
||||
ID string
|
||||
Name string
|
||||
Description string
|
||||
ThresholdAggregation *alertpolicy.ThresholdAggregation
|
||||
}{
|
||||
{
|
||||
ID: "memory",
|
||||
Name: "Cloud Redis - System Memory Utilization",
|
||||
Description: pointers.Ptr("This alert fires if the system memory utilization is above the set threshold. The utilization is measured on a scale of 0 to 1."),
|
||||
Description: "Redis System memory utilization is above the set threshold. The utilization is measured on a scale of 0 to 1.",
|
||||
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
|
||||
Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/memory/system_memory_usage_ratio"},
|
||||
Aligner: alertpolicy.MonitoringAlignMean,
|
||||
@ -471,7 +595,7 @@ func createRedisAlerts(
|
||||
{
|
||||
ID: "cpu",
|
||||
Name: "Cloud Redis - System CPU Utilization",
|
||||
Description: pointers.Ptr("This alert fires if the Redis Engine CPU Utilization goes above the set threshold. The utilization is measured on a scale of 0 to 1."),
|
||||
Description: "Redis Engine CPU Utilization goes above the set threshold. The utilization is measured on a scale of 0 to 1.",
|
||||
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
|
||||
Filters: map[string]string{"metric.type": "redis.googleapis.com/stats/cpu_utilization_main_thread"},
|
||||
GroupByFields: []string{"resource.label.instance_id", "resource.label.node_id"},
|
||||
@ -484,7 +608,7 @@ func createRedisAlerts(
|
||||
{
|
||||
ID: "failover",
|
||||
Name: "Cloud Redis - Standard Instance Failover",
|
||||
Description: pointers.Ptr("This alert fires if failover occurs for a standard tier instance."),
|
||||
Description: "Instance failover occured for a standard tier Redis instance.",
|
||||
ThresholdAggregation: &alertpolicy.ThresholdAggregation{
|
||||
Filters: map[string]string{"metric.type": "redis.googleapis.com/replication/role"},
|
||||
Aligner: alertpolicy.MonitoringAlignStddev,
|
||||
@ -493,12 +617,23 @@ func createRedisAlerts(
|
||||
},
|
||||
},
|
||||
} {
|
||||
config.ServiceEnvironmentSlug = fmt.Sprintf("%s#%s", vars.Service.ID, vars.EnvironmentID)
|
||||
config.ProjectID = vars.ProjectID
|
||||
config.ServiceName = *vars.RedisInstanceID
|
||||
config.ServiceKind = alertpolicy.CloudRedis
|
||||
config.NotificationChannels = channels
|
||||
if _, err := alertpolicy.New(stack, id, &config); err != nil {
|
||||
if _, err := alertpolicy.New(stack, id, &alertpolicy.Config{
|
||||
// Resource we are targetting in this helper
|
||||
ResourceKind: alertpolicy.CloudRedis,
|
||||
ResourceName: *vars.RedisInstanceID,
|
||||
|
||||
// Alert policy
|
||||
ID: config.ID,
|
||||
Name: config.Name,
|
||||
Description: config.Description,
|
||||
ThresholdAggregation: config.ThresholdAggregation,
|
||||
|
||||
// Shared configuration
|
||||
Service: vars.Service,
|
||||
EnvironmentID: vars.EnvironmentID,
|
||||
ProjectID: vars.ProjectID,
|
||||
NotificationChannels: channels,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
@ -43,6 +43,6 @@ environments:
|
||||
scaling:
|
||||
maxCount: 3
|
||||
minCount: 1
|
||||
startupProbe:
|
||||
healthProbes:
|
||||
# Only enable if your service implements MSP /-/healthz conventions.
|
||||
disabled: true
|
||||
healthzProbes: false
|
||||
|
||||
@ -41,9 +41,9 @@ environments:
|
||||
scaling:
|
||||
maxCount: 3
|
||||
minCount: 1
|
||||
startupProbe:
|
||||
healthProbes:
|
||||
# Only enable if your service implements MSP /-/healthz conventions.
|
||||
disabled: true
|
||||
healthzProbes: false
|
||||
- id: second
|
||||
projectID: msp-example-second-xxxx
|
||||
# TODO: We initially provision in 'test' to make it easy to access the project
|
||||
|
||||
@ -43,6 +43,6 @@ environments:
|
||||
scaling:
|
||||
maxCount: 3
|
||||
minCount: 1
|
||||
startupProbe:
|
||||
healthProbes:
|
||||
# Only enable if your service implements MSP /-/healthz conventions.
|
||||
disabled: true
|
||||
healthzProbes: false
|
||||
|
||||
@ -41,9 +41,9 @@ environments:
|
||||
scaling:
|
||||
maxCount: 3
|
||||
minCount: 1
|
||||
startupProbe:
|
||||
healthProbes:
|
||||
# Only enable if your service implements MSP /-/healthz conventions.
|
||||
disabled: true
|
||||
healthzProbes: false
|
||||
- id: second
|
||||
projectID: msp-example-second-xxxx
|
||||
# TODO: We initially provision in 'test' to make it easy to access the project
|
||||
|
||||
@ -43,6 +43,6 @@ environments:
|
||||
scaling:
|
||||
maxCount: 3
|
||||
minCount: 1
|
||||
startupProbe:
|
||||
healthProbes:
|
||||
# Only enable if your service implements MSP /-/healthz conventions.
|
||||
disabled: true
|
||||
healthzProbes: false
|
||||
|
||||
Loading…
Reference in New Issue
Block a user