diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 409617672a4..4f6ee32248b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -179,10 +179,12 @@ Dockerfile @sourcegraph/distribution /enterprise/docs/deployment.md @sourcegraph/distribution **/build.sh @sourcegraph/distribution /cmd/frontend/envvar @sourcegraph/distribution +/cmd/frontend/graphqlbackend/site_monitoring* @sourcegraph/distribution /cmd/server @sourcegraph/distribution /internal/conf @slimsag /internal/db/confdb @slimsag /internal/db/globalstatedb @slimsag +/internal/prometheusutil @sourcegraph/distribution /enterprise/docs @sourcegraph/distribution /.buildkite @sourcegraph/distribution @ggilmore ## Regression testing diff --git a/cmd/frontend/graphqlbackend/schema.go b/cmd/frontend/graphqlbackend/schema.go index 99b940e974f..91ef92f2725 100644 --- a/cmd/frontend/graphqlbackend/schema.go +++ b/cmd/frontend/graphqlbackend/schema.go @@ -3330,6 +3330,19 @@ type Site implements SettingsSubject { # Months of history (based on current UTC time). months: Int ): CodeIntelUsageStatistics! + # Monitoring overview for this site. + # + # Note: This is primarily used for displaying recently-fired alerts in the web app. If your intent + # is to monitor Sourcegraph, it is better to configure alerting or query Prometheus directly in + # order to ensure that if the frontend goes down you still recieve alerts: + # + # Configure alerting: https://docs.sourcegraph.com/admin/observability/alerting + # Query Prometheus directly: https://docs.sourcegraph.com/admin/observability/alerting_custom_consumption + # + monitoringStatistics( + # Days of history (based on current UTC time). + days: Int + ): MonitoringStatistics! } # The configuration for a site. @@ -3630,6 +3643,26 @@ type DeploymentConfiguration { siteID: String } +# Monitoring overview. +type MonitoringStatistics { + # Alerts fired in this time span. + alerts: [MonitoringAlert!]! +} + +# A high-level monitoring alert, for details see https://docs.sourcegraph.com/admin/observability/metrics_guide#high-level-alerting-metrics +type MonitoringAlert { + # End time of this event, which describes the past 12h of recorded data. + timestamp: DateTime! + # Name of alert that the service fired. + name: String! + # Name of the service that fired the alert. + serviceName: String! + # Average percentage of time (between [0, 1]) that the event was firing over the 12h of recorded data. e.g. + # 1.0 if it was firing 100% of the time on average during that 12h window, 0.5 if it was firing 50% of the + # time on average, etc. + average: Float! +} + # A list of survey responses type SurveyResponseConnection { # A list of survey responses. diff --git a/cmd/frontend/graphqlbackend/schema.graphql b/cmd/frontend/graphqlbackend/schema.graphql index d5f360ab61b..34ca1218c08 100755 --- a/cmd/frontend/graphqlbackend/schema.graphql +++ b/cmd/frontend/graphqlbackend/schema.graphql @@ -3337,6 +3337,19 @@ type Site implements SettingsSubject { # Months of history (based on current UTC time). months: Int ): CodeIntelUsageStatistics! + # Monitoring overview for this site. + # + # Note: This is primarily used for displaying recently-fired alerts in the web app. If your intent + # is to monitor Sourcegraph, it is better to configure alerting or query Prometheus directly in + # order to ensure that if the frontend goes down you still recieve alerts: + # + # Configure alerting: https://docs.sourcegraph.com/admin/observability/alerting + # Query Prometheus directly: https://docs.sourcegraph.com/admin/observability/alerting_custom_consumption + # + monitoringStatistics( + # Days of history (based on current UTC time). + days: Int + ): MonitoringStatistics! } # The configuration for a site. @@ -3637,6 +3650,26 @@ type DeploymentConfiguration { siteID: String } +# Monitoring overview. +type MonitoringStatistics { + # Alerts fired in this time span. + alerts: [MonitoringAlert!]! +} + +# A high-level monitoring alert, for details see https://docs.sourcegraph.com/admin/observability/metrics_guide#high-level-alerting-metrics +type MonitoringAlert { + # End time of this event, which describes the past 12h of recorded data. + timestamp: DateTime! + # Name of alert that the service fired. + name: String! + # Name of the service that fired the alert. + serviceName: String! + # Average percentage of time (between [0, 1]) that the event was firing over the 12h of recorded data. e.g. + # 1.0 if it was firing 100% of the time on average during that 12h window, 0.5 if it was firing 50% of the + # time on average, etc. + average: Float! +} + # A list of survey responses type SurveyResponseConnection { # A list of survey responses. diff --git a/cmd/frontend/graphqlbackend/site_monitoring.go b/cmd/frontend/graphqlbackend/site_monitoring.go new file mode 100644 index 00000000000..6157fad380a --- /dev/null +++ b/cmd/frontend/graphqlbackend/site_monitoring.go @@ -0,0 +1,133 @@ +package graphqlbackend + +import ( + "context" + "fmt" + "sort" + "time" + + "github.com/pkg/errors" + + "github.com/inconshreveable/log15" + "github.com/opentracing/opentracing-go/ext" + prometheus "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + "github.com/sourcegraph/sourcegraph/internal/prometheusutil" + "github.com/sourcegraph/sourcegraph/internal/trace/ot" +) + +// MonitoringAlert implements the GraphQL type MonitoringAlert. +type MonitoringAlert struct { + TimestampValue DateTime + NameValue string + ServiceNameValue string + AverageValue float64 +} + +func (r *MonitoringAlert) Timestamp() DateTime { return r.TimestampValue } +func (r *MonitoringAlert) Name() string { return r.NameValue } +func (r *MonitoringAlert) ServiceName() string { return r.ServiceNameValue } +func (r *MonitoringAlert) Average() float64 { return r.AverageValue } + +type MonitoringAlerts []*MonitoringAlert + +// Less determined by timestamp -> serviceName -> alert name +func (a MonitoringAlerts) Less(i, j int) bool { + if a[i].Timestamp().Equal(a[j].Timestamp().Time) { + if a[i].ServiceName() == a[j].ServiceName() { + return a[i].Name() < a[j].Name() + } + return a[i].ServiceName() < a[j].ServiceName() + } + return a[i].Timestamp().Before(a[j].Timestamp().Time) +} +func (a MonitoringAlerts) Swap(i, j int) { + tmp := a[i] + a[i] = a[j] + a[j] = tmp +} +func (a MonitoringAlerts) Len() int { return len(a) } + +func (r *siteResolver) MonitoringStatistics(ctx context.Context, args *struct { + Days *int32 +}) (*siteMonitoringStatisticsResolver, error) { + prom, err := prometheusutil.NewPrometheusQuerier() + if err != nil { + return nil, err + } + return &siteMonitoringStatisticsResolver{ + prom: prom, + timespan: time.Duration(*args.Days) * 24 * time.Hour, + }, nil +} + +type siteMonitoringStatisticsResolver struct { + prom prometheusutil.PrometheusQuerier + timespan time.Duration +} + +func (r *siteMonitoringStatisticsResolver) Alerts(ctx context.Context) ([]*MonitoringAlert, error) { + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + span, ctx := ot.StartSpanFromContext(ctx, "site.MonitoringStatistics.alerts") + + var err error + defer func() { + if err != nil { + ext.Error.Set(span, true) + span.SetTag("err", err.Error()) + } + cancel() + span.Finish() + }() + + results, warn, err := r.prom.QueryRange(ctx, `max by (level,name,service_name)(avg_over_time(alert_count{name!=""}[12h]))`, + prometheus.Range{ + Start: time.Now().Add(-r.timespan), + End: time.Now(), + Step: 12 * time.Hour, + }) + if err != nil { + return nil, errors.Wrap(err, "prometheus query failed") + } + if len(warn) > 0 { + log15.Warn("site.monitoring.alerts: warnings encountered on prometheus query", + "timespan", r.timespan.String(), + "warnings", warn) + } + if results.Type() != model.ValMatrix { + return nil, fmt.Errorf("received unexpected result type %q from prometheus", results.Type()) + } + + data := results.(model.Matrix) + var alerts MonitoringAlerts + for _, sample := range data { + var ( + name = string(sample.Metric["name"]) + serviceName = string(sample.Metric["service_name"]) + level = string(sample.Metric["level"]) + prevVal *model.SampleValue + ) + for _, p := range sample.Values { + // Check for nil so that we don't ignore the first occurrence of an alert - even if the + // alert is never >0, we want to be aware that it is at least configured correctly and + // being tracked. Otherwise, if the value in this window is the same as in the previous + // window just discard it. + if prevVal != nil && p.Value == *prevVal { + continue + } + // copy value for comparison later + v := p.Value + prevVal = &v + // record alert in results + alerts = append(alerts, &MonitoringAlert{ + NameValue: fmt.Sprintf("%s: %s", level, name), + ServiceNameValue: serviceName, + TimestampValue: DateTime{p.Timestamp.Time().UTC().Truncate(time.Hour)}, + AverageValue: float64(p.Value), + }) + } + } + + sort.Sort(alerts) + return alerts, err +} diff --git a/cmd/frontend/graphqlbackend/site_monitoring_test.go b/cmd/frontend/graphqlbackend/site_monitoring_test.go new file mode 100644 index 00000000000..e6b7aaf3bc8 --- /dev/null +++ b/cmd/frontend/graphqlbackend/site_monitoring_test.go @@ -0,0 +1,152 @@ +package graphqlbackend + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/pkg/errors" + prometheus "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + "github.com/sourcegraph/sourcegraph/internal/prometheusutil" +) + +func Test_siteMonitoringStatisticsResolver_Alerts(t *testing.T) { + mock := prometheusutil.NewMockPrometheusQuerier() + sampleT := model.Time(time.Now().UTC().Unix()) + type fields struct { + queryValue model.Value + queryWarnings prometheus.Warnings + queryErr error + } + tests := []struct { + name string + fields fields + want []*MonitoringAlert + wantErr error + }{ + { + name: "includes alerts with no occurrences", + fields: fields{ + queryValue: model.Matrix{ + &model.SampleStream{ + Metric: model.Metric{"name": "hello", "service_name": "world", "level": "warn"}, + Values: []model.SamplePair{{Timestamp: sampleT, Value: model.SampleValue(0)}}}, + }, + }, + want: []*MonitoringAlert{{ + TimestampValue: DateTime{sampleT.Time().Truncate(time.Hour)}, + NameValue: "warn: hello", + ServiceNameValue: "world", + AverageValue: 0, + }}, + wantErr: nil, + }, { + name: "includes alerts with occurrences", + fields: fields{ + queryValue: model.Matrix{ + &model.SampleStream{ + Metric: model.Metric{"name": "hello", "service_name": "world", "level": "warn"}, + Values: []model.SamplePair{{Timestamp: sampleT, Value: model.SampleValue(1)}}}, + }, + }, + want: []*MonitoringAlert{{ + TimestampValue: DateTime{sampleT.Time().Truncate(time.Hour)}, + NameValue: "warn: hello", + ServiceNameValue: "world", + AverageValue: 1, + }}, + wantErr: nil, + }, { + name: "discards repeated values", + fields: fields{ + queryValue: model.Matrix{ + &model.SampleStream{ + Metric: model.Metric{"name": "hello", "service_name": "world", "level": "warn"}, + Values: []model.SamplePair{ + {Timestamp: sampleT, Value: model.SampleValue(1)}, + {Timestamp: sampleT.Add(time.Hour), Value: model.SampleValue(1)}, + {Timestamp: sampleT.Add(2 * time.Hour), Value: model.SampleValue(1)}, + }}, + }, + }, + want: []*MonitoringAlert{{ + TimestampValue: DateTime{sampleT.Time().Truncate(time.Hour)}, + NameValue: "warn: hello", + ServiceNameValue: "world", + AverageValue: 1, + }}, + wantErr: nil, + }, { + name: "elements are sorted", + fields: fields{ + queryValue: model.Matrix{ + &model.SampleStream{ + Metric: model.Metric{"name": "b", "service_name": "b", "level": "warn"}, + Values: []model.SamplePair{ + {Timestamp: sampleT, Value: model.SampleValue(1)}, + }, + }, + &model.SampleStream{ + Metric: model.Metric{"name": "a", "service_name": "b", "level": "warn"}, + Values: []model.SamplePair{ + {Timestamp: sampleT, Value: model.SampleValue(1)}, + {Timestamp: sampleT.Add(time.Hour), Value: model.SampleValue(2)}, + }, + }, + &model.SampleStream{ + Metric: model.Metric{"name": "a", "service_name": "a", "level": "warn"}, + Values: []model.SamplePair{ + {Timestamp: sampleT, Value: model.SampleValue(1)}, + }, + }, + }, + }, + want: []*MonitoringAlert{{ + TimestampValue: DateTime{sampleT.Time().Truncate(time.Hour)}, + NameValue: "warn: a", + ServiceNameValue: "a", + AverageValue: 1, + }, { + TimestampValue: DateTime{sampleT.Time().Truncate(time.Hour)}, + NameValue: "warn: a", + ServiceNameValue: "b", + AverageValue: 1, + }, { + TimestampValue: DateTime{sampleT.Time().Truncate(time.Hour)}, + NameValue: "warn: b", + ServiceNameValue: "b", + AverageValue: 1, + }, { + TimestampValue: DateTime{sampleT.Time().Add(time.Hour).Truncate(time.Hour)}, + NameValue: "warn: a", + ServiceNameValue: "b", + AverageValue: 2, + }}, + wantErr: nil, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mock.QueryRangeFunc.SetDefaultHook(func(ctx context.Context, query string, r prometheus.Range) (model.Value, prometheus.Warnings, error) { + return tt.fields.queryValue, tt.fields.queryWarnings, tt.fields.queryErr + }) + r := &siteMonitoringStatisticsResolver{ + prom: mock, + timespan: 24 * time.Hour, + } + alerts, err := r.Alerts(context.Background()) + if err != nil { + if tt.wantErr == nil { + t.Errorf("expected no error, got %v", err) + } else if !errors.Is(err, tt.wantErr) { + t.Errorf("expected error %v, got %v", tt.wantErr, err) + } + } + if diff := cmp.Diff(tt.want, alerts); diff != "" { + t.Errorf("alerts: %s", diff) + } + }) + } +} diff --git a/dev/start.sh b/dev/start.sh index e5cf5b95a06..79254a6b872 100755 --- a/dev/start.sh +++ b/dev/start.sh @@ -64,6 +64,7 @@ export CTAGS_COMMAND="${CTAGS_COMMAND:=cmd/symbols/universal-ctags-dev}" export ZOEKT_HOST=localhost:3070 export USE_ENHANCED_LANGUAGE_DETECTION=${USE_ENHANCED_LANGUAGE_DETECTION:-1} export GRAFANA_SERVER_URL=http://localhost:3370 +export PROMETHEUS_URL="${PROMETHEUS_URL:-"http://localhost:9090"}" # Caddy / HTTPS configuration export SOURCEGRAPH_HTTPS_DOMAIN="${SOURCEGRAPH_HTTPS_DOMAIN:-"sourcegraph.test"}" diff --git a/go.mod b/go.mod index abcf174ceb7..4987d13553b 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,6 @@ module github.com/sourcegraph/sourcegraph go 1.14 require ( - cloud.google.com/go v0.56.0 // indirect cloud.google.com/go/pubsub v1.3.1 github.com/Masterminds/semver v1.5.0 github.com/NYTimes/gziphandler v1.1.1 @@ -95,6 +94,7 @@ require ( github.com/pkg/errors v0.9.1 github.com/pquerna/cachecontrol v0.0.0-20180517163645-1555304b9b35 // indirect github.com/prometheus/client_golang v1.5.1 + github.com/prometheus/common v0.9.1 github.com/prometheus/procfs v0.0.11 // indirect github.com/rainycape/unidecode v0.0.0-20150907023854-cb7f23ec59be github.com/russellhaering/gosaml2 v0.4.0 @@ -143,7 +143,7 @@ require ( golang.org/x/sys v0.0.0-20200331124033-c3d80250170d golang.org/x/time v0.0.0-20191024005414-555d28b269f0 golang.org/x/tools v0.0.0-20200420001825-978e26b7c37c - google.golang.org/api v0.21.0 // indirect + google.golang.org/api v0.24.0 // indirect google.golang.org/genproto v0.0.0-20200403120447-c50568487044 // indirect gopkg.in/jpoehls/gophermail.v0 v0.0.0-20160410235621-62941eab772c gopkg.in/square/go-jose.v2 v2.4.1 // indirect diff --git a/go.sum b/go.sum index 2714e725194..da0f05f1833 100644 --- a/go.sum +++ b/go.sum @@ -1300,8 +1300,8 @@ google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/ google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.20.0 h1:jz2KixHX7EcCPiQrySzPdnYT7DbINAypCqKZ1Z7GM40= google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.21.0 h1:zS+Q/CJJnVlXpXQVIz+lH0ZT2lBuT2ac7XD8Y/3w6hY= -google.golang.org/api v0.21.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.24.0 h1:cG03eaksBzhfSIk7JRGctfp3lanklcOM/mTGvow7BbQ= +google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.3.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= diff --git a/internal/prometheusutil/gen.go b/internal/prometheusutil/gen.go new file mode 100644 index 00000000000..6e686e4cecc --- /dev/null +++ b/internal/prometheusutil/gen.go @@ -0,0 +1,4 @@ +package prometheusutil + +//go:generate env GOBIN=$PWD/.bin GO111MODULE=on go install github.com/efritz/go-mockgen +//go:generate $PWD/.bin/go-mockgen -f github.com/sourcegraph/sourcegraph/internal/prometheusutil -i PrometheusQuerier -o prometheus_mock.go diff --git a/internal/prometheusutil/prometheus.go b/internal/prometheusutil/prometheus.go new file mode 100644 index 00000000000..06df50287ab --- /dev/null +++ b/internal/prometheusutil/prometheus.go @@ -0,0 +1,39 @@ +package prometheusutil + +import ( + "context" + + "github.com/pkg/errors" + prometheusAPI "github.com/prometheus/client_golang/api" + prometheus "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + "github.com/sourcegraph/sourcegraph/internal/env" +) + +var PrometheusURL = env.Get("PROMETHEUS_URL", "", "prometheus server URL") + +// PrometheusQuerier provides a shim around prometheus.API +type PrometheusQuerier interface { + // QueryRange performs a query for the given range. + QueryRange(ctx context.Context, query string, r prometheus.Range) (model.Value, prometheus.Warnings, error) +} + +// ErrPrometheusUnavailable is raised specifically when prometheusURL is unset or when +// prometheus API access times out, both of which indicate that the server API has likely +// been configured to explicitly disallow access to prometheus, or that prometheus is not +// deployed at all. The website checks for this error in `fetchMonitoringStats`, for example. +var ErrPrometheusUnavailable = errors.New("prometheus API is unavailable") + +func NewPrometheusQuerier() (PrometheusQuerier, error) { + if PrometheusURL == "" { + return nil, ErrPrometheusUnavailable + } + c, err := prometheusAPI.NewClient(prometheusAPI.Config{ + Address: PrometheusURL, + RoundTripper: &roundTripper{}, + }) + if err != nil { + return nil, errors.Wrap(err, "prometheus configuration malformed") + } + return prometheus.NewAPI(c), nil +} diff --git a/internal/prometheusutil/prometheus_mock.go b/internal/prometheusutil/prometheus_mock.go new file mode 100644 index 00000000000..3a87ac162a5 --- /dev/null +++ b/internal/prometheusutil/prometheus_mock.go @@ -0,0 +1,160 @@ +// Code generated by github.com/efritz/go-mockgen 0.1.0; DO NOT EDIT. + +package prometheusutil + +import ( + "context" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + model "github.com/prometheus/common/model" + "sync" +) + +// MockPrometheusQuerier is a mock impelementation of the PrometheusQuerier +// interface (from the package +// github.com/sourcegraph/sourcegraph/internal/prometheusutil) used for unit +// testing. +type MockPrometheusQuerier struct { + // QueryRangeFunc is an instance of a mock function object controlling + // the behavior of the method QueryRange. + QueryRangeFunc *PrometheusQuerierQueryRangeFunc +} + +// NewMockPrometheusQuerier creates a new mock of the PrometheusQuerier +// interface. All methods return zero values for all results, unless +// overwritten. +func NewMockPrometheusQuerier() *MockPrometheusQuerier { + return &MockPrometheusQuerier{ + QueryRangeFunc: &PrometheusQuerierQueryRangeFunc{ + defaultHook: func(context.Context, string, v1.Range) (model.Value, v1.Warnings, error) { + return nil, nil, nil + }, + }, + } +} + +// NewMockPrometheusQuerierFrom creates a new mock of the +// MockPrometheusQuerier interface. All methods delegate to the given +// implementation, unless overwritten. +func NewMockPrometheusQuerierFrom(i PrometheusQuerier) *MockPrometheusQuerier { + return &MockPrometheusQuerier{ + QueryRangeFunc: &PrometheusQuerierQueryRangeFunc{ + defaultHook: i.QueryRange, + }, + } +} + +// PrometheusQuerierQueryRangeFunc describes the behavior when the +// QueryRange method of the parent MockPrometheusQuerier instance is +// invoked. +type PrometheusQuerierQueryRangeFunc struct { + defaultHook func(context.Context, string, v1.Range) (model.Value, v1.Warnings, error) + hooks []func(context.Context, string, v1.Range) (model.Value, v1.Warnings, error) + history []PrometheusQuerierQueryRangeFuncCall + mutex sync.Mutex +} + +// QueryRange delegates to the next hook function in the queue and stores +// the parameter and result values of this invocation. +func (m *MockPrometheusQuerier) QueryRange(v0 context.Context, v1 string, v2 v1.Range) (model.Value, v1.Warnings, error) { + r0, r1, r2 := m.QueryRangeFunc.nextHook()(v0, v1, v2) + m.QueryRangeFunc.appendCall(PrometheusQuerierQueryRangeFuncCall{v0, v1, v2, r0, r1, r2}) + return r0, r1, r2 +} + +// SetDefaultHook sets function that is called when the QueryRange method of +// the parent MockPrometheusQuerier instance is invoked and the hook queue +// is empty. +func (f *PrometheusQuerierQueryRangeFunc) SetDefaultHook(hook func(context.Context, string, v1.Range) (model.Value, v1.Warnings, error)) { + f.defaultHook = hook +} + +// PushHook adds a function to the end of hook queue. Each invocation of the +// QueryRange method of the parent MockPrometheusQuerier instance inovkes +// the hook at the front of the queue and discards it. After the queue is +// empty, the default hook function is invoked for any future action. +func (f *PrometheusQuerierQueryRangeFunc) PushHook(hook func(context.Context, string, v1.Range) (model.Value, v1.Warnings, error)) { + f.mutex.Lock() + f.hooks = append(f.hooks, hook) + f.mutex.Unlock() +} + +// SetDefaultReturn calls SetDefaultDefaultHook with a function that returns +// the given values. +func (f *PrometheusQuerierQueryRangeFunc) SetDefaultReturn(r0 model.Value, r1 v1.Warnings, r2 error) { + f.SetDefaultHook(func(context.Context, string, v1.Range) (model.Value, v1.Warnings, error) { + return r0, r1, r2 + }) +} + +// PushReturn calls PushDefaultHook with a function that returns the given +// values. +func (f *PrometheusQuerierQueryRangeFunc) PushReturn(r0 model.Value, r1 v1.Warnings, r2 error) { + f.PushHook(func(context.Context, string, v1.Range) (model.Value, v1.Warnings, error) { + return r0, r1, r2 + }) +} + +func (f *PrometheusQuerierQueryRangeFunc) nextHook() func(context.Context, string, v1.Range) (model.Value, v1.Warnings, error) { + f.mutex.Lock() + defer f.mutex.Unlock() + + if len(f.hooks) == 0 { + return f.defaultHook + } + + hook := f.hooks[0] + f.hooks = f.hooks[1:] + return hook +} + +func (f *PrometheusQuerierQueryRangeFunc) appendCall(r0 PrometheusQuerierQueryRangeFuncCall) { + f.mutex.Lock() + f.history = append(f.history, r0) + f.mutex.Unlock() +} + +// History returns a sequence of PrometheusQuerierQueryRangeFuncCall objects +// describing the invocations of this function. +func (f *PrometheusQuerierQueryRangeFunc) History() []PrometheusQuerierQueryRangeFuncCall { + f.mutex.Lock() + history := make([]PrometheusQuerierQueryRangeFuncCall, len(f.history)) + copy(history, f.history) + f.mutex.Unlock() + + return history +} + +// PrometheusQuerierQueryRangeFuncCall is an object that describes an +// invocation of method QueryRange on an instance of MockPrometheusQuerier. +type PrometheusQuerierQueryRangeFuncCall struct { + // Arg0 is the value of the 1st argument passed to this method + // invocation. + Arg0 context.Context + // Arg1 is the value of the 2nd argument passed to this method + // invocation. + Arg1 string + // Arg2 is the value of the 3rd argument passed to this method + // invocation. + Arg2 v1.Range + // Result0 is the value of the 1st result returned from this method + // invocation. + Result0 model.Value + // Result1 is the value of the 2nd result returned from this method + // invocation. + Result1 v1.Warnings + // Result2 is the value of the 3rd result returned from this method + // invocation. + Result2 error +} + +// Args returns an interface slice containing the arguments of this +// invocation. +func (c PrometheusQuerierQueryRangeFuncCall) Args() []interface{} { + return []interface{}{c.Arg0, c.Arg1, c.Arg2} +} + +// Results returns an interface slice containing the results of this +// invocation. +func (c PrometheusQuerierQueryRangeFuncCall) Results() []interface{} { + return []interface{}{c.Result0, c.Result1, c.Result2} +} diff --git a/internal/prometheusutil/roundtripper.go b/internal/prometheusutil/roundtripper.go new file mode 100644 index 00000000000..ca47044ad3d --- /dev/null +++ b/internal/prometheusutil/roundtripper.go @@ -0,0 +1,32 @@ +package prometheusutil + +import ( + "context" + "errors" + "net/http" + "os" + "syscall" + + prometheusAPI "github.com/prometheus/client_golang/api" +) + +// wrap the default prometheus API with some custom handling +type roundTripper struct{} + +func (r *roundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + resp, err := prometheusAPI.DefaultRoundTripper.RoundTrip(req) + + // there isn't a great way to check for conn refused, sadly https://github.com/golang/go/issues/9424 + // so check for specific syscall errors to detect if the provided prometheus server is + // not accessible in this deployment. we also treat deadline exceeds as an indicator. + var syscallErr *os.SyscallError + if errors.As(err, &syscallErr) { + if syscallErr.Err == syscall.ECONNREFUSED || syscallErr.Err == syscall.EHOSTUNREACH { + err = ErrPrometheusUnavailable + } + } else if errors.Is(err, context.DeadlineExceeded) { + err = ErrPrometheusUnavailable + } + + return resp, err +} diff --git a/internal/prometheusutil/roundtripper_test.go b/internal/prometheusutil/roundtripper_test.go new file mode 100644 index 00000000000..6afb4f70d86 --- /dev/null +++ b/internal/prometheusutil/roundtripper_test.go @@ -0,0 +1,20 @@ +package prometheusutil + +import ( + "errors" + "net/http" + "testing" +) + +// test detection of "prometheus unavailable" +func Test_roundTripper_PrometheusUnavailable(t *testing.T) { + rt := &roundTripper{} + req, err := http.NewRequest("GET", "http://localhost:1234", nil) + if err != nil { + t.Errorf("failed to set up mock request: %+v", err) + } + _, err = rt.RoundTrip(req) + if !errors.Is(err, ErrPrometheusUnavailable) { + t.Errorf("expected ErrPrometheusUnavailable, got %+v", err) + } +} diff --git a/web/src/site-admin/SiteAdminReportBugPage.tsx b/web/src/site-admin/SiteAdminReportBugPage.tsx index 5c7381ec13d..d1738ccc176 100644 --- a/web/src/site-admin/SiteAdminReportBugPage.tsx +++ b/web/src/site-admin/SiteAdminReportBugPage.tsx @@ -1,5 +1,5 @@ import { RouteComponentProps } from 'react-router' -import { fetchAllConfigAndSettings } from './backend' +import { fetchAllConfigAndSettings, fetchMonitoringStats } from './backend' import React, { useMemo } from 'react' import { DynamicallyImportedMonacoSettingsEditor } from '../settings/DynamicallyImportedMonacoSettingsEditor' import awsCodeCommitJSON from '../../../schema/aws_codecommit.schema.json' @@ -71,6 +71,12 @@ const allConfigSchema = { final: settingsSchemaJSON, }, }, + alerts: { + type: 'array', + items: { + type: 'object', + }, + }, }, definitions: values(externalServices) .map(schema => schema.definitions) @@ -83,22 +89,27 @@ interface Props extends RouteComponentProps { } export const SiteAdminReportBugPage: React.FunctionComponent = ({ isLightTheme, history }) => { + const monitoringDaysBack = 7 + const monitoringStats = useObservable(useMemo(() => fetchMonitoringStats(monitoringDaysBack), [])) const allConfig = useObservable(useMemo(fetchAllConfigAndSettings, [])) return (

Report a bug

- Create an issue on the{' '} - - public issue tracker + + Create an issue on the public issue tracker , and include a description of the bug along with the info below (with secrets redacted). If the report contains sensitive information that should not be public, email the report to{' '} support@sourcegraph.com - - , instead. + {' '} + instead.

@@ -107,7 +118,15 @@ export const SiteAdminReportBugPage: React.FunctionComponent = ({ isLight
data.site) ) } + +/** + * Resolves to `false` if prometheus API is unavailable (due to being disabled or not configured in this deployment) + * + * @param days number of days of data to fetch + */ +export function fetchMonitoringStats(days: number): Observable { + // more details in /internal/prometheusutil.ErrPrometheusUnavailable + const errPrometheusUnavailable = 'prometheus API is unavailable' + return queryGraphQL( + gql` + query SiteMonitoringStatistics($days: Int!) { + site { + monitoringStatistics(days: $days) { + alerts { + serviceName + name + timestamp + average + } + } + } + } + `, + { days } + ).pipe( + map(result => { + if (isErrorGraphQLResult(result)) { + if (result.errors.find(e => e.message.includes(errPrometheusUnavailable))) { + return false + } + throw createAggregateError(result.errors) + } + return result.data + }), + map(data => { + if (data) { + return data.site.monitoringStatistics + } + return data + }) + ) +}