diff --git a/doc/admin/observability/alerts.md b/doc/admin/observability/alerts.md
index ebef15e71c3..d425e48ec19 100644
--- a/doc/admin/observability/alerts.md
+++ b/doc/admin/observability/alerts.md
@@ -7480,6 +7480,100 @@ Generated query for warning alert: `max((rate(src_telemetry_job_total{op="SendEv
+## telemetry: telemetrygatewayexporter_exporter_errors_total
+
+
events exporter operation errors every 30m
+ +**Descriptions** + +- warning telemetry: 0+ events exporter operation errors every 30m + +**Next steps** + +- See worker logs in the `worker.telemetrygateway-exporter` log scope for more details. +If logs only indicate that exports failed, reach out to Sourcegraph with relevant log entries, as this may be an issue in Sourcegraph`s Telemetry Gateway service. +- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#telemetry-telemetrygatewayexporter-exporter-errors-total). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_telemetry_telemetrygatewayexporter_exporter_errors_total" +] +``` + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +export queue cleanup operation errors every 30m
+ +**Descriptions** + +- warning telemetry: 0+ export queue cleanup operation errors every 30m + +**Next steps** + +- See worker logs in the `worker.telemetrygateway-exporter` log scope for more details. +- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#telemetry-telemetrygatewayexporter-queue-cleanup-errors-total). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_telemetry_telemetrygatewayexporter_queue_cleanup_errors_total" +] +``` + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +export backlog metrics reporting operation errors every 30m
+ +**Descriptions** + +- warning telemetry: 0+ export backlog metrics reporting operation errors every 30m + +**Next steps** + +- See worker logs in the `worker.telemetrygateway-exporter` log scope for more details. +- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#telemetry-telemetrygatewayexporter-queue-metrics-reporter-errors-total). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_telemetry_telemetrygatewayexporter_queue_metrics_reporter_errors_total" +] +``` + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +spans refused per receiver
diff --git a/doc/admin/observability/dashboards.md b/doc/admin/observability/dashboards.md index b89a0bc1ab0..54a48268368 100644 --- a/doc/admin/observability/dashboards.md +++ b/doc/admin/observability/dashboards.md @@ -30806,6 +30806,305 @@ Query: `rate(src_telemetry_job_total{op="SendEvents"}[1h]) / on() group_right()Telemetry event payloads pending export
+ +The number of events queued to be exported. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100300` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Events exported from queue per hour
+ +The number of events being exported. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100301` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Number of events exported per batch over 30m
+ +The number of events exported in each batch. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100302` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Events exporter operations every 30m
+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100400` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Aggregate successful events exporter operation duration distribution over 30m
+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100401` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Events exporter operation errors every 30m
+ +Refer to the [alerts reference](./alerts.md#telemetry-telemetrygatewayexporter-exporter-errors-total) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100402` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Events exporter operation error rate over 30m
+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100403` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Export queue cleanup operations every 30m
+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100500` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Aggregate successful export queue cleanup operation duration distribution over 30m
+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100501` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Export queue cleanup operation errors every 30m
+ +Refer to the [alerts reference](./alerts.md#telemetry-telemetrygatewayexporter-queue-cleanup-errors-total) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100502` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Export queue cleanup operation error rate over 30m
+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100503` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Export backlog metrics reporting operations every 30m
+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100600` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Aggregate successful export backlog metrics reporting operation duration distribution over 30m
+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100601` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Export backlog metrics reporting operation errors every 30m
+ +Refer to the [alerts reference](./alerts.md#telemetry-telemetrygatewayexporter-queue-metrics-reporter-errors-total) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100602` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +Export backlog metrics reporting operation error rate over 30m
+ +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100603` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Data & Analytics team](https://handbook.sourcegraph.com/departments/engineering/teams/data-analytics).* + +The OpenTelemetry collector ingests OpenTelemetry data from Sourcegraph and exports it to the configured backends.
diff --git a/enterprise/cmd/worker/internal/telemetrygatewayexporter/BUILD.bazel b/enterprise/cmd/worker/internal/telemetrygatewayexporter/BUILD.bazel index 3e2546f5fd1..78676907d92 100644 --- a/enterprise/cmd/worker/internal/telemetrygatewayexporter/BUILD.bazel +++ b/enterprise/cmd/worker/internal/telemetrygatewayexporter/BUILD.bazel @@ -3,9 +3,9 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") go_library( name = "telemetrygatewayexporter", srcs = [ - "backlog_metrics.go", "exporter.go", "queue_cleanup.go", + "queue_metrics.go", "telemetrygatewayexporter.go", ], importpath = "github.com/sourcegraph/sourcegraph/enterprise/cmd/worker/internal/telemetrygatewayexporter", diff --git a/enterprise/cmd/worker/internal/telemetrygatewayexporter/exporter.go b/enterprise/cmd/worker/internal/telemetrygatewayexporter/exporter.go index c274ba9295c..2f8ece93684 100644 --- a/enterprise/cmd/worker/internal/telemetrygatewayexporter/exporter.go +++ b/enterprise/cmd/worker/internal/telemetrygatewayexporter/exporter.go @@ -43,13 +43,14 @@ func newExporterJob( batchSizeHistogram: promauto.NewHistogram(prometheus.HistogramOpts{ Namespace: "src", - Subsystem: "telemetrygatewayexport", + Subsystem: "telemetrygatewayexporter", Name: "batch_size", Help: "Size of event batches exported from the queue.", + Buckets: prometheus.ExponentialBucketsRange(1, float64(cfg.MaxExportBatchSize), 10), }), exportedEventsCounter: promauto.NewCounter(prometheus.CounterOpts{ Namespace: "src", - Subsystem: "telemetrygatewayexport", + Subsystem: "telemetrygatewayexporter", Name: "exported_events", Help: "Number of events exported from the queue.", }), @@ -61,7 +62,7 @@ func newExporterJob( goroutine.WithDescription("telemetrygatewayexporter events export job"), goroutine.WithInterval(cfg.ExportInterval), goroutine.WithOperation(obctx.Operation(observation.Op{ - Name: "TelemetryGateway.Export", + Name: "TelemetryGatewayExporter.Export", Metrics: metrics.NewREDMetrics(prometheus.DefaultRegisterer, "telemetrygatewayexporter_exporter"), })), ) diff --git a/enterprise/cmd/worker/internal/telemetrygatewayexporter/queue_cleanup.go b/enterprise/cmd/worker/internal/telemetrygatewayexporter/queue_cleanup.go index 38908b18008..66708a256b0 100644 --- a/enterprise/cmd/worker/internal/telemetrygatewayexporter/queue_cleanup.go +++ b/enterprise/cmd/worker/internal/telemetrygatewayexporter/queue_cleanup.go @@ -9,6 +9,8 @@ import ( "github.com/sourcegraph/sourcegraph/internal/database" "github.com/sourcegraph/sourcegraph/internal/goroutine" + "github.com/sourcegraph/sourcegraph/internal/metrics" + "github.com/sourcegraph/sourcegraph/internal/observation" "github.com/sourcegraph/sourcegraph/lib/errors" ) @@ -17,17 +19,17 @@ type queueCleanupJob struct { retentionWindow time.Duration - prunedHistogram prometheus.Histogram + prunedCounter prometheus.Counter } -func newQueueCleanupJob(store database.TelemetryEventsExportQueueStore, cfg config) goroutine.BackgroundRoutine { +func newQueueCleanupJob(obctx *observation.Context, store database.TelemetryEventsExportQueueStore, cfg config) goroutine.BackgroundRoutine { job := &queueCleanupJob{ store: store, - prunedHistogram: promauto.NewHistogram(prometheus.HistogramOpts{ + prunedCounter: promauto.NewCounter(prometheus.CounterOpts{ Namespace: "src", - Subsystem: "telemetrygatewayexport", - Name: "pruned", - Help: "Size of exported events pruned from the queue table.", + Subsystem: "telemetrygatewayexporter", + Name: "events_pruned", + Help: "Events pruned from the queue table.", }), } return goroutine.NewPeriodicGoroutine( @@ -36,6 +38,10 @@ func newQueueCleanupJob(store database.TelemetryEventsExportQueueStore, cfg conf goroutine.WithName("telemetrygatewayexporter.queue_cleanup"), goroutine.WithDescription("telemetrygatewayexporter queue cleanup"), goroutine.WithInterval(cfg.QueueCleanupInterval), + goroutine.WithOperation(obctx.Operation(observation.Op{ + Name: "TelemetryGatewayExporter.QueueCleanup", + Metrics: metrics.NewREDMetrics(prometheus.DefaultRegisterer, "telemetrygatewayexporter_queue_cleanup"), + })), ) } @@ -44,7 +50,7 @@ func (j *queueCleanupJob) Handle(ctx context.Context) error { if err != nil { return errors.Wrap(err, "store.DeletedExported") } - j.prunedHistogram.Observe(float64(count)) + j.prunedCounter.Add(float64(count)) return nil } diff --git a/enterprise/cmd/worker/internal/telemetrygatewayexporter/backlog_metrics.go b/enterprise/cmd/worker/internal/telemetrygatewayexporter/queue_metrics.go similarity index 54% rename from enterprise/cmd/worker/internal/telemetrygatewayexporter/backlog_metrics.go rename to enterprise/cmd/worker/internal/telemetrygatewayexporter/queue_metrics.go index fc29f7b97f4..27c064aee3e 100644 --- a/enterprise/cmd/worker/internal/telemetrygatewayexporter/backlog_metrics.go +++ b/enterprise/cmd/worker/internal/telemetrygatewayexporter/queue_metrics.go @@ -9,35 +9,41 @@ import ( "github.com/sourcegraph/sourcegraph/internal/database" "github.com/sourcegraph/sourcegraph/internal/goroutine" + "github.com/sourcegraph/sourcegraph/internal/metrics" + "github.com/sourcegraph/sourcegraph/internal/observation" "github.com/sourcegraph/sourcegraph/lib/errors" ) -type backlogMetricsJob struct { +type queueMetricsJob struct { store database.TelemetryEventsExportQueueStore sizeGauge prometheus.Gauge } -func newBacklogMetricsJob(store database.TelemetryEventsExportQueueStore) goroutine.BackgroundRoutine { - job := &backlogMetricsJob{ +func newQueueMetricsJob(obctx *observation.Context, store database.TelemetryEventsExportQueueStore) goroutine.BackgroundRoutine { + job := &queueMetricsJob{ store: store, sizeGauge: promauto.NewGauge(prometheus.GaugeOpts{ Namespace: "src", - Subsystem: "telemetrygatewayexport", - Name: "backlog_size", + Subsystem: "telemetrygatewayexporter", + Name: "queue_size", Help: "Current number of events waiting to be exported.", }), } return goroutine.NewPeriodicGoroutine( context.Background(), job, - goroutine.WithName("telemetrygatewayexporter.backlog_metrics"), - goroutine.WithDescription("telemetrygatewayexporter backlog metrics"), + goroutine.WithName("telemetrygatewayexporter.queue_metrics_reporter"), + goroutine.WithDescription("telemetrygatewayexporter backlog metrics reporting"), goroutine.WithInterval(time.Minute*5), + goroutine.WithOperation(obctx.Operation(observation.Op{ + Name: "TelemetryGatewayExporter.ReportQueueMetrics", + Metrics: metrics.NewREDMetrics(prometheus.DefaultRegisterer, "telemetrygatewayexporter_queue_metrics_reporter"), + })), ) } -func (j *backlogMetricsJob) Handle(ctx context.Context) error { +func (j *queueMetricsJob) Handle(ctx context.Context) error { count, err := j.store.CountUnexported(ctx) if err != nil { return errors.Wrap(err, "store.CountUnexported") diff --git a/enterprise/cmd/worker/internal/telemetrygatewayexporter/telemetrygatewayexporter.go b/enterprise/cmd/worker/internal/telemetrygatewayexporter/telemetrygatewayexporter.go index 78c88075531..a4f2cbe4caf 100644 --- a/enterprise/cmd/worker/internal/telemetrygatewayexporter/telemetrygatewayexporter.go +++ b/enterprise/cmd/worker/internal/telemetrygatewayexporter/telemetrygatewayexporter.go @@ -38,14 +38,21 @@ func (c *config) Load() { c.ExportInterval = env.MustGetDuration("TELEMETRY_GATEWAY_EXPORTER_EXPORT_INTERVAL", 10*time.Minute, "Interval at which to export telemetry") + if c.ExportInterval > 1*time.Hour { + c.AddError(errors.New("TELEMETRY_GATEWAY_EXPORTER_EXPORT_INTERVAL cannot be more than 1 hour")) + } + c.MaxExportBatchSize = env.MustGetInt("TELEMETRY_GATEWAY_EXPORTER_EXPORT_BATCH_SIZE", 5000, "Maximum number of events to export in each batch") + if c.MaxExportBatchSize < 100 { + c.AddError(errors.New("TELEMETRY_GATEWAY_EXPORTER_EXPORT_BATCH_SIZE must be no less than 100")) + } c.ExportedEventsRetentionWindow = env.MustGetDuration("TELEMETRY_GATEWAY_EXPORTER_EXPORTED_EVENTS_RETENTION", 2*24*time.Hour, "Duration to retain already-exported telemetry events before deleting") c.QueueCleanupInterval = env.MustGetDuration("TELEMETRY_GATEWAY_EXPORTER_QUEUE_CLEANUP_INTERVAL", - 1*time.Hour, "Interval at which to clean up telemetry export queue") + 30*time.Minute, "Interval at which to clean up telemetry export queue") } type telemetryGatewayExporter struct{} @@ -95,7 +102,7 @@ func (t *telemetryGatewayExporter) Routines(initCtx context.Context, observation exporter, *ConfigInst, ), - newQueueCleanupJob(db.TelemetryEventsExportQueue(), *ConfigInst), - newBacklogMetricsJob(db.TelemetryEventsExportQueue()), + newQueueCleanupJob(observationCtx, db.TelemetryEventsExportQueue(), *ConfigInst), + newQueueMetricsJob(observationCtx, db.TelemetryEventsExportQueue()), }, nil } diff --git a/monitoring/definitions/BUILD.bazel b/monitoring/definitions/BUILD.bazel index 00d4333dfc0..43692f7805e 100644 --- a/monitoring/definitions/BUILD.bazel +++ b/monitoring/definitions/BUILD.bazel @@ -31,8 +31,10 @@ go_library( importpath = "github.com/sourcegraph/sourcegraph/monitoring/definitions", visibility = ["//visibility:public"], deps = [ + "//lib/pointers", "//monitoring/definitions/shared", "//monitoring/monitoring", "@com_github_grafana_tools_sdk//:sdk", + "@com_github_prometheus_common//model", ], ) diff --git a/monitoring/definitions/shared/usage_data_pipeline.go b/monitoring/definitions/shared/usage_data_pipeline.go index 63fa496cf71..c0fcd06c1c7 100644 --- a/monitoring/definitions/shared/usage_data_pipeline.go +++ b/monitoring/definitions/shared/usage_data_pipeline.go @@ -48,7 +48,7 @@ func (dataAnalytics) NewTelemetryJobOperationsGroup(containerName string) monito }, Namespace: usageDataExporterNamespace, DescriptionRoot: "Job operations", - Hidden: false, + Hidden: true, }, SharedObservationGroupOptions: SharedObservationGroupOptions{ Total: NoAlertsOption("none"), @@ -68,7 +68,7 @@ func (dataAnalytics) NewTelemetryJobOperationsGroup(containerName string) monito func (dataAnalytics) TelemetryJobThroughputGroup(containerName string) monitoring.Group { return monitoring.Group{ Title: "Usage data exporter: Utilization", - Hidden: false, + Hidden: true, Rows: []monitoring.Row{ { { diff --git a/monitoring/definitions/telemetry.go b/monitoring/definitions/telemetry.go index 22decb90b7d..929b7a00a20 100644 --- a/monitoring/definitions/telemetry.go +++ b/monitoring/definitions/telemetry.go @@ -1,6 +1,12 @@ package definitions import ( + "time" + + "github.com/grafana-tools/sdk" + "github.com/prometheus/common/model" + + "github.com/sourcegraph/sourcegraph/lib/pointers" "github.com/sourcegraph/sourcegraph/monitoring/definitions/shared" "github.com/sourcegraph/sourcegraph/monitoring/monitoring" ) @@ -12,9 +18,113 @@ func Telemetry() *monitoring.Dashboard { Title: "Telemetry", Description: "Monitoring telemetry services in Sourcegraph.", Groups: []monitoring.Group{ + // Legacy dashboards - TODO(@bobheadxi): remove after 5.2.2 shared.DataAnalytics.NewTelemetryJobOperationsGroup(containerName), shared.DataAnalytics.NewTelemetryJobQueueGroup(containerName), shared.DataAnalytics.TelemetryJobThroughputGroup(containerName), + + // The new stuff - https://docs.sourcegraph.com/dev/background-information/telemetry + { + Title: "Telemetry Gateway Exporter: Export and queue metrics", + Hidden: true, // TODO(@bobheadxi): not yet enabled by default, un-hide in 5.2.1 + Rows: []monitoring.Row{ + { + { + Name: "telemetry_gateway_exporter_queue_size", + Description: "telemetry event payloads pending export", + Owner: monitoring.ObservableOwnerDataAnalytics, + Query: `sum(src_telemetrygatewayexporter_queue_size)`, + Panel: monitoring.Panel().Min(0).LegendFormat("events"), + NoAlert: true, + Interpretation: "The number of events queued to be exported.", + }, + { + Name: "src_telemetrygatewayexporter_exported_events", + Description: "events exported from queue per hour", + Owner: monitoring.ObservableOwnerDataAnalytics, + Query: `max(increase(src_telemetrygatewayexporter_exported_events[1h]))`, + Panel: monitoring.Panel().Min(0).LegendFormat("events"), + NoAlert: true, + Interpretation: "The number of events being exported.", + }, + { + Name: "telemetry_gateway_exporter_batch_size", + Description: "number of events exported per batch over 30m", + Owner: monitoring.ObservableOwnerDataAnalytics, + Query: "sum by (le) (rate(src_telemetrygatewayexporter_batch_size_bucket[30m]))", + Panel: monitoring.PanelHeatmap(). + With(func(o monitoring.Observable, p *sdk.Panel) { + p.HeatmapPanel.YAxis.Format = "short" + p.HeatmapPanel.YAxis.Decimals = pointers.Ptr(0) + p.HeatmapPanel.DataFormat = "tsbuckets" + p.HeatmapPanel.Targets[0].Format = "heatmap" + p.HeatmapPanel.Targets[0].LegendFormat = "{{le}}" + }), + NoAlert: true, + Interpretation: "The number of events exported in each batch.", + }, + }, + }, + }, + shared.Observation.NewGroup(containerName, monitoring.ObservableOwnerDataAnalytics, shared.ObservationGroupOptions{ + GroupConstructorOptions: shared.GroupConstructorOptions{ + ObservableConstructorOptions: shared.ObservableConstructorOptions{ + MetricNameRoot: "telemetrygatewayexporter_exporter", + MetricDescriptionRoot: "events exporter", + RangeWindow: model.Duration(30 * time.Minute), + }, + Namespace: "Telemetry Gateway Exporter", + DescriptionRoot: "Export job operations", + Hidden: true, // TODO(@bobheadxi): not yet enabled by default, un-hide in 5.2.1 + }, + SharedObservationGroupOptions: shared.SharedObservationGroupOptions{ + Total: shared.NoAlertsOption("none"), + Duration: shared.NoAlertsOption("none"), + ErrorRate: shared.NoAlertsOption("none"), + Errors: shared.WarningOption(monitoring.Alert().Greater(0), ` + See worker logs in the 'worker.telemetrygateway-exporter' log scope for more details. + If logs only indicate that exports failed, reach out to Sourcegraph with relevant log entries, as this may be an issue in Sourcegraph's Telemetry Gateway service. + `), + }, + }), + shared.Observation.NewGroup(containerName, monitoring.ObservableOwnerDataAnalytics, shared.ObservationGroupOptions{ + GroupConstructorOptions: shared.GroupConstructorOptions{ + ObservableConstructorOptions: shared.ObservableConstructorOptions{ + MetricNameRoot: "telemetrygatewayexporter_queue_cleanup", + MetricDescriptionRoot: "export queue cleanup", + RangeWindow: model.Duration(30 * time.Minute), + }, + Namespace: "Telemetry Gateway Exporter", + DescriptionRoot: "Export queue cleanup job operations", + Hidden: true, // TODO(@bobheadxi): not yet enabled by default, un-hide in 5.2.1 + }, + SharedObservationGroupOptions: shared.SharedObservationGroupOptions{ + Total: shared.NoAlertsOption("none"), + Duration: shared.NoAlertsOption("none"), + ErrorRate: shared.NoAlertsOption("none"), + Errors: shared.WarningOption(monitoring.Alert().Greater(0), + "See worker logs in the `worker.telemetrygateway-exporter` log scope for more details."), + }, + }), + shared.Observation.NewGroup(containerName, monitoring.ObservableOwnerDataAnalytics, shared.ObservationGroupOptions{ + GroupConstructorOptions: shared.GroupConstructorOptions{ + ObservableConstructorOptions: shared.ObservableConstructorOptions{ + MetricNameRoot: "telemetrygatewayexporter_queue_metrics_reporter", + MetricDescriptionRoot: "export backlog metrics reporting", + RangeWindow: model.Duration(30 * time.Minute), + }, + Namespace: "Telemetry Gateway Exporter", + DescriptionRoot: "Export queue metrics reporting job operations", + Hidden: true, + }, + SharedObservationGroupOptions: shared.SharedObservationGroupOptions{ + Total: shared.NoAlertsOption("none"), + Duration: shared.NoAlertsOption("none"), + ErrorRate: shared.NoAlertsOption("none"), + Errors: shared.WarningOption(monitoring.Alert().Greater(0), + "See worker logs in the `worker.telemetrygateway-exporter` log scope for more details."), + }, + }), }, } } diff --git a/sg.config.yaml b/sg.config.yaml index 0c1cfe34a4b..d90c43846bb 100644 --- a/sg.config.yaml +++ b/sg.config.yaml @@ -720,10 +720,8 @@ commands: -v "$(pwd)"/dev/grafana/all:/sg_config_grafana/provisioning/datasources \ grafana:candidate >"${GRAFANA_LOG_FILE}" 2>&1 install: | - echo foobar mkdir -p "${GRAFANA_DISK}" mkdir -p "$(dirname ${GRAFANA_LOG_FILE})" - export CACHE=true docker inspect $CONTAINER >/dev/null 2>&1 && docker rm -f $CONTAINER bazel build //docker-images/grafana:image_tarball docker load --input $(bazel cquery //docker-images/grafana:image_tarball --output=files)