executors: alert when no jobs processed but queue > 0 (#40570)

Co-authored-by: Erik Seliger <erikseliger@me.com>
2026-02-06 19:51:50 +00:00 · 2022-08-22 05:56:47 -07:00 · 2022-08-22 05:56:47 -07:00 · 3067b57eb4
commit 3067b57eb4
parent 186940d62a
4 changed files with 72 additions and 2 deletions
--- a/doc/admin/observability/alerts.md
+++ b/doc/admin/observability/alerts.md
@ -5920,6 +5920,43 @@ with your code hosts connections or networking issues affecting communication wi

 <br />

+## executor: executor_processor_total
+
+<p class="subtitle">handler operations every 5m</p>
+
+**Descriptions**
+
+- <span class="badge badge-critical">critical</span> executor: less than 0 handler operations every 5m for 5m0s
+
+<details>
+<summary>Technical details</summary>
+
+Custom alert query: `
+		(sum(src_executor_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}) OR vector(0)) == 0
+			AND
+		(sum by (queue)(src_executor_total{job=~"^sourcegraph-executors.*"})) > 0
+	`
+
+</details>
+
+**Next steps**
+
+- Check to see the state of any compute VMs, they may be taking longer than expected to boot.
+- Make sure the executors appear under Site Admin > Executors.
+- Check the Grafana dashboard section for APIClient, it should do frequent requests to Dequeue and Heartbeat and those must not fail.
+- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#executor-executor-processor-total).
+- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
+
+```json
+"observability.silenceAlerts": [
+  "critical_executor_executor_processor_total"
+]
+```
+
+<sub>*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*</sub>
+
+<br />
+
 ## executor: executor_processor_error_rate

 <p class="subtitle">handler operation error rate over 5m</p>
--- a/doc/admin/observability/dashboards.md
+++ b/doc/admin/observability/dashboards.md
@ -17662,7 +17662,7 @@ Query: `sum(src_executor_processor_handlers{queue=~"${queue:regex}",sg_job=~"^so

 <p class="subtitle">Handler operations every 5m</p>

-This panel has no related alerts.
+Refer to the [alerts reference](./alerts.md#executor-executor-processor-total) for 1 alert related to this panel.

 To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100110` on your Sourcegraph instance.

--- a/monitoring/definitions/shared/codeintel.go
+++ b/monitoring/definitions/shared/codeintel.go
@ -271,6 +271,7 @@ func (codeIntelligence) NewExecutorQueueGroup(containerName string) monitoring.G
 			Namespace:       "executor",
 			DescriptionRoot: "Executor jobs",

+			// if updating this, also update in NewExecutorProcessorGroup
 			ObservableConstructorOptions: ObservableConstructorOptions{
 				MetricNameRoot:        "executor",
 				MetricDescriptionRoot: "unprocessed executor job",
@ -304,6 +305,12 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori
 		Filters:               filters,
 	}

+	queueConstructorOptions := ObservableConstructorOptions{
+		MetricNameRoot:        "executor",
+		MetricDescriptionRoot: "unprocessed executor job",
+		By:                    []string{"queue"},
+	}
+
 	return Workerutil.NewGroup(containerName, monitoring.ObservableOwnerCodeIntel, WorkerutilGroupOptions{
 		GroupConstructorOptions: GroupConstructorOptions{
 			Namespace:       "executor",
@ -313,7 +320,17 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori
 		},

 		SharedObservationGroupOptions: SharedObservationGroupOptions{
-			Total:    NoAlertsOption("none"),
+			Total: CriticalOption(
+				monitoring.Alert().
+					CustomQuery(Workerutil.QueueForwardProgress(containerName, constructorOptions, queueConstructorOptions)).
+					LessOrEqual(0).
+					// ~5min for scale-from-zero
+					For(time.Minute*5),
+				`
+				- Check to see the state of any compute VMs, they may be taking longer than expected to boot.
+				- Make sure the executors appear under Site Admin > Executors.
+				- Check the Grafana dashboard section for APIClient, it should do frequent requests to Dequeue and Heartbeat and those must not fail.
+			`),
 			Duration: NoAlertsOption("none"),
 			Errors:   NoAlertsOption("none"),
 			ErrorRate: CriticalOption(
--- a/monitoring/definitions/shared/workerutil.go
+++ b/monitoring/definitions/shared/workerutil.go
@ -78,6 +78,22 @@ func (workerutilConstructor) LastOverTimeErrorRate(containerName string, lookbac
 	return Standard.LastOverTimeErrorRate(containerName, lookbackWindow, options)
 }

+// QueueForwardProgress creates a queue-based workerutil-specific query that yields 0 when the queue is non-empty but the
+// number of processed records is zero.
+func (workerutilConstructor) QueueForwardProgress(containerName string, handlerOptions, queueOptions ObservableConstructorOptions) string {
+	handlerFilters := makeFilters(handlerOptions.JobLabel, containerName, handlerOptions.Filters...)
+	handlerBy, _ := makeBy(handlerOptions.By...)
+
+	queueFilters := makeFilters(queueOptions.JobLabel, containerName, queueOptions.Filters...)
+	queueBy, _ := makeBy(queueOptions.By...)
+
+	return fmt.Sprintf(`
+		(sum%[1]s(src_%[2]s_total{%[3]s}) OR vector(0)) == 0
+			AND
+		(sum%[4]s(src_%[5]s_total{%[6]s})) > 0
+	`, handlerBy, handlerOptions.MetricNameRoot, handlerFilters, queueBy, queueOptions.MetricNameRoot, queueFilters)
+}
+
 type WorkerutilGroupOptions struct {
 	GroupConstructorOptions
 	SharedObservationGroupOptions