executors: alert when no jobs processed but queue > 0 (#40570)

Co-authored-by: Erik Seliger <erikseliger@me.com>
This commit is contained in:
Noah S-C 2022-08-22 05:56:47 -07:00 committed by GitHub
parent 186940d62a
commit 3067b57eb4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 72 additions and 2 deletions

View File

@ -5920,6 +5920,43 @@ with your code hosts connections or networking issues affecting communication wi
<br />
## executor: executor_processor_total
<p class="subtitle">handler operations every 5m</p>
**Descriptions**
- <span class="badge badge-critical">critical</span> executor: less than 0 handler operations every 5m for 5m0s
<details>
<summary>Technical details</summary>
Custom alert query: `
(sum(src_executor_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}) OR vector(0)) == 0
AND
(sum by (queue)(src_executor_total{job=~"^sourcegraph-executors.*"})) > 0
`
</details>
**Next steps**
- Check to see the state of any compute VMs, they may be taking longer than expected to boot.
- Make sure the executors appear under Site Admin > Executors.
- Check the Grafana dashboard section for APIClient, it should do frequent requests to Dequeue and Heartbeat and those must not fail.
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#executor-executor-processor-total).
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"critical_executor_executor_processor_total"
]
```
<sub>*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*</sub>
<br />
## executor: executor_processor_error_rate
<p class="subtitle">handler operation error rate over 5m</p>

View File

@ -17662,7 +17662,7 @@ Query: `sum(src_executor_processor_handlers{queue=~"${queue:regex}",sg_job=~"^so
<p class="subtitle">Handler operations every 5m</p>
This panel has no related alerts.
Refer to the [alerts reference](./alerts.md#executor-executor-processor-total) for 1 alert related to this panel.
To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100110` on your Sourcegraph instance.

View File

@ -271,6 +271,7 @@ func (codeIntelligence) NewExecutorQueueGroup(containerName string) monitoring.G
Namespace: "executor",
DescriptionRoot: "Executor jobs",
// if updating this, also update in NewExecutorProcessorGroup
ObservableConstructorOptions: ObservableConstructorOptions{
MetricNameRoot: "executor",
MetricDescriptionRoot: "unprocessed executor job",
@ -304,6 +305,12 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori
Filters: filters,
}
queueConstructorOptions := ObservableConstructorOptions{
MetricNameRoot: "executor",
MetricDescriptionRoot: "unprocessed executor job",
By: []string{"queue"},
}
return Workerutil.NewGroup(containerName, monitoring.ObservableOwnerCodeIntel, WorkerutilGroupOptions{
GroupConstructorOptions: GroupConstructorOptions{
Namespace: "executor",
@ -313,7 +320,17 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori
},
SharedObservationGroupOptions: SharedObservationGroupOptions{
Total: NoAlertsOption("none"),
Total: CriticalOption(
monitoring.Alert().
CustomQuery(Workerutil.QueueForwardProgress(containerName, constructorOptions, queueConstructorOptions)).
LessOrEqual(0).
// ~5min for scale-from-zero
For(time.Minute*5),
`
- Check to see the state of any compute VMs, they may be taking longer than expected to boot.
- Make sure the executors appear under Site Admin > Executors.
- Check the Grafana dashboard section for APIClient, it should do frequent requests to Dequeue and Heartbeat and those must not fail.
`),
Duration: NoAlertsOption("none"),
Errors: NoAlertsOption("none"),
ErrorRate: CriticalOption(

View File

@ -78,6 +78,22 @@ func (workerutilConstructor) LastOverTimeErrorRate(containerName string, lookbac
return Standard.LastOverTimeErrorRate(containerName, lookbackWindow, options)
}
// QueueForwardProgress creates a queue-based workerutil-specific query that yields 0 when the queue is non-empty but the
// number of processed records is zero.
func (workerutilConstructor) QueueForwardProgress(containerName string, handlerOptions, queueOptions ObservableConstructorOptions) string {
handlerFilters := makeFilters(handlerOptions.JobLabel, containerName, handlerOptions.Filters...)
handlerBy, _ := makeBy(handlerOptions.By...)
queueFilters := makeFilters(queueOptions.JobLabel, containerName, queueOptions.Filters...)
queueBy, _ := makeBy(queueOptions.By...)
return fmt.Sprintf(`
(sum%[1]s(src_%[2]s_total{%[3]s}) OR vector(0)) == 0
AND
(sum%[4]s(src_%[5]s_total{%[6]s})) > 0
`, handlerBy, handlerOptions.MetricNameRoot, handlerFilters, queueBy, queueOptions.MetricNameRoot, queueFilters)
}
type WorkerutilGroupOptions struct {
GroupConstructorOptions
SharedObservationGroupOptions