mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 19:51:50 +00:00
executors: alert when no jobs processed but queue > 0 (#40570)
Co-authored-by: Erik Seliger <erikseliger@me.com>
This commit is contained in:
parent
186940d62a
commit
3067b57eb4
@ -5920,6 +5920,43 @@ with your code hosts connections or networking issues affecting communication wi
|
||||
|
||||
<br />
|
||||
|
||||
## executor: executor_processor_total
|
||||
|
||||
<p class="subtitle">handler operations every 5m</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-critical">critical</span> executor: less than 0 handler operations every 5m for 5m0s
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Custom alert query: `
|
||||
(sum(src_executor_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}) OR vector(0)) == 0
|
||||
AND
|
||||
(sum by (queue)(src_executor_total{job=~"^sourcegraph-executors.*"})) > 0
|
||||
`
|
||||
|
||||
</details>
|
||||
|
||||
**Next steps**
|
||||
|
||||
- Check to see the state of any compute VMs, they may be taking longer than expected to boot.
|
||||
- Make sure the executors appear under Site Admin > Executors.
|
||||
- Check the Grafana dashboard section for APIClient, it should do frequent requests to Dequeue and Heartbeat and those must not fail.
|
||||
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#executor-executor-processor-total).
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"critical_executor_executor_processor_total"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Code intelligence team](https://handbook.sourcegraph.com/departments/engineering/teams/code-intelligence).*</sub>
|
||||
|
||||
<br />
|
||||
|
||||
## executor: executor_processor_error_rate
|
||||
|
||||
<p class="subtitle">handler operation error rate over 5m</p>
|
||||
|
||||
2
doc/admin/observability/dashboards.md
generated
2
doc/admin/observability/dashboards.md
generated
@ -17662,7 +17662,7 @@ Query: `sum(src_executor_processor_handlers{queue=~"${queue:regex}",sg_job=~"^so
|
||||
|
||||
<p class="subtitle">Handler operations every 5m</p>
|
||||
|
||||
This panel has no related alerts.
|
||||
Refer to the [alerts reference](./alerts.md#executor-executor-processor-total) for 1 alert related to this panel.
|
||||
|
||||
To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100110` on your Sourcegraph instance.
|
||||
|
||||
|
||||
@ -271,6 +271,7 @@ func (codeIntelligence) NewExecutorQueueGroup(containerName string) monitoring.G
|
||||
Namespace: "executor",
|
||||
DescriptionRoot: "Executor jobs",
|
||||
|
||||
// if updating this, also update in NewExecutorProcessorGroup
|
||||
ObservableConstructorOptions: ObservableConstructorOptions{
|
||||
MetricNameRoot: "executor",
|
||||
MetricDescriptionRoot: "unprocessed executor job",
|
||||
@ -304,6 +305,12 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori
|
||||
Filters: filters,
|
||||
}
|
||||
|
||||
queueConstructorOptions := ObservableConstructorOptions{
|
||||
MetricNameRoot: "executor",
|
||||
MetricDescriptionRoot: "unprocessed executor job",
|
||||
By: []string{"queue"},
|
||||
}
|
||||
|
||||
return Workerutil.NewGroup(containerName, monitoring.ObservableOwnerCodeIntel, WorkerutilGroupOptions{
|
||||
GroupConstructorOptions: GroupConstructorOptions{
|
||||
Namespace: "executor",
|
||||
@ -313,7 +320,17 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori
|
||||
},
|
||||
|
||||
SharedObservationGroupOptions: SharedObservationGroupOptions{
|
||||
Total: NoAlertsOption("none"),
|
||||
Total: CriticalOption(
|
||||
monitoring.Alert().
|
||||
CustomQuery(Workerutil.QueueForwardProgress(containerName, constructorOptions, queueConstructorOptions)).
|
||||
LessOrEqual(0).
|
||||
// ~5min for scale-from-zero
|
||||
For(time.Minute*5),
|
||||
`
|
||||
- Check to see the state of any compute VMs, they may be taking longer than expected to boot.
|
||||
- Make sure the executors appear under Site Admin > Executors.
|
||||
- Check the Grafana dashboard section for APIClient, it should do frequent requests to Dequeue and Heartbeat and those must not fail.
|
||||
`),
|
||||
Duration: NoAlertsOption("none"),
|
||||
Errors: NoAlertsOption("none"),
|
||||
ErrorRate: CriticalOption(
|
||||
|
||||
@ -78,6 +78,22 @@ func (workerutilConstructor) LastOverTimeErrorRate(containerName string, lookbac
|
||||
return Standard.LastOverTimeErrorRate(containerName, lookbackWindow, options)
|
||||
}
|
||||
|
||||
// QueueForwardProgress creates a queue-based workerutil-specific query that yields 0 when the queue is non-empty but the
|
||||
// number of processed records is zero.
|
||||
func (workerutilConstructor) QueueForwardProgress(containerName string, handlerOptions, queueOptions ObservableConstructorOptions) string {
|
||||
handlerFilters := makeFilters(handlerOptions.JobLabel, containerName, handlerOptions.Filters...)
|
||||
handlerBy, _ := makeBy(handlerOptions.By...)
|
||||
|
||||
queueFilters := makeFilters(queueOptions.JobLabel, containerName, queueOptions.Filters...)
|
||||
queueBy, _ := makeBy(queueOptions.By...)
|
||||
|
||||
return fmt.Sprintf(`
|
||||
(sum%[1]s(src_%[2]s_total{%[3]s}) OR vector(0)) == 0
|
||||
AND
|
||||
(sum%[4]s(src_%[5]s_total{%[6]s})) > 0
|
||||
`, handlerBy, handlerOptions.MetricNameRoot, handlerFilters, queueBy, queueOptions.MetricNameRoot, queueFilters)
|
||||
}
|
||||
|
||||
type WorkerutilGroupOptions struct {
|
||||
GroupConstructorOptions
|
||||
SharedObservationGroupOptions
|
||||
|
||||
Loading…
Reference in New Issue
Block a user