mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 18:11:48 +00:00
monitoring: Update syncer sync errors alerting (#24071)
In this commit we: 1. Add a Warning alert if the threshold is greater than 0.5 over 10 minutes. 2. Modify the Critical alert to fire if the threshold is greater than 1 over 10 minutes. Historically, over the last six months approximately, the value of this metric has rarely gone above 0.5 and has recovered almost immediately. Only in one instance has this gone over 1. At the moment we get paged for this alert for what might be intermittent errors on the code host's end which get resolved immediately, reducing the signal to noise ration in our alerting. Link to production dashboard for reference: https://sourcegraph.com/-/debug/grafana/explore?orgId=1&left=%5B%221614537000000%22,%221629484199000%22,%22Prometheus%22,%7B%22expr%22:%22max%20by%20(family)%20(rate(src_repoupdater_syncer_sync_errors_total%7Bowner!%3D%5C%22user%5C%22%7D%5B5m%5D))%20OR%20on()%20vector(0)%22,%22datasource%22:%22Prometheus%22,%22exemplar%22:true,%22requestId%22:%22Q-6ba88982-7cc8-4c07-9135-56645d70c8ca-0A%22%7D%5D
This commit is contained in:
parent
4a05c4bd2d
commit
4d701f5d90
@ -2710,7 +2710,8 @@ with your code hosts connections or networking issues affecting communication wi
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-critical">critical</span> repo-updater: 0+ site level external service sync error rate for 10m0s
|
||||
- <span class="badge badge-warning">warning</span> repo-updater: 0.5+ site level external service sync error rate for 10m0s
|
||||
- <span class="badge badge-critical">critical</span> repo-updater: 1+ site level external service sync error rate for 10m0s
|
||||
|
||||
**Possible solutions**
|
||||
|
||||
@ -2725,6 +2726,7 @@ with your code hosts connections or networking issues affecting communication wi
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"warning_repo-updater_src_repoupdater_syncer_sync_errors_total",
|
||||
"critical_repo-updater_src_repoupdater_syncer_sync_errors_total"
|
||||
]
|
||||
```
|
||||
|
||||
@ -64,7 +64,8 @@ func RepoUpdater() *monitoring.Container {
|
||||
Name: "src_repoupdater_syncer_sync_errors_total",
|
||||
Description: "site level external service sync error rate",
|
||||
Query: `max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user"}[5m]))`,
|
||||
Critical: monitoring.Alert().Greater(0, nil).For(10 * time.Minute),
|
||||
Warning: monitoring.Alert().Greater(0.5, nil).For(10 * time.Minute),
|
||||
Critical: monitoring.Alert().Greater(1, nil).For(10 * time.Minute),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number).With(monitoring.PanelOptions.ZeroIfNoData()),
|
||||
Owner: monitoring.ObservableOwnerCoreApplication,
|
||||
PossibleSolutions: `
|
||||
|
||||
Loading…
Reference in New Issue
Block a user