monitoring: Amend repo-updater critical alerts (#14757)

Increase alert threshold to 9 hours instead of 8 to give the syncer some breathing room.
Alert on the rate of change of `sched_update_queue_length`

Co-authored-by: Robert Lin <robert@bobheadxi.dev>
This commit is contained in:
Ryan Slade 2020-10-15 14:02:06 +02:00 committed by GitHub
parent 3f1c5fdd98
commit 18d09bb021
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 23 deletions

View File

@ -3008,7 +3008,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: 28800s+ time since oldest sync for 10m0s_
- _repo-updater: 32400s+ time since oldest sync for 10m0s_
**Possible solutions:**
@ -3025,7 +3025,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 0 sync was started for 8h0m0s_
- _repo-updater: less than 0 sync was started for 9h0m0s_
**Possible solutions:**
@ -3076,7 +3076,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 0 repositories synced for 8h0m0s_
- _repo-updater: less than 0 repositories synced for 9h0m0s_
**Possible solutions:**
@ -3093,7 +3093,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 0 repositories sourced for 8h0m0s_
- _repo-updater: less than 0 repositories sourced for 9h0m0s_
**Possible solutions:**
@ -3144,7 +3144,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 0 repositories scheduled due to hitting a deadline for 8h0m0s_
- _repo-updater: less than 0 repositories scheduled due to hitting a deadline for 9h0m0s_
**Possible solutions:**
@ -3161,7 +3161,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 0 repositories scheduled due to user traffic for 8h0m0s_
- _repo-updater: less than 0 repositories scheduled due to user traffic for 9h0m0s_
**Possible solutions:**
@ -3195,11 +3195,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: 1000+ repositories queued for update for 5m0s_
- _repo-updater: 0+ rate of growth of update queue length over 5 minutes for 30m0s_
**Possible solutions:**
- Check repo-updater logs. The queue should drop as items are sent to GitServer
- Check repo-updater logs for indications that the queue is not being processed. The queue length should trend downwards over time as items are sent to GitServer
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
@ -3212,7 +3212,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 0 scheduler loops for 8h0m0s_
- _repo-updater: less than 0 scheduler loops for 9h0m0s_
**Possible solutions:**

View File

@ -4,6 +4,9 @@ import (
"time"
)
// This is set a bit longer than maxSyncInterval in cmd/repo-updater/repos/syncer.go
const syncDurationThreshold = 9 * time.Hour
func RepoUpdater() *Container {
return &Container{
Name: "repo-updater",
@ -35,9 +38,9 @@ func RepoUpdater() *Container {
Observable{
Name: "src_repoupdater_max_sync_backoff",
Description: "time since oldest sync",
Query: `sum(src_repoupdater_max_sync_backoff)`,
Query: `max(src_repoupdater_max_sync_backoff)`,
DataMayNotExist: true,
Critical: Alert().GreaterOrEqual(8 * time.Hour.Seconds()).For(10 * time.Minute),
Critical: Alert().GreaterOrEqual(syncDurationThreshold.Seconds()).For(10 * time.Minute),
PanelOptions: PanelOptions().Unit(Seconds),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Make sure there are external services added with valid tokens",
@ -49,7 +52,7 @@ func RepoUpdater() *Container {
Description: "sync was started",
Query: `sum by (family) (rate(src_repoupdater_syncer_start_sync[5m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
PanelOptions: PanelOptions().LegendFormat("{{family}}").Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs for errors.",
@ -81,7 +84,7 @@ func RepoUpdater() *Container {
Description: "repositories synced",
Query: `sum by (state) (rate(src_repoupdater_syncer_synced_repos_total[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
PanelOptions: PanelOptions().LegendFormat("{{state}}").Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check network connectivity to code hosts",
@ -91,7 +94,7 @@ func RepoUpdater() *Container {
Description: "repositories sourced",
Query: `sum(rate(src_repoupdater_source_repos_total[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check network connectivity to code hosts",
@ -126,7 +129,7 @@ func RepoUpdater() *Container {
Description: "repositories scheduled due to hitting a deadline",
Query: `sum(rate(src_repoupdater_sched_auto_fetch[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
@ -136,7 +139,7 @@ func RepoUpdater() *Container {
Description: "repositories scheduled due to user traffic",
Query: `sum(rate(src_repoupdater_sched_manual_fetch[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
@ -154,21 +157,22 @@ func RepoUpdater() *Container {
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
},
Observable{
Name: "sched_update_queue_length",
Description: "repositories queued for update",
Query: `sum(src_repoupdater_sched_update_queue_length)`,
DataMayNotExist: true,
Critical: Alert().GreaterOrEqual(1000).For(5 * time.Minute),
Name: "sched_update_queue_length",
Description: "rate of growth of update queue length over 5 minutes",
Query: `max(deriv(src_repoupdater_sched_update_queue_length[5m]))`,
DataMayNotExist: true,
// Alert if the derivative is positive for longer than 30 minutes
Critical: Alert().GreaterOrEqual(0).For(30 * time.Minute),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs. The queue should drop as items are sent to GitServer",
PossibleSolutions: "Check repo-updater logs for indications that the queue is not being processed. The queue length should trend downwards over time as items are sent to GitServer",
},
Observable{
Name: "sched_loops",
Description: "scheduler loops",
Query: `sum(rate(src_repoupdater_sched_loops[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs for errors. This is expected to fire if there are no user added code hosts",