mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 17:51:57 +00:00
monitoring: Amend repo-updater critical alerts (#14757)
Increase alert threshold to 9 hours instead of 8 to give the syncer some breathing room. Alert on the rate of change of `sched_update_queue_length` Co-authored-by: Robert Lin <robert@bobheadxi.dev>
This commit is contained in:
parent
3f1c5fdd98
commit
18d09bb021
@ -3008,7 +3008,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: 28800s+ time since oldest sync for 10m0s_
|
||||
- _repo-updater: 32400s+ time since oldest sync for 10m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3025,7 +3025,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 0 sync was started for 8h0m0s_
|
||||
- _repo-updater: less than 0 sync was started for 9h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3076,7 +3076,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 0 repositories synced for 8h0m0s_
|
||||
- _repo-updater: less than 0 repositories synced for 9h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3093,7 +3093,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 0 repositories sourced for 8h0m0s_
|
||||
- _repo-updater: less than 0 repositories sourced for 9h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3144,7 +3144,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 0 repositories scheduled due to hitting a deadline for 8h0m0s_
|
||||
- _repo-updater: less than 0 repositories scheduled due to hitting a deadline for 9h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3161,7 +3161,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 0 repositories scheduled due to user traffic for 8h0m0s_
|
||||
- _repo-updater: less than 0 repositories scheduled due to user traffic for 9h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3195,11 +3195,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: 1000+ repositories queued for update for 5m0s_
|
||||
- _repo-updater: 0+ rate of growth of update queue length over 5 minutes for 30m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
- Check repo-updater logs. The queue should drop as items are sent to GitServer
|
||||
- Check repo-updater logs for indications that the queue is not being processed. The queue length should trend downwards over time as items are sent to GitServer
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
@ -3212,7 +3212,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 0 scheduler loops for 8h0m0s_
|
||||
- _repo-updater: less than 0 scheduler loops for 9h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
|
||||
@ -4,6 +4,9 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// This is set a bit longer than maxSyncInterval in cmd/repo-updater/repos/syncer.go
|
||||
const syncDurationThreshold = 9 * time.Hour
|
||||
|
||||
func RepoUpdater() *Container {
|
||||
return &Container{
|
||||
Name: "repo-updater",
|
||||
@ -35,9 +38,9 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "src_repoupdater_max_sync_backoff",
|
||||
Description: "time since oldest sync",
|
||||
Query: `sum(src_repoupdater_max_sync_backoff)`,
|
||||
Query: `max(src_repoupdater_max_sync_backoff)`,
|
||||
DataMayNotExist: true,
|
||||
Critical: Alert().GreaterOrEqual(8 * time.Hour.Seconds()).For(10 * time.Minute),
|
||||
Critical: Alert().GreaterOrEqual(syncDurationThreshold.Seconds()).For(10 * time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Seconds),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Make sure there are external services added with valid tokens",
|
||||
@ -49,7 +52,7 @@ func RepoUpdater() *Container {
|
||||
Description: "sync was started",
|
||||
Query: `sum by (family) (rate(src_repoupdater_syncer_start_sync[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{family}}").Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs for errors.",
|
||||
@ -81,7 +84,7 @@ func RepoUpdater() *Container {
|
||||
Description: "repositories synced",
|
||||
Query: `sum by (state) (rate(src_repoupdater_syncer_synced_repos_total[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{state}}").Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check network connectivity to code hosts",
|
||||
@ -91,7 +94,7 @@ func RepoUpdater() *Container {
|
||||
Description: "repositories sourced",
|
||||
Query: `sum(rate(src_repoupdater_source_repos_total[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check network connectivity to code hosts",
|
||||
@ -126,7 +129,7 @@ func RepoUpdater() *Container {
|
||||
Description: "repositories scheduled due to hitting a deadline",
|
||||
Query: `sum(rate(src_repoupdater_sched_auto_fetch[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
|
||||
@ -136,7 +139,7 @@ func RepoUpdater() *Container {
|
||||
Description: "repositories scheduled due to user traffic",
|
||||
Query: `sum(rate(src_repoupdater_sched_manual_fetch[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
|
||||
@ -154,21 +157,22 @@ func RepoUpdater() *Container {
|
||||
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
|
||||
},
|
||||
Observable{
|
||||
Name: "sched_update_queue_length",
|
||||
Description: "repositories queued for update",
|
||||
Query: `sum(src_repoupdater_sched_update_queue_length)`,
|
||||
DataMayNotExist: true,
|
||||
Critical: Alert().GreaterOrEqual(1000).For(5 * time.Minute),
|
||||
Name: "sched_update_queue_length",
|
||||
Description: "rate of growth of update queue length over 5 minutes",
|
||||
Query: `max(deriv(src_repoupdater_sched_update_queue_length[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
// Alert if the derivative is positive for longer than 30 minutes
|
||||
Critical: Alert().GreaterOrEqual(0).For(30 * time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs. The queue should drop as items are sent to GitServer",
|
||||
PossibleSolutions: "Check repo-updater logs for indications that the queue is not being processed. The queue length should trend downwards over time as items are sent to GitServer",
|
||||
},
|
||||
Observable{
|
||||
Name: "sched_loops",
|
||||
Description: "scheduler loops",
|
||||
Query: `sum(rate(src_repoupdater_sched_loops[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs for errors. This is expected to fire if there are no user added code hosts",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user