diff --git a/cmd/repo-updater/repos/syncer.go b/cmd/repo-updater/repos/syncer.go index 49ce4d7b544..8f963b7a934 100644 --- a/cmd/repo-updater/repos/syncer.go +++ b/cmd/repo-updater/repos/syncer.go @@ -836,6 +836,8 @@ func (s *Syncer) observe(ctx context.Context, family, title string) (context.Con tr, ctx := trace.New(ctx, family, title) return ctx, func(d *Diff, err *error) { + syncStarted.WithLabelValues(family).Inc() + now := s.Now() took := s.Now().Sub(began).Seconds() @@ -862,7 +864,6 @@ func (s *Syncer) observe(ctx context.Context, family, title string) (context.Con tr.LogFields(fields...) lastSync.WithLabelValues(family).Set(float64(now.Unix())) - syncStarted.WithLabelValues(family).Inc() success := err == nil || *err == nil syncDuration.WithLabelValues(strconv.FormatBool(success), family).Observe(took) diff --git a/doc/admin/observability/alert_solutions.md b/doc/admin/observability/alert_solutions.md index 951b236c1d2..c4a45abadee 100644 --- a/doc/admin/observability/alert_solutions.md +++ b/doc/admin/observability/alert_solutions.md @@ -2896,7 +2896,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: 28800s+ time since oldest sync for 5m0s_ +- _repo-updater: 28800s+ time since oldest sync for 10m0s_ **Possible solutions:** @@ -2905,7 +2905,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt ```json "observability.silenceAlerts": [ - "warning_repo-updater_src_repoupdater_max_sync_backoff" + "critical_repo-updater_src_repoupdater_max_sync_backoff" ] ``` @@ -2913,11 +2913,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: 100+ sync was started for 5m0s_ +- _repo-updater: less than 1 sync was started for 8h0m0s_ **Possible solutions:** -- None +- Check repo-updater logs for errors. Ignore this alert if only one code host connection is defined - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -2934,7 +2934,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Possible solutions:** -- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host. +- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -2951,7 +2951,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Possible solutions:** -- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host. +- Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -2964,10 +2964,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: 1000+ repositories synced for 5m0s_ +- _repo-updater: less than 1 repositories synced for 8h0m0s_ **Possible solutions:** +- Check network connectivity to code hosts - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -2980,10 +2981,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: 1000+ repositories sourced for 5m0s_ +- _repo-updater: less than 1 repositories sourced for 8h0m0s_ **Possible solutions:** +- Check network connectivity to code hosts - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -2992,6 +2994,23 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt ] ``` +## repo-updater: user_added_repos + +**Descriptions:** + +- _repo-updater: 180000+ total number of user added repos for 5m0s_ + +**Possible solutions:** + +- Check for unusual spikes in user added repos. Each user is only allowed to add 2000 +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_repo-updater_user_added_repos" +] +``` + ## repo-updater: purge_failed **Descriptions:** @@ -3000,6 +3019,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Possible solutions:** +- Check repo-updater`s connectivity with gitserver and gitserver logs - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -3008,30 +3028,15 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt ] ``` -## repo-updater: purge_success - -**Descriptions:** - -- _repo-updater: 10+ repositories purge succeeded for 5m0s_ - -**Possible solutions:** - -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_repo-updater_purge_success" -] -``` - ## repo-updater: sched_auto_fetch **Descriptions:** -- _repo-updater: 1000+ repositories scheduled due to hitting a deadline for 5m0s_ +- _repo-updater: less than 1 repositories scheduled due to hitting a deadline for 8h0m0s_ **Possible solutions:** +- Check repo-updater logs. This is expected to fire if there are no user added code hosts - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -3044,10 +3049,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: 1000+ repositories scheduled due to user traffic for 5m0s_ +- _repo-updater: less than 1 repositories scheduled due to user traffic for 8h0m0s_ **Possible solutions:** +- Check repo-updater logs. This is expected to fire if there are no user added code hosts - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -3060,10 +3066,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: 1000+ repositories managed by the scheduler for 5m0s_ +- _repo-updater: less than 1 repositories managed by the scheduler for 10m0s_ **Possible solutions:** +- Check repo-updater logs. This is expected to fire if there are no user added code hosts - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -3080,11 +3087,12 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Possible solutions:** +- Check repo-updater logs. The queue should drop as items are sent to GitServer - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_sched_update_queue_length" + "critical_repo-updater_sched_update_queue_length" ] ``` @@ -3092,10 +3100,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: 10+ scheduler loops for 5m0s_ +- _repo-updater: less than 1 scheduler loops for 8h0m0s_ **Possible solutions:** +- Check repo-updater logs for errors. This is expected to fire if there are no user added code hosts - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -3112,6 +3121,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Possible solutions:** +- Check repo-updater logs for errors - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -3250,12 +3260,12 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Possible solutions:** -- None +- Check for spikes in external services, could be abuse - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_repo-updater_src_repoupdater_external_services_total" + "critical_repo-updater_src_repoupdater_external_services_total" ] ``` @@ -3267,7 +3277,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Possible solutions:** -- None +- Check for spikes in external services, could be abuse - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -3302,7 +3312,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Possible solutions:** -- None +- Check repo-updater logs. Jobs older than 1 day should have been removed. - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -3319,7 +3329,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Possible solutions:** -- None +- Check repo-updater logs. Check code host connectivity - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json diff --git a/monitoring/repo_updater.go b/monitoring/repo_updater.go index 79c181bc2ed..a3c20fd86e7 100644 --- a/monitoring/repo_updater.go +++ b/monitoring/repo_updater.go @@ -37,7 +37,7 @@ func RepoUpdater() *Container { Description: "time since oldest sync", Query: `src_repoupdater_max_sync_backoff`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 8 * time.Hour.Seconds(), For: 5 * time.Minute}, + Critical: Alert{GreaterOrEqual: 8 * time.Hour.Seconds(), For: 10 * time.Minute}, PanelOptions: PanelOptions().LegendFormat("seconds").Unit(Seconds), Owner: ObservableOwnerCloud, PossibleSolutions: "Make sure there are external services added with valid tokens", @@ -47,12 +47,12 @@ func RepoUpdater() *Container { Observable{ Name: "syncer_sync_start", Description: "sync was started", - Query: `src_repoupdater_syncer_start_sync`, + Query: `rate(src_repoupdater_syncer_start_sync[5m])`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 100, For: 5 * time.Minute}, + Warning: Alert{LessOrEqual: 1, For: 8 * time.Hour}, PanelOptions: PanelOptions().LegendFormat("{{family}}-{{external_service_id}}").Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "None", + PossibleSolutions: "Check repo-updater logs for errors. Ignore this alert if only one code host connection is defined", }, Observable{ Name: "syncer_sync_duration", @@ -62,7 +62,7 @@ func RepoUpdater() *Container { Warning: Alert{GreaterOrEqual: 30, For: 5 * time.Minute}, PanelOptions: PanelOptions().LegendFormat("seconds").Unit(Seconds), Owner: ObservableOwnerCloud, - PossibleSolutions: "Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host.", + PossibleSolutions: "Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host", }, Observable{ Name: "source_duration", @@ -72,7 +72,7 @@ func RepoUpdater() *Container { Warning: Alert{GreaterOrEqual: 30, For: 5 * time.Minute}, PanelOptions: PanelOptions().LegendFormat("seconds").Unit(Seconds), Owner: ObservableOwnerCloud, - PossibleSolutions: "Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host.", + PossibleSolutions: "Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host", }, }, { @@ -81,20 +81,31 @@ func RepoUpdater() *Container { Description: "repositories synced", Query: `rate(src_repoupdater_syncer_synced_repos_total[1m])`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 1000, For: 5 * time.Minute}, // NOTE: There is really no point to have such warning + Warning: Alert{LessOrEqual: 1, For: 8 * time.Hour}, PanelOptions: PanelOptions().LegendFormat("{{state}}").Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "none", + PossibleSolutions: "Check network connectivity to code hosts", }, Observable{ Name: "sourced_repos", Description: "repositories sourced", Query: `rate(src_repoupdater_source_repos_total[1m])`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 1000, For: 5 * time.Minute}, // NOTE: There is really no point to have such warning + Warning: Alert{LessOrEqual: 1, For: 8 * time.Hour}, PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "none", + PossibleSolutions: "Check network connectivity to code hosts", + }, + Observable{ + Name: "user_added_repos", + Description: "total number of user added repos", + Query: `src_repoupdater_user_repos_total`, + DataMayNotExist: true, + // 90% of our enforced limit + Critical: Alert{GreaterOrEqual: 200000 * 0.9, For: 5 * time.Minute}, + PanelOptions: PanelOptions().Unit(Number), + Owner: ObservableOwnerCloud, + PossibleSolutions: "Check for unusual spikes in user added repos. Each user is only allowed to add 2000", }, }, { @@ -106,17 +117,7 @@ func RepoUpdater() *Container { Warning: Alert{GreaterOrEqual: 1, For: 5 * time.Minute}, PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "none", - }, - Observable{ - Name: "purge_success", - Description: "repositories purge succeeded", - Query: `rate(src_repoupdater_purge_success[1m])`, - DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 10, For: 5 * time.Minute}, - PanelOptions: PanelOptions().Unit(Number), - Owner: ObservableOwnerCloud, - PossibleSolutions: "none", + PossibleSolutions: "Check repo-updater's connectivity with gitserver and gitserver logs", }, }, { @@ -125,20 +126,20 @@ func RepoUpdater() *Container { Description: "repositories scheduled due to hitting a deadline", Query: `rate(src_repoupdater_sched_auto_fetch[1m])`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 1000, For: 5 * time.Minute}, // NOTE: There is really no point to have such warning + Warning: Alert{LessOrEqual: 1, For: 8 * time.Hour}, PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "none", + PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts", }, Observable{ Name: "sched_manual_fetch", Description: "repositories scheduled due to user traffic", Query: `rate(src_repoupdater_sched_manual_fetch[1m])`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 1000, For: 5 * time.Minute}, // NOTE: There is really no point to have such warning + Warning: Alert{LessOrEqual: 1, For: 8 * time.Hour}, PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "none", + PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts", }, }, { @@ -147,30 +148,30 @@ func RepoUpdater() *Container { Description: "repositories managed by the scheduler", Query: `src_repoupdater_sched_known_repos`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 1000, For: 5 * time.Minute}, // NOTE: There is really no point to have such warning + Warning: Alert{LessOrEqual: 1, For: 10 * time.Minute}, PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "none", + PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts", }, Observable{ Name: "sched_update_queue_length", Description: "repositories queued for update", Query: `src_repoupdater_sched_update_queue_length`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 1000, For: 5 * time.Minute}, // NOTE: There is really no point to have such warning + Critical: Alert{GreaterOrEqual: 1000, For: 5 * time.Minute}, PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "none", + PossibleSolutions: "Check repo-updater logs. The queue should drop as items are sent to GitServer", }, Observable{ Name: "sched_loops", Description: "scheduler loops", Query: `rate(src_repoupdater_sched_loops[1m])`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 10, For: 5 * time.Minute}, // NOTE: There is really no point to have such warning + Warning: Alert{LessOrEqual: 1, For: 8 * time.Hour}, PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "none", + PossibleSolutions: "Check repo-updater logs for errors. This is expected to fire if there are no user added code hosts", }, }, { @@ -182,7 +183,7 @@ func RepoUpdater() *Container { Critical: Alert{GreaterOrEqual: 1, For: time.Minute}, PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "none", + PossibleSolutions: "Check repo-updater logs for errors", }, }, }, @@ -288,10 +289,10 @@ func RepoUpdater() *Container { Description: "the total number of external services", Query: `src_repoupdater_external_services_total`, DataMayNotExist: true, - Warning: Alert{GreaterOrEqual: 20000, For: 1 * time.Hour}, + Critical: Alert{GreaterOrEqual: 20000, For: 1 * time.Hour}, PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "None", + PossibleSolutions: "Check for spikes in external services, could be abuse", }, Observable{ Name: "src_repoupdater_user_external_services_total", @@ -301,7 +302,7 @@ func RepoUpdater() *Container { Warning: Alert{GreaterOrEqual: 20000, For: 1 * time.Hour}, PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "None", + PossibleSolutions: "Check for spikes in external services, could be abuse", }, }, { @@ -326,7 +327,7 @@ func RepoUpdater() *Container { Warning: Alert{GreaterOrEqual: 100000, For: 1 * time.Hour}, PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "None", + PossibleSolutions: "Check repo-updater logs. Jobs older than 1 day should have been removed.", }, Observable{ Name: "repoupdater_errored_sync_jobs_total", @@ -336,7 +337,7 @@ func RepoUpdater() *Container { Warning: Alert{GreaterOrEqual: 100, For: 1 * time.Hour}, PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "None", + PossibleSolutions: "Check repo-updater logs. Check code host connectivity", }, }, },