diff --git a/doc/admin/observability/alert_solutions.md b/doc/admin/observability/alert_solutions.md index c4a45abadee..cf8139397c2 100644 --- a/doc/admin/observability/alert_solutions.md +++ b/doc/admin/observability/alert_solutions.md @@ -2913,11 +2913,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: less than 1 sync was started for 8h0m0s_ +- _repo-updater: less than 0 sync was started for 8h0m0s_ **Possible solutions:** -- Check repo-updater logs for errors. Ignore this alert if only one code host connection is defined +- Check repo-updater logs for errors. - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json @@ -2964,7 +2964,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: less than 1 repositories synced for 8h0m0s_ +- _repo-updater: less than 0 repositories synced for 8h0m0s_ **Possible solutions:** @@ -2981,7 +2981,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: less than 1 repositories sourced for 8h0m0s_ +- _repo-updater: less than 0 repositories sourced for 8h0m0s_ **Possible solutions:** @@ -3015,7 +3015,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: 1+ repositories purge failed for 5m0s_ +- _repo-updater: 0+ repositories purge failed for 5m0s_ **Possible solutions:** @@ -3032,7 +3032,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: less than 1 repositories scheduled due to hitting a deadline for 8h0m0s_ +- _repo-updater: less than 0 repositories scheduled due to hitting a deadline for 8h0m0s_ **Possible solutions:** @@ -3049,7 +3049,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: less than 1 repositories scheduled due to user traffic for 8h0m0s_ +- _repo-updater: less than 0 repositories scheduled due to user traffic for 8h0m0s_ **Possible solutions:** @@ -3066,7 +3066,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: less than 1 repositories managed by the scheduler for 10m0s_ +- _repo-updater: less than 0 repositories managed by the scheduler for 10m0s_ **Possible solutions:** @@ -3100,7 +3100,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt **Descriptions:** -- _repo-updater: less than 1 scheduler loops for 8h0m0s_ +- _repo-updater: less than 0 scheduler loops for 8h0m0s_ **Possible solutions:** diff --git a/monitoring/repo_updater.go b/monitoring/repo_updater.go index 34985932a24..43e22e0f5c2 100644 --- a/monitoring/repo_updater.go +++ b/monitoring/repo_updater.go @@ -35,7 +35,7 @@ func RepoUpdater() *Container { Observable{ Name: "src_repoupdater_max_sync_backoff", Description: "time since oldest sync", - Query: `src_repoupdater_max_sync_backoff`, + Query: `sum(src_repoupdater_max_sync_backoff)`, DataMayNotExist: true, Critical: Alert().GreaterOrEqual(8 * time.Hour.Seconds()).For(10 * time.Minute), PanelOptions: PanelOptions().Unit(Seconds), @@ -47,17 +47,17 @@ func RepoUpdater() *Container { Observable{ Name: "syncer_sync_start", Description: "sync was started", - Query: `rate(src_repoupdater_syncer_start_sync[5m])`, + Query: `sum by (family) (rate(src_repoupdater_syncer_start_sync[5m]))`, DataMayNotExist: true, - Warning: Alert().LessOrEqual(1).For(8 * time.Hour), + Warning: Alert().LessOrEqual(0).For(8 * time.Hour), PanelOptions: PanelOptions().LegendFormat("{{family}}").Unit(Number), Owner: ObservableOwnerCloud, - PossibleSolutions: "Check repo-updater logs for errors. Ignore this alert if only one code host connection is defined", + PossibleSolutions: "Check repo-updater logs for errors.", }, Observable{ Name: "syncer_sync_duration", Description: "95th repositories sync duration", - Query: `histogram_quantile(0.95, rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m]))`, + Query: `histogram_quantile(0.95, sum by (le, family, success) (rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m])))`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(30).For(5 * time.Minute), PanelOptions: PanelOptions().LegendFormat("{{family}}-{{success}}").Unit(Seconds), @@ -67,7 +67,7 @@ func RepoUpdater() *Container { Observable{ Name: "source_duration", Description: "95th repositories source duration", - Query: `histogram_quantile(0.95, rate(src_repoupdater_source_duration_seconds_bucket[1m]))`, + Query: `histogram_quantile(0.95, sum by (le) (rate(src_repoupdater_source_duration_seconds_bucket[1m])))`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(30).For(5 * time.Minute), PanelOptions: PanelOptions().Unit(Seconds), @@ -79,9 +79,9 @@ func RepoUpdater() *Container { Observable{ Name: "syncer_synced_repos", Description: "repositories synced", - Query: `rate(src_repoupdater_syncer_synced_repos_total[1m])`, + Query: `sum by (state) (rate(src_repoupdater_syncer_synced_repos_total[1m]))`, DataMayNotExist: true, - Warning: Alert().LessOrEqual(1).For(8 * time.Hour), + Warning: Alert().LessOrEqual(0).For(8 * time.Hour), PanelOptions: PanelOptions().LegendFormat("{{state}}").Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check network connectivity to code hosts", @@ -89,9 +89,9 @@ func RepoUpdater() *Container { Observable{ Name: "sourced_repos", Description: "repositories sourced", - Query: `rate(src_repoupdater_source_repos_total[1m])`, + Query: `sum(rate(src_repoupdater_source_repos_total[1m]))`, DataMayNotExist: true, - Warning: Alert().LessOrEqual(1).For(8 * time.Hour), + Warning: Alert().LessOrEqual(0).For(8 * time.Hour), PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check network connectivity to code hosts", @@ -99,7 +99,7 @@ func RepoUpdater() *Container { Observable{ Name: "user_added_repos", Description: "total number of user added repos", - Query: `src_repoupdater_user_repos_total`, + Query: `sum(src_repoupdater_user_repos_total)`, DataMayNotExist: true, // 90% of our enforced limit Critical: Alert().GreaterOrEqual(200000 * 0.9).For(5 * time.Minute), @@ -112,9 +112,9 @@ func RepoUpdater() *Container { Observable{ Name: "purge_failed", Description: "repositories purge failed", - Query: `rate(src_repoupdater_purge_failed[1m])`, + Query: `sum(rate(src_repoupdater_purge_failed[1m]))`, DataMayNotExist: true, - Warning: Alert().GreaterOrEqual(1).For(5 * time.Minute), + Warning: Alert().GreaterOrEqual(0).For(5 * time.Minute), PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check repo-updater's connectivity with gitserver and gitserver logs", @@ -124,9 +124,9 @@ func RepoUpdater() *Container { Observable{ Name: "sched_auto_fetch", Description: "repositories scheduled due to hitting a deadline", - Query: `rate(src_repoupdater_sched_auto_fetch[1m])`, + Query: `sum(rate(src_repoupdater_sched_auto_fetch[1m]))`, DataMayNotExist: true, - Warning: Alert().LessOrEqual(1).For(8 * time.Hour), + Warning: Alert().LessOrEqual(0).For(8 * time.Hour), PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts", @@ -134,9 +134,9 @@ func RepoUpdater() *Container { Observable{ Name: "sched_manual_fetch", Description: "repositories scheduled due to user traffic", - Query: `rate(src_repoupdater_sched_manual_fetch[1m])`, + Query: `sum(rate(src_repoupdater_sched_manual_fetch[1m]))`, DataMayNotExist: true, - Warning: Alert().LessOrEqual(1).For(8 * time.Hour), + Warning: Alert().LessOrEqual(0).For(8 * time.Hour), PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts", @@ -146,9 +146,9 @@ func RepoUpdater() *Container { Observable{ Name: "sched_known_repos", Description: "repositories managed by the scheduler", - Query: `src_repoupdater_sched_known_repos`, + Query: `sum(src_repoupdater_sched_known_repos)`, DataMayNotExist: true, - Warning: Alert().LessOrEqual(1).For(10 * time.Minute), + Warning: Alert().LessOrEqual(0).For(10 * time.Minute), PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts", @@ -156,7 +156,7 @@ func RepoUpdater() *Container { Observable{ Name: "sched_update_queue_length", Description: "repositories queued for update", - Query: `src_repoupdater_sched_update_queue_length`, + Query: `sum(src_repoupdater_sched_update_queue_length)`, DataMayNotExist: true, Critical: Alert().GreaterOrEqual(1000).For(5 * time.Minute), PanelOptions: PanelOptions().Unit(Number), @@ -166,9 +166,9 @@ func RepoUpdater() *Container { Observable{ Name: "sched_loops", Description: "scheduler loops", - Query: `rate(src_repoupdater_sched_loops[1m])`, + Query: `sum(rate(src_repoupdater_sched_loops[1m]))`, DataMayNotExist: true, - Warning: Alert().LessOrEqual(1).For(8 * time.Hour), + Warning: Alert().LessOrEqual(0).For(8 * time.Hour), PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check repo-updater logs for errors. This is expected to fire if there are no user added code hosts", @@ -178,7 +178,7 @@ func RepoUpdater() *Container { Observable{ Name: "sched_error", Description: "repositories schedule error rate", - Query: `rate(src_repoupdater_sched_error[1m])`, + Query: `sum(rate(src_repoupdater_sched_error[1m]))`, DataMayNotExist: true, Critical: Alert().GreaterOrEqual(1).For(time.Minute), PanelOptions: PanelOptions().Unit(Number), @@ -196,7 +196,7 @@ func RepoUpdater() *Container { Observable{ Name: "perms_syncer_perms", Description: "time gap between least and most up to date permissions", - Query: `src_repoupdater_perms_syncer_perms_gap_seconds`, + Query: `sum by (type) (src_repoupdater_perms_syncer_perms_gap_seconds)`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual((3 * 24 * time.Hour).Seconds()).For(5 * time.Minute), // 3 days PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Seconds), @@ -206,7 +206,7 @@ func RepoUpdater() *Container { Observable{ Name: "perms_syncer_stale_perms", Description: "number of entities with stale permissions", - Query: `src_repoupdater_perms_syncer_stale_perms`, + Query: `sum by (type) (src_repoupdater_perms_syncer_stale_perms)`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(100).For(5 * time.Minute), PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), @@ -216,7 +216,7 @@ func RepoUpdater() *Container { Observable{ Name: "perms_syncer_no_perms", Description: "number of entities with no permissions", - Query: `src_repoupdater_perms_syncer_no_perms`, + Query: `sum by (type) (src_repoupdater_perms_syncer_no_perms)`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(100).For(5 * time.Minute), PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), @@ -231,7 +231,7 @@ func RepoUpdater() *Container { Observable{ Name: "perms_syncer_sync_duration", Description: "95th permissions sync duration", - Query: `histogram_quantile(0.95, rate(src_repoupdater_perms_syncer_sync_duration_seconds_bucket[1m]))`, + Query: `histogram_quantile(0.95, sum by (le, type) (rate(src_repoupdater_perms_syncer_sync_duration_seconds_bucket[1m])))`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(30).For(5 * time.Minute), PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Seconds), @@ -241,7 +241,7 @@ func RepoUpdater() *Container { Observable{ Name: "perms_syncer_queue_size", Description: "permissions sync queued items", - Query: `src_repoupdater_perms_syncer_queue_size`, + Query: `sum(src_repoupdater_perms_syncer_queue_size)`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(100).For(5 * time.Minute), PanelOptions: PanelOptions().Unit(Number), @@ -256,7 +256,7 @@ func RepoUpdater() *Container { Observable{ Name: "authz_filter_duration", Description: "95th authorization duration", - Query: `histogram_quantile(0.95, rate(src_frontend_authz_filter_duration_seconds_bucket{success="true"}[1m]))`, + Query: `histogram_quantile(0.95, sum by (le) (rate(src_frontend_authz_filter_duration_seconds_bucket{success="true"}[1m])))`, DataMayNotExist: true, Critical: Alert().GreaterOrEqual(1).For(time.Minute), PanelOptions: PanelOptions().Unit(Seconds), @@ -266,7 +266,7 @@ func RepoUpdater() *Container { Observable{ Name: "perms_syncer_sync_errors", Description: "permissions sync error rate", - Query: `rate(src_repoupdater_perms_syncer_sync_errors_total[1m]) / rate(src_repoupdater_perms_syncer_sync_duration_seconds_count[1m])`, + Query: `sum by (type) (rate(src_repoupdater_perms_syncer_sync_errors_total[1m])) / sum by (type) (rate(src_repoupdater_perms_syncer_sync_duration_seconds_count[1m]))`, DataMayNotExist: true, Critical: Alert().GreaterOrEqual(1).For(time.Minute), PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), @@ -287,20 +287,20 @@ func RepoUpdater() *Container { Observable{ Name: "src_repoupdater_external_services_total", Description: "the total number of external services", - Query: `src_repoupdater_external_services_total`, + Query: `sum(src_repoupdater_external_services_total)`, DataMayNotExist: true, Critical: Alert().GreaterOrEqual(20000).For(1 * time.Hour), - PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), + PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check for spikes in external services, could be abuse", }, Observable{ Name: "src_repoupdater_user_external_services_total", Description: "the total number of user added external services", - Query: `src_repoupdater_user_external_services_total`, + Query: `sum(src_repoupdater_user_external_services_total)`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(20000).For(1 * time.Hour), - PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), + PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check for spikes in external services, could be abuse", }, @@ -309,10 +309,10 @@ func RepoUpdater() *Container { Observable{ Name: "repoupdater_queued_sync_jobs_total", Description: "the total number of queued sync jobs", - Query: `src_repoupdater_queued_sync_jobs_total`, + Query: `sum(src_repoupdater_queued_sync_jobs_total)`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(100).For(1 * time.Hour), - PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), + PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: ` - **Check if jobs are failing to sync:** "SELECT * FROM external_service_sync_jobs WHERE state = 'errored'"; @@ -322,20 +322,20 @@ func RepoUpdater() *Container { Observable{ Name: "repoupdater_completed_sync_jobs_total", Description: "the total number of completed sync jobs", - Query: `src_repoupdater_completed_sync_jobs_total`, + Query: `sum(src_repoupdater_completed_sync_jobs_total)`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(100000).For(1 * time.Hour), - PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), + PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check repo-updater logs. Jobs older than 1 day should have been removed.", }, Observable{ Name: "repoupdater_errored_sync_jobs_total", Description: "the total number of errored sync jobs", - Query: `src_repoupdater_errored_sync_jobs_total`, + Query: `sum(src_repoupdater_errored_sync_jobs_total)`, DataMayNotExist: true, Warning: Alert().GreaterOrEqual(100).For(1 * time.Hour), - PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number), + PanelOptions: PanelOptions().Unit(Number), Owner: ObservableOwnerCloud, PossibleSolutions: "Check repo-updater logs. Check code host connectivity", },