monitoring: Properly aggregate repo-updater metrics (#14704)

* monitoring: Properly aggregate repo-updater metrics

Since we only have one instance we should sum the
results.

* Alert on zero values where appropriate
This commit is contained in:
Ryan Slade 2020-10-14 15:54:45 +02:00 committed by GitHub
parent 742ced7e54
commit 452ea432b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 49 additions and 49 deletions

View File

@ -2913,11 +2913,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 1 sync was started for 8h0m0s_
- _repo-updater: less than 0 sync was started for 8h0m0s_
**Possible solutions:**
- Check repo-updater logs for errors. Ignore this alert if only one code host connection is defined
- Check repo-updater logs for errors.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
@ -2964,7 +2964,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 1 repositories synced for 8h0m0s_
- _repo-updater: less than 0 repositories synced for 8h0m0s_
**Possible solutions:**
@ -2981,7 +2981,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 1 repositories sourced for 8h0m0s_
- _repo-updater: less than 0 repositories sourced for 8h0m0s_
**Possible solutions:**
@ -3015,7 +3015,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: 1+ repositories purge failed for 5m0s_
- _repo-updater: 0+ repositories purge failed for 5m0s_
**Possible solutions:**
@ -3032,7 +3032,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 1 repositories scheduled due to hitting a deadline for 8h0m0s_
- _repo-updater: less than 0 repositories scheduled due to hitting a deadline for 8h0m0s_
**Possible solutions:**
@ -3049,7 +3049,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 1 repositories scheduled due to user traffic for 8h0m0s_
- _repo-updater: less than 0 repositories scheduled due to user traffic for 8h0m0s_
**Possible solutions:**
@ -3066,7 +3066,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 1 repositories managed by the scheduler for 10m0s_
- _repo-updater: less than 0 repositories managed by the scheduler for 10m0s_
**Possible solutions:**
@ -3100,7 +3100,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
**Descriptions:**
- _repo-updater: less than 1 scheduler loops for 8h0m0s_
- _repo-updater: less than 0 scheduler loops for 8h0m0s_
**Possible solutions:**

View File

@ -35,7 +35,7 @@ func RepoUpdater() *Container {
Observable{
Name: "src_repoupdater_max_sync_backoff",
Description: "time since oldest sync",
Query: `src_repoupdater_max_sync_backoff`,
Query: `sum(src_repoupdater_max_sync_backoff)`,
DataMayNotExist: true,
Critical: Alert().GreaterOrEqual(8 * time.Hour.Seconds()).For(10 * time.Minute),
PanelOptions: PanelOptions().Unit(Seconds),
@ -47,17 +47,17 @@ func RepoUpdater() *Container {
Observable{
Name: "syncer_sync_start",
Description: "sync was started",
Query: `rate(src_repoupdater_syncer_start_sync[5m])`,
Query: `sum by (family) (rate(src_repoupdater_syncer_start_sync[5m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
PanelOptions: PanelOptions().LegendFormat("{{family}}").Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs for errors. Ignore this alert if only one code host connection is defined",
PossibleSolutions: "Check repo-updater logs for errors.",
},
Observable{
Name: "syncer_sync_duration",
Description: "95th repositories sync duration",
Query: `histogram_quantile(0.95, rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m]))`,
Query: `histogram_quantile(0.95, sum by (le, family, success) (rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m])))`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(30).For(5 * time.Minute),
PanelOptions: PanelOptions().LegendFormat("{{family}}-{{success}}").Unit(Seconds),
@ -67,7 +67,7 @@ func RepoUpdater() *Container {
Observable{
Name: "source_duration",
Description: "95th repositories source duration",
Query: `histogram_quantile(0.95, rate(src_repoupdater_source_duration_seconds_bucket[1m]))`,
Query: `histogram_quantile(0.95, sum by (le) (rate(src_repoupdater_source_duration_seconds_bucket[1m])))`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(30).For(5 * time.Minute),
PanelOptions: PanelOptions().Unit(Seconds),
@ -79,9 +79,9 @@ func RepoUpdater() *Container {
Observable{
Name: "syncer_synced_repos",
Description: "repositories synced",
Query: `rate(src_repoupdater_syncer_synced_repos_total[1m])`,
Query: `sum by (state) (rate(src_repoupdater_syncer_synced_repos_total[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
PanelOptions: PanelOptions().LegendFormat("{{state}}").Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check network connectivity to code hosts",
@ -89,9 +89,9 @@ func RepoUpdater() *Container {
Observable{
Name: "sourced_repos",
Description: "repositories sourced",
Query: `rate(src_repoupdater_source_repos_total[1m])`,
Query: `sum(rate(src_repoupdater_source_repos_total[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check network connectivity to code hosts",
@ -99,7 +99,7 @@ func RepoUpdater() *Container {
Observable{
Name: "user_added_repos",
Description: "total number of user added repos",
Query: `src_repoupdater_user_repos_total`,
Query: `sum(src_repoupdater_user_repos_total)`,
DataMayNotExist: true,
// 90% of our enforced limit
Critical: Alert().GreaterOrEqual(200000 * 0.9).For(5 * time.Minute),
@ -112,9 +112,9 @@ func RepoUpdater() *Container {
Observable{
Name: "purge_failed",
Description: "repositories purge failed",
Query: `rate(src_repoupdater_purge_failed[1m])`,
Query: `sum(rate(src_repoupdater_purge_failed[1m]))`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(1).For(5 * time.Minute),
Warning: Alert().GreaterOrEqual(0).For(5 * time.Minute),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater's connectivity with gitserver and gitserver logs",
@ -124,9 +124,9 @@ func RepoUpdater() *Container {
Observable{
Name: "sched_auto_fetch",
Description: "repositories scheduled due to hitting a deadline",
Query: `rate(src_repoupdater_sched_auto_fetch[1m])`,
Query: `sum(rate(src_repoupdater_sched_auto_fetch[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
@ -134,9 +134,9 @@ func RepoUpdater() *Container {
Observable{
Name: "sched_manual_fetch",
Description: "repositories scheduled due to user traffic",
Query: `rate(src_repoupdater_sched_manual_fetch[1m])`,
Query: `sum(rate(src_repoupdater_sched_manual_fetch[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
@ -146,9 +146,9 @@ func RepoUpdater() *Container {
Observable{
Name: "sched_known_repos",
Description: "repositories managed by the scheduler",
Query: `src_repoupdater_sched_known_repos`,
Query: `sum(src_repoupdater_sched_known_repos)`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(1).For(10 * time.Minute),
Warning: Alert().LessOrEqual(0).For(10 * time.Minute),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
@ -156,7 +156,7 @@ func RepoUpdater() *Container {
Observable{
Name: "sched_update_queue_length",
Description: "repositories queued for update",
Query: `src_repoupdater_sched_update_queue_length`,
Query: `sum(src_repoupdater_sched_update_queue_length)`,
DataMayNotExist: true,
Critical: Alert().GreaterOrEqual(1000).For(5 * time.Minute),
PanelOptions: PanelOptions().Unit(Number),
@ -166,9 +166,9 @@ func RepoUpdater() *Container {
Observable{
Name: "sched_loops",
Description: "scheduler loops",
Query: `rate(src_repoupdater_sched_loops[1m])`,
Query: `sum(rate(src_repoupdater_sched_loops[1m]))`,
DataMayNotExist: true,
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs for errors. This is expected to fire if there are no user added code hosts",
@ -178,7 +178,7 @@ func RepoUpdater() *Container {
Observable{
Name: "sched_error",
Description: "repositories schedule error rate",
Query: `rate(src_repoupdater_sched_error[1m])`,
Query: `sum(rate(src_repoupdater_sched_error[1m]))`,
DataMayNotExist: true,
Critical: Alert().GreaterOrEqual(1).For(time.Minute),
PanelOptions: PanelOptions().Unit(Number),
@ -196,7 +196,7 @@ func RepoUpdater() *Container {
Observable{
Name: "perms_syncer_perms",
Description: "time gap between least and most up to date permissions",
Query: `src_repoupdater_perms_syncer_perms_gap_seconds`,
Query: `sum by (type) (src_repoupdater_perms_syncer_perms_gap_seconds)`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual((3 * 24 * time.Hour).Seconds()).For(5 * time.Minute), // 3 days
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Seconds),
@ -206,7 +206,7 @@ func RepoUpdater() *Container {
Observable{
Name: "perms_syncer_stale_perms",
Description: "number of entities with stale permissions",
Query: `src_repoupdater_perms_syncer_stale_perms`,
Query: `sum by (type) (src_repoupdater_perms_syncer_stale_perms)`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(100).For(5 * time.Minute),
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
@ -216,7 +216,7 @@ func RepoUpdater() *Container {
Observable{
Name: "perms_syncer_no_perms",
Description: "number of entities with no permissions",
Query: `src_repoupdater_perms_syncer_no_perms`,
Query: `sum by (type) (src_repoupdater_perms_syncer_no_perms)`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(100).For(5 * time.Minute),
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
@ -231,7 +231,7 @@ func RepoUpdater() *Container {
Observable{
Name: "perms_syncer_sync_duration",
Description: "95th permissions sync duration",
Query: `histogram_quantile(0.95, rate(src_repoupdater_perms_syncer_sync_duration_seconds_bucket[1m]))`,
Query: `histogram_quantile(0.95, sum by (le, type) (rate(src_repoupdater_perms_syncer_sync_duration_seconds_bucket[1m])))`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(30).For(5 * time.Minute),
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Seconds),
@ -241,7 +241,7 @@ func RepoUpdater() *Container {
Observable{
Name: "perms_syncer_queue_size",
Description: "permissions sync queued items",
Query: `src_repoupdater_perms_syncer_queue_size`,
Query: `sum(src_repoupdater_perms_syncer_queue_size)`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(100).For(5 * time.Minute),
PanelOptions: PanelOptions().Unit(Number),
@ -256,7 +256,7 @@ func RepoUpdater() *Container {
Observable{
Name: "authz_filter_duration",
Description: "95th authorization duration",
Query: `histogram_quantile(0.95, rate(src_frontend_authz_filter_duration_seconds_bucket{success="true"}[1m]))`,
Query: `histogram_quantile(0.95, sum by (le) (rate(src_frontend_authz_filter_duration_seconds_bucket{success="true"}[1m])))`,
DataMayNotExist: true,
Critical: Alert().GreaterOrEqual(1).For(time.Minute),
PanelOptions: PanelOptions().Unit(Seconds),
@ -266,7 +266,7 @@ func RepoUpdater() *Container {
Observable{
Name: "perms_syncer_sync_errors",
Description: "permissions sync error rate",
Query: `rate(src_repoupdater_perms_syncer_sync_errors_total[1m]) / rate(src_repoupdater_perms_syncer_sync_duration_seconds_count[1m])`,
Query: `sum by (type) (rate(src_repoupdater_perms_syncer_sync_errors_total[1m])) / sum by (type) (rate(src_repoupdater_perms_syncer_sync_duration_seconds_count[1m]))`,
DataMayNotExist: true,
Critical: Alert().GreaterOrEqual(1).For(time.Minute),
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
@ -287,20 +287,20 @@ func RepoUpdater() *Container {
Observable{
Name: "src_repoupdater_external_services_total",
Description: "the total number of external services",
Query: `src_repoupdater_external_services_total`,
Query: `sum(src_repoupdater_external_services_total)`,
DataMayNotExist: true,
Critical: Alert().GreaterOrEqual(20000).For(1 * time.Hour),
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check for spikes in external services, could be abuse",
},
Observable{
Name: "src_repoupdater_user_external_services_total",
Description: "the total number of user added external services",
Query: `src_repoupdater_user_external_services_total`,
Query: `sum(src_repoupdater_user_external_services_total)`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(20000).For(1 * time.Hour),
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check for spikes in external services, could be abuse",
},
@ -309,10 +309,10 @@ func RepoUpdater() *Container {
Observable{
Name: "repoupdater_queued_sync_jobs_total",
Description: "the total number of queued sync jobs",
Query: `src_repoupdater_queued_sync_jobs_total`,
Query: `sum(src_repoupdater_queued_sync_jobs_total)`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(100).For(1 * time.Hour),
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: `
- **Check if jobs are failing to sync:** "SELECT * FROM external_service_sync_jobs WHERE state = 'errored'";
@ -322,20 +322,20 @@ func RepoUpdater() *Container {
Observable{
Name: "repoupdater_completed_sync_jobs_total",
Description: "the total number of completed sync jobs",
Query: `src_repoupdater_completed_sync_jobs_total`,
Query: `sum(src_repoupdater_completed_sync_jobs_total)`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(100000).For(1 * time.Hour),
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs. Jobs older than 1 day should have been removed.",
},
Observable{
Name: "repoupdater_errored_sync_jobs_total",
Description: "the total number of errored sync jobs",
Query: `src_repoupdater_errored_sync_jobs_total`,
Query: `sum(src_repoupdater_errored_sync_jobs_total)`,
DataMayNotExist: true,
Warning: Alert().GreaterOrEqual(100).For(1 * time.Hour),
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
PanelOptions: PanelOptions().Unit(Number),
Owner: ObservableOwnerCloud,
PossibleSolutions: "Check repo-updater logs. Check code host connectivity",
},