mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 18:11:48 +00:00
monitoring: Properly aggregate repo-updater metrics (#14704)
* monitoring: Properly aggregate repo-updater metrics Since we only have one instance we should sum the results. * Alert on zero values where appropriate
This commit is contained in:
parent
742ced7e54
commit
452ea432b2
@ -2913,11 +2913,11 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 1 sync was started for 8h0m0s_
|
||||
- _repo-updater: less than 0 sync was started for 8h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
- Check repo-updater logs for errors. Ignore this alert if only one code host connection is defined
|
||||
- Check repo-updater logs for errors.
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
@ -2964,7 +2964,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 1 repositories synced for 8h0m0s_
|
||||
- _repo-updater: less than 0 repositories synced for 8h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -2981,7 +2981,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 1 repositories sourced for 8h0m0s_
|
||||
- _repo-updater: less than 0 repositories sourced for 8h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3015,7 +3015,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: 1+ repositories purge failed for 5m0s_
|
||||
- _repo-updater: 0+ repositories purge failed for 5m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3032,7 +3032,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 1 repositories scheduled due to hitting a deadline for 8h0m0s_
|
||||
- _repo-updater: less than 0 repositories scheduled due to hitting a deadline for 8h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3049,7 +3049,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 1 repositories scheduled due to user traffic for 8h0m0s_
|
||||
- _repo-updater: less than 0 repositories scheduled due to user traffic for 8h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3066,7 +3066,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 1 repositories managed by the scheduler for 10m0s_
|
||||
- _repo-updater: less than 0 repositories managed by the scheduler for 10m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
@ -3100,7 +3100,7 @@ To learn more about Sourcegraph's alerting, see [our alerting documentation](htt
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _repo-updater: less than 1 scheduler loops for 8h0m0s_
|
||||
- _repo-updater: less than 0 scheduler loops for 8h0m0s_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
|
||||
@ -35,7 +35,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "src_repoupdater_max_sync_backoff",
|
||||
Description: "time since oldest sync",
|
||||
Query: `src_repoupdater_max_sync_backoff`,
|
||||
Query: `sum(src_repoupdater_max_sync_backoff)`,
|
||||
DataMayNotExist: true,
|
||||
Critical: Alert().GreaterOrEqual(8 * time.Hour.Seconds()).For(10 * time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Seconds),
|
||||
@ -47,17 +47,17 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "syncer_sync_start",
|
||||
Description: "sync was started",
|
||||
Query: `rate(src_repoupdater_syncer_start_sync[5m])`,
|
||||
Query: `sum by (family) (rate(src_repoupdater_syncer_start_sync[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{family}}").Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs for errors. Ignore this alert if only one code host connection is defined",
|
||||
PossibleSolutions: "Check repo-updater logs for errors.",
|
||||
},
|
||||
Observable{
|
||||
Name: "syncer_sync_duration",
|
||||
Description: "95th repositories sync duration",
|
||||
Query: `histogram_quantile(0.95, rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m]))`,
|
||||
Query: `histogram_quantile(0.95, sum by (le, family, success) (rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m])))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(30).For(5 * time.Minute),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{family}}-{{success}}").Unit(Seconds),
|
||||
@ -67,7 +67,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "source_duration",
|
||||
Description: "95th repositories source duration",
|
||||
Query: `histogram_quantile(0.95, rate(src_repoupdater_source_duration_seconds_bucket[1m]))`,
|
||||
Query: `histogram_quantile(0.95, sum by (le) (rate(src_repoupdater_source_duration_seconds_bucket[1m])))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(30).For(5 * time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Seconds),
|
||||
@ -79,9 +79,9 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "syncer_synced_repos",
|
||||
Description: "repositories synced",
|
||||
Query: `rate(src_repoupdater_syncer_synced_repos_total[1m])`,
|
||||
Query: `sum by (state) (rate(src_repoupdater_syncer_synced_repos_total[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{state}}").Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check network connectivity to code hosts",
|
||||
@ -89,9 +89,9 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "sourced_repos",
|
||||
Description: "repositories sourced",
|
||||
Query: `rate(src_repoupdater_source_repos_total[1m])`,
|
||||
Query: `sum(rate(src_repoupdater_source_repos_total[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check network connectivity to code hosts",
|
||||
@ -99,7 +99,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "user_added_repos",
|
||||
Description: "total number of user added repos",
|
||||
Query: `src_repoupdater_user_repos_total`,
|
||||
Query: `sum(src_repoupdater_user_repos_total)`,
|
||||
DataMayNotExist: true,
|
||||
// 90% of our enforced limit
|
||||
Critical: Alert().GreaterOrEqual(200000 * 0.9).For(5 * time.Minute),
|
||||
@ -112,9 +112,9 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "purge_failed",
|
||||
Description: "repositories purge failed",
|
||||
Query: `rate(src_repoupdater_purge_failed[1m])`,
|
||||
Query: `sum(rate(src_repoupdater_purge_failed[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(1).For(5 * time.Minute),
|
||||
Warning: Alert().GreaterOrEqual(0).For(5 * time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater's connectivity with gitserver and gitserver logs",
|
||||
@ -124,9 +124,9 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "sched_auto_fetch",
|
||||
Description: "repositories scheduled due to hitting a deadline",
|
||||
Query: `rate(src_repoupdater_sched_auto_fetch[1m])`,
|
||||
Query: `sum(rate(src_repoupdater_sched_auto_fetch[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
|
||||
@ -134,9 +134,9 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "sched_manual_fetch",
|
||||
Description: "repositories scheduled due to user traffic",
|
||||
Query: `rate(src_repoupdater_sched_manual_fetch[1m])`,
|
||||
Query: `sum(rate(src_repoupdater_sched_manual_fetch[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
|
||||
@ -146,9 +146,9 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "sched_known_repos",
|
||||
Description: "repositories managed by the scheduler",
|
||||
Query: `src_repoupdater_sched_known_repos`,
|
||||
Query: `sum(src_repoupdater_sched_known_repos)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(1).For(10 * time.Minute),
|
||||
Warning: Alert().LessOrEqual(0).For(10 * time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
|
||||
@ -156,7 +156,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "sched_update_queue_length",
|
||||
Description: "repositories queued for update",
|
||||
Query: `src_repoupdater_sched_update_queue_length`,
|
||||
Query: `sum(src_repoupdater_sched_update_queue_length)`,
|
||||
DataMayNotExist: true,
|
||||
Critical: Alert().GreaterOrEqual(1000).For(5 * time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
@ -166,9 +166,9 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "sched_loops",
|
||||
Description: "scheduler loops",
|
||||
Query: `rate(src_repoupdater_sched_loops[1m])`,
|
||||
Query: `sum(rate(src_repoupdater_sched_loops[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().LessOrEqual(1).For(8 * time.Hour),
|
||||
Warning: Alert().LessOrEqual(0).For(8 * time.Hour),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs for errors. This is expected to fire if there are no user added code hosts",
|
||||
@ -178,7 +178,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "sched_error",
|
||||
Description: "repositories schedule error rate",
|
||||
Query: `rate(src_repoupdater_sched_error[1m])`,
|
||||
Query: `sum(rate(src_repoupdater_sched_error[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Critical: Alert().GreaterOrEqual(1).For(time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
@ -196,7 +196,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "perms_syncer_perms",
|
||||
Description: "time gap between least and most up to date permissions",
|
||||
Query: `src_repoupdater_perms_syncer_perms_gap_seconds`,
|
||||
Query: `sum by (type) (src_repoupdater_perms_syncer_perms_gap_seconds)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual((3 * 24 * time.Hour).Seconds()).For(5 * time.Minute), // 3 days
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Seconds),
|
||||
@ -206,7 +206,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "perms_syncer_stale_perms",
|
||||
Description: "number of entities with stale permissions",
|
||||
Query: `src_repoupdater_perms_syncer_stale_perms`,
|
||||
Query: `sum by (type) (src_repoupdater_perms_syncer_stale_perms)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(100).For(5 * time.Minute),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
|
||||
@ -216,7 +216,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "perms_syncer_no_perms",
|
||||
Description: "number of entities with no permissions",
|
||||
Query: `src_repoupdater_perms_syncer_no_perms`,
|
||||
Query: `sum by (type) (src_repoupdater_perms_syncer_no_perms)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(100).For(5 * time.Minute),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
|
||||
@ -231,7 +231,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "perms_syncer_sync_duration",
|
||||
Description: "95th permissions sync duration",
|
||||
Query: `histogram_quantile(0.95, rate(src_repoupdater_perms_syncer_sync_duration_seconds_bucket[1m]))`,
|
||||
Query: `histogram_quantile(0.95, sum by (le, type) (rate(src_repoupdater_perms_syncer_sync_duration_seconds_bucket[1m])))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(30).For(5 * time.Minute),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Seconds),
|
||||
@ -241,7 +241,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "perms_syncer_queue_size",
|
||||
Description: "permissions sync queued items",
|
||||
Query: `src_repoupdater_perms_syncer_queue_size`,
|
||||
Query: `sum(src_repoupdater_perms_syncer_queue_size)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(100).For(5 * time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
@ -256,7 +256,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "authz_filter_duration",
|
||||
Description: "95th authorization duration",
|
||||
Query: `histogram_quantile(0.95, rate(src_frontend_authz_filter_duration_seconds_bucket{success="true"}[1m]))`,
|
||||
Query: `histogram_quantile(0.95, sum by (le) (rate(src_frontend_authz_filter_duration_seconds_bucket{success="true"}[1m])))`,
|
||||
DataMayNotExist: true,
|
||||
Critical: Alert().GreaterOrEqual(1).For(time.Minute),
|
||||
PanelOptions: PanelOptions().Unit(Seconds),
|
||||
@ -266,7 +266,7 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "perms_syncer_sync_errors",
|
||||
Description: "permissions sync error rate",
|
||||
Query: `rate(src_repoupdater_perms_syncer_sync_errors_total[1m]) / rate(src_repoupdater_perms_syncer_sync_duration_seconds_count[1m])`,
|
||||
Query: `sum by (type) (rate(src_repoupdater_perms_syncer_sync_errors_total[1m])) / sum by (type) (rate(src_repoupdater_perms_syncer_sync_duration_seconds_count[1m]))`,
|
||||
DataMayNotExist: true,
|
||||
Critical: Alert().GreaterOrEqual(1).For(time.Minute),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
|
||||
@ -287,20 +287,20 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "src_repoupdater_external_services_total",
|
||||
Description: "the total number of external services",
|
||||
Query: `src_repoupdater_external_services_total`,
|
||||
Query: `sum(src_repoupdater_external_services_total)`,
|
||||
DataMayNotExist: true,
|
||||
Critical: Alert().GreaterOrEqual(20000).For(1 * time.Hour),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check for spikes in external services, could be abuse",
|
||||
},
|
||||
Observable{
|
||||
Name: "src_repoupdater_user_external_services_total",
|
||||
Description: "the total number of user added external services",
|
||||
Query: `src_repoupdater_user_external_services_total`,
|
||||
Query: `sum(src_repoupdater_user_external_services_total)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(20000).For(1 * time.Hour),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check for spikes in external services, could be abuse",
|
||||
},
|
||||
@ -309,10 +309,10 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "repoupdater_queued_sync_jobs_total",
|
||||
Description: "the total number of queued sync jobs",
|
||||
Query: `src_repoupdater_queued_sync_jobs_total`,
|
||||
Query: `sum(src_repoupdater_queued_sync_jobs_total)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(100).For(1 * time.Hour),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: `
|
||||
- **Check if jobs are failing to sync:** "SELECT * FROM external_service_sync_jobs WHERE state = 'errored'";
|
||||
@ -322,20 +322,20 @@ func RepoUpdater() *Container {
|
||||
Observable{
|
||||
Name: "repoupdater_completed_sync_jobs_total",
|
||||
Description: "the total number of completed sync jobs",
|
||||
Query: `src_repoupdater_completed_sync_jobs_total`,
|
||||
Query: `sum(src_repoupdater_completed_sync_jobs_total)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(100000).For(1 * time.Hour),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs. Jobs older than 1 day should have been removed.",
|
||||
},
|
||||
Observable{
|
||||
Name: "repoupdater_errored_sync_jobs_total",
|
||||
Description: "the total number of errored sync jobs",
|
||||
Query: `src_repoupdater_errored_sync_jobs_total`,
|
||||
Query: `sum(src_repoupdater_errored_sync_jobs_total)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert().GreaterOrEqual(100).For(1 * time.Hour),
|
||||
PanelOptions: PanelOptions().LegendFormat("{{type}}").Unit(Number),
|
||||
PanelOptions: PanelOptions().Unit(Number),
|
||||
Owner: ObservableOwnerCloud,
|
||||
PossibleSolutions: "Check repo-updater logs. Check code host connectivity",
|
||||
},
|
||||
|
||||
Loading…
Reference in New Issue
Block a user