mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 17:11:49 +00:00
Update o11y ownership from IAM/repo-mgmt to Source (#54368)
Does what it says in the title. ## Test plan - Ran `sg generate`
This commit is contained in:
parent
485b6d07ff
commit
e6323ee6e4
@ -218,7 +218,7 @@ Generated query for warning alert: `max((sum by (alert_type) (increase(src_graph
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Identity and Access Management team](https://handbook.sourcegraph.com/departments/engineering/teams/iam).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -251,7 +251,7 @@ Generated query for warning alert: `max((histogram_quantile(0.9, sum by (le) (ra
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -694,7 +694,7 @@ Generated query for warning alert: `max((sum by (code) (increase(searcher_servic
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Identity and Access Management team](https://handbook.sourcegraph.com/departments/engineering/teams/iam).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -724,7 +724,7 @@ Generated query for warning alert: `max((sum by (category) (increase(src_fronten
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -754,7 +754,7 @@ Generated query for warning alert: `max((histogram_quantile(0.99, sum by (le, ca
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -849,7 +849,7 @@ Generated query for critical alert: `max((max by (owner) (observability_test_met
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1422,7 +1422,7 @@ Generated query for critical alert: `max((histogram_quantile(0.9, sum by (le) (l
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1459,7 +1459,7 @@ Generated query for critical alert: `min(((src_gitserver_disk_space_available /
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1493,7 +1493,7 @@ Generated query for critical alert: `max((sum by (instance, cmd) (src_gitserver_
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1526,7 +1526,7 @@ Generated query for warning alert: `max((sum(src_gitserver_clone_queue)) >= 25)`
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1563,7 +1563,7 @@ Generated query for warning alert: `max((sum(src_gitserver_lsremote_queue)) >= 2
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1631,7 +1631,7 @@ Generated query for critical alert: `max((sum by (app_name, db_name) (increase(s
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1663,7 +1663,7 @@ Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1695,7 +1695,7 @@ Generated query for warning alert: `max((cadvisor_container_memory_usage_percent
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1727,7 +1727,7 @@ Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_contai
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1759,7 +1759,7 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_us
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1789,7 +1789,7 @@ Generated query for warning alert: `max((max by (name) (container_oom_events_tot
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1819,7 +1819,7 @@ Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1851,7 +1851,7 @@ Generated query for warning alert: `max((max by (instance) (go_gc_duration_secon
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -1883,7 +1883,7 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*gitserver"})
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4090,7 +4090,7 @@ with your code hosts connections or networking issues affecting communication wi
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4129,7 +4129,7 @@ with your code hosts connections or networking issues affecting communication wi
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4162,7 +4162,7 @@ Generated query for critical alert: `max((max by (family) (rate(src_repoupdater_
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4193,7 +4193,7 @@ Generated query for warning alert: `min((max by (family) (rate(src_repoupdater_s
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4224,7 +4224,7 @@ Generated query for warning alert: `max((histogram_quantile(0.95, max by (le, fa
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4255,7 +4255,7 @@ Generated query for warning alert: `max((histogram_quantile(0.95, max by (le) (r
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4286,7 +4286,7 @@ Generated query for warning alert: `max((max(rate(src_repoupdater_syncer_synced_
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4317,7 +4317,7 @@ Generated query for warning alert: `min((max(rate(src_repoupdater_source_repos_t
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4348,7 +4348,7 @@ Generated query for warning alert: `max((max(rate(src_repoupdater_purge_failed[1
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4379,7 +4379,7 @@ Generated query for warning alert: `min((max(rate(src_repoupdater_sched_auto_fet
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4410,7 +4410,7 @@ Generated query for warning alert: `min((max(src_repoupdater_sched_known_repos))
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4441,7 +4441,7 @@ Generated query for critical alert: `max((max(deriv(src_repoupdater_sched_update
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4473,7 +4473,7 @@ Generated query for warning alert: `min((max(rate(src_repoupdater_sched_loops[1m
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4504,7 +4504,7 @@ Generated query for warning alert: `max((max(src_repoupdater_stale_repos)) >= 1)
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4536,7 +4536,7 @@ Generated query for critical alert: `max((max(rate(src_repoupdater_sched_error[1
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Identity and Access Management team](https://handbook.sourcegraph.com/departments/engineering/teams/iam).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4567,7 +4567,7 @@ Generated query for warning alert: `max((max by (type) (src_repoupdater_perms_sy
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Identity and Access Management team](https://handbook.sourcegraph.com/departments/engineering/teams/iam).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4599,7 +4599,7 @@ Generated query for warning alert: `max((histogram_quantile(0.95, max by (le, ty
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Identity and Access Management team](https://handbook.sourcegraph.com/departments/engineering/teams/iam).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4630,7 +4630,7 @@ Generated query for critical alert: `max((max by (type) (ceil(rate(src_repoupdat
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4662,7 +4662,7 @@ Generated query for critical alert: `max((max(src_repoupdater_external_services_
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4693,7 +4693,7 @@ Generated query for warning alert: `max((max(src_repoupdater_queued_sync_jobs_to
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4724,7 +4724,7 @@ Generated query for warning alert: `max((max(src_repoupdater_completed_sync_jobs
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4755,7 +4755,7 @@ Generated query for warning alert: `max((max(src_repoupdater_errored_sync_jobs_p
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4786,7 +4786,7 @@ Generated query for warning alert: `min((max by (name) (src_github_rate_limit_re
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4817,7 +4817,7 @@ Generated query for warning alert: `min((max by (name) (src_github_rate_limit_re
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4848,7 +4848,7 @@ Generated query for warning alert: `min((max by (name) (src_github_rate_limit_re
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4885,7 +4885,7 @@ Generated query for critical alert: `min((max by (name) (src_gitlab_rate_limit_r
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4953,7 +4953,7 @@ Generated query for critical alert: `max((sum by (app_name, db_name) (increase(s
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -4985,7 +4985,7 @@ Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -5017,7 +5017,7 @@ Generated query for critical alert: `max((cadvisor_container_memory_usage_percen
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -5049,7 +5049,7 @@ Generated query for warning alert: `max((quantile_over_time(0.9, cadvisor_contai
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -5081,7 +5081,7 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_memory
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -5113,7 +5113,7 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_cpu_us
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -5145,7 +5145,7 @@ Generated query for warning alert: `max((max_over_time(cadvisor_container_memory
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -5175,7 +5175,7 @@ Generated query for warning alert: `max((max by (name) (container_oom_events_tot
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -5205,7 +5205,7 @@ Generated query for warning alert: `max((max by (instance) (go_goroutines{job=~"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
@ -5237,7 +5237,7 @@ Generated query for warning alert: `max((max by (instance) (go_gc_duration_secon
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Repo Management team](https://handbook.sourcegraph.com/departments/engineering/teams/repo-management).*</sub>
|
||||
<sub>*Managed by the [Sourcegraph Source team](https://handbook.sourcegraph.com/departments/engineering/teams/source).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
368
doc/admin/observability/dashboards.md
generated
368
doc/admin/observability/dashboards.md
generated
File diff suppressed because it is too large
Load Diff
@ -155,7 +155,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `histogram_quantile(0.9, sum by(le) (rate(src_http_request_duration_seconds_bucket{route!="raw",route!="blob",route!~"graphql.*"}[10m])))`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(2),
|
||||
Panel: monitoring.Panel().LegendFormat("latency").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- Confirm that the Sourcegraph frontend has enough CPU/memory using the provisioning panels.
|
||||
- Investigate potential sources of latency by selecting Explore and modifying the 'sum by(le)' section to include additional labels: for example, 'sum by(le, job)' or 'sum by (le, instance)'.
|
||||
@ -168,7 +168,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `histogram_quantile(0.9, sum by(le) (rate(src_http_request_duration_seconds_bucket{route="blob"}[10m])))`,
|
||||
Critical: monitoring.Alert().GreaterOrEqual(5),
|
||||
Panel: monitoring.Panel().LegendFormat("latency").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- Confirm that the Sourcegraph frontend has enough CPU/memory using the provisioning panels.
|
||||
- Trace a request to see what the slowest part is: https://docs.sourcegraph.com/admin/observability/tracing
|
||||
@ -451,7 +451,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `sum by(category) (increase(src_frontend_internal_request_duration_seconds_count{code!~"2.."}[5m])) / ignoring(code) group_left sum(increase(src_frontend_internal_request_duration_seconds_count[5m])) * 100`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(5).For(15 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{category}}").Unit(monitoring.Percentage),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- May not be a substantial issue, check the 'frontend' logs for potential causes.
|
||||
`,
|
||||
@ -464,7 +464,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `histogram_quantile(0.99, sum by (le,category)(rate(src_gitserver_request_duration_seconds_bucket{job=~"(sourcegraph-)?frontend"}[5m])))`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(20),
|
||||
Panel: monitoring.Panel().LegendFormat("{{category}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "none",
|
||||
},
|
||||
{
|
||||
@ -473,7 +473,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~"(sourcegraph-)?frontend",code!~"2.."}[5m])) / ignoring(code) group_left sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~"(sourcegraph-)?frontend"}[5m])) * 100`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(5).For(15 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{category}}").Unit(monitoring.Percentage),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "none",
|
||||
},
|
||||
},
|
||||
@ -510,7 +510,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `sum(irate(src_http_request_duration_seconds_count{route="sign-in",method="post"}[5m]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.RequestsPerSecond),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Rate (QPS) of requests to sign-in`,
|
||||
},
|
||||
{
|
||||
@ -519,7 +519,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `histogram_quantile(0.99, sum(rate(src_http_request_duration_seconds_bucket{route="sign-in",method="post"}[5m])) by (le))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Milliseconds),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `99% percentile of sign-in latency`,
|
||||
},
|
||||
{
|
||||
@ -528,7 +528,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `sum by (code)(irate(src_http_request_duration_seconds_count{route="sign-in",method="post"}[5m]))/ ignoring (code) group_left sum(irate(src_http_request_duration_seconds_count{route="sign-in",method="post"}[5m]))*100`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Percentage),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Percentage of sign-in requests grouped by http code`,
|
||||
},
|
||||
},
|
||||
@ -540,7 +540,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.RequestsPerSecond),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Rate (QPS) of requests to sign-up`,
|
||||
},
|
||||
{
|
||||
@ -550,7 +550,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `histogram_quantile(0.99, sum(rate(src_http_request_duration_seconds_bucket{route="sign-up",method="post"}[5m])) by (le))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Milliseconds),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `99% percentile of sign-up latency`,
|
||||
},
|
||||
{
|
||||
@ -559,7 +559,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `sum by (code)(irate(src_http_request_duration_seconds_count{route="sign-up",method="post"}[5m]))/ ignoring (code) group_left sum(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))*100`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Percentage),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Percentage of sign-up requests grouped by http code`,
|
||||
},
|
||||
},
|
||||
@ -570,7 +570,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `sum(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.RequestsPerSecond),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Rate (QPS) of requests to sign-out`,
|
||||
},
|
||||
{
|
||||
@ -579,7 +579,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `histogram_quantile(0.99, sum(rate(src_http_request_duration_seconds_bucket{route="sign-out"}[5m])) by (le))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Milliseconds),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `99% percentile of sign-out latency`,
|
||||
},
|
||||
{
|
||||
@ -588,7 +588,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: ` sum by (code)(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))/ ignoring (code) group_left sum(irate(src_http_request_duration_seconds_count{route="sign-out"}[5m]))*100`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Percentage),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Percentage of sign-out requests grouped by http code`,
|
||||
},
|
||||
},
|
||||
@ -599,7 +599,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `sum(rate(src_frontend_account_failed_sign_in_attempts_total[1m]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Failed sign-in attempts per minute`,
|
||||
},
|
||||
{
|
||||
@ -608,7 +608,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `sum(rate(src_frontend_account_lockouts_total[1m]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Account lockouts per minute`,
|
||||
},
|
||||
},
|
||||
@ -644,7 +644,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Warning: monitoring.Alert().GreaterOrEqual(15000).For(5 * time.Minute),
|
||||
Critical: monitoring.Alert().GreaterOrEqual(30000).For(5 * time.Minute),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- Revert recent commits that cause extensive listing from "external_services" and/or "user_external_accounts" tables.
|
||||
`,
|
||||
@ -655,7 +655,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `min by (kubernetes_name) (src_encryption_cache_hit_total/(src_encryption_cache_hit_total+src_encryption_cache_miss_total))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
- Encryption cache hit ratio (hits/(hits+misses)) - minimum across all instances of a workload.
|
||||
`,
|
||||
@ -666,7 +666,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: `sum by (kubernetes_name) (irate(src_encryption_cache_eviction_total[5m]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
- Rate of encryption cache evictions (caused by cache exceeding its maximum size) - sum across all instances of a workload
|
||||
`,
|
||||
@ -1032,7 +1032,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
Query: "histogram_quantile(0.95, sum (rate(src_http_request_duration_seconds_bucket{route=~\"webhooks|github.webhooks|gitlab.webhooks|bitbucketServer.webhooks|bitbucketCloud.webhooks\"}[5m])) by (le, route))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("duration").Unit(monitoring.Seconds).With(monitoring.PanelOptions.NoLegend()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
p95 response time to incoming webhook requests from code hosts.
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{container_label_io_kubernetes_pod_name}}").
|
||||
Unit(monitoring.Bytes).
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -63,7 +63,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -77,7 +77,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{container_label_io_kubernetes_pod_name}}").
|
||||
Unit(monitoring.Percentage).
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -88,7 +88,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{container_label_io_kubernetes_pod_name}}").
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -110,7 +110,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").
|
||||
Unit(monitoring.Percentage).
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
Indicates disk space remaining for each gitserver instance, which is used to determine when to start evicting least-used repository clones from disk (default 10%, configured by 'SRC_REPOS_DESIRED_PERCENT_FREE').
|
||||
`,
|
||||
@ -129,7 +129,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{container_label_io_kubernetes_pod_name}}").
|
||||
Unit(monitoring.ReadsPerSecond).
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -141,7 +141,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{container_label_io_kubernetes_pod_name}}").
|
||||
Unit(monitoring.WritesPerSecond).
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -155,7 +155,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{container_label_io_kubernetes_pod_name}}").
|
||||
Unit(monitoring.ReadsPerSecond).
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -167,7 +167,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{container_label_io_kubernetes_pod_name}}").
|
||||
Unit(monitoring.WritesPerSecond).
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -181,7 +181,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{container_label_io_kubernetes_pod_name}}").
|
||||
Unit(monitoring.ReadsPerSecond).
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -193,7 +193,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{container_label_io_kubernetes_pod_name}}").
|
||||
Unit(monitoring.WritesPerSecond).
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
`,
|
||||
},
|
||||
@ -207,7 +207,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Critical: monitoring.Alert().GreaterOrEqual(100).For(5 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}} {{cmd}}").
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
A high value signals load.
|
||||
`,
|
||||
@ -225,7 +225,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Interpretation: "per second rate per command across all instances",
|
||||
Panel: monitoring.Panel().LegendFormat("{{cmd}}").
|
||||
With(monitoring.PanelOptions.LegendOnRight()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -235,7 +235,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum(src_gitserver_clone_queue)",
|
||||
Warning: monitoring.Alert().GreaterOrEqual(25),
|
||||
Panel: monitoring.Panel().LegendFormat("queue size"),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- **If you just added several repositories**, the warning may be expected.
|
||||
- **Check which repositories need cloning**, by visiting e.g. https://sourcegraph.example.com/site-admin/repositories?filter=not-cloned
|
||||
@ -247,7 +247,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum(src_gitserver_lsremote_queue)",
|
||||
Warning: monitoring.Alert().GreaterOrEqual(25),
|
||||
Panel: monitoring.Panel().LegendFormat("queue size"),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- **Check the code host status indicator for errors:** on the Sourcegraph app homepage, when signed in as an admin click the cloud icon in the top right corner of the page.
|
||||
- **Check if the issue continues to happen after 30 minutes**, it may be temporary.
|
||||
@ -262,7 +262,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "max(src_gitserver_echo_duration_seconds)",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("running commands").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
A high value here likely indicates a problem, especially if consistently high.
|
||||
You can query for individual commands using 'sum by (cmd)(src_gitserver_exec_running)' in Grafana ('/-/debug/grafana') to see if a specific Git Server command might be spiking in frequency.
|
||||
@ -273,7 +273,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
- **Kubernetes and Docker Compose:** Check that you are running a similar number of git server replicas and that their CPU/memory limits are allocated according to what is shown in the [Sourcegraph resource estimator](../deploy/resource_estimator.md).
|
||||
`,
|
||||
},
|
||||
shared.FrontendInternalAPIErrorResponses("gitserver", monitoring.ObservableOwnerRepoManagement).Observable(),
|
||||
shared.FrontendInternalAPIErrorResponses("gitserver", monitoring.ObservableOwnerSource).Observable(),
|
||||
},
|
||||
{
|
||||
{
|
||||
@ -282,7 +282,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "src_gitserver_repo_count",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("repo count"),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
MultiInstance: true,
|
||||
Interpretation: `
|
||||
This metric is only for informational purposes. It indicates the total number of repositories on gitserver.
|
||||
@ -306,7 +306,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{type=`gitserver`, error=`false`}[5m])) by (le))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{le}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `A high value means any internal service trying to clone a repo from gitserver is slowed down.`,
|
||||
},
|
||||
{
|
||||
@ -315,7 +315,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{type=`gitserver`, error=`false`, instance=~`${shard:regex}`}[5m])) by (le, instance))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `A high value means any internal service trying to clone a repo from gitserver is slowed down.`,
|
||||
},
|
||||
},
|
||||
@ -326,7 +326,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{type=`gitserver`, error=`true`}[5m])) by (le))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{le}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `95th percentile gitservice error request duration aggregate`,
|
||||
},
|
||||
{
|
||||
@ -335,7 +335,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{type=`gitserver`, error=`true`, instance=~`${shard:regex}`}[5m])) by (le, instance))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `95th percentile gitservice error request duration per shard`,
|
||||
},
|
||||
},
|
||||
@ -346,7 +346,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum(rate(src_gitserver_gitservice_duration_seconds_count{type=`gitserver`, error=`false`}[5m]))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("gitservers").Unit(monitoring.RequestsPerSecond),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Aggregate gitservice request rate`,
|
||||
},
|
||||
{
|
||||
@ -355,7 +355,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum(rate(src_gitserver_gitservice_duration_seconds_count{type=`gitserver`, error=`false`, instance=~`${shard:regex}`}[5m]))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.RequestsPerSecond),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Per shard gitservice request rate`,
|
||||
},
|
||||
},
|
||||
@ -366,7 +366,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum(rate(src_gitserver_gitservice_duration_seconds_count{type=`gitserver`, error=`true`}[5m]))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("gitservers").Unit(monitoring.RequestsPerSecond),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Aggregate gitservice request error rate`,
|
||||
},
|
||||
{
|
||||
@ -375,7 +375,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum(rate(src_gitserver_gitservice_duration_seconds_count{type=`gitserver`, error=`true`, instance=~`${shard:regex}`}[5m]))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.RequestsPerSecond),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Per shard gitservice request error rate`,
|
||||
},
|
||||
},
|
||||
@ -386,7 +386,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum(src_gitserver_gitservice_running{type=`gitserver`})",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("gitservers").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Aggregate gitservice requests running`,
|
||||
},
|
||||
{
|
||||
@ -395,7 +395,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum(src_gitserver_gitservice_running{type=`gitserver`, instance=~`${shard:regex}`}) by (instance)",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.RequestsPerSecond),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `Per shard gitservice requests running`,
|
||||
},
|
||||
},
|
||||
@ -412,7 +412,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "max by (instance) (src_gitserver_janitor_running)",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("janitor process running").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "1, if the janitor process is currently running",
|
||||
},
|
||||
},
|
||||
@ -423,7 +423,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "histogram_quantile(0.95, sum(rate(src_gitserver_janitor_job_duration_seconds_bucket[5m])) by (le, job_name))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{job_name}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "95th percentile job run duration",
|
||||
},
|
||||
},
|
||||
@ -434,7 +434,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: `sum by (job_name) (rate(src_gitserver_janitor_job_duration_seconds_count{success="false"}[5m]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{job_name}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "the rate of failures over 5m (by job)",
|
||||
},
|
||||
},
|
||||
@ -445,7 +445,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum by (instance) (rate(src_gitserver_repos_removed_disk_pressure[5m]))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "Repositories removed due to disk pressure",
|
||||
},
|
||||
},
|
||||
@ -456,7 +456,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: "sum by (instance) (increase(src_gitserver_non_existing_repos_removed[5m]))",
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "Repositoriess removed because they are not defined in the DB",
|
||||
},
|
||||
},
|
||||
@ -467,7 +467,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: `sum by (reason) (rate(src_gitserver_maintenance_status{success="true"}[1h]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{reason}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "the rate of successful sg maintenance jobs and the reason why they were triggered",
|
||||
},
|
||||
},
|
||||
@ -478,7 +478,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
Query: `sum by (skipped) (rate(src_gitserver_prune_status{success="true"}[1h]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("skipped={{skipped}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "the rate of successful git prune jobs over 1h and whether they were skipped",
|
||||
},
|
||||
},
|
||||
@ -541,7 +541,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
ServiceName: "gitserver",
|
||||
InstanceFilterRegex: `${shard:regex}`,
|
||||
},
|
||||
monitoring.ObservableOwnerRepoManagement,
|
||||
monitoring.ObservableOwnerSource,
|
||||
),
|
||||
|
||||
shared.NewGRPCServerMetricsGroup(
|
||||
@ -566,10 +566,10 @@ func GitServer() *monitoring.Dashboard {
|
||||
|
||||
shared.HTTP.NewHandlersGroup(containerName),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerRepoManagement, nil),
|
||||
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerRepoManagement, provisioningIndicatorsOptions),
|
||||
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerRepoManagement, nil),
|
||||
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerRepoManagement, nil),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSource, nil),
|
||||
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerSource, provisioningIndicatorsOptions),
|
||||
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerSource, nil),
|
||||
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerSource, nil),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@ -25,7 +25,7 @@ func GitHubProxy() *monitoring.Dashboard {
|
||||
Query: `max(github_proxy_waiting_requests)`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(100).For(5 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("requests waiting"),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- **Check github-proxy logs for network connection issues.
|
||||
- **Check github status.`,
|
||||
|
||||
@ -53,7 +53,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(timestamp(vector(time()))) - max(src_repoupdater_syncer_sync_last_time)`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
A high value here indicates issues synchronizing repo metadata.
|
||||
If the value is persistently high, make sure all external services have valid tokens.
|
||||
@ -65,7 +65,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(src_repoupdater_max_sync_backoff)`,
|
||||
Critical: monitoring.Alert().GreaterOrEqual(syncDurationThreshold.Seconds()).For(10 * time.Minute),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: fmt.Sprintf(`
|
||||
An alert here indicates that no code host connections have synced in at least %v. This indicates that there could be a configuration issue
|
||||
with your code hosts connections or networking issues affecting communication with your code hosts.
|
||||
@ -83,7 +83,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Warning: monitoring.Alert().Greater(0.5).For(10 * time.Minute),
|
||||
Critical: monitoring.Alert().Greater(1).For(10 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{family}}").Unit(monitoring.Number).With(monitoring.PanelOptions.ZeroIfNoData()),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
An alert here indicates errors syncing site level repo metadata with code hosts. This indicates that there could be a configuration issue
|
||||
with your code hosts connections or networking issues affecting communication with your code hosts.
|
||||
@ -102,7 +102,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: fmt.Sprintf(`max by (family) (rate(src_repoupdater_syncer_start_sync{family="Syncer.SyncExternalService"}[%s]))`, syncDurationThreshold.String()),
|
||||
Warning: monitoring.Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
Panel: monitoring.Panel().LegendFormat("Family: {{family}} Owner: {{owner}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check repo-updater logs for errors.",
|
||||
},
|
||||
{
|
||||
@ -111,7 +111,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `histogram_quantile(0.95, max by (le, family, success) (rate(src_repoupdater_syncer_sync_duration_seconds_bucket[1m])))`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(30).For(5 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{family}}-{{success}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host",
|
||||
},
|
||||
{
|
||||
@ -120,7 +120,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `histogram_quantile(0.95, max by (le) (rate(src_repoupdater_source_duration_seconds_bucket[1m])))`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(30).For(5 * time.Minute),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host",
|
||||
},
|
||||
},
|
||||
@ -133,7 +133,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
AggregateBy(monitoring.AggregatorMax).
|
||||
For(syncDurationThreshold),
|
||||
Panel: monitoring.Panel().LegendFormat("{{state}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check network connectivity to code hosts",
|
||||
},
|
||||
{
|
||||
@ -142,7 +142,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(rate(src_repoupdater_source_repos_total[1m]))`,
|
||||
Warning: monitoring.Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check network connectivity to code hosts",
|
||||
},
|
||||
},
|
||||
@ -153,7 +153,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(rate(src_repoupdater_purge_failed[1m]))`,
|
||||
Warning: monitoring.Alert().Greater(0).For(5 * time.Minute),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check repo-updater's connectivity with gitserver and gitserver logs",
|
||||
},
|
||||
},
|
||||
@ -164,7 +164,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(rate(src_repoupdater_sched_auto_fetch[1m]))`,
|
||||
Warning: monitoring.Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check repo-updater logs.",
|
||||
},
|
||||
{
|
||||
@ -173,7 +173,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(rate(src_repoupdater_sched_manual_fetch[1m]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
Check repo-updater logs if this value is persistently high.
|
||||
This does not indicate anything if there are no user added code hosts.
|
||||
@ -187,7 +187,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(src_repoupdater_sched_known_repos)`,
|
||||
Warning: monitoring.Alert().LessOrEqual(0).For(10 * time.Minute),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check repo-updater logs. This is expected to fire if there are no user added code hosts",
|
||||
},
|
||||
{
|
||||
@ -197,7 +197,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
// Alert if the derivative is positive for longer than 30 minutes
|
||||
Critical: monitoring.Alert().Greater(0).For(120 * time.Minute),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check repo-updater logs for indications that the queue is not being processed. The queue length should trend downwards over time as items are sent to GitServer",
|
||||
},
|
||||
{
|
||||
@ -206,7 +206,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(rate(src_repoupdater_sched_loops[1m]))`,
|
||||
Warning: monitoring.Alert().LessOrEqual(0).For(syncDurationThreshold),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check repo-updater logs for errors. This is expected to fire if there are no user added code hosts",
|
||||
},
|
||||
},
|
||||
@ -217,7 +217,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(src_repoupdater_stale_repos)`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(1).For(25 * time.Minute),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
Check repo-updater logs for errors.
|
||||
Check for rows in gitserver_repos where LastError is not an empty string.
|
||||
@ -229,7 +229,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(rate(src_repoupdater_sched_error[1m]))`,
|
||||
Critical: monitoring.Alert().GreaterOrEqual(1).For(25 * time.Minute),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check repo-updater logs for errors",
|
||||
},
|
||||
},
|
||||
@ -245,7 +245,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "total number of user permissions syncs",
|
||||
Query: `sum(src_repoupdater_perms_syncer_success_syncs{type="user"})`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the total number of user permissions sync completed.",
|
||||
},
|
||||
@ -254,7 +254,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "number of user permissions syncs [5m]",
|
||||
Query: `sum(increase(src_repoupdater_perms_syncer_success_syncs{type="user"}[5m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the number of users permissions syncs completed.",
|
||||
},
|
||||
@ -263,7 +263,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "number of first user permissions syncs [5m]",
|
||||
Query: `sum(increase(src_repoupdater_perms_syncer_initial_syncs{type="user"}[5m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the number of permissions syncs done for the first time for the user.",
|
||||
},
|
||||
@ -275,7 +275,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "total number of repo permissions syncs",
|
||||
Query: `sum(src_repoupdater_perms_syncer_success_syncs{type="repo"})`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the total number of repo permissions sync completed.",
|
||||
},
|
||||
@ -284,7 +284,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "number of repo permissions syncs over 5m",
|
||||
Query: `sum(increase(src_repoupdater_perms_syncer_success_syncs{type="repo"}[5m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the number of repos permissions syncs completed.",
|
||||
},
|
||||
@ -293,7 +293,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "number of first repo permissions syncs over 5m",
|
||||
Query: `sum(increase(src_repoupdater_perms_syncer_initial_syncs{type="repo"}[5m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the number of permissions syncs done for the first time for the repo.",
|
||||
},
|
||||
@ -304,7 +304,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "max duration between two consecutive permissions sync for user",
|
||||
Query: `max(max_over_time (src_repoupdater_perms_syncer_perms_consecutive_sync_delay{type="user"} [1m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("seconds").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the max delay between two consecutive permissions sync for a user during the period.",
|
||||
},
|
||||
@ -313,7 +313,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "max duration between two consecutive permissions sync for repo",
|
||||
Query: `max(max_over_time (src_repoupdater_perms_syncer_perms_consecutive_sync_delay{type="repo"} [1m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("seconds").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the max delay between two consecutive permissions sync for a repo during the period.",
|
||||
},
|
||||
@ -324,7 +324,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "max duration between user creation and first permissions sync",
|
||||
Query: `max(max_over_time(src_repoupdater_perms_syncer_perms_first_sync_delay{type="user"}[1m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("seconds").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the max delay between user creation and their permissions sync",
|
||||
},
|
||||
@ -333,7 +333,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "max duration between repo creation and first permissions sync over 1m",
|
||||
Query: `max(max_over_time(src_repoupdater_perms_syncer_perms_first_sync_delay{type="repo"}[1m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("seconds").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the max delay between repo creation and their permissions sync",
|
||||
},
|
||||
@ -344,7 +344,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "number of permissions found during user/repo permissions sync",
|
||||
Query: `sum by (type) (src_repoupdater_perms_syncer_perms_found)`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the number permissions found during users/repos permissions sync.",
|
||||
},
|
||||
@ -353,7 +353,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "average number of permissions found during permissions sync per user/repo",
|
||||
Query: `avg by (type) (src_repoupdater_perms_syncer_perms_found)`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the average number permissions found during permissions sync per user/repo.",
|
||||
},
|
||||
@ -365,7 +365,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max by (type) (src_repoupdater_perms_syncer_outdated_perms)`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(100).For(5 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- **Enabled permissions for the first time:** Wait for few minutes and see if the number goes down.
|
||||
- **Otherwise:** Increase the API rate limit to [GitHub](https://docs.sourcegraph.com/admin/external_service/github#github-com-rate-limits), [GitLab](https://docs.sourcegraph.com/admin/external_service/gitlab#internal-rate-limits) or [Bitbucket Server](https://docs.sourcegraph.com/admin/external_service/bitbucket_server#internal-rate-limits).
|
||||
@ -379,7 +379,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `histogram_quantile(0.95, max by (le, type) (rate(src_repoupdater_perms_syncer_sync_duration_seconds_bucket[1m])))`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(30).For(5 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check the network latency is reasonable (<50ms) between the Sourcegraph and the code host.",
|
||||
},
|
||||
},
|
||||
@ -390,7 +390,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max by (type) (ceil(rate(src_repoupdater_perms_syncer_sync_errors_total[1m])))`,
|
||||
Critical: monitoring.Alert().GreaterOrEqual(1).For(time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{type}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- Check the network connectivity the Sourcegraph and the code host.
|
||||
- Check if API rate limit quota is exhausted on the code host.
|
||||
@ -402,7 +402,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(rate(src_repoupdater_perms_syncer_schedule_repos_total[1m]))`,
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerIAM,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: `
|
||||
Indicates how many repositories have been scheduled for a permissions sync.
|
||||
More about repository permissions synchronization [here](https://docs.sourcegraph.com/admin/permissions/syncing#scheduling)
|
||||
@ -422,7 +422,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(src_repoupdater_external_services_total)`,
|
||||
Critical: monitoring.Alert().GreaterOrEqual(20000).For(1 * time.Hour),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check for spikes in external services, could be abuse",
|
||||
},
|
||||
},
|
||||
@ -433,7 +433,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(src_repoupdater_queued_sync_jobs_total)`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(100).For(1 * time.Hour),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- **Check if jobs are failing to sync:** "SELECT * FROM external_service_sync_jobs WHERE state = 'errored'";
|
||||
- **Increase the number of workers** using the 'repoConcurrentExternalServiceSyncers' site config.
|
||||
@ -445,7 +445,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(src_repoupdater_completed_sync_jobs_total)`,
|
||||
Warning: monitoring.Alert().GreaterOrEqual(100000).For(1 * time.Hour),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check repo-updater logs. Jobs older than 1 day should have been removed.",
|
||||
},
|
||||
{
|
||||
@ -454,7 +454,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max(src_repoupdater_errored_sync_jobs_percentage)`,
|
||||
Warning: monitoring.Alert().Greater(10).For(1 * time.Hour),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Percentage),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: "Check repo-updater logs. Check code host connectivity",
|
||||
},
|
||||
},
|
||||
@ -466,7 +466,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
// 5% of initial limit of 5000
|
||||
Warning: monitoring.Alert().LessOrEqual(250),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}"),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- Consider creating a new token for the indicated resource (the 'name' label for series below the threshold in the dashboard) under a dedicated machine user to reduce rate limit pressure.
|
||||
`,
|
||||
@ -478,7 +478,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
// 5% of initial limit of 5000
|
||||
Warning: monitoring.Alert().LessOrEqual(250),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}"),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- Consider creating a new token for the indicated resource (the 'name' label for series below the threshold in the dashboard) under a dedicated machine user to reduce rate limit pressure.
|
||||
`,
|
||||
@ -489,7 +489,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Query: `max by (name) (src_github_rate_limit_remaining_v2{resource="search"})`,
|
||||
Warning: monitoring.Alert().LessOrEqual(5),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}"),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `
|
||||
- Consider creating a new token for the indicated resource (the 'name' label for series below the threshold in the dashboard) under a dedicated machine user to reduce rate limit pressure.
|
||||
`,
|
||||
@ -501,7 +501,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "time spent waiting for the GitHub graphql API rate limiter",
|
||||
Query: `max by(name) (rate(src_github_rate_limit_wait_duration_seconds{resource="graphql"}[5m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates how long we're waiting on the rate limit once it has been exceeded",
|
||||
},
|
||||
@ -510,7 +510,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "time spent waiting for the GitHub rest API rate limiter",
|
||||
Query: `max by(name) (rate(src_github_rate_limit_wait_duration_seconds{resource="rest"}[5m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates how long we're waiting on the rate limit once it has been exceeded",
|
||||
},
|
||||
@ -519,7 +519,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "time spent waiting for the GitHub search API rate limiter",
|
||||
Query: `max by(name) (rate(src_github_rate_limit_wait_duration_seconds{resource="search"}[5m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates how long we're waiting on the rate limit once it has been exceeded",
|
||||
},
|
||||
@ -532,7 +532,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
// 5% of initial limit of 600
|
||||
Critical: monitoring.Alert().LessOrEqual(30),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}"),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NextSteps: `Try restarting the pod to get a different public IP.`,
|
||||
},
|
||||
{
|
||||
@ -540,7 +540,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "time spent waiting for the GitLab rest API rate limiter",
|
||||
Query: `max by (name) (rate(src_gitlab_rate_limit_wait_duration_seconds{resource="rest"}[5m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates how long we're waiting on the rate limit once it has been exceeded",
|
||||
},
|
||||
@ -551,7 +551,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "95th percentile time spent successfully waiting on our internal rate limiter",
|
||||
Query: `histogram_quantile(0.95, sum(rate(src_internal_rate_limit_wait_duration_bucket{failed="false"}[5m])) by (le, urn))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{urn}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates how long we're waiting on our internal rate limiter when communicating with a code host",
|
||||
},
|
||||
@ -560,7 +560,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
Description: "rate of failures waiting on our internal rate limiter",
|
||||
Query: `sum by (urn) (rate(src_internal_rate_limit_wait_duration_count{failed="true"}[5m]))`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{urn}}"),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
NoAlert: true,
|
||||
Interpretation: "The rate at which we fail our internal rate limiter.",
|
||||
},
|
||||
@ -581,7 +581,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
}, monitoring.ObservableOwnerRepoManagement),
|
||||
}, monitoring.ObservableOwnerSource),
|
||||
|
||||
shared.NewGRPCInternalErrorMetricsGroup(
|
||||
shared.GRPCInternalErrorMetricsOptions{
|
||||
@ -589,15 +589,15 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
RawGRPCServiceName: "repoupdater.v1.RepoUpdaterService",
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerRepoManagement),
|
||||
}, monitoring.ObservableOwnerSource),
|
||||
|
||||
shared.HTTP.NewHandlersGroup(containerName),
|
||||
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerRepoManagement, nil),
|
||||
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerSource, nil),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerRepoManagement, containerMonitoringOptions),
|
||||
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerRepoManagement, nil),
|
||||
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerRepoManagement, nil),
|
||||
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerRepoManagement, nil),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSource, containerMonitoringOptions),
|
||||
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerSource, nil),
|
||||
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerSource, nil),
|
||||
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerSource, nil),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@ -14,7 +14,7 @@ type gitServer struct{}
|
||||
// src_gitserver_api_duration_seconds_bucket
|
||||
// src_gitserver_api_errors_total
|
||||
func (gitServer) NewAPIGroup(containerName string) monitoring.Group {
|
||||
return Observation.NewGroup(containerName, monitoring.ObservableOwnerRepoManagement, ObservationGroupOptions{
|
||||
return Observation.NewGroup(containerName, monitoring.ObservableOwnerSource, ObservationGroupOptions{
|
||||
GroupConstructorOptions: GroupConstructorOptions{
|
||||
Namespace: "gitserver",
|
||||
DescriptionRoot: "Gitserver API (powered by internal/observation)",
|
||||
@ -46,7 +46,7 @@ func (gitServer) NewAPIGroup(containerName string) monitoring.Group {
|
||||
// src_gitserver_client_duration_seconds_bucket
|
||||
// src_gitserver_client_errors_total
|
||||
func (gitServer) NewClientGroup(containerName string) monitoring.Group {
|
||||
return Observation.NewGroup(containerName, monitoring.ObservableOwnerRepoManagement, ObservationGroupOptions{
|
||||
return Observation.NewGroup(containerName, monitoring.ObservableOwnerSource, ObservationGroupOptions{
|
||||
GroupConstructorOptions: GroupConstructorOptions{
|
||||
Namespace: "gitserver",
|
||||
DescriptionRoot: "Gitserver Client",
|
||||
@ -84,7 +84,7 @@ func (gitServer) NewBatchLogSemaphoreWait(containerName string) monitoring.Group
|
||||
NoAlertsOption("none")(Observation.Duration(ObservableConstructorOptions{
|
||||
MetricNameRoot: "batch_log_semaphore_wait",
|
||||
MetricDescriptionRoot: "batch log semaphore",
|
||||
})(containerName, monitoring.ObservableOwnerRepoManagement)).Observable(),
|
||||
})(containerName, monitoring.ObservableOwnerSource)).Observable(),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@ -22,7 +22,7 @@ func (http) NewHandlersGroup(name string) monitoring.Group {
|
||||
Query: fmt.Sprintf("sum by (route) (rate(src_http_request_duration_seconds_count{app=\"%s\",code=~\"2..\"}[5m]))", name),
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{route}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "The number of healthy HTTP requests per second to internal HTTP api",
|
||||
},
|
||||
{
|
||||
@ -31,7 +31,7 @@ func (http) NewHandlersGroup(name string) monitoring.Group {
|
||||
Query: fmt.Sprintf("sum by (route) (rate(src_http_request_duration_seconds_count{app=\"%s\",code!~\"2..\"}[5m]))", name),
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{route}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "The number of unhealthy HTTP requests per second to internal HTTP api",
|
||||
},
|
||||
{
|
||||
@ -40,7 +40,7 @@ func (http) NewHandlersGroup(name string) monitoring.Group {
|
||||
Query: fmt.Sprintf("sum by (code) (rate(src_http_request_duration_seconds_count{app=\"%s\"}[5m]))", name),
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{code}}").Unit(monitoring.Number),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "The number of HTTP requests per second by code",
|
||||
},
|
||||
},
|
||||
@ -51,7 +51,7 @@ func (http) NewHandlersGroup(name string) monitoring.Group {
|
||||
Query: fmt.Sprintf("histogram_quantile(0.95, sum(rate(src_http_request_duration_seconds_bucket{app=\"%s\",code=~\"2..\"}[5m])) by (le, route))", name),
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{route}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "The 95th percentile duration by route when the status code is 200 ",
|
||||
},
|
||||
{
|
||||
@ -60,7 +60,7 @@ func (http) NewHandlersGroup(name string) monitoring.Group {
|
||||
Query: fmt.Sprintf("histogram_quantile(0.95, sum(rate(src_http_request_duration_seconds_bucket{app=\"%s\",code!~\"2..\"}[5m])) by (le, route))", name),
|
||||
NoAlert: true,
|
||||
Panel: monitoring.Panel().LegendFormat("{{route}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerRepoManagement,
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
Interpretation: "The 95th percentile duration by route when the status code is not 200 ",
|
||||
},
|
||||
},
|
||||
|
||||
@ -93,7 +93,7 @@ func Worker() *monitoring.Dashboard {
|
||||
Panel: monitoring.Panel().LegendFormat("{{tableName}}").Unit(monitoring.Percentage).Min(0).Max(100),
|
||||
Owner: owner,
|
||||
}
|
||||
}(monitoring.ObservableOwnerRepoManagement).WithNoAlerts(`
|
||||
}(monitoring.ObservableOwnerSource).WithNoAlerts(`
|
||||
Percentage of encrypted database records
|
||||
`).Observable(),
|
||||
|
||||
@ -101,7 +101,7 @@ func Worker() *monitoring.Dashboard {
|
||||
MetricNameRoot: "records_encrypted",
|
||||
MetricDescriptionRoot: "database",
|
||||
By: []string{"tableName"},
|
||||
})(containerName, monitoring.ObservableOwnerRepoManagement).WithNoAlerts(`
|
||||
})(containerName, monitoring.ObservableOwnerSource).WithNoAlerts(`
|
||||
Number of encrypted database records every 5m
|
||||
`).Observable(),
|
||||
|
||||
@ -109,14 +109,14 @@ func Worker() *monitoring.Dashboard {
|
||||
MetricNameRoot: "records_decrypted",
|
||||
MetricDescriptionRoot: "database",
|
||||
By: []string{"tableName"},
|
||||
})(containerName, monitoring.ObservableOwnerRepoManagement).WithNoAlerts(`
|
||||
})(containerName, monitoring.ObservableOwnerSource).WithNoAlerts(`
|
||||
Number of encrypted database records every 5m
|
||||
`).Observable(),
|
||||
|
||||
shared.Observation.Errors(shared.ObservableConstructorOptions{
|
||||
MetricNameRoot: "record_encryption",
|
||||
MetricDescriptionRoot: "encryption",
|
||||
})(containerName, monitoring.ObservableOwnerRepoManagement).WithNoAlerts(`
|
||||
})(containerName, monitoring.ObservableOwnerSource).WithNoAlerts(`
|
||||
Number of database record encryption/decryption errors every 5m
|
||||
`).Observable(),
|
||||
},
|
||||
|
||||
@ -501,10 +501,10 @@ var (
|
||||
handbookSlug: "security",
|
||||
teamName: "Security",
|
||||
}
|
||||
ObservableOwnerRepoManagement = ObservableOwner{
|
||||
identifier: "repo-management",
|
||||
handbookSlug: "repo-management",
|
||||
teamName: "Repo Management",
|
||||
ObservableOwnerSource = ObservableOwner{
|
||||
identifier: "source",
|
||||
handbookSlug: "source",
|
||||
teamName: "Source",
|
||||
}
|
||||
ObservableOwnerCodeInsights = ObservableOwner{
|
||||
identifier: "code-insights",
|
||||
@ -516,11 +516,6 @@ var (
|
||||
handbookSlug: "devops",
|
||||
teamName: "Cloud DevOps",
|
||||
}
|
||||
ObservableOwnerIAM = ObservableOwner{
|
||||
identifier: "iam",
|
||||
handbookSlug: "iam",
|
||||
teamName: "Identity and Access Management",
|
||||
}
|
||||
ObservableOwnerDataAnalytics = ObservableOwner{
|
||||
identifier: "data-analytics",
|
||||
handbookSlug: "data-analytics",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user