mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 17:11:49 +00:00
monitoring: fix alert definition for site configuration by adding scrape job label (#59687)
We discovered recently that the definition for the alert that fires if the site configuration hasn't been fetched within 5 minutes strips out the regex that targets individual services (since it uses a grafana variable). This means that every instance of this alert will fire if any individual service trips over this threshold. This PR fixes the issue by adding a new `job` filter for this alert that targets only the services that have that Prometheus scrape target name. This works around the previous issue by using a fixed value for the `job` value instead of a dynamic grafana value. The value of the job filter generally looks like `job=~.*$container_name` (following the strategy from https://sourcegraph.com/github.com/sourcegraph/sourcegraph@9a780f2e694238b5326e3e121d6a1828463001b9/-/blob/monitoring/monitoring/monitoring.go?L161 ) unless I noticed that there was different logic in the existing dashboard for the services. Ex: - `frontend`: already used `job=~"(sourcegraph-)?frontend"` for some metrics, so I used it again here - `worker`: `already used `job=~"^worker.*"` in some metrics, so I used it again and standarized the other existing panels to use the same shared variable ## Test plan I eyeballed the generated alert.md and dashboards.md to verify that my changes looked correct (that is, my refactors resulted in either no diff, or that the diff I generated still looked like valid regex).
This commit is contained in:
parent
9a780f2e69
commit
616e3df4b9
@ -639,7 +639,7 @@ Generated query for warning alert: `max((sum by (alert_type) (increase(src_graph
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~"(sourcegraph-)?frontend"}[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -1567,7 +1567,7 @@ Generated query for warning alert: `max((sum(src_gitserver_lsremote_queue)) >= 2
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*gitserver"}[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -3144,9 +3144,9 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*redis-store"}
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for warning alert: `(min((sum(src_worker_jobs{job="worker",job_name="codeintel-upload-janitor"})) < 1)) or (absent(sum(src_worker_jobs{job="worker",job_name="codeintel-upload-janitor"})) == 1)`
|
||||
Generated query for warning alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-upload-janitor"})) < 1)) or (absent(sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-upload-janitor"})) == 1)`
|
||||
|
||||
Generated query for critical alert: `(min((sum(src_worker_jobs{job="worker",job_name="codeintel-upload-janitor"})) < 1)) or (absent(sum(src_worker_jobs{job="worker",job_name="codeintel-upload-janitor"})) == 1)`
|
||||
Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-upload-janitor"})) < 1)) or (absent(sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-upload-janitor"})) == 1)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -3182,9 +3182,9 @@ Generated query for critical alert: `(min((sum(src_worker_jobs{job="worker",job_
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for warning alert: `(min((sum(src_worker_jobs{job="worker",job_name="codeintel-commitgraph-updater"})) < 1)) or (absent(sum(src_worker_jobs{job="worker",job_name="codeintel-commitgraph-updater"})) == 1)`
|
||||
Generated query for warning alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-commitgraph-updater"})) < 1)) or (absent(sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-commitgraph-updater"})) == 1)`
|
||||
|
||||
Generated query for critical alert: `(min((sum(src_worker_jobs{job="worker",job_name="codeintel-commitgraph-updater"})) < 1)) or (absent(sum(src_worker_jobs{job="worker",job_name="codeintel-commitgraph-updater"})) == 1)`
|
||||
Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-commitgraph-updater"})) < 1)) or (absent(sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-commitgraph-updater"})) == 1)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -3220,9 +3220,9 @@ Generated query for critical alert: `(min((sum(src_worker_jobs{job="worker",job_
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for warning alert: `(min((sum(src_worker_jobs{job="worker",job_name="codeintel-autoindexing-scheduler"})) < 1)) or (absent(sum(src_worker_jobs{job="worker",job_name="codeintel-autoindexing-scheduler"})) == 1)`
|
||||
Generated query for warning alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"})) < 1)) or (absent(sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"})) == 1)`
|
||||
|
||||
Generated query for critical alert: `(min((sum(src_worker_jobs{job="worker",job_name="codeintel-autoindexing-scheduler"})) < 1)) or (absent(sum(src_worker_jobs{job="worker",job_name="codeintel-autoindexing-scheduler"})) == 1)`
|
||||
Generated query for critical alert: `(min((sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"})) < 1)) or (absent(sum(src_worker_jobs{job=~"^worker.*",job_name="codeintel-autoindexing-scheduler"})) == 1)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -3765,7 +3765,7 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*worker"}) / c
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~"^worker.*"}[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -4497,7 +4497,7 @@ Generated query for critical alert: `min((max by (name) (src_gitlab_rate_limit_r
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*repo-updater"}[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -4942,7 +4942,7 @@ Generated query for warning alert: `max((sum by (code) (increase(searcher_servic
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*searcher"}[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -5327,7 +5327,7 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*searcher"}) /
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*symbols"}[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -7710,7 +7710,7 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*otel-collecto
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~".*embeddings"}[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
36
doc/admin/observability/dashboards.md
generated
36
doc/admin/observability/dashboards.md
generated
@ -398,7 +398,7 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100300`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{instance=~`${internalInstance:regex}`}`
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{job=~`(sourcegraph-)?frontend`,instance=~`${internalInstance:regex}`}`
|
||||
|
||||
</details>
|
||||
|
||||
@ -417,7 +417,7 @@ To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=100301`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{instance=~`${internalInstance:regex}`}[1m]))`
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`(sourcegraph-)?frontend`,instance=~`${internalInstance:regex}`}[1m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -7337,7 +7337,7 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10110
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{instance=~`${shard:regex}`}`
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{job=~`.*gitserver`,instance=~`${shard:regex}`}`
|
||||
|
||||
</details>
|
||||
|
||||
@ -7356,7 +7356,7 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10110
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{instance=~`${shard:regex}`}[1m]))`
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*gitserver`,instance=~`${shard:regex}`}[1m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -10151,7 +10151,7 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100000` on
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (job_name) (src_worker_jobs{job="worker"})`
|
||||
Query: `sum by (job_name) (src_worker_jobs{job=~"^worker.*"})`
|
||||
|
||||
</details>
|
||||
|
||||
@ -10170,7 +10170,7 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100010` on
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum (src_worker_jobs{job="worker", job_name="codeintel-upload-janitor"})`
|
||||
Query: `sum (src_worker_jobs{job=~"^worker.*", job_name="codeintel-upload-janitor"})`
|
||||
|
||||
</details>
|
||||
|
||||
@ -10189,7 +10189,7 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100011` on
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum (src_worker_jobs{job="worker", job_name="codeintel-commitgraph-updater"})`
|
||||
Query: `sum (src_worker_jobs{job=~"^worker.*", job_name="codeintel-commitgraph-updater"})`
|
||||
|
||||
</details>
|
||||
|
||||
@ -10208,7 +10208,7 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100012` on
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum (src_worker_jobs{job="worker", job_name="codeintel-autoindexing-scheduler"})`
|
||||
Query: `sum (src_worker_jobs{job=~"^worker.*", job_name="codeintel-autoindexing-scheduler"})`
|
||||
|
||||
</details>
|
||||
|
||||
@ -13918,7 +13918,7 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103800` on
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}`
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{job=~`^worker.*`,instance=~`${instance:regex}`}`
|
||||
|
||||
</details>
|
||||
|
||||
@ -13937,7 +13937,7 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=103801` on
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}[1m]))`
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`^worker.*`,instance=~`${instance:regex}`}[1m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -15856,7 +15856,7 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}`
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{job=~`.*repo-updater`,instance=~`${instance:regex}`}`
|
||||
|
||||
</details>
|
||||
|
||||
@ -15875,7 +15875,7 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}[1m]))`
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*repo-updater`,instance=~`${instance:regex}`}[1m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17442,7 +17442,7 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100700`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}`
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{job=~`.*searcher`,instance=~`${instance:regex}`}`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17461,7 +17461,7 @@ To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=100701`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}[1m]))`
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*searcher`,instance=~`${instance:regex}`}[1m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -19221,7 +19221,7 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100800` o
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}`
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{job=~`.*symbols`,instance=~`${instance:regex}`}`
|
||||
|
||||
</details>
|
||||
|
||||
@ -19240,7 +19240,7 @@ To see this panel, visit `/-/debug/grafana/d/symbols/symbols?viewPanel=100801` o
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}[1m]))`
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*symbols`,instance=~`${instance:regex}`}[1m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -32013,7 +32013,7 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}`
|
||||
Query: `src_conf_client_time_since_last_successful_update_seconds{job=~`.*embeddings`,instance=~`${instance:regex}`}`
|
||||
|
||||
</details>
|
||||
|
||||
@ -32032,7 +32032,7 @@ To see this panel, visit `/-/debug/grafana/d/embeddings/embeddings?viewPanel=100
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{instance=~`${instance:regex}`}[1m]))`
|
||||
Query: `max(max_over_time(src_conf_client_time_since_last_successful_update_seconds{job=~`.*embeddings`,instance=~`${instance:regex}`}[1m]))`
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
package definitions
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/monitoring/definitions/shared"
|
||||
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
||||
)
|
||||
@ -8,6 +10,8 @@ import (
|
||||
func Embeddings() *monitoring.Dashboard {
|
||||
const containerName = "embeddings"
|
||||
|
||||
scrapeJobRegex := fmt.Sprintf(".*%s", containerName)
|
||||
|
||||
return &monitoring.Dashboard{
|
||||
Name: "embeddings",
|
||||
Title: "Embeddings",
|
||||
@ -28,6 +32,7 @@ func Embeddings() *monitoring.Dashboard {
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "embeddings",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
JobFilterRegex: scrapeJobRegex,
|
||||
}, monitoring.ObservableOwnerInfraOrg),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName, monitoring.ObservableOwnerCody),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCody, nil),
|
||||
|
||||
@ -17,6 +17,8 @@ func Frontend() *monitoring.Dashboard {
|
||||
|
||||
grpcZoektConfigurationServiceName = "sourcegraph.zoekt.configuration.v1.ZoektConfigurationService"
|
||||
grpcInternalAPIServiceName = "api.internalapi.v1.ConfigService"
|
||||
|
||||
scrapeJobRegex = `(sourcegraph-)?frontend`
|
||||
)
|
||||
|
||||
var sentinelSamplingIntervals []string
|
||||
@ -340,6 +342,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "frontend",
|
||||
InstanceFilterRegex: `${internalInstance:regex}`,
|
||||
JobFilterRegex: scrapeJobRegex,
|
||||
}, monitoring.ObservableOwnerInfraOrg),
|
||||
|
||||
shared.CodeIntelligence.NewResolversGroup(containerName),
|
||||
@ -493,7 +496,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
{
|
||||
Name: "99th_percentile_gitserver_duration",
|
||||
Description: "99th percentile successful gitserver query duration over 5m",
|
||||
Query: `histogram_quantile(0.99, sum by (le,category)(rate(src_gitserver_request_duration_seconds_bucket{job=~"(sourcegraph-)?frontend"}[5m])))`,
|
||||
Query: fmt.Sprintf(`histogram_quantile(0.99, sum by (le,category)(rate(src_gitserver_request_duration_seconds_bucket{job=~%q}[5m])))`, scrapeJobRegex),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(20),
|
||||
Panel: monitoring.Panel().LegendFormat("{{category}}").Unit(monitoring.Seconds),
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
@ -502,7 +505,7 @@ func Frontend() *monitoring.Dashboard {
|
||||
{
|
||||
Name: "gitserver_error_responses",
|
||||
Description: "gitserver error responses every 5m",
|
||||
Query: `sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~"(sourcegraph-)?frontend",code!~"2.."}[5m])) / ignoring(code) group_left sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~"(sourcegraph-)?frontend"}[5m])) * 100`,
|
||||
Query: fmt.Sprintf(`sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~%q,code!~"2.."}[5m])) / ignoring(code) group_left sum by (category)(increase(src_gitserver_request_duration_seconds_count{job=~%q}[5m])) * 100`, scrapeJobRegex, scrapeJobRegex),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(5).For(15 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{category}}").Unit(monitoring.Percentage),
|
||||
Owner: monitoring.ObservableOwnerSource,
|
||||
|
||||
@ -14,6 +14,8 @@ func GitServer() *monitoring.Dashboard {
|
||||
grpcServiceName = "gitserver.v1.GitserverService"
|
||||
)
|
||||
|
||||
scrapeJobRegex := fmt.Sprintf(".*%s", containerName)
|
||||
|
||||
gitserverHighMemoryNoAlertTransformer := func(observable shared.Observable) shared.Observable {
|
||||
return observable.WithNoAlerts(`Git Server is expected to use up all the memory it is provided.`)
|
||||
}
|
||||
@ -568,6 +570,7 @@ func GitServer() *monitoring.Dashboard {
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "gitserver",
|
||||
InstanceFilterRegex: `${shard:regex}`,
|
||||
JobFilterRegex: scrapeJobRegex,
|
||||
}, monitoring.ObservableOwnerInfraOrg),
|
||||
|
||||
shared.CodeIntelligence.NewCoursierGroup(containerName),
|
||||
|
||||
@ -17,6 +17,8 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
syncDurationThreshold = 9 * time.Hour
|
||||
)
|
||||
|
||||
scrapeJobRegex := fmt.Sprintf(".*%s", containerName)
|
||||
|
||||
containerMonitoringOptions := &shared.ContainerMonitoringGroupOptions{
|
||||
MemoryUsage: func(observable shared.Observable) shared.Observable {
|
||||
return observable.WithWarning(nil).WithCritical(monitoring.Alert().GreaterOrEqual(90).For(10 * time.Minute))
|
||||
@ -432,6 +434,7 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "repo_updater",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
JobFilterRegex: scrapeJobRegex,
|
||||
}, monitoring.ObservableOwnerInfraOrg),
|
||||
shared.HTTP.NewHandlersGroup(containerName),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName, monitoring.ObservableOwnerSource),
|
||||
|
||||
@ -15,6 +15,8 @@ func Searcher() *monitoring.Dashboard {
|
||||
grpcServiceName = "searcher.v1.SearcherService"
|
||||
)
|
||||
|
||||
scrapeJobRegex := fmt.Sprintf(".*%s", containerName)
|
||||
|
||||
grpcMethodVariable := shared.GRPCMethodVariable("searcher", grpcServiceName)
|
||||
|
||||
// instanceSelector is a helper for inserting the instance selector.
|
||||
@ -253,6 +255,7 @@ regularly above 0 it is a sign for further investigation.`,
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "searcher",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
JobFilterRegex: scrapeJobRegex,
|
||||
}, monitoring.ObservableOwnerInfraOrg),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName, monitoring.ObservableOwnerInfraOrg),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
|
||||
|
||||
@ -6,6 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/iancoleman/strcase"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
||||
)
|
||||
|
||||
@ -20,6 +21,12 @@ type SiteConfigurationMetricsOptions struct {
|
||||
//
|
||||
// Example: (gitserver-0 | gitserver-1)
|
||||
InstanceFilterRegex string
|
||||
|
||||
// JobFilterRegex is the PromQL regex that's used to filter the
|
||||
// site configuration client metrics to only those emitted by the Prometheus scrape job(s) that were interested in.
|
||||
//
|
||||
// Example: `.*gitserver`
|
||||
JobFilterRegex string
|
||||
}
|
||||
|
||||
// NewSiteConfigurationClientMetricsGroup creates a group containing site configuration fetching latency statistics for the service
|
||||
@ -27,6 +34,8 @@ type SiteConfigurationMetricsOptions struct {
|
||||
func NewSiteConfigurationClientMetricsGroup(opts SiteConfigurationMetricsOptions, owner monitoring.ObservableOwner) monitoring.Group {
|
||||
opts.HumanServiceName = strcase.ToSnake(opts.HumanServiceName)
|
||||
|
||||
jobFilter := fmt.Sprintf("job=~`%s`", opts.JobFilterRegex)
|
||||
|
||||
metric := func(base string, labelFilters ...string) string {
|
||||
metric := base
|
||||
|
||||
@ -49,7 +58,7 @@ func NewSiteConfigurationClientMetricsGroup(opts SiteConfigurationMetricsOptions
|
||||
{
|
||||
Name: fmt.Sprintf("%s_site_configuration_duration_since_last_successful_update_by_instance", opts.HumanServiceName),
|
||||
Description: "duration since last successful site configuration update (by instance)",
|
||||
Query: metric("src_conf_client_time_since_last_successful_update_seconds"),
|
||||
Query: metric("src_conf_client_time_since_last_successful_update_seconds", jobFilter),
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds),
|
||||
Owner: owner,
|
||||
NoAlert: true,
|
||||
@ -58,7 +67,7 @@ func NewSiteConfigurationClientMetricsGroup(opts SiteConfigurationMetricsOptions
|
||||
{
|
||||
Name: fmt.Sprintf("%s_site_configuration_duration_since_last_successful_update_by_instance", opts.HumanServiceName),
|
||||
Description: fmt.Sprintf("maximum duration since last successful site configuration update (all %q instances)", opts.HumanServiceName),
|
||||
Query: fmt.Sprintf("max(max_over_time(%s[1m]))", metric("src_conf_client_time_since_last_successful_update_seconds")),
|
||||
Query: fmt.Sprintf("max(max_over_time(%s[1m]))", metric("src_conf_client_time_since_last_successful_update_seconds", jobFilter)),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Seconds),
|
||||
Owner: owner,
|
||||
Critical: monitoring.Alert().GreaterOrEqual((5 * time.Minute).Seconds()),
|
||||
|
||||
@ -13,6 +13,8 @@ func Symbols() *monitoring.Dashboard {
|
||||
grpcServiceName = "symbols.v1.SymbolsService"
|
||||
)
|
||||
|
||||
scrapeJobRegex := fmt.Sprintf(".*%s", containerName)
|
||||
|
||||
grpcMethodVariable := shared.GRPCMethodVariable("symbols", grpcServiceName)
|
||||
|
||||
return &monitoring.Dashboard{
|
||||
@ -70,6 +72,7 @@ func Symbols() *monitoring.Dashboard {
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "symbols",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
JobFilterRegex: scrapeJobRegex,
|
||||
}, monitoring.ObservableOwnerInfraOrg),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName, monitoring.ObservableOwnerInfraOrg),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
|
||||
|
||||
@ -11,6 +11,8 @@ import (
|
||||
func Worker() *monitoring.Dashboard {
|
||||
const containerName = "worker"
|
||||
|
||||
scrapeJobRegex := fmt.Sprintf("^%s.*", containerName)
|
||||
|
||||
workerJobs := []struct {
|
||||
Name string
|
||||
Owner monitoring.ObservableOwner
|
||||
@ -25,7 +27,7 @@ func Worker() *monitoring.Dashboard {
|
||||
activeJobObservables = append(activeJobObservables, monitoring.Observable{
|
||||
Name: fmt.Sprintf("worker_job_%s_count", job.Name),
|
||||
Description: fmt.Sprintf("number of worker instances running the %s job", job.Name),
|
||||
Query: fmt.Sprintf(`sum (src_worker_jobs{job="worker", job_name="%s"})`, job.Name),
|
||||
Query: fmt.Sprintf(`sum (src_worker_jobs{job=~%q, job_name="%s"})`, scrapeJobRegex, job.Name),
|
||||
Panel: monitoring.Panel().LegendFormat(fmt.Sprintf("instances running %s", job.Name)),
|
||||
DataMustExist: true,
|
||||
Warning: monitoring.Alert().Less(1).For(1 * time.Minute),
|
||||
@ -66,7 +68,7 @@ func Worker() *monitoring.Dashboard {
|
||||
{
|
||||
Name: "worker_job_count",
|
||||
Description: "number of worker instances running each job",
|
||||
Query: `sum by (job_name) (src_worker_jobs{job="worker"})`,
|
||||
Query: fmt.Sprintf(`sum by (job_name) (src_worker_jobs{job=~%q})`, scrapeJobRegex),
|
||||
Panel: monitoring.Panel().LegendFormat("instances running {{job_name}}"),
|
||||
NoAlert: true,
|
||||
Interpretation: `
|
||||
@ -245,7 +247,7 @@ func Worker() *monitoring.Dashboard {
|
||||
Name: "insights_queue_unutilized_size",
|
||||
Description: "insights queue size that is not utilized (not processing)",
|
||||
Owner: monitoring.ObservableOwnerCodeInsights,
|
||||
Query: "max(src_query_runner_worker_total{job=~\"^worker.*\"}) > 0 and on(job) sum by (op)(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_total{job=~\"^worker.*\",op=\"Dequeue\"}[5m])) < 1",
|
||||
Query: fmt.Sprintf("max(src_query_runner_worker_total{job=~%q}) > 0 and on(job) sum by (op)(increase(src_workerutil_dbworker_store_insights_query_runner_jobs_store_total{job=~%q,op=\"Dequeue\"}[5m])) < 1", scrapeJobRegex, scrapeJobRegex),
|
||||
DataMustExist: false,
|
||||
Warning: monitoring.Alert().Greater(0.0).For(time.Minute * 30),
|
||||
NextSteps: "Verify code insights worker job has successfully started. Restart worker service and monitoring startup logs, looking for worker panics.",
|
||||
@ -270,6 +272,7 @@ func Worker() *monitoring.Dashboard {
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "worker",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
JobFilterRegex: scrapeJobRegex,
|
||||
}, monitoring.ObservableOwnerInfraOrg),
|
||||
},
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user