mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 17:11:49 +00:00
conf: add metric and associated alert if clients fail to update site configuration within 5 minutes (#57682)
This commit is contained in:
parent
75cbd196f3
commit
9d34a48425
@ -612,6 +612,38 @@ Generated query for warning alert: `max((sum by (alert_type) (increase(src_graph
|
||||
|
||||
<br />
|
||||
|
||||
## frontend: frontend_site_configuration_duration_since_last_successful_update_by_instance
|
||||
|
||||
<p class="subtitle">maximum duration since last successful site configuration update (all "frontend" instances)</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-critical">critical</span> frontend: 300s+ maximum duration since last successful site configuration update (all "frontend" instances)
|
||||
|
||||
**Next steps**
|
||||
|
||||
- This indicates that one or more "frontend" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
|
||||
- Check for relevant errors in the "frontend" logs, as well as frontend`s logs.
|
||||
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#frontend-frontend-site-configuration-duration-since-last-successful-update-by-instance).
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"critical_frontend_frontend_site_configuration_duration_since_last_successful_update_by_instance"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
<br />
|
||||
|
||||
## frontend: internal_indexed_search_error_responses
|
||||
|
||||
<p class="subtitle">internal indexed search error responses every 5m</p>
|
||||
@ -1575,6 +1607,38 @@ Generated query for warning alert: `max((sum by (category) (increase(src_fronten
|
||||
|
||||
<br />
|
||||
|
||||
## gitserver: gitserver_site_configuration_duration_since_last_successful_update_by_instance
|
||||
|
||||
<p class="subtitle">maximum duration since last successful site configuration update (all "gitserver" instances)</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-critical">critical</span> gitserver: 300s+ maximum duration since last successful site configuration update (all "gitserver" instances)
|
||||
|
||||
**Next steps**
|
||||
|
||||
- This indicates that one or more "gitserver" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
|
||||
- Check for relevant errors in the "gitserver" logs, as well as frontend`s logs.
|
||||
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#gitserver-gitserver-site-configuration-duration-since-last-successful-update-by-instance).
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"critical_gitserver_gitserver_site_configuration_duration_since_last_successful_update_by_instance"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
<br />
|
||||
|
||||
## gitserver: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
@ -3814,6 +3878,38 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*worker"}) / c
|
||||
|
||||
<br />
|
||||
|
||||
## worker: worker_site_configuration_duration_since_last_successful_update_by_instance
|
||||
|
||||
<p class="subtitle">maximum duration since last successful site configuration update (all "worker" instances)</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-critical">critical</span> worker: 300s+ maximum duration since last successful site configuration update (all "worker" instances)
|
||||
|
||||
**Next steps**
|
||||
|
||||
- This indicates that one or more "worker" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
|
||||
- Check for relevant errors in the "worker" logs, as well as frontend`s logs.
|
||||
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#worker-worker-site-configuration-duration-since-last-successful-update-by-instance).
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"critical_worker_worker_site_configuration_duration_since_last_successful_update_by_instance"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
<br />
|
||||
|
||||
## repo-updater: src_repoupdater_max_sync_backoff
|
||||
|
||||
<p class="subtitle">time since oldest sync</p>
|
||||
@ -4609,6 +4705,38 @@ Generated query for critical alert: `min((max by (name) (src_gitlab_rate_limit_r
|
||||
|
||||
<br />
|
||||
|
||||
## repo-updater: repo_updater_site_configuration_duration_since_last_successful_update_by_instance
|
||||
|
||||
<p class="subtitle">maximum duration since last successful site configuration update (all "repo_updater" instances)</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-critical">critical</span> repo-updater: 300s+ maximum duration since last successful site configuration update (all "repo_updater" instances)
|
||||
|
||||
**Next steps**
|
||||
|
||||
- This indicates that one or more "repo_updater" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
|
||||
- Check for relevant errors in the "repo_updater" logs, as well as frontend`s logs.
|
||||
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#repo-updater-repo-updater-site-configuration-duration-since-last-successful-update-by-instance).
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"critical_repo-updater_repo_updater_site_configuration_duration_since_last_successful_update_by_instance"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
<br />
|
||||
|
||||
## repo-updater: frontend_internal_api_error_responses
|
||||
|
||||
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
|
||||
@ -5058,6 +5186,38 @@ Generated query for warning alert: `max((sum by (code) (increase(searcher_servic
|
||||
|
||||
<br />
|
||||
|
||||
## searcher: searcher_site_configuration_duration_since_last_successful_update_by_instance
|
||||
|
||||
<p class="subtitle">maximum duration since last successful site configuration update (all "searcher" instances)</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-critical">critical</span> searcher: 300s+ maximum duration since last successful site configuration update (all "searcher" instances)
|
||||
|
||||
**Next steps**
|
||||
|
||||
- This indicates that one or more "searcher" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
|
||||
- Check for relevant errors in the "searcher" logs, as well as frontend`s logs.
|
||||
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#searcher-searcher-site-configuration-duration-since-last-successful-update-by-instance).
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"critical_searcher_searcher_site_configuration_duration_since_last_successful_update_by_instance"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
<br />
|
||||
|
||||
## searcher: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
@ -5447,6 +5607,38 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*searcher"}) /
|
||||
|
||||
<br />
|
||||
|
||||
## symbols: symbols_site_configuration_duration_since_last_successful_update_by_instance
|
||||
|
||||
<p class="subtitle">maximum duration since last successful site configuration update (all "symbols" instances)</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-critical">critical</span> symbols: 300s+ maximum duration since last successful site configuration update (all "symbols" instances)
|
||||
|
||||
**Next steps**
|
||||
|
||||
- This indicates that one or more "symbols" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
|
||||
- Check for relevant errors in the "symbols" logs, as well as frontend`s logs.
|
||||
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#symbols-symbols-site-configuration-duration-since-last-successful-update-by-instance).
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"critical_symbols_symbols_site_configuration_duration_since_last_successful_update_by_instance"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
<br />
|
||||
|
||||
## symbols: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
@ -7831,6 +8023,38 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*otel-collecto
|
||||
|
||||
<br />
|
||||
|
||||
## embeddings: embeddings_site_configuration_duration_since_last_successful_update_by_instance
|
||||
|
||||
<p class="subtitle">maximum duration since last successful site configuration update (all "embeddings" instances)</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-critical">critical</span> embeddings: 300s+ maximum duration since last successful site configuration update (all "embeddings" instances)
|
||||
|
||||
**Next steps**
|
||||
|
||||
- This indicates that one or more "embeddings" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
|
||||
- Check for relevant errors in the "embeddings" logs, as well as frontend`s logs.
|
||||
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#embeddings-embeddings-site-configuration-duration-since-last-successful-update-by-instance).
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"critical_embeddings_embeddings_site_configuration_duration_since_last_successful_update_by_instance"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
|
||||
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
|
||||
|
||||
</details>
|
||||
|
||||
<br />
|
||||
|
||||
## embeddings: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
|
||||
1318
doc/admin/observability/dashboards.md
generated
1318
doc/admin/observability/dashboards.md
generated
File diff suppressed because it is too large
Load Diff
@ -43,6 +43,8 @@ go_library(
|
||||
"@com_github_getsentry_sentry_go//:sentry-go",
|
||||
"@com_github_grafana_regexp//:regexp",
|
||||
"@com_github_hashicorp_cronexpr//:cronexpr",
|
||||
"@com_github_prometheus_client_golang//prometheus",
|
||||
"@com_github_prometheus_client_golang//prometheus/promauto",
|
||||
"@com_github_sourcegraph_jsonx//:jsonx",
|
||||
"@com_github_sourcegraph_log//:log",
|
||||
"@com_github_xeipuuv_gojsonschema//:gojsonschema",
|
||||
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/sourcegraph/log"
|
||||
"github.com/sourcegraph/sourcegraph/internal/api/internalapi"
|
||||
"github.com/sourcegraph/sourcegraph/internal/conf/conftypes"
|
||||
@ -28,6 +29,10 @@ type client struct {
|
||||
// should be closed when future queries to the client returns the most up to date
|
||||
// configuration.
|
||||
sourceUpdates <-chan chan struct{}
|
||||
|
||||
// metricDurationSinceLastSuccessfulUpdateSeconds measures the duration in seconds since the client's
|
||||
// last successful update from the configuration source
|
||||
metricDurationSinceLastSuccessfulUpdateSeconds prometheus.Gauge
|
||||
}
|
||||
|
||||
var _ conftypes.UnifiedQuerier = &client{}
|
||||
@ -47,7 +52,12 @@ func DefaultClient() *client {
|
||||
// MockClient returns a client in the same basic configuration as the DefaultClient, but is not limited to a global singleton.
|
||||
// This is useful to mock configuration in tests without race conditions modifying values when running tests in parallel.
|
||||
func MockClient() *client {
|
||||
return &client{store: newStore()}
|
||||
return &client{
|
||||
store: newStore(),
|
||||
metricDurationSinceLastSuccessfulUpdateSeconds: prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "src_mock_conf_client_time_since_last_successful_update_seconds",
|
||||
Help: "Time since the last successful update of the configuration by the mock conf client"}),
|
||||
}
|
||||
}
|
||||
|
||||
// Raw returns a copy of the raw configuration.
|
||||
@ -277,8 +287,12 @@ func (c *client) continuouslyUpdate(optOnlySetByTests *continuousUpdateOptions)
|
||||
// error on this initial attempt.
|
||||
_ = c.fetchAndUpdate(opts.logger)
|
||||
|
||||
start := time.Now()
|
||||
lastSuccessfulUpdate := time.Now()
|
||||
for {
|
||||
if c.metricDurationSinceLastSuccessfulUpdateSeconds != nil { // Update configuration latency at the top of the loop
|
||||
c.metricDurationSinceLastSuccessfulUpdateSeconds.Set(time.Since(lastSuccessfulUpdate).Seconds())
|
||||
}
|
||||
|
||||
logger := opts.logger
|
||||
|
||||
// signalDoneReading, if set, indicates that we were prompted to update because
|
||||
@ -293,19 +307,27 @@ func (c *client) continuouslyUpdate(optOnlySetByTests *continuousUpdateOptions)
|
||||
logger = logger.With(log.String("triggered_by", "waitForSleep"))
|
||||
}
|
||||
|
||||
if c.metricDurationSinceLastSuccessfulUpdateSeconds != nil { // Update configuration latency after sleeping
|
||||
c.metricDurationSinceLastSuccessfulUpdateSeconds.Set(time.Since(lastSuccessfulUpdate).Seconds())
|
||||
}
|
||||
|
||||
logger.Debug("checking for updates")
|
||||
err := c.fetchAndUpdate(logger)
|
||||
if err != nil {
|
||||
// Suppress log messages for errors caused by the frontend being unreachable until we've
|
||||
// given the frontend enough time to initialize (in case other services start up before
|
||||
// the frontend), to reduce log spam.
|
||||
if time.Since(start) > opts.delayBeforeUnreachableLog || !isFrontendUnreachableError(err) {
|
||||
if time.Since(lastSuccessfulUpdate) > opts.delayBeforeUnreachableLog || !isFrontendUnreachableError(err) {
|
||||
logger.Error("received error during background config update", log.Error(err))
|
||||
}
|
||||
} else {
|
||||
// We successfully fetched the config, we reset the timer to give
|
||||
// frontend time if it needs to restart
|
||||
start = time.Now()
|
||||
lastSuccessfulUpdate = time.Now()
|
||||
}
|
||||
|
||||
if c.metricDurationSinceLastSuccessfulUpdateSeconds != nil { // Record the update latency after the fetch
|
||||
c.metricDurationSinceLastSuccessfulUpdateSeconds.Set(time.Since(lastSuccessfulUpdate).Seconds())
|
||||
}
|
||||
|
||||
// Indicate that we are done reading, if we were prompted to update by the updates
|
||||
|
||||
@ -10,6 +10,8 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"github.com/sourcegraph/jsonx"
|
||||
sglog "github.com/sourcegraph/log"
|
||||
|
||||
@ -106,7 +108,13 @@ func getModeUncached() configurationMode {
|
||||
var configurationServerFrontendOnlyInitialized = make(chan struct{})
|
||||
|
||||
func initDefaultClient() *client {
|
||||
defaultClient := &client{store: newStore()}
|
||||
defaultClient := &client{
|
||||
store: newStore(),
|
||||
metricDurationSinceLastSuccessfulUpdateSeconds: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "src_conf_client_time_since_last_successful_update_seconds",
|
||||
Help: "Time since the last successful update of the configuration by the conf client",
|
||||
}),
|
||||
}
|
||||
|
||||
mode := getMode()
|
||||
// Don't kickoff the background updaters for the client/server
|
||||
|
||||
@ -12,7 +12,23 @@ func Embeddings() *monitoring.Dashboard {
|
||||
Name: "embeddings",
|
||||
Title: "Embeddings",
|
||||
Description: "Handles embeddings searches.",
|
||||
Variables: []monitoring.ContainerVariable{
|
||||
{
|
||||
Label: "instance",
|
||||
Name: "instance",
|
||||
OptionsLabelValues: monitoring.ContainerVariableOptionsLabelValues{
|
||||
Query: "src_embeddings_cache_hit_count",
|
||||
LabelName: "instance",
|
||||
ExampleOption: "embeddings:6099",
|
||||
},
|
||||
Multi: true,
|
||||
},
|
||||
},
|
||||
Groups: []monitoring.Group{
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "embeddings",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
}, monitoring.ObservableOwnerDevOps),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
|
||||
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerCody, nil),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCody, nil),
|
||||
|
||||
@ -333,6 +333,11 @@ func Frontend() *monitoring.Dashboard {
|
||||
},
|
||||
},
|
||||
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "frontend",
|
||||
InstanceFilterRegex: `${internalInstance:regex}`,
|
||||
}, monitoring.ObservableOwnerDevOps),
|
||||
|
||||
shared.CodeIntelligence.NewResolversGroup(containerName),
|
||||
shared.CodeIntelligence.NewAutoIndexEnqueuerGroup(containerName),
|
||||
shared.CodeIntelligence.NewDBStoreGroup(containerName),
|
||||
|
||||
@ -554,6 +554,11 @@ func GitServer() *monitoring.Dashboard {
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "gitserver",
|
||||
InstanceFilterRegex: `${shard:regex}`,
|
||||
}, monitoring.ObservableOwnerDevOps),
|
||||
|
||||
shared.CodeIntelligence.NewCoursierGroup(containerName),
|
||||
shared.CodeIntelligence.NewNpmGroup(containerName),
|
||||
|
||||
|
||||
@ -594,6 +594,10 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSource),
|
||||
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "repo_updater",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
}, monitoring.ObservableOwnerDevOps),
|
||||
shared.HTTP.NewHandlersGroup(containerName),
|
||||
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerSource, nil),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
|
||||
|
||||
@ -240,6 +240,10 @@ regularly above 0 it is a sign for further investigation.`,
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "searcher",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
}, monitoring.ObservableOwnerDevOps),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
|
||||
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
|
||||
|
||||
@ -28,6 +28,7 @@ go_library(
|
||||
"provisioning.go",
|
||||
"queues.go",
|
||||
"shared.go",
|
||||
"site_configuration.go",
|
||||
"standard.go",
|
||||
"usage_data_pipeline.go",
|
||||
"workerutil.go",
|
||||
|
||||
73
monitoring/definitions/shared/site_configuration.go
Normal file
73
monitoring/definitions/shared/site_configuration.go
Normal file
@ -0,0 +1,73 @@
|
||||
package shared
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/iancoleman/strcase"
|
||||
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
||||
)
|
||||
|
||||
type SiteConfigurationMetricsOptions struct {
|
||||
// HumanServiceName is the short, lowercase, snake_case, human-readable name of the service that we're gathering metrics for.
|
||||
//
|
||||
// Example: "gitserver"
|
||||
HumanServiceName string
|
||||
|
||||
// InstanceFilterRegex is the PromQL regex that's used to filter the
|
||||
// site configuration client metrics to only those emitted by the instance(s) that were interested in.
|
||||
//
|
||||
// Example: (gitserver-0 | gitserver-1)
|
||||
InstanceFilterRegex string
|
||||
}
|
||||
|
||||
// NewSiteConfigurationClientMetricsGroup creates a group containing site configuration fetching latency statistics for the service
|
||||
// specified in the given options.
|
||||
func NewSiteConfigurationClientMetricsGroup(opts SiteConfigurationMetricsOptions, owner monitoring.ObservableOwner) monitoring.Group {
|
||||
opts.HumanServiceName = strcase.ToSnake(opts.HumanServiceName)
|
||||
|
||||
metric := func(base string, labelFilters ...string) string {
|
||||
metric := base
|
||||
|
||||
instanceLabelFilter := fmt.Sprintf("instance=~`%s`", opts.InstanceFilterRegex)
|
||||
|
||||
labelFilters = append(labelFilters, instanceLabelFilter)
|
||||
|
||||
if len(labelFilters) > 0 {
|
||||
metric = fmt.Sprintf("%s{%s}", metric, strings.Join(labelFilters, ","))
|
||||
}
|
||||
|
||||
return metric
|
||||
}
|
||||
|
||||
return monitoring.Group{
|
||||
Title: "Site configuration client update latency",
|
||||
Hidden: true,
|
||||
Rows: []monitoring.Row{
|
||||
{
|
||||
{
|
||||
Name: fmt.Sprintf("%s_site_configuration_duration_since_last_successful_update_by_instance", opts.HumanServiceName),
|
||||
Description: "duration since last successful site configuration update (by instance)",
|
||||
Query: metric("src_conf_client_time_since_last_successful_update_seconds"),
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds),
|
||||
Owner: owner,
|
||||
NoAlert: true,
|
||||
Interpretation: fmt.Sprintf("The duration since the configuration client used by the %q service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration.", opts.HumanServiceName),
|
||||
},
|
||||
{
|
||||
Name: fmt.Sprintf("%s_site_configuration_duration_since_last_successful_update_by_instance", opts.HumanServiceName),
|
||||
Description: fmt.Sprintf("maximum duration since last successful site configuration update (all %q instances)", opts.HumanServiceName),
|
||||
Query: fmt.Sprintf("max(max_over_time(%s[1m]))", metric("src_conf_client_time_since_last_successful_update_seconds")),
|
||||
Panel: monitoring.Panel().Unit(monitoring.Seconds),
|
||||
Owner: owner,
|
||||
Critical: monitoring.Alert().GreaterOrEqual((5 * time.Minute).Seconds()),
|
||||
NextSteps: fmt.Sprintf(`
|
||||
- This indicates that one or more %q instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
|
||||
- Check for relevant errors in the %q logs, as well as frontend's logs.
|
||||
`, opts.HumanServiceName, opts.HumanServiceName),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
@ -58,6 +58,10 @@ func Symbols() *monitoring.Dashboard {
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerCodeIntel),
|
||||
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "symbols",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
}, monitoring.ObservableOwnerDevOps),
|
||||
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
|
||||
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
|
||||
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
|
||||
|
||||
@ -127,6 +127,18 @@ func Worker() *monitoring.Dashboard {
|
||||
Name: "worker",
|
||||
Title: "Worker",
|
||||
Description: "Manages background processes.",
|
||||
Variables: []monitoring.ContainerVariable{
|
||||
{
|
||||
Label: "Instance",
|
||||
Name: "instance",
|
||||
OptionsLabelValues: monitoring.ContainerVariableOptionsLabelValues{
|
||||
Query: "src_worker_jobs",
|
||||
LabelName: "instance",
|
||||
ExampleOption: "worker:6089",
|
||||
},
|
||||
Multi: true,
|
||||
},
|
||||
},
|
||||
Groups: []monitoring.Group{
|
||||
// src_worker_jobs
|
||||
activeJobsGroup,
|
||||
@ -251,6 +263,11 @@ func Worker() *monitoring.Dashboard {
|
||||
shared.SourcegraphOwn.NewOwnRepoIndexerWorkerGroup(containerName),
|
||||
shared.SourcegraphOwn.NewOwnRepoIndexerResetterGroup(containerName),
|
||||
shared.SourcegraphOwn.NewOwnRepoIndexerSchedulerGroup(containerName),
|
||||
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "worker",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
}, monitoring.ObservableOwnerDevOps),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user