conf: add metric and associated alert if clients fail to update site configuration within 5 minutes (#57682)

2026-02-06 17:11:49 +00:00 · 2023-10-18 16:53:55 -07:00 · 2023-10-18 16:53:55 -07:00 · 9d34a48425
commit 9d34a48425
parent 75cbd196f3
14 changed files with 1196 additions and 517 deletions
--- a/doc/admin/observability/alerts.md
+++ b/doc/admin/observability/alerts.md
@ -612,6 +612,38 @@ Generated query for warning alert: `max((sum by (alert_type) (increase(src_graph

 <br />

+## frontend: frontend_site_configuration_duration_since_last_successful_update_by_instance
+
+<p class="subtitle">maximum duration since last successful site configuration update (all "frontend" instances)</p>
+
+**Descriptions**
+
+- <span class="badge badge-critical">critical</span> frontend: 300s+ maximum duration since last successful site configuration update (all "frontend" instances)
+
+**Next steps**
+
+- This indicates that one or more "frontend" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
+- Check for relevant errors in the "frontend" logs, as well as frontend`s logs.
+- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#frontend-frontend-site-configuration-duration-since-last-successful-update-by-instance).
+- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
+
+```json
+"observability.silenceAlerts": [
+  "critical_frontend_frontend_site_configuration_duration_since_last_successful_update_by_instance"
+]
+```
+
+<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
+
+<details>
+<summary>Technical details</summary>
+
+Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
+
+</details>
+
+<br />
+
 ## frontend: internal_indexed_search_error_responses

 <p class="subtitle">internal indexed search error responses every 5m</p>
@ -1575,6 +1607,38 @@ Generated query for warning alert: `max((sum by (category) (increase(src_fronten

 <br />

+## gitserver: gitserver_site_configuration_duration_since_last_successful_update_by_instance
+
+<p class="subtitle">maximum duration since last successful site configuration update (all "gitserver" instances)</p>
+
+**Descriptions**
+
+- <span class="badge badge-critical">critical</span> gitserver: 300s+ maximum duration since last successful site configuration update (all "gitserver" instances)
+
+**Next steps**
+
+- This indicates that one or more "gitserver" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
+- Check for relevant errors in the "gitserver" logs, as well as frontend`s logs.
+- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#gitserver-gitserver-site-configuration-duration-since-last-successful-update-by-instance).
+- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
+
+```json
+"observability.silenceAlerts": [
+  "critical_gitserver_gitserver_site_configuration_duration_since_last_successful_update_by_instance"
+]
+```
+
+<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
+
+<details>
+<summary>Technical details</summary>
+
+Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
+
+</details>
+
+<br />
+
 ## gitserver: mean_blocked_seconds_per_conn_request

 <p class="subtitle">mean blocked seconds per conn request</p>
@ -3814,6 +3878,38 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*worker"}) / c

 <br />

+## worker: worker_site_configuration_duration_since_last_successful_update_by_instance
+
+<p class="subtitle">maximum duration since last successful site configuration update (all "worker" instances)</p>
+
+**Descriptions**
+
+- <span class="badge badge-critical">critical</span> worker: 300s+ maximum duration since last successful site configuration update (all "worker" instances)
+
+**Next steps**
+
+- This indicates that one or more "worker" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
+- Check for relevant errors in the "worker" logs, as well as frontend`s logs.
+- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#worker-worker-site-configuration-duration-since-last-successful-update-by-instance).
+- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
+
+```json
+"observability.silenceAlerts": [
+  "critical_worker_worker_site_configuration_duration_since_last_successful_update_by_instance"
+]
+```
+
+<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
+
+<details>
+<summary>Technical details</summary>
+
+Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
+
+</details>
+
+<br />
+
 ## repo-updater: src_repoupdater_max_sync_backoff

 <p class="subtitle">time since oldest sync</p>
@ -4609,6 +4705,38 @@ Generated query for critical alert: `min((max by (name) (src_gitlab_rate_limit_r

 <br />

+## repo-updater: repo_updater_site_configuration_duration_since_last_successful_update_by_instance
+
+<p class="subtitle">maximum duration since last successful site configuration update (all "repo_updater" instances)</p>
+
+**Descriptions**
+
+- <span class="badge badge-critical">critical</span> repo-updater: 300s+ maximum duration since last successful site configuration update (all "repo_updater" instances)
+
+**Next steps**
+
+- This indicates that one or more "repo_updater" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
+- Check for relevant errors in the "repo_updater" logs, as well as frontend`s logs.
+- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#repo-updater-repo-updater-site-configuration-duration-since-last-successful-update-by-instance).
+- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
+
+```json
+"observability.silenceAlerts": [
+  "critical_repo-updater_repo_updater_site_configuration_duration_since_last_successful_update_by_instance"
+]
+```
+
+<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
+
+<details>
+<summary>Technical details</summary>
+
+Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
+
+</details>
+
+<br />
+
 ## repo-updater: frontend_internal_api_error_responses

 <p class="subtitle">frontend-internal API error responses every 5m by route</p>
@ -5058,6 +5186,38 @@ Generated query for warning alert: `max((sum by (code) (increase(searcher_servic

 <br />

+## searcher: searcher_site_configuration_duration_since_last_successful_update_by_instance
+
+<p class="subtitle">maximum duration since last successful site configuration update (all "searcher" instances)</p>
+
+**Descriptions**
+
+- <span class="badge badge-critical">critical</span> searcher: 300s+ maximum duration since last successful site configuration update (all "searcher" instances)
+
+**Next steps**
+
+- This indicates that one or more "searcher" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
+- Check for relevant errors in the "searcher" logs, as well as frontend`s logs.
+- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#searcher-searcher-site-configuration-duration-since-last-successful-update-by-instance).
+- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
+
+```json
+"observability.silenceAlerts": [
+  "critical_searcher_searcher_site_configuration_duration_since_last_successful_update_by_instance"
+]
+```
+
+<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
+
+<details>
+<summary>Technical details</summary>
+
+Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
+
+</details>
+
+<br />
+
 ## searcher: mean_blocked_seconds_per_conn_request

 <p class="subtitle">mean blocked seconds per conn request</p>
@ -5447,6 +5607,38 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*searcher"}) /

 <br />

+## symbols: symbols_site_configuration_duration_since_last_successful_update_by_instance
+
+<p class="subtitle">maximum duration since last successful site configuration update (all "symbols" instances)</p>
+
+**Descriptions**
+
+- <span class="badge badge-critical">critical</span> symbols: 300s+ maximum duration since last successful site configuration update (all "symbols" instances)
+
+**Next steps**
+
+- This indicates that one or more "symbols" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
+- Check for relevant errors in the "symbols" logs, as well as frontend`s logs.
+- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#symbols-symbols-site-configuration-duration-since-last-successful-update-by-instance).
+- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
+
+```json
+"observability.silenceAlerts": [
+  "critical_symbols_symbols_site_configuration_duration_since_last_successful_update_by_instance"
+]
+```
+
+<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
+
+<details>
+<summary>Technical details</summary>
+
+Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
+
+</details>
+
+<br />
+
 ## symbols: mean_blocked_seconds_per_conn_request

 <p class="subtitle">mean blocked seconds per conn request</p>
@ -7831,6 +8023,38 @@ Generated query for critical alert: `min((sum by (app) (up{app=~".*otel-collecto

 <br />

+## embeddings: embeddings_site_configuration_duration_since_last_successful_update_by_instance
+
+<p class="subtitle">maximum duration since last successful site configuration update (all "embeddings" instances)</p>
+
+**Descriptions**
+
+- <span class="badge badge-critical">critical</span> embeddings: 300s+ maximum duration since last successful site configuration update (all "embeddings" instances)
+
+**Next steps**
+
+- This indicates that one or more "embeddings" instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
+- Check for relevant errors in the "embeddings" logs, as well as frontend`s logs.
+- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#embeddings-embeddings-site-configuration-duration-since-last-successful-update-by-instance).
+- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
+
+```json
+"observability.silenceAlerts": [
+  "critical_embeddings_embeddings_site_configuration_duration_since_last_successful_update_by_instance"
+]
+```
+
+<sub>*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).*</sub>
+
+<details>
+<summary>Technical details</summary>
+
+Generated query for critical alert: `max((max(max_over_time(src_conf_client_time_since_last_successful_update_seconds[1m]))) >= 300)`
+
+</details>
+
+<br />
+
 ## embeddings: mean_blocked_seconds_per_conn_request

 <p class="subtitle">mean blocked seconds per conn request</p>
--- a/doc/admin/observability/dashboards.md
+++ b/doc/admin/observability/dashboards.md
--- a/internal/conf/BUILD.bazel
+++ b/internal/conf/BUILD.bazel
@ -43,6 +43,8 @@ go_library(
        "@com_github_getsentry_sentry_go//:sentry-go",
        "@com_github_grafana_regexp//:regexp",
        "@com_github_hashicorp_cronexpr//:cronexpr",
+        "@com_github_prometheus_client_golang//prometheus",
+        "@com_github_prometheus_client_golang//prometheus/promauto",
        "@com_github_sourcegraph_jsonx//:jsonx",
        "@com_github_sourcegraph_log//:log",
        "@com_github_xeipuuv_gojsonschema//:gojsonschema",
--- a/internal/conf/client.go
+++ b/internal/conf/client.go
@ -8,6 +8,7 @@ import (
 	"sync/atomic"
 	"time"

+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sourcegraph/log"
 	"github.com/sourcegraph/sourcegraph/internal/api/internalapi"
 	"github.com/sourcegraph/sourcegraph/internal/conf/conftypes"
@ -28,6 +29,10 @@ type client struct {
 	// should be closed when future queries to the client returns the most up to date
 	// configuration.
 	sourceUpdates <-chan chan struct{}
+
+	// metricDurationSinceLastSuccessfulUpdateSeconds measures the duration in seconds since the client's
+	// last successful update from the configuration source
+	metricDurationSinceLastSuccessfulUpdateSeconds prometheus.Gauge
 }

 var _ conftypes.UnifiedQuerier = &client{}
@ -47,7 +52,12 @@ func DefaultClient() *client {
 // MockClient returns a client in the same basic configuration as the DefaultClient, but is not limited to a global singleton.
 // This is useful to mock configuration in tests without race conditions modifying values when running tests in parallel.
 func MockClient() *client {
-	return &client{store: newStore()}
+	return &client{
+		store: newStore(),
+		metricDurationSinceLastSuccessfulUpdateSeconds: prometheus.NewGauge(prometheus.GaugeOpts{
+			Name: "src_mock_conf_client_time_since_last_successful_update_seconds",
+			Help: "Time since the last successful update of the configuration by the mock conf client"}),
+	}
 }

 // Raw returns a copy of the raw configuration.
@ -277,8 +287,12 @@ func (c *client) continuouslyUpdate(optOnlySetByTests *continuousUpdateOptions)
 	// error on this initial attempt.
 	_ = c.fetchAndUpdate(opts.logger)

-	start := time.Now()
+	lastSuccessfulUpdate := time.Now()
 	for {
+		if c.metricDurationSinceLastSuccessfulUpdateSeconds != nil { // Update configuration latency at the top of the loop
+			c.metricDurationSinceLastSuccessfulUpdateSeconds.Set(time.Since(lastSuccessfulUpdate).Seconds())
+		}
+
 		logger := opts.logger

 		// signalDoneReading, if set, indicates that we were prompted to update because
@ -293,19 +307,27 @@ func (c *client) continuouslyUpdate(optOnlySetByTests *continuousUpdateOptions)
 			logger = logger.With(log.String("triggered_by", "waitForSleep"))
 		}

+		if c.metricDurationSinceLastSuccessfulUpdateSeconds != nil { // Update configuration latency after sleeping
+			c.metricDurationSinceLastSuccessfulUpdateSeconds.Set(time.Since(lastSuccessfulUpdate).Seconds())
+		}
+
 		logger.Debug("checking for updates")
 		err := c.fetchAndUpdate(logger)
 		if err != nil {
 			// Suppress log messages for errors caused by the frontend being unreachable until we've
 			// given the frontend enough time to initialize (in case other services start up before
 			// the frontend), to reduce log spam.
-			if time.Since(start) > opts.delayBeforeUnreachableLog || !isFrontendUnreachableError(err) {
+			if time.Since(lastSuccessfulUpdate) > opts.delayBeforeUnreachableLog || !isFrontendUnreachableError(err) {
 				logger.Error("received error during background config update", log.Error(err))
 			}
 		} else {
 			// We successfully fetched the config, we reset the timer to give
 			// frontend time if it needs to restart
-			start = time.Now()
+			lastSuccessfulUpdate = time.Now()
+		}
+
+		if c.metricDurationSinceLastSuccessfulUpdateSeconds != nil { // Record the update latency after the fetch
+			c.metricDurationSinceLastSuccessfulUpdateSeconds.Set(time.Since(lastSuccessfulUpdate).Seconds())
 		}

 		// Indicate that we are done reading, if we were prompted to update by the updates
--- a/internal/conf/conf.go
+++ b/internal/conf/conf.go
@ -10,6 +10,8 @@ import (
 	"sync"
 	"time"

+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
 	"github.com/sourcegraph/jsonx"
 	sglog "github.com/sourcegraph/log"

@ -106,7 +108,13 @@ func getModeUncached() configurationMode {
 var configurationServerFrontendOnlyInitialized = make(chan struct{})

 func initDefaultClient() *client {
-	defaultClient := &client{store: newStore()}
+	defaultClient := &client{
+		store: newStore(),
+		metricDurationSinceLastSuccessfulUpdateSeconds: promauto.NewGauge(prometheus.GaugeOpts{
+			Name: "src_conf_client_time_since_last_successful_update_seconds",
+			Help: "Time since the last successful update of the configuration by the conf client",
+		}),
+	}

 	mode := getMode()
 	// Don't kickoff the background updaters for the client/server
--- a/monitoring/definitions/embeddings.go
+++ b/monitoring/definitions/embeddings.go
@ -12,7 +12,23 @@ func Embeddings() *monitoring.Dashboard {
 		Name:        "embeddings",
 		Title:       "Embeddings",
 		Description: "Handles embeddings searches.",
+		Variables: []monitoring.ContainerVariable{
+			{
+				Label: "instance",
+				Name:  "instance",
+				OptionsLabelValues: monitoring.ContainerVariableOptionsLabelValues{
+					Query:         "src_embeddings_cache_hit_count",
+					LabelName:     "instance",
+					ExampleOption: "embeddings:6099",
+				},
+				Multi: true,
+			},
+		},
 		Groups: []monitoring.Group{
+			shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
+				HumanServiceName:    "embeddings",
+				InstanceFilterRegex: `${instance:regex}`,
+			}, monitoring.ObservableOwnerDevOps),
 			shared.NewDatabaseConnectionsMonitoringGroup(containerName),
 			shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerCody, nil),
 			shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCody, nil),
--- a/monitoring/definitions/frontend.go
+++ b/monitoring/definitions/frontend.go
@ -333,6 +333,11 @@ func Frontend() *monitoring.Dashboard {
 				},
 			},

+			shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
+				HumanServiceName:    "frontend",
+				InstanceFilterRegex: `${internalInstance:regex}`,
+			}, monitoring.ObservableOwnerDevOps),
+
 			shared.CodeIntelligence.NewResolversGroup(containerName),
 			shared.CodeIntelligence.NewAutoIndexEnqueuerGroup(containerName),
 			shared.CodeIntelligence.NewDBStoreGroup(containerName),
--- a/monitoring/definitions/git_server.go
+++ b/monitoring/definitions/git_server.go
@ -554,6 +554,11 @@ func GitServer() *monitoring.Dashboard {
 					MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
 				}, monitoring.ObservableOwnerSearchCore),

+			shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
+				HumanServiceName:    "gitserver",
+				InstanceFilterRegex: `${shard:regex}`,
+			}, monitoring.ObservableOwnerDevOps),
+
 			shared.CodeIntelligence.NewCoursierGroup(containerName),
 			shared.CodeIntelligence.NewNpmGroup(containerName),

--- a/monitoring/definitions/repo_updater.go
+++ b/monitoring/definitions/repo_updater.go
@ -594,6 +594,10 @@ func RepoUpdater() *monitoring.Dashboard {
 					MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
 				}, monitoring.ObservableOwnerSource),

+			shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
+				HumanServiceName:    "repo_updater",
+				InstanceFilterRegex: `${instance:regex}`,
+			}, monitoring.ObservableOwnerDevOps),
 			shared.HTTP.NewHandlersGroup(containerName),
 			shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerSource, nil),
 			shared.NewDatabaseConnectionsMonitoringGroup(containerName),
--- a/monitoring/definitions/searcher.go
+++ b/monitoring/definitions/searcher.go
@ -240,6 +240,10 @@ regularly above 0 it is a sign for further investigation.`,

 					MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
 				}, monitoring.ObservableOwnerSearchCore),
+			shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
+				HumanServiceName:    "searcher",
+				InstanceFilterRegex: `${instance:regex}`,
+			}, monitoring.ObservableOwnerDevOps),
 			shared.NewDatabaseConnectionsMonitoringGroup(containerName),
 			shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
 			shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
--- a/monitoring/definitions/shared/BUILD.bazel
+++ b/monitoring/definitions/shared/BUILD.bazel
@ -28,6 +28,7 @@ go_library(
        "provisioning.go",
        "queues.go",
        "shared.go",
+        "site_configuration.go",
        "standard.go",
        "usage_data_pipeline.go",
        "workerutil.go",
--- a/monitoring/definitions/shared/site_configuration.go
+++ b/monitoring/definitions/shared/site_configuration.go
@ -0,0 +1,73 @@
+package shared
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/iancoleman/strcase"
+	"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
+)
+
+type SiteConfigurationMetricsOptions struct {
+	// HumanServiceName is the short, lowercase, snake_case, human-readable name of the service that we're gathering metrics for.
+	//
+	// Example: "gitserver"
+	HumanServiceName string
+
+	// InstanceFilterRegex is the PromQL regex that's used to filter the
+	// site configuration client metrics to only those emitted by the instance(s) that were interested in.
+	//
+	// Example: (gitserver-0 | gitserver-1)
+	InstanceFilterRegex string
+}
+
+// NewSiteConfigurationClientMetricsGroup creates a group containing site configuration fetching latency statistics for the service
+// specified in the given options.
+func NewSiteConfigurationClientMetricsGroup(opts SiteConfigurationMetricsOptions, owner monitoring.ObservableOwner) monitoring.Group {
+	opts.HumanServiceName = strcase.ToSnake(opts.HumanServiceName)
+
+	metric := func(base string, labelFilters ...string) string {
+		metric := base
+
+		instanceLabelFilter := fmt.Sprintf("instance=~`%s`", opts.InstanceFilterRegex)
+
+		labelFilters = append(labelFilters, instanceLabelFilter)
+
+		if len(labelFilters) > 0 {
+			metric = fmt.Sprintf("%s{%s}", metric, strings.Join(labelFilters, ","))
+		}
+
+		return metric
+	}
+
+	return monitoring.Group{
+		Title:  "Site configuration client update latency",
+		Hidden: true,
+		Rows: []monitoring.Row{
+			{
+				{
+					Name:           fmt.Sprintf("%s_site_configuration_duration_since_last_successful_update_by_instance", opts.HumanServiceName),
+					Description:    "duration since last successful site configuration update (by instance)",
+					Query:          metric("src_conf_client_time_since_last_successful_update_seconds"),
+					Panel:          monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds),
+					Owner:          owner,
+					NoAlert:        true,
+					Interpretation: fmt.Sprintf("The duration since the configuration client used by the %q service last successfully updated its site configuration. Long durations could indicate issues updating the site configuration.", opts.HumanServiceName),
+				},
+				{
+					Name:        fmt.Sprintf("%s_site_configuration_duration_since_last_successful_update_by_instance", opts.HumanServiceName),
+					Description: fmt.Sprintf("maximum duration since last successful site configuration update (all %q instances)", opts.HumanServiceName),
+					Query:       fmt.Sprintf("max(max_over_time(%s[1m]))", metric("src_conf_client_time_since_last_successful_update_seconds")),
+					Panel:       monitoring.Panel().Unit(monitoring.Seconds),
+					Owner:       owner,
+					Critical:    monitoring.Alert().GreaterOrEqual((5 * time.Minute).Seconds()),
+					NextSteps: fmt.Sprintf(`
+								- This indicates that one or more %q instances have not successfully updated the site configuration in over 5 minutes. This could be due to networking issues between services or problems with the site configuration service itself.
+								- Check for relevant errors in the %q logs, as well as frontend's logs.
+							`, opts.HumanServiceName, opts.HumanServiceName),
+				},
+			},
+		},
+	}
+}
--- a/monitoring/definitions/symbols.go
+++ b/monitoring/definitions/symbols.go
@ -58,6 +58,10 @@ func Symbols() *monitoring.Dashboard {
 					MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
 				}, monitoring.ObservableOwnerCodeIntel),

+			shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
+				HumanServiceName:    "symbols",
+				InstanceFilterRegex: `${instance:regex}`,
+			}, monitoring.ObservableOwnerDevOps),
 			shared.NewDatabaseConnectionsMonitoringGroup(containerName),
 			shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
 			shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
--- a/monitoring/definitions/worker.go
+++ b/monitoring/definitions/worker.go
@ -127,6 +127,18 @@ func Worker() *monitoring.Dashboard {
 		Name:        "worker",
 		Title:       "Worker",
 		Description: "Manages background processes.",
+		Variables: []monitoring.ContainerVariable{
+			{
+				Label: "Instance",
+				Name:  "instance",
+				OptionsLabelValues: monitoring.ContainerVariableOptionsLabelValues{
+					Query:         "src_worker_jobs",
+					LabelName:     "instance",
+					ExampleOption: "worker:6089",
+				},
+				Multi: true,
+			},
+		},
 		Groups: []monitoring.Group{
 			// src_worker_jobs
 			activeJobsGroup,
@ -251,6 +263,11 @@ func Worker() *monitoring.Dashboard {
 			shared.SourcegraphOwn.NewOwnRepoIndexerWorkerGroup(containerName),
 			shared.SourcegraphOwn.NewOwnRepoIndexerResetterGroup(containerName),
 			shared.SourcegraphOwn.NewOwnRepoIndexerSchedulerGroup(containerName),
+
+			shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
+				HumanServiceName:    "worker",
+				InstanceFilterRegex: `${instance:regex}`,
+			}, monitoring.ObservableOwnerDevOps),
 		},
 	}
 }