mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 19:21:50 +00:00
We discovered recently that the definition for the alert that fires if the site configuration hasn't been fetched within 5 minutes strips out the regex that targets individual services (since it uses a grafana variable). This means that every instance of this alert will fire if any individual service trips over this threshold. This PR fixes the issue by adding a new `job` filter for this alert that targets only the services that have that Prometheus scrape target name. This works around the previous issue by using a fixed value for the `job` value instead of a dynamic grafana value. The value of the job filter generally looks like `job=~.*$container_name` (following the strategy from https://sourcegraph.com/github.com/sourcegraph/sourcegraph@9a780f2e694238b5326e3e121d6a1828463001b9/-/blob/monitoring/monitoring/monitoring.go?L161 ) unless I noticed that there was different logic in the existing dashboard for the services. Ex: - `frontend`: already used `job=~"(sourcegraph-)?frontend"` for some metrics, so I used it again here - `worker`: `already used `job=~"^worker.*"` in some metrics, so I used it again and standarized the other existing panels to use the same shared variable ## Test plan I eyeballed the generated alert.md and dashboards.md to verify that my changes looked correct (that is, my refactors resulted in either no diff, or that the diff I generated still looked like valid regex).
67 lines
2.5 KiB
Go
67 lines
2.5 KiB
Go
package definitions
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/sourcegraph/sourcegraph/monitoring/definitions/shared"
|
|
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
|
)
|
|
|
|
func Embeddings() *monitoring.Dashboard {
|
|
const containerName = "embeddings"
|
|
|
|
scrapeJobRegex := fmt.Sprintf(".*%s", containerName)
|
|
|
|
return &monitoring.Dashboard{
|
|
Name: "embeddings",
|
|
Title: "Embeddings",
|
|
Description: "Handles embeddings searches.",
|
|
Variables: []monitoring.ContainerVariable{
|
|
{
|
|
Label: "instance",
|
|
Name: "instance",
|
|
OptionsLabelValues: monitoring.ContainerVariableOptionsLabelValues{
|
|
Query: "src_embeddings_cache_hit_count",
|
|
LabelName: "instance",
|
|
ExampleOption: "embeddings:6099",
|
|
},
|
|
Multi: true,
|
|
},
|
|
},
|
|
Groups: []monitoring.Group{
|
|
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
|
HumanServiceName: "embeddings",
|
|
InstanceFilterRegex: `${instance:regex}`,
|
|
JobFilterRegex: scrapeJobRegex,
|
|
}, monitoring.ObservableOwnerInfraOrg),
|
|
shared.NewDatabaseConnectionsMonitoringGroup(containerName, monitoring.ObservableOwnerCody),
|
|
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCody, nil),
|
|
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCody, nil),
|
|
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCody, nil),
|
|
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCody, nil),
|
|
{
|
|
Title: "Cache",
|
|
Hidden: true,
|
|
Rows: []monitoring.Row{{
|
|
{
|
|
Name: "hit_ratio",
|
|
Description: "hit ratio of the embeddings cache",
|
|
Query: "rate(src_embeddings_cache_hit_count[30m]) / (rate(src_embeddings_cache_hit_count[30m]) + rate(src_embeddings_cache_miss_count[30m]))",
|
|
NoAlert: true,
|
|
Interpretation: "A low hit rate indicates your cache is not well utilized. Consider increasing the cache size.",
|
|
Panel: monitoring.Panel().Unit(monitoring.Number),
|
|
},
|
|
{
|
|
Name: "missed_bytes",
|
|
Description: "bytes fetched due to a cache miss",
|
|
Query: "rate(src_embeddings_cache_miss_bytes[10m])",
|
|
NoAlert: true,
|
|
Interpretation: "A high volume of misses indicates that the many searches are not hitting the cache. Consider increasing the cache size.",
|
|
Panel: monitoring.Panel().Unit(monitoring.Bytes),
|
|
},
|
|
}},
|
|
},
|
|
},
|
|
}
|
|
}
|