mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 17:51:57 +00:00
In INC-264 it seems that certain alerts - such as [zoekt: less than 90% percentage pods available for 10m0s](https://opsg.in/a/i/sourcegraph/178a626f-0f28-4295-bee9-84da988bb473-1703759057681) - don't seem to end up going anywhere because the ObservableOwner is defunct. This change adds _opt-in_ testing to report: 1. How many owners have valid Opsgenie teams 2. How many owners have valid handbook pages In addition, we collect ObservableOwners that pass the test and use it to generate configuration for `site.json` in Sourcegraph.com: https://github.com/sourcegraph/deploy-sourcegraph-cloud/pull/18338 - this helps ensure the list is valid and not deceptively high-coverage. The results are not great, but **enforcing** that owners are valid isn't currently in scope: ``` 6/10 ObservableOwners do not have valid Opsgenie teams 3/10 ObservableOwners do not point to valid handbook pages ``` I also removed some defunct/unused functionality/owners. ## Test plan To run these tests: ``` export OPSGENIE_API_KEY="..." go test -timeout 30s github.com/sourcegraph/sourcegraph/monitoring/monitoring -update -online ```
69 lines
2.7 KiB
Go
69 lines
2.7 KiB
Go
package definitions
|
|
|
|
import (
|
|
"github.com/sourcegraph/sourcegraph/monitoring/definitions/shared"
|
|
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
|
)
|
|
|
|
func SyntectServer() *monitoring.Dashboard {
|
|
const containerName = "syntect-server"
|
|
|
|
return &monitoring.Dashboard{
|
|
Name: "syntect-server",
|
|
Title: "Syntect Server",
|
|
Description: "Handles syntax highlighting for code files.",
|
|
NoSourcegraphDebugServer: true, // This is third-party service
|
|
Groups: []monitoring.Group{
|
|
{
|
|
Title: "General",
|
|
Rows: []monitoring.Row{
|
|
{
|
|
{
|
|
Name: "syntax_highlighting_errors",
|
|
Description: "syntax highlighting errors every 5m",
|
|
Query: `sum(increase(src_syntax_highlighting_requests{status="error"}[5m])) / sum(increase(src_syntax_highlighting_requests[5m])) * 100`,
|
|
NoAlert: true,
|
|
Panel: monitoring.Panel().LegendFormat("error").Unit(monitoring.Percentage),
|
|
Owner: monitoring.ObservableOwnerCodeIntel,
|
|
Interpretation: "none",
|
|
},
|
|
{
|
|
Name: "syntax_highlighting_timeouts",
|
|
Description: "syntax highlighting timeouts every 5m",
|
|
Query: `sum(increase(src_syntax_highlighting_requests{status="timeout"}[5m])) / sum(increase(src_syntax_highlighting_requests[5m])) * 100`,
|
|
NoAlert: true,
|
|
Panel: monitoring.Panel().LegendFormat("timeout").Unit(monitoring.Percentage),
|
|
Owner: monitoring.ObservableOwnerCodeIntel,
|
|
Interpretation: "none",
|
|
},
|
|
},
|
|
{
|
|
{
|
|
Name: "syntax_highlighting_panics",
|
|
Description: "syntax highlighting panics every 5m",
|
|
Query: `sum(increase(src_syntax_highlighting_requests{status="panic"}[5m]))`,
|
|
NoAlert: true,
|
|
Panel: monitoring.Panel().LegendFormat("panic"),
|
|
Owner: monitoring.ObservableOwnerCodeIntel,
|
|
Interpretation: "none",
|
|
},
|
|
{
|
|
Name: "syntax_highlighting_worker_deaths",
|
|
Description: "syntax highlighter worker deaths every 5m",
|
|
Query: `sum(increase(src_syntax_highlighting_requests{status="hss_worker_timeout"}[5m]))`,
|
|
NoAlert: true,
|
|
Panel: monitoring.Panel().LegendFormat("worker death"),
|
|
Owner: monitoring.ObservableOwnerCodeIntel,
|
|
Interpretation: "none",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
|
|
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerInfraOrg, nil),
|
|
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerInfraOrg, nil),
|
|
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerInfraOrg, nil),
|
|
},
|
|
}
|
|
}
|