sourcegraph/monitoring/definitions/prometheus.go
Camden Cheek 1ead945267
Docs: update links to point to new site (#60381)
We have a number of docs links in the product that point to the old doc site. 

Method:
- Search the repo for `docs.sourcegraph.com`
- Exclude the `doc/` dir, all test fixtures, and `CHANGELOG.md`
- For each, replace `docs.sourcegraph.com` with `sourcegraph.com/docs`
- Navigate to the resulting URL ensuring it's not a dead link, updating the URL if necessary

Many of the URLs updated are just comments, but since I'm doing a manual audit of each URL anyways, I felt it was worth it to update these while I was at it.
2024-02-13 00:23:47 +00:00

183 lines
9.4 KiB
Go

package definitions
import (
"fmt"
"github.com/sourcegraph/sourcegraph/monitoring/definitions/shared"
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
)
func Prometheus() *monitoring.Dashboard {
const (
containerName = "prometheus"
// ruleGroupInterpretation provides interpretation documentation for observables that are per prometheus rule_group.
ruleGroupInterpretation = `Rules that Sourcegraph ships with are grouped under '/sg_config_prometheus'. [Custom rules are grouped under '/sg_prometheus_addons'](https://sourcegraph.com/docs/admin/observability/metrics#prometheus-configuration).`
)
return &monitoring.Dashboard{
Name: "prometheus",
Title: "Prometheus",
Description: "Sourcegraph's all-in-one Prometheus and Alertmanager service.",
NoSourcegraphDebugServer: true, // This is third-party service
Groups: []monitoring.Group{
{
Title: "Metrics",
Rows: []monitoring.Row{
{
{
Name: "metrics_cardinality",
Description: "metrics with highest cardinalities",
Query: `topk(10, count by (__name__, job)({__name__!=""}))`,
Panel: monitoring.Panel().LegendFormat("{{__name__}} ({{job}})"),
Owner: monitoring.ObservableOwnerInfraOrg,
NoAlert: true,
Interpretation: "The 10 highest-cardinality metrics collected by this Prometheus instance.",
},
{
Name: "samples_scraped",
Description: "samples scraped by job",
Query: `sum by(job) (scrape_samples_post_metric_relabeling{job!=""})`,
Panel: monitoring.Panel().LegendFormat("{{job}}"),
Owner: monitoring.ObservableOwnerInfraOrg,
NoAlert: true,
Interpretation: "The number of samples scraped after metric relabeling was applied by this Prometheus instance.",
},
},
{
{
Name: "prometheus_rule_eval_duration",
Description: "average prometheus rule group evaluation duration over 10m by rule group",
Query: `sum by(rule_group) (avg_over_time(prometheus_rule_group_last_duration_seconds[10m]))`,
Warning: monitoring.Alert().GreaterOrEqual(30), // standard prometheus_rule_group_interval_seconds
Panel: monitoring.Panel().Unit(monitoring.Seconds).MinAuto().LegendFormat("{{rule_group}}"),
Owner: monitoring.ObservableOwnerInfraOrg,
Interpretation: fmt.Sprintf(`
A high value here indicates Prometheus rule evaluation is taking longer than expected.
It might indicate that certain rule groups are taking too long to evaluate, or Prometheus is underprovisioned.
%s
`, ruleGroupInterpretation),
NextSteps: fmt.Sprintf(`
- Check the %s panels and try increasing resources for Prometheus if necessary.
- If the rule group taking a long time to evaluate belongs to '/sg_prometheus_addons', try reducing the complexity of any custom Prometheus rules provided.
- If the rule group taking a long time to evaluate belongs to '/sg_config_prometheus', please [open an issue](https://github.com/sourcegraph/sourcegraph/issues/new?assignees=&labels=&template=bug_report.md&title=).
`, shared.TitleContainerMonitoring),
},
{
Name: "prometheus_rule_eval_failures",
Description: "failed prometheus rule evaluations over 5m by rule group",
Query: `sum by(rule_group) (rate(prometheus_rule_evaluation_failures_total[5m]))`,
Warning: monitoring.Alert().Greater(0),
Panel: monitoring.Panel().LegendFormat("{{rule_group}}"),
Owner: monitoring.ObservableOwnerInfraOrg,
Interpretation: ruleGroupInterpretation,
NextSteps: `
- Check Prometheus logs for messages related to rule group evaluation (generally with log field 'component="rule manager"').
- If the rule group failing to evaluate belongs to '/sg_prometheus_addons', ensure any custom Prometheus configuration provided is valid.
- If the rule group taking a long time to evaluate belongs to '/sg_config_prometheus', please [open an issue](https://github.com/sourcegraph/sourcegraph/issues/new?assignees=&labels=&template=bug_report.md&title=).
`,
},
},
},
},
{
Title: "Alerts",
Rows: []monitoring.Row{
{
{
Name: "alertmanager_notification_latency",
Description: "alertmanager notification latency over 1m by integration",
Query: `sum by(integration) (rate(alertmanager_notification_latency_seconds_sum[1m]))`,
Warning: monitoring.Alert().GreaterOrEqual(1),
Panel: monitoring.Panel().Unit(monitoring.Seconds).LegendFormat("{{integration}}"),
Owner: monitoring.ObservableOwnerInfraOrg,
NextSteps: fmt.Sprintf(`
- Check the %s panels and try increasing resources for Prometheus if necessary.
- Ensure that your ['observability.alerts' configuration](https://sourcegraph.com/docs/admin/observability/alerting#setting-up-alerting) (in site configuration) is valid.
- Check if the relevant alert integration service is experiencing downtime or issues.
`, shared.TitleContainerMonitoring),
},
{
Name: "alertmanager_notification_failures",
Description: "failed alertmanager notifications over 1m by integration",
Query: `sum by(integration) (rate(alertmanager_notifications_failed_total[1m]))`,
Warning: monitoring.Alert().Greater(0),
Panel: monitoring.Panel().LegendFormat("{{integration}}"),
Owner: monitoring.ObservableOwnerInfraOrg,
NextSteps: `
- Ensure that your ['observability.alerts' configuration](https://sourcegraph.com/docs/admin/observability/alerting#setting-up-alerting) (in site configuration) is valid.
- Check if the relevant alert integration service is experiencing downtime or issues.
`,
},
},
},
},
{
Title: "Internals",
Hidden: true,
Rows: []monitoring.Row{
{
{
Name: "prometheus_config_status",
Description: "prometheus configuration reload status",
Query: `prometheus_config_last_reload_successful`,
Warning: monitoring.Alert().Less(1),
Panel: monitoring.Panel().LegendFormat("reload success").Max(1),
Owner: monitoring.ObservableOwnerInfraOrg,
Interpretation: "A '1' indicates Prometheus reloaded its configuration successfully.",
NextSteps: `
- Check Prometheus logs for messages related to configuration loading.
- Ensure any [custom configuration you have provided Prometheus](https://sourcegraph.com/docs/admin/observability/metrics#prometheus-configuration) is valid.
`,
},
{
Name: "alertmanager_config_status",
Description: "alertmanager configuration reload status",
Query: `alertmanager_config_last_reload_successful`,
Warning: monitoring.Alert().Less(1),
Panel: monitoring.Panel().LegendFormat("reload success").Max(1),
Owner: monitoring.ObservableOwnerInfraOrg,
Interpretation: "A '1' indicates Alertmanager reloaded its configuration successfully.",
NextSteps: "Ensure that your [`observability.alerts` configuration](https://sourcegraph.com/docs/admin/observability/alerting#setting-up-alerting) (in site configuration) is valid.",
},
},
{
{
Name: "prometheus_tsdb_op_failure",
Description: "prometheus tsdb failures by operation over 1m by operation",
Query: `increase(label_replace({__name__=~"prometheus_tsdb_(.*)_failed_total"}, "operation", "$1", "__name__", "(.+)s_failed_total")[5m:1m])`,
Warning: monitoring.Alert().Greater(0),
Panel: monitoring.Panel().LegendFormat("{{operation}}"),
Owner: monitoring.ObservableOwnerInfraOrg,
NextSteps: "Check Prometheus logs for messages related to the failing operation.",
},
{
Name: "prometheus_target_sample_exceeded",
Description: "prometheus scrapes that exceed the sample limit over 10m",
Query: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m])",
Warning: monitoring.Alert().Greater(0),
Panel: monitoring.Panel().LegendFormat("rejected scrapes"),
Owner: monitoring.ObservableOwnerInfraOrg,
NextSteps: "Check Prometheus logs for messages related to target scrape failures.",
},
{
Name: "prometheus_target_sample_duplicate",
Description: "prometheus scrapes rejected due to duplicate timestamps over 10m",
Query: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[10m])",
Warning: monitoring.Alert().Greater(0),
Panel: monitoring.Panel().LegendFormat("rejected scrapes"),
Owner: monitoring.ObservableOwnerInfraOrg,
NextSteps: "Check Prometheus logs for messages related to target scrape failures.",
},
},
},
},
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerInfraOrg, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerInfraOrg, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerInfraOrg, nil),
},
}
}