monitoring: dashboards docs improvements (#24563)

- Render Description more prominently because that's how panels are identified in dashboards
- Render link to panel and dashboards
- Render full query in collapsible section for reference
- Show number of alerts and indicate that no alerts are defined more prominently
- Alerts now always link to the panel docs
- Fix code-insights owner slug that lead to handbook 404
- Fix some `MetricDescriptionRoot` using underscore instead of space
This commit is contained in:
Robert Lin 2021-09-02 16:05:36 -04:00 committed by GitHub
parent 586b0ee3a0
commit 4933ac4e81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 8733 additions and 1222 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -75,7 +75,7 @@ func (codeInsights) NewInsightsQueryRunnerResetterGroup(containerName string) mo
ObservableConstructorOptions: ObservableConstructorOptions{
MetricNameRoot: "insights_search_queue",
MetricDescriptionRoot: "insights_search_queue",
MetricDescriptionRoot: "insights search queue",
},
},

View File

@ -663,21 +663,21 @@ func (codeIntelligence) NewJanitorGroup(containerName string) monitoring.Group {
{
Standard.Count("records deleted")(ObservableConstructorOptions{
MetricNameRoot: "codeintel_background_upload_records_removed",
MetricDescriptionRoot: "lsif_upload",
MetricDescriptionRoot: "lsif upload",
})(containerName, monitoring.ObservableOwnerCodeIntel).WithNoAlerts(`
Number of LSIF upload records deleted due to expiration or unreachability every 5m
`).Observable(),
Standard.Count("records deleted")(ObservableConstructorOptions{
MetricNameRoot: "codeintel_background_index_records_removed",
MetricDescriptionRoot: "lsif_index",
MetricDescriptionRoot: "lsif index",
})(containerName, monitoring.ObservableOwnerCodeIntel).WithNoAlerts(`
Number of LSIF index records deleted due to expiration or unreachability every 5m
`).Observable(),
Standard.Count("data bundles deleted")(ObservableConstructorOptions{
MetricNameRoot: "codeintel_background_uploads_purged",
MetricDescriptionRoot: "lsif_upload",
MetricDescriptionRoot: "lsif upload",
})(containerName, monitoring.ObservableOwnerCodeIntel).WithNoAlerts(`
Number of LSIF upload data bundles purged from the codeintel-db database every 5m
`).Observable(),

View File

@ -112,7 +112,7 @@ func Worker() *monitoring.Container {
ObservableConstructorOptions: shared.ObservableConstructorOptions{
MetricNameRoot: "codeintel_background_upload",
MetricDescriptionRoot: "lsif_upload",
MetricDescriptionRoot: "lsif upload",
},
},
@ -132,7 +132,7 @@ func Worker() *monitoring.Container {
ObservableConstructorOptions: shared.ObservableConstructorOptions{
MetricNameRoot: "codeintel_background_index",
MetricDescriptionRoot: "lsif_index",
MetricDescriptionRoot: "lsif index",
},
},
@ -152,7 +152,7 @@ func Worker() *monitoring.Container {
ObservableConstructorOptions: shared.ObservableConstructorOptions{
MetricNameRoot: "codeintel_background_dependency_index",
MetricDescriptionRoot: "lsif_dependency_index",
MetricDescriptionRoot: "lsif dependency index",
},
},

View File

@ -78,20 +78,21 @@ func renderDocumentation(containers []*Container) (*documentation, error) {
for _, c := range containers {
fmt.Fprintf(&docs.dashboards, "## %s\n\n", c.Title)
fprintSubtitle(&docs.dashboards, c.Description)
fmt.Fprintf(&docs.dashboards, "To see this dashboard, visit `/-/debug/grafana/d/%[1]s/%[1]s` on your Sourcegraph instance.\n\n", c.Name)
for _, g := range c.Groups {
for gIndex, g := range c.Groups {
// the "General" group is top-level
if g.Title != "General" {
fmt.Fprintf(&docs.dashboards, "### %s: %s\n\n", c.Title, g.Title)
}
for _, r := range g.Rows {
for _, o := range r {
for rIndex, r := range g.Rows {
for oIndex, o := range r {
if err := docs.renderAlertSolutionEntry(c, o); err != nil {
return nil, errors.Errorf("error rendering alert solution entry %q %q: %w",
c.Name, o.Name, err)
}
if err := docs.renderDashboardPanelEntry(c, o); err != nil {
if err := docs.renderDashboardPanelEntry(c, g, o, observablePanelID(gIndex, rIndex, oIndex)); err != nil {
return nil, errors.Errorf("error rendering dashboard panel entry %q %q: %w",
c.Name, o.Name, err)
}
@ -140,16 +141,20 @@ func (d *documentation) renderAlertSolutionEntry(c *Container, o Observable) err
possibleSolutions, _ := toMarkdown(o.PossibleSolutions, true)
fmt.Fprintf(&d.alertSolutions, "%s\n", possibleSolutions)
}
if o.Interpretation != "" && o.Interpretation != "none" {
// indicate help is available in dashboards reference
fmt.Fprintf(&d.alertSolutions, "- More help interpreting this metric is available in the [dashboards reference](./%s#%s).\n",
dashboardsDocsFile, observableDocAnchor(c, o))
} else {
// just show the panel reference
fmt.Fprintf(&d.alertSolutions, "- Learn more about the related dashboard panel in the [dashboards reference](./%s#%s).\n",
dashboardsDocsFile, observableDocAnchor(c, o))
}
// add silencing configuration as another solution
fmt.Fprintf(&d.alertSolutions, "- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:\n\n")
fmt.Fprintf(&d.alertSolutions, "```json\n%s\n```\n\n", fmt.Sprintf(`"observability.silenceAlerts": [
%s
]`, strings.Join(prometheusAlertNames, ",\n")))
// add link to panel information IF there are additional details available
if o.Interpretation != "" && o.Interpretation != "none" {
fmt.Fprintf(&d.alertSolutions, "> NOTE: More help interpreting this metric is available in the [dashboards reference](./%s#%s).\n\n",
dashboardsDocsFile, observableDocAnchor(c, o))
}
if o.Owner != "" {
// add owner
fprintOwnedBy(&d.alertSolutions, o.Owner)
@ -159,23 +164,42 @@ func (d *documentation) renderAlertSolutionEntry(c *Container, o Observable) err
return nil
}
func (d *documentation) renderDashboardPanelEntry(c *Container, o Observable) error {
func (d *documentation) renderDashboardPanelEntry(c *Container, g Group, o Observable, panelID uint) error {
fprintObservableHeader(&d.dashboards, c, &o, 4)
fmt.Fprintf(&d.dashboards, "This panel indicates %s.\n\n", o.Description)
fprintSubtitle(&d.dashboards, fmt.Sprintf("%s\n\n", upperFirst(o.Description)))
// render interpretation reference if available
if o.Interpretation != "" && o.Interpretation != "none" {
interpretation, _ := toMarkdown(o.Interpretation, false)
fmt.Fprintf(&d.dashboards, "%s\n\n", interpretation)
}
// add link to alert solutions IF there is an alert attached
if !o.NoAlert {
fmt.Fprintf(&d.dashboards, "> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./%s#%s).\n\n",
alertSolutionsFile, observableDocAnchor(c, o))
fmt.Fprintf(&d.dashboards, "Refer to the [alert solutions reference](./%s#%s) for %s related to this panel.\n\n",
alertSolutionsFile, observableDocAnchor(c, o), pluralize("alert", o.alertsCount()))
} else {
fmt.Fprintf(&d.dashboards, "This panel has no related alerts.\n\n")
}
// how to get to this panel
fmt.Fprintf(&d.dashboards, "To see this panel, visit `/-/debug/grafana/d/%[1]s/%[1]s?viewPanel=%[2]d` on your Sourcegraph instance.\n\n",
c.Name, panelID)
if o.Owner != "" {
// add owner
fprintOwnedBy(&d.dashboards, o.Owner)
}
fmt.Fprintf(&d.dashboards, `
<details>
<summary>Technical details</summary>
Query: %s
</details>
`, fmt.Sprintf("`%s`", o.Query))
// render break for readability
fmt.Fprint(&d.dashboards, "\n<br />\n\n")
return nil

View File

@ -422,19 +422,21 @@ const (
// toMarkdown returns a Markdown string that also links to the owner's team page
func (o ObservableOwner) toMarkdown() string {
var teamName string
var slug string
// special cases for differences in how a team is named in ObservableOwner and how
// they are named in the handbook.
// see https://about.sourcegraph.com/company/team/org_chart#engineering
switch o {
case ObservableOwnerCodeIntel:
teamName = "code-intelligence"
slug = "code-intelligence"
case ObservableOwnerCodeInsights:
slug = "developer-insights/code-insights"
default:
teamName = string(o)
slug = strings.ReplaceAll(string(o), " ", "-")
}
slug := strings.ReplaceAll(teamName, " ", "-")
return fmt.Sprintf("[Sourcegraph %s team](https://about.sourcegraph.com/handbook/engineering/%s)", upperFirst(teamName), slug)
return fmt.Sprintf("[Sourcegraph %s team](https://about.sourcegraph.com/handbook/engineering/%s)",
upperFirst(string(o)), slug)
}
// Observable describes a metric about a container that can be observed. For example, memory usage.
@ -574,7 +576,7 @@ func (o Observable) validate() error {
return errors.New(`Panel.panelType must be "graph" or "heatmap"`)
}
allAlertsEmpty := o.Warning.isEmpty() && o.Critical.isEmpty()
allAlertsEmpty := o.alertsCount() == 0
if allAlertsEmpty || o.NoAlert {
// Ensure lack of alerts is intentional
if allAlertsEmpty && !o.NoAlert {
@ -619,6 +621,16 @@ func (o Observable) validate() error {
return nil
}
func (o Observable) alertsCount() (count int) {
if !o.Warning.isEmpty() {
count++
}
if !o.Critical.isEmpty() {
count++
}
return
}
// Alert provides a builder for defining alerting on an Observable.
func Alert() *ObservableAlertDefinition {
return &ObservableAlertDefinition{}

View File

@ -20,6 +20,13 @@ func withPeriod(s string) string {
return s
}
func pluralize(noun string, count int) string {
if count != 1 {
noun += "s"
}
return fmt.Sprintf("%d %s", count, noun)
}
// StringPtr converts a string value to a pointer, useful for setting fields in some APIs.
func StringPtr(s string) *string { return &s }