diff --git a/dev/grafana.sh b/dev/grafana.sh index 62f90fca932..8eb22e73cc0 100755 --- a/dev/grafana.sh +++ b/dev/grafana.sh @@ -12,6 +12,7 @@ fi IMAGE=sourcegraph/grafana:dev CONTAINER=grafana +PORT=3370 # docker containers must access things via docker host on non-linux platforms CONFIG_SUB_DIR="all" @@ -30,11 +31,6 @@ fi docker inspect $CONTAINER >/dev/null 2>&1 && docker rm -f $CONTAINER -# Generate Grafana dashboards -pushd monitoring >/dev/null || exit 1 -RELOAD=false go generate -popd >/dev/null || exit 1 - # Log file location: since we log outside of the Docker container, we should # log somewhere that's _not_ ~/.sourcegraph-dev/data/grafana, since that gets # volume mounted into the container and therefore has its own ownership @@ -48,8 +44,9 @@ mkdir -p "${GRAFANA_LOGS}" GRAFANA_LOG_FILE="${GRAFANA_LOGS}/grafana.log" # Quickly build image +echo "Grafana: building ${IMAGE}..." IMAGE=${IMAGE} CACHE=true ./docker-images/grafana/build.sh >"${GRAFANA_LOG_FILE}" 2>&1 || - (BUILD_EXIT_CODE=$? && echo "build failed; dumping log:" && cat "${GRAFANA_LOG_FILE}" && exit $BUILD_EXIT_CODE) + (BUILD_EXIT_CODE=$? && echo "Grafana build failed; dumping log:" && cat "${GRAFANA_LOG_FILE}" && exit $BUILD_EXIT_CODE) function finish() { GRAFANA_EXIT_CODE=$? @@ -66,6 +63,8 @@ function finish() { return $GRAFANA_EXIT_CODE } +echo "Grafana: serving on http://localhost:${PORT}" +echo "Grafana: note that logs are piped to ${GRAFANA_LOG_FILE}" docker run --rm ${DOCKER_USER} \ --name=${CONTAINER} \ --cpus=1 \ diff --git a/dev/prometheus.sh b/dev/prometheus.sh index 674f1d19c87..f8f13c1b425 100755 --- a/dev/prometheus.sh +++ b/dev/prometheus.sh @@ -12,6 +12,7 @@ if [ ! -e "${PROMETHEUS_DISK}" ]; then fi IMAGE=sourcegraph/prometheus:dev CONTAINER=prometheus +PORT=9090 CONFIG_DIR="$(pwd)/docker-images/prometheus/config" DOCKER_NET="" @@ -34,17 +35,15 @@ docker inspect $CONTAINER >/dev/null 2>&1 && docker rm -f $CONTAINER cp ${PROM_TARGETS} "${CONFIG_DIR}"/prometheus_targets.yml -pushd monitoring >/dev/null || exit 1 -RELOAD=false go generate -popd >/dev/null || exit 1 - +# Avoid cluttering dev/start.sh log output PROMETHEUS_LOGS="${HOME}/.sourcegraph-dev/logs/prometheus" mkdir -p "${PROMETHEUS_LOGS}" PROMETHEUS_LOG_FILE="${PROMETHEUS_LOGS}/prometheus.log" # Quickly build image +echo "Prometheus: building ${IMAGE}..." IMAGE=${IMAGE} CACHE=true ./docker-images/prometheus/build.sh >"${PROMETHEUS_LOG_FILE}" 2>&1 || - (BUILD_EXIT_CODE=$? && echo "build failed; dumping log:" && cat "${PROMETHEUS_LOG_FILE}" && exit $BUILD_EXIT_CODE) + (BUILD_EXIT_CODE=$? && echo "Prometheus build failed; dumping log:" && cat "${PROMETHEUS_LOG_FILE}" && exit $BUILD_EXIT_CODE) function finish() { PROMETHEUS_EXIT_CODE=$? @@ -58,6 +57,8 @@ function finish() { return $PROMETHEUS_EXIT_CODE } +echo "Prometheus: serving on http://localhost:${PORT}" +echo "Prometheus: note that logs are piped to ${PROMETHEUS_LOG_FILE}" docker run --rm ${DOCKER_NET} ${DOCKER_USER} \ --name=${CONTAINER} \ --cpus=1 \ @@ -66,4 +67,6 @@ docker run --rm ${DOCKER_NET} ${DOCKER_USER} \ -v "${PROMETHEUS_DISK}":/prometheus \ -v "${CONFIG_DIR}":/sg_prometheus_add_ons \ -e SRC_FRONTEND_INTERNAL="${SRC_FRONTEND_INTERNAL}" \ + -e DISABLE_SOURCEGRAPH_CONFIG="${DISABLE_SOURCEGRAPH_CONFIG:-""}" \ + -e DISABLE_ALERTMANAGER="${DISABLE_ALERTMANAGER:-""}" \ ${IMAGE} >"${PROMETHEUS_LOG_FILE}" 2>&1 || finish diff --git a/doc/admin/observability/alerting.md b/doc/admin/observability/alerting.md index 5b047aa3373..3f0058d4d06 100644 --- a/doc/admin/observability/alerting.md +++ b/doc/admin/observability/alerting.md @@ -4,7 +4,17 @@ Alerts can be configured to notify site admins when there is something wrong or ## Understanding alerts -See [alert solutions](alert_solutions.md) for possible solutions when alerts are firing, and learn more about alert labels, metrics, and dashboards in our [metrics guide](metrics.md). +Alerts fall in one of two severity levels: + +- critical: something is _definitively_ wrong with Sourcegraph. We suggest using a high-visibility notification channel for these alerts. + - **Examples:** Database inaccessible, running out of disk space, running out of memory. + - **Suggested action:** Page a site administrator to investigate. +- warning: something _could_ be wrong with Sourcegraph. We suggest checking in on these periodically, or using a notification channel that will not bother anyone if it is spammed. Over time, as warning alerts become stable and reliable across many Sourcegraph deployments, they will also be promoted to critical alerts in an update by Sourcegraph. + - **Examples:** High latency, high search timeouts. + - **Suggested action:** Email a site administrator to investigate and monitor when convenient, and please let us know so that we can improve them. + +Refer to the [alert solutions reference](alert_solutions.md) for a complete list of Sourcegraph alerts, as well as possible solutions when these alerts are firing. +Learn more about metrics, dashboards, and alert labels in our [metrics guide](metrics.md). ## Setting up alerting diff --git a/doc/admin/observability/metrics.md b/doc/admin/observability/metrics.md index 19027a3a37b..ddccec209b6 100644 --- a/doc/admin/observability/metrics.md +++ b/doc/admin/observability/metrics.md @@ -32,7 +32,7 @@ More behavior can be controlled with [environmental variables](https://grafana.c For most use cases, you can access Grafana [through your Sourcegraph instance](#grafana). Follow the instructions below to access Grafana directly to, for example, edit configuration directly. -> NOTE: Most of the dashboards that Sourcegraph ships with is not configurable through the Grafana UI. +> NOTE: Most of the dashboards that Sourcegraph ships with are not configurable through the Grafana UI. > In general, we recommend [these configuration methods instead](#grafana-configuration). If you are using the [Kubernetes deployment option](../install/kubernetes/index.md), you can access Grafana directly using Kubernetes port forwarding to your local machine: @@ -66,33 +66,29 @@ For most use cases, you can query Prometheus through [Grafana](#grafana) using G #### High-level alerting metrics -Sourcegraph's metrics include a single high-level metric `alert_count` which indicates the number of `level=critical` and `level=warning` alerts each service has fired over time for each Sourcegraph service. This is the same metric presented on the **Overview** Grafana dashboard. +Sourcegraph's metrics include a single high-level metric `alert_count` which indicates the number of `level=critical` and `level=warning` alerts each service has fired over time for each Sourcegraph service. +This is the same metric presented on the **Overview** Grafana dashboard. -We provide [built-in alerting](./alerting.md) for these metrics. Refer to our [alert solutions reference](./alert_solutions.md) for details on specific alerts metrics. +> NOTE: We provide [built-in alerting](./alerting.md) for these alerting metrics to help monitor the health of your Sourcegraph instance. +> Refer to our [alert solutions reference](./alert_solutions.md) for details on specific alerts. -**Description:** The number of alerts each service has fired and their severity level. The severity levels are defined as follows: +**Description:** The number of alerts each service has fired for a given alert name and severity level. -- `critical`: something is _definitively_ wrong with Sourcegraph. We suggest using a high-visibility notification channel for these alerts. - - **Examples:** Database inaccessible, running out of disk space, running out of memory. - - **Suggested action:** Page a site administrator to investigate. -- `warning`: something _could_ be wrong with Sourcegraph. We suggest checking in on these periodically, or using a notification channel that will not bother anyone if it is spammed. Over time, as warning alerts become stable and reliable across many Sourcegraph deployments, they will also be promoted to critical alerts in an update by Sourcegraph. - - **Examples:** High latency, high search timeouts. - - **Suggested action:** Email a site administrator to investigate and monitor when convenient, and please let us know so that we can improve them. - -**Values:** - -- Although the values of `alert_count` are floating-point numbers, only their whole numbers have meaning. For example: `0.5` and `0.7` indicate no alerts are firing, while `1.2` indicates exactly one alert is firing and `3.0` indicates exactly three alerts firing. +**Values:** Although the values of `alert_count` are floating-point numbers, only their whole numbers have meaning. +For example, `0.5` and `0.7` indicate no alerts are firing, while `1.2` indicates exactly one alert is firing and `3.0` indicates exactly three alerts firing. **Labels:** -- `level`: either `critical` or `warning`, as defined above. -- `service_name`: the name of the service that fired the alert. -- `name`: the name of the alert that the service fired. -- `description`: a human-readable description of the alert. +| Label | Description | +|-------|-------------| +| `service_name` | the name of the service that fired the alert | +| `name` | the name of the alert that the service fired | +| `level` | either `critical` or `warning`, as defined [here](./alerting.md) | +| `description` | a human-readable description of the alert | #### Complete reference -A complete reference of Sourcegraph's vast set of Prometheus metrics is not yet available. If you are interested in this, please reach out by filing an issue or contacting us at support@sourcegraph.com. +A complete reference of Sourcegraph's vast set of Prometheus metrics is not yet available. If you are interested in this, please reach out by filing an issue or contacting us at [support@sourcegraph.com](mailto:support@sourcegraph.com). ### Prometheus configuration diff --git a/doc/dev/background-information/index.md b/doc/dev/background-information/index.md index 631aebd8b9f..8e4eff35d00 100644 --- a/doc/dev/background-information/index.md +++ b/doc/dev/background-information/index.md @@ -15,6 +15,7 @@ - [Developing campaigns](campaigns/index.md) - [Developing code intelligence](codeintel/index.md) - [Developing code monitoring](codemonitoring/index.md) +- [Developing observability](observability/index.md) - [Dependencies and generated code](dependencies_and_codegen.md) ## Tools @@ -22,7 +23,6 @@ - [Renovate dependency updates](renovate.md) - [Using PostgreSQL](postgresql.md) -## Monitoring +## Other - [Telemetry](telemetry.md) -- [Observability](observability.md) diff --git a/doc/dev/background-information/observability.md b/doc/dev/background-information/observability.md deleted file mode 100644 index c2e17572053..00000000000 --- a/doc/dev/background-information/observability.md +++ /dev/null @@ -1,21 +0,0 @@ -# Observability developer documentation - -**For how to use Sourcegraph's observability, see [admin: Observability](../../admin/observability/index.md).** - -This documentation is for **developing** Sourcegraph's observability. - -## What type of observability should you add? - -**WIP** - -## How to add tracing? - -**WIP** - -## How to add Prometheus metrics? - -**WIP** - -## How to add Grafana dashboards? - -**WIP** diff --git a/doc/dev/background-information/observability/grafana.md b/doc/dev/background-information/observability/grafana.md new file mode 100644 index 00000000000..8247879703d --- /dev/null +++ b/doc/dev/background-information/observability/grafana.md @@ -0,0 +1,15 @@ +# Sourcegraph Grafana + +We ship a custom Grafana image as part of a standard Sourcegraph distribution. +Learn more about it in our [monitoring architecture](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-grafana). + +Adding dashboards, panels, etc. to this image is handled by the [monitoring generator](./monitoring-generator.md). + +The image is defined in [`docker-images/grafana`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/tree/docker-images/grafana). + +## Upgrading Grafana + +To upgrade Grafana, make the appropriate version change to the [`sourcegraph/grafana` Dockerfile](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+FROM+grafana/grafana::%5Bversion.%5D&patternType=structural) and: + +* Ensure the image still builds: `./docker-images/grafana/build.sh` +* [Run the monitoring stack locally](../../how-to/monitoring_local_dev.md) and verify that all generated Grafana dashboards still render correctly diff --git a/doc/dev/background-information/observability/index.md b/doc/dev/background-information/observability/index.md new file mode 100644 index 00000000000..8dad0806971 --- /dev/null +++ b/doc/dev/background-information/observability/index.md @@ -0,0 +1,34 @@ +# Developing observability + +This documentation is for generalized, usecase-agnostic development of Sourcegraph's observability. +Sourcegraph employees should also refer to the [handbook's observability section](https://about.sourcegraph.com/handbook/engineering/observability) for Sourcegraph-specific documentation. + +> NOTE: For how to *use* Sourcegraph's observability and an overview of our observability features, refer to the [observability for site administrators documentation](../../../admin/observability/index.md). + +## Overview + +Observability at Sourcegraph includes: + +| | Description | Examples | +|:--|------------|--------| +| **Monitoring** | how you know _when_ something is wrong | Dashboards & metrics, alerting, health checks | +| **Debugging** | how you debug _what_ is wrong | Tracing, logging | + +## Concepts + +- [Sourcegraph monitoring pillars](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_pillars) +- [Sourcegraph monitoring architecture](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture) + +## Guides + +- [How to find monitoring](../../how-to/find_monitoring.md) +- [How to add monitoring](../../how-to/add_monitoring.md) +- [Set up local monitoring development](../../how-to/monitoring_local_dev.md) +- How to add observability (coming soon) + +## Components + +- [Monitoring generator](./monitoring-generator.md) +- [Sourcegraph Grafana](./grafana.md) +- [Sourcegraph Prometheus](./prometheus.md) +- [Observability for site administrators](../../../admin/observability/index.md) diff --git a/doc/dev/background-information/observability/monitoring-generator.md b/doc/dev/background-information/observability/monitoring-generator.md new file mode 100644 index 00000000000..ed20272696b --- /dev/null +++ b/doc/dev/background-information/observability/monitoring-generator.md @@ -0,0 +1,59 @@ +# Sourcegraph monitoring generator + +
+The monitoring generator manages converting monitoring definitions into integrations with Sourcegraph's monitoring ecosystem. +
+ +Its purpose is to help enable a [cohesive observability experience for site administrators](../../../admin/observability/index.md), codify [Sourcegraph's monitoring pillars](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_pillars), and make it easy for [developers to add monitoring for their Sourcegraph services](../../how-to/add_monitoring.md) by generating integrations with Sourcegraph's monitoring ecosystem for free. + +## Reference + +- [Usage and development](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/README.md) for developing the generator itself +- [Monitoring API](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md) for interacting with the generator library +- [How to add monitoring definitions](../../how-to/add_monitoring.md) for developers looking to add monitoring for their services + +## Features + +### Documentation generation + +The generator automatically creates documentation from monitoring definitions that customers and engineers can reference. +These include: + +- [Alert solutions reference](https://docs.sourcegraph.com/admin/observability/alert_solutions) +- [Dashboards reference](https://docs.sourcegraph.com/admin/observability/dashboards) + +Links to generated documentation can be provided in our other generated integrations - for example, [Slack alerts](https://docs.sourcegraph.com/admin/observability/alerting#setting-up-alerting) will provide a link to the appropriate alert solutions entry, and [Grafana panels](#grafana-integration) will link to the appropriate dashboards reference entry. + +### Grafana integration + +The generator automatically generates and ships dashboards from monitoring definitions within the [Sourcegraph Grafana distribution](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-grafana). + +It also takes care of the following: + +- Graphs within rows are sized appropriately +- Alerts visualization through the [`ObservableAlertDefinition` API](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md#type-observablealertdefinition): + - Overview graphs for alerts (both Sourcegraph-wide and per-service) + - Threshold lines for alerts of all levels are rendered in graphs +- Formatting of units, labels, and more (using either the defaults, or the [`ObservablePanelOptions` API](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md#type-observablepaneloptions)) +- Maintaining a uniform look and feel across all dashboards +- Providing links to [generated documentation](#documentation-generation) + +Links to generated documentation can be provided in our other generated integrations - for example, [Slack alerts](https://docs.sourcegraph.com/admin/observability/alerting#setting-up-alerting) will provide a link to the appropriate service's dashboard. + +### Prometheus integration + +The generator automatically generates and ships Prometheus recording rules and alerts within the [Sourcegraph Prometheus distribution](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-prometheus). +This include the following, all with appropriate and consistent labels: + +- [`alert_count` recording rules](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#alert-count-metrics) +- Native Prometheus alerts, leveraged by our [Alertmanager integration](#alertmanager-integration) + +Generated Prometheus recording rules are leveraged by the [Grafana integration](#grafana-integration). + +### Alertmanager integration + +The generator's [Prometheus integration](#prometheus-integration) is a critical part of the [Sourcegraph's alerting capabilities](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#alert-notifications), which handles alert routing by level and formatting of alert messages to include links to [documentation](#documentation-generation) and [dashboards](#grafana-integration). +Learn more about using Sourcegraph alerting in the [alerting documentation](https://docs.sourcegraph.com/admin/observability/alerting). +This is possible due to the labels generated by the [Prometheus integration](#prometheus-integration). + +At Sourcegraph, extended routing based on team ownership (as defined by [`ObservableOwner`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md#type-observableowner)) is also used to route customer support requests and [on-call events through OpsGenie](https://about.sourcegraph.com/handbook/engineering/incidents/on_call). diff --git a/doc/dev/background-information/observability/prometheus.md b/doc/dev/background-information/observability/prometheus.md new file mode 100644 index 00000000000..a338ad40295 --- /dev/null +++ b/doc/dev/background-information/observability/prometheus.md @@ -0,0 +1,27 @@ +# Sourcegraph Prometheus + +We ship a custom Prometheus image as part of a standard Sourcegraph distribution. +It currently bundles Alertmanager as well as integrations to the Sourcegraph web application. +Learn more about it in our [monitoring architecture](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-prometheus). + +Adding recording rules, alerts, etc. to this image is handled by the [monitoring generator](./monitoring-generator.md). + +The image is defined in [`docker-images/prometheus`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/tree/docker-images/prometheus). + +## Prom-wrapper + +The entrypoint of the image is a sidecar program called the prom-wrapper. +Learn more about it [here](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#prom-wrapper). + +The source code for this program is currently kept in [`docker-images/prometheus/cmd/prom-wrapper`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/tree/docker-images/prometheus/cmd/prom-wrapper). + +## Upgrading Prometheus or Alertmanager + +To upgrade Prometheus or Alertmanager, make the appropriate version and sum changes to the [`sourcegraph/prometheus` Dockerfile](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:go.mod+prometheus/alertmanager+OR+prometheus/client_golang&patternType=literal) and make sure to: + +* Upgrade the [Alertmanager and Prometheus Go client dependencies](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:go.mod+prometheus/alertmanager+OR+prometheus/client_golang&patternType=literal) where appropriate +* Ensure the image still builds: `./docker-images/prometheus/build.sh` +* [Run the monitoring stack locally](../../how-to/monitoring_local_dev.md) and verify that: + * all Prometheus rules are evaluated successfully (`localhost:9090/rules`) + * Alertmanager starts up correctly (`localhost:9090/alertmanager/#/status`) + * [`observability.alerts` can be configured](../../../admin/observability/alerting.md) via the Sourcegraph web application diff --git a/doc/dev/how-to/add_monitoring.md b/doc/dev/how-to/add_monitoring.md new file mode 100644 index 00000000000..2175cd0f25b --- /dev/null +++ b/doc/dev/how-to/add_monitoring.md @@ -0,0 +1,166 @@ +# How to add monitoring + +This guide documents how to add monitoring to Sourcegraph's source code. +Sourcegraph employees should also refer to the [handbook's monitoring section](https://about.sourcegraph.com/handbook/engineering/observability/monitoring) for Sourcegraph-specific documentation. +The [developing observability page](../background-information/observability/index.md) contains relevant documentation as well. + +> NOTE: For how to *use* Sourcegraph's observability and an overview of our observability features, refer to the [observability for site administrators documentation](../../admin/observability/index.md). + +## Metrics + +Service-side, metrics should be made available over HTTP for Prometheus to scrape. +By default, Prometheus expects metrics to be exported on `$SERVICEPORT/metrics` - for example, run your local Sourcegraph dev server and metrics should be available on `http://localhost:$SERVICEPORT/metrics`. +How this is configured varies across the various [Sourcegraph deployment options](../../admin/install/index.md) - see [tracking a new service](#tracking-a-new-service). + +### Tracking a new service + +In [deploy-sourcegraph](https://github.com/sourcegraph/deploy-sourcegraph), Prometheus uses the Kubernetes API to discover endpoints to scrape. Just add the following annotations to your service definition: + +```yaml +metadata: + annotations: + prometheus.io/port: "$SERVICEPORT" # replace with the port your service runs on + sourcegraph.prometheus/scrape: "true" +``` + +In [deploy-sourcegraph-docker](https://github.com/sourcegraph/deploy-sourcegraph-docker), Prometheus relies on targets defined in the [`prometheus_targets`](https://github.com/sourcegraph/deploy-sourcegraph-docker/blob/master/prometheus/prometheus_targets.yml) configuration file - you will need to add your service here. + +## Alerts, dashboards, and documentation + +Creating alerts, dashboards, and documentation for monitoring is powered by the Sourcegraph monitoring generator, which requires monitorings to be defined in our [monitoring definitions package](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/tree/monitoring/definitions). +The monitoring generator provides [a lot of features and integrations with the Sourcegraph monitoring ecosystem](../background-information/observability/monitoring-generator.md#features) for free. + +This section documents how to use develop monitoring definitions for a Sourcegraph service. +To get started, you should read: + +- the [Sourcegraph monitoring pillars](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_pillars) for some of the principles we try to uphold when developing monitoring +- relevant [reference documentation for the monitoring generator](../background-information/observability/monitoring-generator.md) + +### Set up an observable + +Monitoring is build around "observables" - something you wish to observe. +The generator API exposes this concept through the [`Observable` type](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md#type-observable). + +You can decide where to put your new observable by looking for an existing dashboard that your information should go in. +Think "when this number shows something bad, which service logs are likely to be most relevant?". +If you are just editing an existing observable, + +Existing dashboards can be viewed by either: + +- Visiting Grafana on an existing Sourcegraph instance that you have site admin permissions for, e.g. `example.sourcegraph.com/-/debug/grafana` - see the [metrics for site administrators documentation](../../admin/observability/metrics.md) for more details. +- [Running the monitoring stack locally](./monitoring_local_dev.md) + +Once you have found a home for your observable, open that service's monitoring definition (e.g. `monitoring/frontend.go`, `monitoring/git_server.go`) in your editor. +Declare your [`Observable`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24%40master+file:%5Emonitoring/+type+Observable&patternType=literal) by: + +- adding it to [an existing `Row` in the file](https://sourcegraph.com/github.com/sourcegraph/sourcegraph@64aa473/-/blob/monitoring/frontend.go#L12-43) +- adding a new [`Row`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24%40master+file:%5Emonitoring/+type+Row&patternType=literal) +- adding a new [`Group`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24%40master+file:%5Emonitoring/+type+Group&patternType=literal) entirely + +Here's an example `Observable` that we will use throughout this guide to get you started: + +```go +{ + Name: "some_metric_behaviour", + Description: "some behaviour of a metric", +} +``` + +### Write a query + +Use the Grafana Explore page on a Sourcegraph instance where you have site administrator access (`/-/debug/grafana/explore`) to start writing your Prometheus query. + +```diff +{ + Name: "some_metric_behaviour", +- Description: "some behaviour of a metric", ++ Description: "some behaviour of a metric over 5m", ++ Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`, +} +``` + +Make sure to update your description to reflect the query you end up with where relevant. + +### Configure panel options + +Panel options can be used to customize the visualization of your observable in Grafana. +This step is optional, but highly recommended. + +There are not many panel options (intentionally) to keep things simple. +The primary thing you'll use is to change the Grafana display from plain numbers to a unit like seconds: + +```diff +{ + Name: "some_metric_behaviour", + Description: "some behaviour of a metric over 5m", + Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`, ++ PanelOptions: PanelOptions().LegendFormat("duration").Unit(Seconds), +} +``` + +### Add an alert + +Alerts can be defined at two levels: warning, and critical. +They are used to provide Sourcegraph health notifications for site administrators. +This step is optional, but highly recommended. + +To get started, make a guess about what a good or bad value for your query is. +It's OK if this isn't perfect, just do your best. +Then add an alert to your Observable, for example: + +```diff +{ + Name: "some_metric_behaviour", + Description: "some behaviour of a metric over 5m", + Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`, + PanelOptions: PanelOptions().LegendFormat("duration").Unit(Seconds), ++ Warning: Alert{GreaterOrEqual: 20}, +} +``` + +This step is optional - if you opt not to include an alert, you must explicitly set `NoAlert: true` and provide [relevant documentation for this observable](#add-documentation). + +### Add documentation + +It's best if you also add some Markdown documentation with your best guess of what someone _might consider doing_ if they observe the alert firing (again, just your best guess is good enough here): + +```diff +{ + Name: "some_metric_behaviour", + Description: "some behaviour of a metric over 5m", + Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`, + Warning: Alert{GreaterOrEqual: 20}, + PanelOptions: PanelOptions().LegendFormat("duration").Unit(Seconds), ++ PossibleSolutions: ` ++ - Look at 'SERVICE' logs for details on the slow search queries. ++ `, +} +``` + +```diff +{ + Name: "some_metric_behaviour", + Description: "some behaviour of a metric over 5m", + Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`, + NoAlert: true, + PanelOptions: PanelOptions().LegendFormat("duration").Unit(Seconds), ++ Interpretation: ` ++ This value might be high under X, Y, and Z conditions. ++ `, +} +``` + +> NOTE: In both `PossibleSolutions` and `Interpretation`, you can write plain Markdown with some slight modifications, such as single quotes are used instead of backticks for code formatting, and indention will automatically be removed for you. + +### Validate your observable + +Run the monitoring generator from the root Sourcegraph directory: + +```sh +go generate ./monitoring/... +``` + +This will validate your Observable configuration and let you know of any changes you need to make if required. +If the generator runs successfully, you should now [run the monitoring stack locally](./monitoring_local_dev.md) to validate the output and results of your observable by hand. + +Once everything looks good, open a pull request with your observable to the main Sourcegraph codebase! diff --git a/doc/dev/how-to/documentation_implementation.md b/doc/dev/how-to/documentation_implementation.md index 7ee961e36b8..f8ad35696a3 100644 --- a/doc/dev/how-to/documentation_implementation.md +++ b/doc/dev/how-to/documentation_implementation.md @@ -26,7 +26,7 @@ You can preview the documentation site at http://localhost:5080 when running Sou You can also run the docsite on its own with the following command: ```sh -./dev/docsite.sh -config doc/docsite.json serve -http=localhost:5080 +yarn docsite:serve ``` ## Linking to documentation in-product diff --git a/doc/dev/how-to/find_monitoring.md b/doc/dev/how-to/find_monitoring.md new file mode 100644 index 00000000000..5d284fca5c9 --- /dev/null +++ b/doc/dev/how-to/find_monitoring.md @@ -0,0 +1,24 @@ +# How to find monitoring + +This guide documents how to find monitoring within Sourcegraph's source code. +Sourcegraph employees should also refer to the [handbook's monitoring section](https://about.sourcegraph.com/handbook/engineering/observability/monitoring) for Sourcegraph-specific documentation. +The [developing observability page](../background-information/observability/index.md) contains relevant documentation as well. + +> NOTE: For how to *use* Sourcegraph's observability and an overview of our observability features, refer to the [observability for site administrators documentation](../../admin/observability/index.md). + +## Alerts + +Alerts are defined in the [`monitoring/definitions` package](https://k8s.sgdev.org/github.com/sourcegraph/sourcegraph/-/tree/monitoring/definitions) - for example, [querying for definitions of `Warning` or `Critical`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:monitoring/definitions+Warning:+:%5B_%5Cn%5D+OR+Critical:+:%5B_%5Cn%5D&patternType=structural) will surface all Sourcegraph alerts. + +## Metrics + +You can use Sourcegraph itself to search for metrics definitions - for example, by [querying for usages of `prometheus.HistogramOpts`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+prometheus.HistogramOpts%7B+:%5B_%5D+%7D+&patternType=structural). + +Sometimes the metrics are hard to find because their name declarations are not literal strings, but are concatenated in code from variables. +In these cases you can try a specialized tool called [`promgrep`](https://github.com/sourcegraph/promgrep) to find them. + +```sh +go get github.com/sourcegraph/promgrep +# in the root `sourcegraph/sourcegraph` source directory +promgrep