doc(dev): monitoring how-to guides, observability background (#16965)

Co-authored-by: uwedeportivo <534011+uwedeportivo@users.noreply.github.com>
This commit is contained in:
Robert Lin 2020-12-24 14:51:42 +08:00 committed by GitHub
parent ef7f19a756
commit 6a6f0ab5b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 508 additions and 128 deletions

View File

@ -12,6 +12,7 @@ fi
IMAGE=sourcegraph/grafana:dev
CONTAINER=grafana
PORT=3370
# docker containers must access things via docker host on non-linux platforms
CONFIG_SUB_DIR="all"
@ -30,11 +31,6 @@ fi
docker inspect $CONTAINER >/dev/null 2>&1 && docker rm -f $CONTAINER
# Generate Grafana dashboards
pushd monitoring >/dev/null || exit 1
RELOAD=false go generate
popd >/dev/null || exit 1
# Log file location: since we log outside of the Docker container, we should
# log somewhere that's _not_ ~/.sourcegraph-dev/data/grafana, since that gets
# volume mounted into the container and therefore has its own ownership
@ -48,8 +44,9 @@ mkdir -p "${GRAFANA_LOGS}"
GRAFANA_LOG_FILE="${GRAFANA_LOGS}/grafana.log"
# Quickly build image
echo "Grafana: building ${IMAGE}..."
IMAGE=${IMAGE} CACHE=true ./docker-images/grafana/build.sh >"${GRAFANA_LOG_FILE}" 2>&1 ||
(BUILD_EXIT_CODE=$? && echo "build failed; dumping log:" && cat "${GRAFANA_LOG_FILE}" && exit $BUILD_EXIT_CODE)
(BUILD_EXIT_CODE=$? && echo "Grafana build failed; dumping log:" && cat "${GRAFANA_LOG_FILE}" && exit $BUILD_EXIT_CODE)
function finish() {
GRAFANA_EXIT_CODE=$?
@ -66,6 +63,8 @@ function finish() {
return $GRAFANA_EXIT_CODE
}
echo "Grafana: serving on http://localhost:${PORT}"
echo "Grafana: note that logs are piped to ${GRAFANA_LOG_FILE}"
docker run --rm ${DOCKER_USER} \
--name=${CONTAINER} \
--cpus=1 \

View File

@ -12,6 +12,7 @@ if [ ! -e "${PROMETHEUS_DISK}" ]; then
fi
IMAGE=sourcegraph/prometheus:dev
CONTAINER=prometheus
PORT=9090
CONFIG_DIR="$(pwd)/docker-images/prometheus/config"
DOCKER_NET=""
@ -34,17 +35,15 @@ docker inspect $CONTAINER >/dev/null 2>&1 && docker rm -f $CONTAINER
cp ${PROM_TARGETS} "${CONFIG_DIR}"/prometheus_targets.yml
pushd monitoring >/dev/null || exit 1
RELOAD=false go generate
popd >/dev/null || exit 1
# Avoid cluttering dev/start.sh log output
PROMETHEUS_LOGS="${HOME}/.sourcegraph-dev/logs/prometheus"
mkdir -p "${PROMETHEUS_LOGS}"
PROMETHEUS_LOG_FILE="${PROMETHEUS_LOGS}/prometheus.log"
# Quickly build image
echo "Prometheus: building ${IMAGE}..."
IMAGE=${IMAGE} CACHE=true ./docker-images/prometheus/build.sh >"${PROMETHEUS_LOG_FILE}" 2>&1 ||
(BUILD_EXIT_CODE=$? && echo "build failed; dumping log:" && cat "${PROMETHEUS_LOG_FILE}" && exit $BUILD_EXIT_CODE)
(BUILD_EXIT_CODE=$? && echo "Prometheus build failed; dumping log:" && cat "${PROMETHEUS_LOG_FILE}" && exit $BUILD_EXIT_CODE)
function finish() {
PROMETHEUS_EXIT_CODE=$?
@ -58,6 +57,8 @@ function finish() {
return $PROMETHEUS_EXIT_CODE
}
echo "Prometheus: serving on http://localhost:${PORT}"
echo "Prometheus: note that logs are piped to ${PROMETHEUS_LOG_FILE}"
docker run --rm ${DOCKER_NET} ${DOCKER_USER} \
--name=${CONTAINER} \
--cpus=1 \
@ -66,4 +67,6 @@ docker run --rm ${DOCKER_NET} ${DOCKER_USER} \
-v "${PROMETHEUS_DISK}":/prometheus \
-v "${CONFIG_DIR}":/sg_prometheus_add_ons \
-e SRC_FRONTEND_INTERNAL="${SRC_FRONTEND_INTERNAL}" \
-e DISABLE_SOURCEGRAPH_CONFIG="${DISABLE_SOURCEGRAPH_CONFIG:-""}" \
-e DISABLE_ALERTMANAGER="${DISABLE_ALERTMANAGER:-""}" \
${IMAGE} >"${PROMETHEUS_LOG_FILE}" 2>&1 || finish

View File

@ -4,7 +4,17 @@ Alerts can be configured to notify site admins when there is something wrong or
## Understanding alerts
See [alert solutions](alert_solutions.md) for possible solutions when alerts are firing, and learn more about alert labels, metrics, and dashboards in our [metrics guide](metrics.md).
Alerts fall in one of two severity levels:
- <span class="badge badge-critical">critical</span>: something is _definitively_ wrong with Sourcegraph. We suggest using a high-visibility notification channel for these alerts.
- **Examples:** Database inaccessible, running out of disk space, running out of memory.
- **Suggested action:** Page a site administrator to investigate.
- <span class="badge badge-warning">warning</span>: something _could_ be wrong with Sourcegraph. We suggest checking in on these periodically, or using a notification channel that will not bother anyone if it is spammed. Over time, as warning alerts become stable and reliable across many Sourcegraph deployments, they will also be promoted to critical alerts in an update by Sourcegraph.
- **Examples:** High latency, high search timeouts.
- **Suggested action:** Email a site administrator to investigate and monitor when convenient, and please let us know so that we can improve them.
Refer to the [alert solutions reference](alert_solutions.md) for a complete list of Sourcegraph alerts, as well as possible solutions when these alerts are firing.
Learn more about metrics, dashboards, and alert labels in our [metrics guide](metrics.md).
## Setting up alerting

View File

@ -32,7 +32,7 @@ More behavior can be controlled with [environmental variables](https://grafana.c
For most use cases, you can access Grafana [through your Sourcegraph instance](#grafana).
Follow the instructions below to access Grafana directly to, for example, edit configuration directly.
> NOTE: Most of the dashboards that Sourcegraph ships with is not configurable through the Grafana UI.
> NOTE: Most of the dashboards that Sourcegraph ships with are not configurable through the Grafana UI.
> In general, we recommend [these configuration methods instead](#grafana-configuration).
If you are using the [Kubernetes deployment option](../install/kubernetes/index.md), you can access Grafana directly using Kubernetes port forwarding to your local machine:
@ -66,33 +66,29 @@ For most use cases, you can query Prometheus through [Grafana](#grafana) using G
#### High-level alerting metrics
Sourcegraph's metrics include a single high-level metric `alert_count` which indicates the number of `level=critical` and `level=warning` alerts each service has fired over time for each Sourcegraph service. This is the same metric presented on the **Overview** Grafana dashboard.
Sourcegraph's metrics include a single high-level metric `alert_count` which indicates the number of `level=critical` and `level=warning` alerts each service has fired over time for each Sourcegraph service.
This is the same metric presented on the **Overview** Grafana dashboard.
We provide [built-in alerting](./alerting.md) for these metrics. Refer to our [alert solutions reference](./alert_solutions.md) for details on specific alerts metrics.
> NOTE: We provide [built-in alerting](./alerting.md) for these alerting metrics to help monitor the health of your Sourcegraph instance.
> Refer to our [alert solutions reference](./alert_solutions.md) for details on specific alerts.
**Description:** The number of alerts each service has fired and their severity level. The severity levels are defined as follows:
**Description:** The number of alerts each service has fired for a given alert name and severity level.
- `critical`: something is _definitively_ wrong with Sourcegraph. We suggest using a high-visibility notification channel for these alerts.
- **Examples:** Database inaccessible, running out of disk space, running out of memory.
- **Suggested action:** Page a site administrator to investigate.
- `warning`: something _could_ be wrong with Sourcegraph. We suggest checking in on these periodically, or using a notification channel that will not bother anyone if it is spammed. Over time, as warning alerts become stable and reliable across many Sourcegraph deployments, they will also be promoted to critical alerts in an update by Sourcegraph.
- **Examples:** High latency, high search timeouts.
- **Suggested action:** Email a site administrator to investigate and monitor when convenient, and please let us know so that we can improve them.
**Values:**
- Although the values of `alert_count` are floating-point numbers, only their whole numbers have meaning. For example: `0.5` and `0.7` indicate no alerts are firing, while `1.2` indicates exactly one alert is firing and `3.0` indicates exactly three alerts firing.
**Values:** Although the values of `alert_count` are floating-point numbers, only their whole numbers have meaning.
For example, `0.5` and `0.7` indicate no alerts are firing, while `1.2` indicates exactly one alert is firing and `3.0` indicates exactly three alerts firing.
**Labels:**
- `level`: either `critical` or `warning`, as defined above.
- `service_name`: the name of the service that fired the alert.
- `name`: the name of the alert that the service fired.
- `description`: a human-readable description of the alert.
| Label | Description |
|-------|-------------|
| `service_name` | the name of the service that fired the alert |
| `name` | the name of the alert that the service fired |
| `level` | either `critical` or `warning`, as defined [here](./alerting.md) |
| `description` | a human-readable description of the alert |
#### Complete reference
A complete reference of Sourcegraph's vast set of Prometheus metrics is not yet available. If you are interested in this, please reach out by filing an issue or contacting us at support@sourcegraph.com.
A complete reference of Sourcegraph's vast set of Prometheus metrics is not yet available. If you are interested in this, please reach out by filing an issue or contacting us at [support@sourcegraph.com](mailto:support@sourcegraph.com).
### Prometheus configuration

View File

@ -15,6 +15,7 @@
- [Developing campaigns](campaigns/index.md)
- [Developing code intelligence](codeintel/index.md)
- [Developing code monitoring](codemonitoring/index.md)
- [Developing observability](observability/index.md)
- [Dependencies and generated code](dependencies_and_codegen.md)
## Tools
@ -22,7 +23,6 @@
- [Renovate dependency updates](renovate.md)
- [Using PostgreSQL](postgresql.md)
## Monitoring
## Other
- [Telemetry](telemetry.md)
- [Observability](observability.md)

View File

@ -1,21 +0,0 @@
# Observability developer documentation
**For how to use Sourcegraph's observability, see [admin: Observability](../../admin/observability/index.md).**
This documentation is for **developing** Sourcegraph's observability.
## What type of observability should you add?
**WIP**
## How to add tracing?
**WIP**
## How to add Prometheus metrics?
**WIP**
## How to add Grafana dashboards?
**WIP**

View File

@ -0,0 +1,15 @@
# Sourcegraph Grafana
We ship a custom Grafana image as part of a standard Sourcegraph distribution.
Learn more about it in our [monitoring architecture](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-grafana).
Adding dashboards, panels, etc. to this image is handled by the [monitoring generator](./monitoring-generator.md).
The image is defined in [`docker-images/grafana`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/tree/docker-images/grafana).
## Upgrading Grafana
To upgrade Grafana, make the appropriate version change to the [`sourcegraph/grafana` Dockerfile](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+FROM+grafana/grafana::%5Bversion.%5D&patternType=structural) and:
* Ensure the image still builds: `./docker-images/grafana/build.sh`
* [Run the monitoring stack locally](../../how-to/monitoring_local_dev.md) and verify that all generated Grafana dashboards still render correctly

View File

@ -0,0 +1,34 @@
# Developing observability
This documentation is for generalized, usecase-agnostic development of Sourcegraph's observability.
Sourcegraph employees should also refer to the [handbook's observability section](https://about.sourcegraph.com/handbook/engineering/observability) for Sourcegraph-specific documentation.
> NOTE: For how to *use* Sourcegraph's observability and an overview of our observability features, refer to the [observability for site administrators documentation](../../../admin/observability/index.md).
## Overview
Observability at Sourcegraph includes:
| | Description | Examples |
|:--|------------|--------|
| **Monitoring** | how you know _when_ something is wrong | Dashboards & metrics, alerting, health checks |
| **Debugging** | how you debug _what_ is wrong | Tracing, logging |
## Concepts
- [Sourcegraph monitoring pillars](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_pillars)
- [Sourcegraph monitoring architecture](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture)
## Guides
- [How to find monitoring](../../how-to/find_monitoring.md)
- [How to add monitoring](../../how-to/add_monitoring.md)
- [Set up local monitoring development](../../how-to/monitoring_local_dev.md)
- How to add observability (coming soon)
## Components
- [Monitoring generator](./monitoring-generator.md)
- [Sourcegraph Grafana](./grafana.md)
- [Sourcegraph Prometheus](./prometheus.md)
- [Observability for site administrators](../../../admin/observability/index.md)

View File

@ -0,0 +1,59 @@
# Sourcegraph monitoring generator
<p class="lead">
The monitoring generator manages converting monitoring definitions into integrations with Sourcegraph's monitoring ecosystem.
</p>
Its purpose is to help enable a [cohesive observability experience for site administrators](../../../admin/observability/index.md), codify [Sourcegraph's monitoring pillars](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_pillars), and make it easy for [developers to add monitoring for their Sourcegraph services](../../how-to/add_monitoring.md) by generating integrations with Sourcegraph's monitoring ecosystem for free.
## Reference
- [Usage and development](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/README.md) for developing the generator itself
- [Monitoring API](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md) for interacting with the generator library
- [How to add monitoring definitions](../../how-to/add_monitoring.md) for developers looking to add monitoring for their services
## Features
### Documentation generation
The generator automatically creates documentation from monitoring definitions that customers and engineers can reference.
These include:
- [Alert solutions reference](https://docs.sourcegraph.com/admin/observability/alert_solutions)
- [Dashboards reference](https://docs.sourcegraph.com/admin/observability/dashboards)
Links to generated documentation can be provided in our other generated integrations - for example, [Slack alerts](https://docs.sourcegraph.com/admin/observability/alerting#setting-up-alerting) will provide a link to the appropriate alert solutions entry, and [Grafana panels](#grafana-integration) will link to the appropriate dashboards reference entry.
### Grafana integration
The generator automatically generates and ships dashboards from monitoring definitions within the [Sourcegraph Grafana distribution](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-grafana).
It also takes care of the following:
- Graphs within rows are sized appropriately
- Alerts visualization through the [`ObservableAlertDefinition` API](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md#type-observablealertdefinition):
- Overview graphs for alerts (both Sourcegraph-wide and per-service)
- Threshold lines for alerts of all levels are rendered in graphs
- Formatting of units, labels, and more (using either the defaults, or the [`ObservablePanelOptions` API](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md#type-observablepaneloptions))
- Maintaining a uniform look and feel across all dashboards
- Providing links to [generated documentation](#documentation-generation)
Links to generated documentation can be provided in our other generated integrations - for example, [Slack alerts](https://docs.sourcegraph.com/admin/observability/alerting#setting-up-alerting) will provide a link to the appropriate service's dashboard.
### Prometheus integration
The generator automatically generates and ships Prometheus recording rules and alerts within the [Sourcegraph Prometheus distribution](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-prometheus).
This include the following, all with appropriate and consistent labels:
- [`alert_count` recording rules](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#alert-count-metrics)
- Native Prometheus alerts, leveraged by our [Alertmanager integration](#alertmanager-integration)
Generated Prometheus recording rules are leveraged by the [Grafana integration](#grafana-integration).
### Alertmanager integration
The generator's [Prometheus integration](#prometheus-integration) is a critical part of the [Sourcegraph's alerting capabilities](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#alert-notifications), which handles alert routing by level and formatting of alert messages to include links to [documentation](#documentation-generation) and [dashboards](#grafana-integration).
Learn more about using Sourcegraph alerting in the [alerting documentation](https://docs.sourcegraph.com/admin/observability/alerting).
This is possible due to the labels generated by the [Prometheus integration](#prometheus-integration).
At Sourcegraph, extended routing based on team ownership (as defined by [`ObservableOwner`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md#type-observableowner)) is also used to route customer support requests and [on-call events through OpsGenie](https://about.sourcegraph.com/handbook/engineering/incidents/on_call).

View File

@ -0,0 +1,27 @@
# Sourcegraph Prometheus
We ship a custom Prometheus image as part of a standard Sourcegraph distribution.
It currently bundles Alertmanager as well as integrations to the Sourcegraph web application.
Learn more about it in our [monitoring architecture](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-prometheus).
Adding recording rules, alerts, etc. to this image is handled by the [monitoring generator](./monitoring-generator.md).
The image is defined in [`docker-images/prometheus`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/tree/docker-images/prometheus).
## Prom-wrapper
The entrypoint of the image is a sidecar program called the prom-wrapper.
Learn more about it [here](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#prom-wrapper).
The source code for this program is currently kept in [`docker-images/prometheus/cmd/prom-wrapper`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/tree/docker-images/prometheus/cmd/prom-wrapper).
## Upgrading Prometheus or Alertmanager
To upgrade Prometheus or Alertmanager, make the appropriate version and sum changes to the [`sourcegraph/prometheus` Dockerfile](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:go.mod+prometheus/alertmanager+OR+prometheus/client_golang&patternType=literal) and make sure to:
* Upgrade the [Alertmanager and Prometheus Go client dependencies](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:go.mod+prometheus/alertmanager+OR+prometheus/client_golang&patternType=literal) where appropriate
* Ensure the image still builds: `./docker-images/prometheus/build.sh`
* [Run the monitoring stack locally](../../how-to/monitoring_local_dev.md) and verify that:
* all Prometheus rules are evaluated successfully (`localhost:9090/rules`)
* Alertmanager starts up correctly (`localhost:9090/alertmanager/#/status`)
* [`observability.alerts` can be configured](../../../admin/observability/alerting.md) via the Sourcegraph web application

View File

@ -0,0 +1,166 @@
# How to add monitoring
This guide documents how to add monitoring to Sourcegraph's source code.
Sourcegraph employees should also refer to the [handbook's monitoring section](https://about.sourcegraph.com/handbook/engineering/observability/monitoring) for Sourcegraph-specific documentation.
The [developing observability page](../background-information/observability/index.md) contains relevant documentation as well.
> NOTE: For how to *use* Sourcegraph's observability and an overview of our observability features, refer to the [observability for site administrators documentation](../../admin/observability/index.md).
## Metrics
Service-side, metrics should be made available over HTTP for Prometheus to scrape.
By default, Prometheus expects metrics to be exported on `$SERVICEPORT/metrics` - for example, run your local Sourcegraph dev server and metrics should be available on `http://localhost:$SERVICEPORT/metrics`.
How this is configured varies across the various [Sourcegraph deployment options](../../admin/install/index.md) - see [tracking a new service](#tracking-a-new-service).
### Tracking a new service
In [deploy-sourcegraph](https://github.com/sourcegraph/deploy-sourcegraph), Prometheus uses the Kubernetes API to discover endpoints to scrape. Just add the following annotations to your service definition:
```yaml
metadata:
annotations:
prometheus.io/port: "$SERVICEPORT" # replace with the port your service runs on
sourcegraph.prometheus/scrape: "true"
```
In [deploy-sourcegraph-docker](https://github.com/sourcegraph/deploy-sourcegraph-docker), Prometheus relies on targets defined in the [`prometheus_targets`](https://github.com/sourcegraph/deploy-sourcegraph-docker/blob/master/prometheus/prometheus_targets.yml) configuration file - you will need to add your service here.
## Alerts, dashboards, and documentation
Creating alerts, dashboards, and documentation for monitoring is powered by the Sourcegraph monitoring generator, which requires monitorings to be defined in our [monitoring definitions package](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/tree/monitoring/definitions).
The monitoring generator provides [a lot of features and integrations with the Sourcegraph monitoring ecosystem](../background-information/observability/monitoring-generator.md#features) for free.
This section documents how to use develop monitoring definitions for a Sourcegraph service.
To get started, you should read:
- the [Sourcegraph monitoring pillars](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_pillars) for some of the principles we try to uphold when developing monitoring
- relevant [reference documentation for the monitoring generator](../background-information/observability/monitoring-generator.md)
### Set up an observable
Monitoring is build around "observables" - something you wish to observe.
The generator API exposes this concept through the [`Observable` type](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/monitoring/monitoring/README.md#type-observable).
You can decide where to put your new observable by looking for an existing dashboard that your information should go in.
Think "when this number shows something bad, which service logs are likely to be most relevant?".
If you are just editing an existing observable,
Existing dashboards can be viewed by either:
- Visiting Grafana on an existing Sourcegraph instance that you have site admin permissions for, e.g. `example.sourcegraph.com/-/debug/grafana` - see the [metrics for site administrators documentation](../../admin/observability/metrics.md) for more details.
- [Running the monitoring stack locally](./monitoring_local_dev.md)
Once you have found a home for your observable, open that service's monitoring definition (e.g. `monitoring/frontend.go`, `monitoring/git_server.go`) in your editor.
Declare your [`Observable`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24%40master+file:%5Emonitoring/+type+Observable&patternType=literal) by:
- adding it to [an existing `Row` in the file](https://sourcegraph.com/github.com/sourcegraph/sourcegraph@64aa473/-/blob/monitoring/frontend.go#L12-43)
- adding a new [`Row`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24%40master+file:%5Emonitoring/+type+Row&patternType=literal)
- adding a new [`Group`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24%40master+file:%5Emonitoring/+type+Group&patternType=literal) entirely
Here's an example `Observable` that we will use throughout this guide to get you started:
```go
{
Name: "some_metric_behaviour",
Description: "some behaviour of a metric",
}
```
### Write a query
Use the Grafana Explore page on a Sourcegraph instance where you have site administrator access (`/-/debug/grafana/explore`) to start writing your Prometheus query.
```diff
{
Name: "some_metric_behaviour",
- Description: "some behaviour of a metric",
+ Description: "some behaviour of a metric over 5m",
+ Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`,
}
```
Make sure to update your description to reflect the query you end up with where relevant.
### Configure panel options
Panel options can be used to customize the visualization of your observable in Grafana.
This step is optional, but highly recommended.
There are not many panel options (intentionally) to keep things simple.
The primary thing you'll use is to change the Grafana display from plain numbers to a unit like seconds:
```diff
{
Name: "some_metric_behaviour",
Description: "some behaviour of a metric over 5m",
Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`,
+ PanelOptions: PanelOptions().LegendFormat("duration").Unit(Seconds),
}
```
### Add an alert
Alerts can be defined at two levels: warning, and critical.
They are used to provide Sourcegraph health notifications for site administrators.
This step is optional, but highly recommended.
To get started, make a guess about what a good or bad value for your query is.
It's OK if this isn't perfect, just do your best.
Then add an alert to your Observable, for example:
```diff
{
Name: "some_metric_behaviour",
Description: "some behaviour of a metric over 5m",
Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`,
PanelOptions: PanelOptions().LegendFormat("duration").Unit(Seconds),
+ Warning: Alert{GreaterOrEqual: 20},
}
```
This step is optional - if you opt not to include an alert, you must explicitly set `NoAlert: true` and provide [relevant documentation for this observable](#add-documentation).
### Add documentation
It's best if you also add some Markdown documentation with your best guess of what someone _might consider doing_ if they observe the alert firing (again, just your best guess is good enough here):
```diff
{
Name: "some_metric_behaviour",
Description: "some behaviour of a metric over 5m",
Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("duration").Unit(Seconds),
+ PossibleSolutions: `
+ - Look at 'SERVICE' logs for details on the slow search queries.
+ `,
}
```
```diff
{
Name: "some_metric_behaviour",
Description: "some behaviour of a metric over 5m",
Query: `histogram_quantile(0.99, sum by (le)(rate(search_request_duration{status="success}[5m])))`,
NoAlert: true,
PanelOptions: PanelOptions().LegendFormat("duration").Unit(Seconds),
+ Interpretation: `
+ This value might be high under X, Y, and Z conditions.
+ `,
}
```
> NOTE: In both `PossibleSolutions` and `Interpretation`, you can write plain Markdown with some slight modifications, such as single quotes are used instead of backticks for code formatting, and indention will automatically be removed for you.
### Validate your observable
Run the monitoring generator from the root Sourcegraph directory:
```sh
go generate ./monitoring/...
```
This will validate your Observable configuration and let you know of any changes you need to make if required.
If the generator runs successfully, you should now [run the monitoring stack locally](./monitoring_local_dev.md) to validate the output and results of your observable by hand.
Once everything looks good, open a pull request with your observable to the main Sourcegraph codebase!

View File

@ -26,7 +26,7 @@ You can preview the documentation site at http://localhost:5080 when running Sou
You can also run the docsite on its own with the following command:
```sh
./dev/docsite.sh -config doc/docsite.json serve -http=localhost:5080
yarn docsite:serve
```
## Linking to documentation in-product

View File

@ -0,0 +1,24 @@
# How to find monitoring
This guide documents how to find monitoring within Sourcegraph's source code.
Sourcegraph employees should also refer to the [handbook's monitoring section](https://about.sourcegraph.com/handbook/engineering/observability/monitoring) for Sourcegraph-specific documentation.
The [developing observability page](../background-information/observability/index.md) contains relevant documentation as well.
> NOTE: For how to *use* Sourcegraph's observability and an overview of our observability features, refer to the [observability for site administrators documentation](../../admin/observability/index.md).
## Alerts
Alerts are defined in the [`monitoring/definitions` package](https://k8s.sgdev.org/github.com/sourcegraph/sourcegraph/-/tree/monitoring/definitions) - for example, [querying for definitions of `Warning` or `Critical`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:monitoring/definitions+Warning:+:%5B_%5Cn%5D+OR+Critical:+:%5B_%5Cn%5D&patternType=structural) will surface all Sourcegraph alerts.
## Metrics
You can use Sourcegraph itself to search for metrics definitions - for example, by [querying for usages of `prometheus.HistogramOpts`](https://sourcegraph.com/search?q=repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+prometheus.HistogramOpts%7B+:%5B_%5D+%7D+&patternType=structural).
Sometimes the metrics are hard to find because their name declarations are not literal strings, but are concatenated in code from variables.
In these cases you can try a specialized tool called [`promgrep`](https://github.com/sourcegraph/promgrep) to find them.
```sh
go get github.com/sourcegraph/promgrep
# in the root `sourcegraph/sourcegraph` source directory
promgrep <some_partial_metric_name> # no arguments lists all declared metrics
```

View File

@ -24,6 +24,10 @@
## Implementing Sourcegraph
- [Developing the product documentation](documentation_implementation.md)
- [Observability](../background-information/observability/index.md)
- [How to find monitoring](find_monitoring.md)
- [How to add monitoring](add_monitoring.md)
- [Set up local Sourcegraph monitoring development](monitoring_local_dev.md)
## Testing Sourcegraph

View File

@ -0,0 +1,114 @@
# Set up local Sourcegraph monitoring development
This guide documents how to spin up and develop Sourcegraph's monitoring stack locally.
Sourcegraph employees should also refer to the [handbook's monitoring section](https://about.sourcegraph.com/handbook/engineering/observability/monitoring) for Sourcegraph-specific documentation.
The [developing observability page](../background-information/observability/index.md) contains relevant documentation as well, including background about the components listed here.
> NOTE: For how to *use* Sourcegraph's observability and an overview of our observability features, refer to the [observability for site administrators documentation](../../admin/observability/index.md).
## Running monitoring components
### With all services
The monitoring stack is included in the `./dev/start.sh` and `./enterprise/dev/start.sh` scripts.
Learn more about these in the [general development getting started guide](../getting-started/index.md).
### Without all services
For convenience, there are a number of ways to spin up Sourcegraph's monitoring services *without* having to start up every other service as well.
#### Grafana
Running just Grafana is a convenient way to validate dashboards.
When doing so, you may wish to connect Grafana to a remote Prometheus instance that you have administrator access to (such as [Sourcegraph's instances](https://about.sourcegraph.com/handbook/engineering/deployments/instances)), to show more real data than is available on your dev server.
For Kubernetes deployments, you can do this by getting `kubectl` connected to a Sourcegraph cluster and then port-forwarding Prometheus via:
```sh
kubectl port-forward svc/prometheus 9090:30090
```
Then, you can start up a standalone Grafana using:
```sh
./dev/grafana.sh
```
Dashboards will be available at `localhost:3030`.
Note that instead of `kubectl`, you can use whichever port-forwarding mechanism you wish to connect to a remote Prometheus instance as well, as long as Prometheus is available on port `9090` locally.
The dev targets for Grafana are defined in the following files:
* Non-Linux: [`dev/grafana/all/datasources.yaml`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/dev/grafana/all/datasources.yaml)
* Linux: [`dev/grafana/linux/datasources.yaml`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/dev/grafana/linux/datasources.yaml)
#### Prometheus
Running just Prometheus is a convenient way to validate the generated recording and alert rules.
You can start up a standalone Prometheus using:
```sh
./dev/prometheus.sh
```
The loaded generated recording and alert rules are available at `http://localhost:9090/rules`.
The bundled Alertmanager is available at `http://localhost:9090/alertmanager`.
Some configuration options are available:
* `DISABLE_SOURCEGRAPH_CONFIG`: when `true`, disables the prom-wrapper's [integration with the Sourcegraph frontend](#frontend-integration).
* `DISABLE_ALERTMANAGER`: when `true`, disables the bundled Alertmanager entirely.
This includes the behaviour of `DISABLE_SOURCEGRAPH_CONFIG=true`.
Note that without services to scrape, running a standalone Prometheus will not provide any metrics - if you need to test metrics as well, you should [start all services](#with-all-services) instead.
The dev targets for Prometheus are defined in the following files:
* Non-Linux: [`dev/prometheus/all/prometheus_targets.yml`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/dev/prometheus/all/prometheus_targets.yml)
* Linux: [`dev/prometheus/linux/prometheus_targets.yml`](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/dev/prometheus/linux/prometheus_targets.yml)
##### Frontend integration
The Sourcegraph Prometheus service features an integration with the Sourcegraph frontend that requires a frontend instance to be running to develop or test these features.
Note that the Prometheus service will still run without additional configuration even if no frontend is accessible.
One way to do this is to [start up Prometheus alongside all Sourcegraph services](#with-all-services).
You can alternatively spin up just the frontend separately:
```sh
./dev/start.sh --only frontend
```
This should be sufficient to access the frontend API and the admin console (`/site-admin`), which is where most of the integration is.
#### Docsite
The docsite is used to serve generated monitoring documentation, such as the [alert solutions reference](../../admin/observability/alert_solutions.md).
You can spin it up by running:
```sh
yarn docsite:serve
```
Learn more about docsite development in the [product documentation implementation guide](./documentation_implementation.md).
## Using the monitoring generator
> NOTE: Looking to add monitoring first? Refer to the [how to add monitoring](./add_monitoring.md) guide!
The dev startup scripts used in this guide all mount relevant configuration directories into each monitoring service.
This means that you can:
* Update your monitoring definitions
* Run the generator to regenerate and reload monitoring services
* Validate the result of your changes immediately (for example, by checking Prometheus rules in `/rules` or Grafana dashboards in `/-/debug/grafana`)
To run the generator and trigger a reload:
```sh
RELOAD=true go generate ./monitoring
```
Make sure to provide the following parameters as well, where relevant:
* `GRAFANA_DIR=''`, if you are *not* running Grafana
* `PROMETHEUS_DIR=''`, if you are *not* running Prometheus
* `SRC_LOG_LEVEL=dbug` to enable potentially helpful output for debugging issues

View File

@ -1,6 +1,8 @@
# Grafana image
# Sourcegraph Grafana
Vanilla Grafana image with provisioned Sourcegraph dashboards and config. For more details, refer to [the handbook](https://about.sourcegraph.com/handbook/engineering/distribution/observability/monitoring#grafana).
Vanilla Grafana image with provisioned Sourcegraph dashboards and config.
To learn more, refer to the [Sourcegraph observability developer guide](https://docs.sourcegraph.com/dev/background-information/observability) and [monitoring architecture](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-grafana).
## Image API

View File

@ -1,11 +1,11 @@
# Prometheus image
# Sourcegraph Prometheus
The `sourcegraph/prometheus` image provides an all-in-one image through `prom-wrapper` with:
- Vanilla Prometheus with embedded Sourcegraph configuration
- Bundled Alertmanager with a `siteConfigSubscriber` sidecar service to automatically apply relevant configuration changes to Alertmanager
To learn more, refer to the [Sourcegraph monitoring developer guide](https://about.sourcegraph.com/handbook/engineering/distribution/observability/monitoring) and the [alerting documentation](https://docs.sourcegraph.com/admin/observability/alerting).
To learn more, refer to the [Sourcegraph observability developer guide](https://docs.sourcegraph.com/dev/background-information/observability) and [monitoring architecture](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-prometheus).
## Image API

View File

@ -15,8 +15,8 @@ import (
"github.com/gorilla/mux"
"github.com/inconshreveable/log15"
amclient "github.com/prometheus/alertmanager/api/v2/client"
"github.com/sourcegraph/sourcegraph/internal/env"
)

View File

@ -1,19 +1,8 @@
# Sourcegraph monitoring generator
The Sourcegraph monitoring generator uses [`Container` definitions](./monitoring/README.md#type-container) to generate integrations with [Sourcegraph's monitoring architecture](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture).
It also aims to help codify guidelines defined in the [Sourcegraph monitoring pillars](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_pillars).
This page primarily documents the [generator's current capabilities](#features) - in other words, and what you get for free by declaring Sourcegraph service monitoring in this package - as well as [how to make changes to the generator itself](#development).
To learn about how to find, add, and use monitoring, see the [Sourcegraph monitoring developer guide](https://about.sourcegraph.com/handbook/engineering/observability/monitoring).
- [Usage](#usage)
- [Features](#features)
- [Documentation generation](#documentation-generation)
- [Grafana integration](#grafana-integration)
- [Prometheus integration](#prometheus-integration)
- [Alertmanager integration](#alertmanager-integration)
- [Development](#development)
This page documents usage (running the generator) and development (of the generator itself).
For background and feature documentation, see [the generator overview](https://docs.sourcegraph.com/dev/background-information/observability/monitoring-generator).
To learn about how to find, add, and use monitoring, see the [Sourcegraph observability developer guide](https://docs.sourcegraph.com/dev/background-information/observability).
## Usage
@ -26,52 +15,6 @@ go generate ./...
Logging output supports the [Sourcegraph log level flags](https://docs.sourcegraph.com/admin/observability#logs).
Other configuration options can be customized via flags declared in [`main.go`](./main.go).
## Features
### Documentation generation
The generator automatically creates documentation from monitoring definitions that customers and engineers can reference.
These include:
- [Alert solutions reference](https://docs.sourcegraph.com/admin/observability/alert_solutions)
- [Dashboards reference](https://docs.sourcegraph.com/admin/observability/dashboards)
Links to generated documentation can be provided in our other generated integrations - for example, [Slack alerts](https://docs.sourcegraph.com/admin/observability/alerting#setting-up-alerting) will provide a link to the appropriate alert solutions entry, and [Grafana panels](#grafana-integration) will link to the appropriate dashboards reference entry.
### Grafana integration
The generator automatically generates and ships dashboards from monitoring definitions within the [Sourcegraph Grafana distribution](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-grafana).
It also takes care of the following:
- Graphs within rows are sized appropriately
- Alerts visualization through the [`ObservableAlertDefinition` API](./monitoring/README.md#type-observablealertdefinition):
- Overview graphs for alerts (both Sourcegraph-wide and per-service)
- Threshold lines for alerts of all levels are rendered in graphs
- Formatting of units, labels, and more (using either the defaults, or the [`ObservablePanelOptions` API](./monitoring/README.md#type-observablepaneloptions))
- Maintaining a uniform look and feel across all dashboards
- Providing links to [generated documentation](#documentation-generation)
Links to generated documentation can be provided in our other generated integrations - for example, [Slack alerts](https://docs.sourcegraph.com/admin/observability/alerting#setting-up-alerting) will provide a link to the appropriate service's dashboard.
### Prometheus integration
The generator automatically generates and ships Prometheus recording rules and alerts within the [Sourcegraph Prometheus distribution](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#sourcegraph-prometheus).
This include the following, all with appropriate and consistent labels:
- [`alert_count` recording rules](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#alert-count-metrics)
- Native Prometheus alerts, leveraged by our [Alertmanager integration](#alertmanager-integration)
Generated Prometheus recording rules are leveraged by the [Grafana integration](#grafana-integration).
### Alertmanager integration
The generator's [Prometheus integration](#prometheus-integration) is a critical part of the [Sourcegraph's alerting capabilities](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_architecture#alert-notifications), which handles alert routing by level and formatting of alert messages to include links to [documentation](#documentation-generation) and [dashboards](#grafana-integration).
Learn more about using Sourcegraph alerting in the [alerting documentation](https://docs.sourcegraph.com/admin/observability/alerting).
This is possible due to the labels generated by the [Prometheus integration](#prometheus-integration)
At Sourcegraph, extended routing based on team ownership (as defined by [`ObservableOwner`](./monitoring/README.md#type-observableowner)) is also used to route customer support requests and [on-call events through OpsGenie](https://about.sourcegraph.com/handbook/engineering/incidents/on_call).
## Development
The Sourcegraph monitoring generator consists of three components:
@ -79,8 +22,8 @@ The Sourcegraph monitoring generator consists of three components:
- The [main program](./main.go) - this is the primary entrypoint to the generator.
- _Definitions_, defined in the top-level [`monitoring/definitions` package](./definitions/).
This is where the all service monitoring definitions lives.
If you are editing monitoring, this is probably where you want to look - see the [Sourcegraph monitoring developer guide](https://about.sourcegraph.com/handbook/engineering/observability/monitoring).
If you are editing monitoring, this is probably where you want to look - see the [Sourcegraph observability developer guide](https://docs.sourcegraph.com/dev/background-information/observability).
- _Generator_, defined in the nested [`monitoring/monitoring` package](./monitoring/README.md) package.
This is where the API for service monitoring definitions is defined, as well as the generator code that provides the [above features](#features).
This is where the API for service monitoring definitions is defined, as well as the generator code that provides [its features](https://docs.sourcegraph.com/dev/background-information/observability/monitoring-generator#features).
All features and capabilities for developed for the generator should align with the [Sourcegraph monitoring pillars](https://about.sourcegraph.com/handbook/engineering/observability/monitoring_pillars).

View File

@ -73,13 +73,15 @@ func Generate(logger log15.Logger, opts GenerateOptions, containers ...*Containe
// Reload specific dashboard
if opts.Reload {
clog.Debug("Reloading Grafana instance", "instance", localGrafanaURL)
crlog := clog.New("instance", localGrafanaURL)
crlog.Debug("Reloading Grafana instance")
client := sdk.NewClient(localGrafanaURL, localGrafanaCredentials, sdk.DefaultHTTPClient)
_, err := client.SetDashboard(context.Background(), *board, sdk.SetDashboardParams{Overwrite: true})
if err != nil {
clog.Crit("Could not reload Grafana instance", "error", err)
crlog.Crit("Could not reload Grafana instance", "error", err)
return err
}
crlog.Info("Reloaded Grafana instance")
}
}
@ -108,18 +110,20 @@ func Generate(logger log15.Logger, opts GenerateOptions, containers ...*Containe
// Reload all Prometheus rules
if opts.PrometheusDir != "" && opts.Reload {
rlog := logger.New("instance", localPrometheusURL)
// Reload all Prometheus rules
logger.Debug("Reloading Prometheus instance", "instance", localPrometheusURL)
rlog.Debug("Reloading Prometheus instance", "instance", localPrometheusURL)
resp, err := http.Post(localPrometheusURL+"/-/reload", "", nil)
if err != nil {
logger.Crit("Could not reload Prometheus", "error", err)
rlog.Crit("Could not reload Prometheus", "error", err)
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
logger.Crit("Unexpected status code while reloading Prometheus rules", "code", resp.StatusCode)
rlog.Crit("Unexpected status code while reloading Prometheus rules", "code", resp.StatusCode)
return err
}
rlog.Info("Reloaded Prometheus instance")
}
// Generate documentation

View File

@ -31,7 +31,8 @@
"build-storybook": "build-storybook -c .storybook -s ui/assets",
"cover-storybook": "nyc --hook-require=false yarn jest .storybook/coverage",
"deduplicate": "yarn-deduplicate -s fewer",
"release": "cd dev/release && yarn run release"
"release": "cd dev/release && yarn run release",
"docsite:serve": "./dev/docsite.sh -config doc/docsite.json serve -http=localhost:5080"
},
"browserslist": [
"last 1 version",