Add alerts for mean db blocked seconds (#22822)

Warn if the average database block time is over 5ms, alert if it exceeds 10ms
This commit is contained in:
Dax McDonald 2021-07-15 20:21:24 -05:00 committed by GitHub
parent 5905c39847
commit 7eb956e968
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 163 additions and 5 deletions

View File

@ -842,6 +842,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## frontend: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> frontend: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> frontend: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_frontend_mean_blocked_seconds_per_conn_request",
"critical_frontend_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## frontend: internal_indexed_search_error_responses
<p class="subtitle">internal indexed search error responses every 5m</p>
@ -1453,6 +1477,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## gitserver: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> gitserver: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> gitserver: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_gitserver_mean_blocked_seconds_per_conn_request",
"critical_gitserver_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## gitserver: container_cpu_usage
<p class="subtitle">container cpu usage total (1m average) across all cores by instance</p>
@ -2402,6 +2450,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## precise-code-intel-worker: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> precise-code-intel-worker: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> precise-code-intel-worker: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_precise-code-intel-worker_mean_blocked_seconds_per_conn_request",
"critical_precise-code-intel-worker_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## precise-code-intel-worker: frontend_internal_api_error_responses
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
@ -3210,6 +3282,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## worker: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> worker: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> worker: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_worker_mean_blocked_seconds_per_conn_request",
"critical_worker_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## worker: frontend_internal_api_error_responses
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
@ -4163,6 +4259,30 @@ with your code hosts connections or networking issues affecting communication wi
<br />
## repo-updater: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> repo-updater: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> repo-updater: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_repo-updater_mean_blocked_seconds_per_conn_request",
"critical_repo-updater_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## repo-updater: container_cpu_usage
<p class="subtitle">container cpu usage total (1m average) across all cores by instance</p>
@ -5966,6 +6086,30 @@ with your code hosts connections or networking issues affecting communication wi
<br />
## executor-queue: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> executor-queue: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> executor-queue: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_executor-queue_mean_blocked_seconds_per_conn_request",
"critical_executor-queue_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## executor-queue: frontend_internal_api_error_responses
<p class="subtitle">frontend-internal API error responses every 5m by route</p>

View File

@ -436,6 +436,8 @@ This panel indicates idle.
This panel indicates mean blocked seconds per conn request.
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#frontend-mean-blocked-seconds-per-conn-request).
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
@ -978,6 +980,8 @@ This panel indicates idle.
This panel indicates mean blocked seconds per conn request.
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#gitserver-mean-blocked-seconds-per-conn-request).
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
@ -1641,6 +1645,8 @@ This panel indicates idle.
This panel indicates mean blocked seconds per conn request.
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#precise-code-intel-worker-mean-blocked-seconds-per-conn-request).
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
@ -2171,6 +2177,8 @@ This panel indicates idle.
This panel indicates mean blocked seconds per conn request.
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#worker-mean-blocked-seconds-per-conn-request).
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
@ -2739,6 +2747,8 @@ This panel indicates idle.
This panel indicates mean blocked seconds per conn request.
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#repo-updater-mean-blocked-seconds-per-conn-request).
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
@ -3912,6 +3922,8 @@ This panel indicates idle.
This panel indicates mean blocked seconds per conn request.
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#executor-queue-mean-blocked-seconds-per-conn-request).
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />

View File

@ -2,6 +2,7 @@ package shared
import (
"fmt"
"time"
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
)
@ -61,10 +62,11 @@ func DatabaseConnectionsMonitoring(app string) []monitoring.Row {
Description: "mean blocked seconds per conn request",
Query: fmt.Sprintf(`sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name=%q}[5m])) / `+
`sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name=%q}[5m]))`, app, app),
Panel: monitoring.Panel().LegendFormat("dbname={{db_name}}").Unit(monitoring.Seconds),
NoAlert: true,
Owner: monitoring.ObservableOwnerCoreApplication,
Interpretation: "none",
Panel: monitoring.Panel().LegendFormat("dbname={{db_name}}").Unit(monitoring.Seconds),
Warning: monitoring.Alert().GreaterOrEqual(0.05, nil).For(5 * time.Minute),
Critical: monitoring.Alert().GreaterOrEqual(0.10, nil).For(10 * time.Minute),
Owner: monitoring.ObservableOwnerCoreApplication,
PossibleSolutions: "none",
},
},
{

View File

@ -50,7 +50,7 @@ func (c *Container) validate() error {
return errors.Errorf("Title must be in Title Case; found \"%s\" want \"%s\"", c.Title, strings.Title(c.Title))
}
if c.Description != withPeriod(c.Description) || c.Description != upperFirst(c.Description) {
return errors.Errorf("Description must be sentence starting with an uppercas eletter and ending with period; found \"%s\"", c.Description)
return errors.Errorf("Description must be sentence starting with an uppercase letter and ending with period; found \"%s\"", c.Description)
}
for i, g := range c.Groups {
if err := g.validate(); err != nil {