mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 17:51:57 +00:00
Add alerts for mean db blocked seconds (#22822)
Warn if the average database block time is over 5ms, alert if it exceeds 10ms
This commit is contained in:
parent
5905c39847
commit
7eb956e968
@ -842,6 +842,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
|
||||
|
||||
<br />
|
||||
|
||||
## frontend: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-warning">warning</span> frontend: 0.05s+ mean blocked seconds per conn request for 5m0s
|
||||
- <span class="badge badge-critical">critical</span> frontend: 0.1s+ mean blocked seconds per conn request for 10m0s
|
||||
|
||||
**Possible solutions**
|
||||
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"warning_frontend_mean_blocked_seconds_per_conn_request",
|
||||
"critical_frontend_mean_blocked_seconds_per_conn_request"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
|
||||
## frontend: internal_indexed_search_error_responses
|
||||
|
||||
<p class="subtitle">internal indexed search error responses every 5m</p>
|
||||
@ -1453,6 +1477,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
|
||||
|
||||
<br />
|
||||
|
||||
## gitserver: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-warning">warning</span> gitserver: 0.05s+ mean blocked seconds per conn request for 5m0s
|
||||
- <span class="badge badge-critical">critical</span> gitserver: 0.1s+ mean blocked seconds per conn request for 10m0s
|
||||
|
||||
**Possible solutions**
|
||||
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"warning_gitserver_mean_blocked_seconds_per_conn_request",
|
||||
"critical_gitserver_mean_blocked_seconds_per_conn_request"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
|
||||
## gitserver: container_cpu_usage
|
||||
|
||||
<p class="subtitle">container cpu usage total (1m average) across all cores by instance</p>
|
||||
@ -2402,6 +2450,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
|
||||
|
||||
<br />
|
||||
|
||||
## precise-code-intel-worker: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-warning">warning</span> precise-code-intel-worker: 0.05s+ mean blocked seconds per conn request for 5m0s
|
||||
- <span class="badge badge-critical">critical</span> precise-code-intel-worker: 0.1s+ mean blocked seconds per conn request for 10m0s
|
||||
|
||||
**Possible solutions**
|
||||
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"warning_precise-code-intel-worker_mean_blocked_seconds_per_conn_request",
|
||||
"critical_precise-code-intel-worker_mean_blocked_seconds_per_conn_request"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
|
||||
## precise-code-intel-worker: frontend_internal_api_error_responses
|
||||
|
||||
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
|
||||
@ -3210,6 +3282,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
|
||||
|
||||
<br />
|
||||
|
||||
## worker: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-warning">warning</span> worker: 0.05s+ mean blocked seconds per conn request for 5m0s
|
||||
- <span class="badge badge-critical">critical</span> worker: 0.1s+ mean blocked seconds per conn request for 10m0s
|
||||
|
||||
**Possible solutions**
|
||||
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"warning_worker_mean_blocked_seconds_per_conn_request",
|
||||
"critical_worker_mean_blocked_seconds_per_conn_request"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
|
||||
## worker: frontend_internal_api_error_responses
|
||||
|
||||
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
|
||||
@ -4163,6 +4259,30 @@ with your code hosts connections or networking issues affecting communication wi
|
||||
|
||||
<br />
|
||||
|
||||
## repo-updater: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-warning">warning</span> repo-updater: 0.05s+ mean blocked seconds per conn request for 5m0s
|
||||
- <span class="badge badge-critical">critical</span> repo-updater: 0.1s+ mean blocked seconds per conn request for 10m0s
|
||||
|
||||
**Possible solutions**
|
||||
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"warning_repo-updater_mean_blocked_seconds_per_conn_request",
|
||||
"critical_repo-updater_mean_blocked_seconds_per_conn_request"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
|
||||
## repo-updater: container_cpu_usage
|
||||
|
||||
<p class="subtitle">container cpu usage total (1m average) across all cores by instance</p>
|
||||
@ -5966,6 +6086,30 @@ with your code hosts connections or networking issues affecting communication wi
|
||||
|
||||
<br />
|
||||
|
||||
## executor-queue: mean_blocked_seconds_per_conn_request
|
||||
|
||||
<p class="subtitle">mean blocked seconds per conn request</p>
|
||||
|
||||
**Descriptions**
|
||||
|
||||
- <span class="badge badge-warning">warning</span> executor-queue: 0.05s+ mean blocked seconds per conn request for 5m0s
|
||||
- <span class="badge badge-critical">critical</span> executor-queue: 0.1s+ mean blocked seconds per conn request for 10m0s
|
||||
|
||||
**Possible solutions**
|
||||
|
||||
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
|
||||
|
||||
```json
|
||||
"observability.silenceAlerts": [
|
||||
"warning_executor-queue_mean_blocked_seconds_per_conn_request",
|
||||
"critical_executor-queue_mean_blocked_seconds_per_conn_request"
|
||||
]
|
||||
```
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
|
||||
## executor-queue: frontend_internal_api_error_responses
|
||||
|
||||
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
|
||||
|
||||
@ -436,6 +436,8 @@ This panel indicates idle.
|
||||
|
||||
This panel indicates mean blocked seconds per conn request.
|
||||
|
||||
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#frontend-mean-blocked-seconds-per-conn-request).
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
@ -978,6 +980,8 @@ This panel indicates idle.
|
||||
|
||||
This panel indicates mean blocked seconds per conn request.
|
||||
|
||||
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#gitserver-mean-blocked-seconds-per-conn-request).
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
@ -1641,6 +1645,8 @@ This panel indicates idle.
|
||||
|
||||
This panel indicates mean blocked seconds per conn request.
|
||||
|
||||
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#precise-code-intel-worker-mean-blocked-seconds-per-conn-request).
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
@ -2171,6 +2177,8 @@ This panel indicates idle.
|
||||
|
||||
This panel indicates mean blocked seconds per conn request.
|
||||
|
||||
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#worker-mean-blocked-seconds-per-conn-request).
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
@ -2739,6 +2747,8 @@ This panel indicates idle.
|
||||
|
||||
This panel indicates mean blocked seconds per conn request.
|
||||
|
||||
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#repo-updater-mean-blocked-seconds-per-conn-request).
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
@ -3912,6 +3922,8 @@ This panel indicates idle.
|
||||
|
||||
This panel indicates mean blocked seconds per conn request.
|
||||
|
||||
> NOTE: Alerts related to this panel are documented in the [alert solutions reference](./alert_solutions.md#executor-queue-mean-blocked-seconds-per-conn-request).
|
||||
|
||||
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
|
||||
|
||||
<br />
|
||||
|
||||
@ -2,6 +2,7 @@ package shared
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
||||
)
|
||||
@ -61,10 +62,11 @@ func DatabaseConnectionsMonitoring(app string) []monitoring.Row {
|
||||
Description: "mean blocked seconds per conn request",
|
||||
Query: fmt.Sprintf(`sum by (app_name, db_name) (increase(src_pgsql_conns_blocked_seconds{app_name=%q}[5m])) / `+
|
||||
`sum by (app_name, db_name) (increase(src_pgsql_conns_waited_for{app_name=%q}[5m]))`, app, app),
|
||||
Panel: monitoring.Panel().LegendFormat("dbname={{db_name}}").Unit(monitoring.Seconds),
|
||||
NoAlert: true,
|
||||
Owner: monitoring.ObservableOwnerCoreApplication,
|
||||
Interpretation: "none",
|
||||
Panel: monitoring.Panel().LegendFormat("dbname={{db_name}}").Unit(monitoring.Seconds),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(0.05, nil).For(5 * time.Minute),
|
||||
Critical: monitoring.Alert().GreaterOrEqual(0.10, nil).For(10 * time.Minute),
|
||||
Owner: monitoring.ObservableOwnerCoreApplication,
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
{
|
||||
|
||||
@ -50,7 +50,7 @@ func (c *Container) validate() error {
|
||||
return errors.Errorf("Title must be in Title Case; found \"%s\" want \"%s\"", c.Title, strings.Title(c.Title))
|
||||
}
|
||||
if c.Description != withPeriod(c.Description) || c.Description != upperFirst(c.Description) {
|
||||
return errors.Errorf("Description must be sentence starting with an uppercas eletter and ending with period; found \"%s\"", c.Description)
|
||||
return errors.Errorf("Description must be sentence starting with an uppercase letter and ending with period; found \"%s\"", c.Description)
|
||||
}
|
||||
for i, g := range c.Groups {
|
||||
if err := g.validate(); err != nil {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user