monitoring: Refactor provisioning dashboards (#23006)

This commit is contained in:
Eric Fritz 2021-07-20 08:59:13 -05:00 committed by GitHub
parent 6e51f45463
commit cf4435bb64
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 948 additions and 1254 deletions

View File

@ -842,30 +842,6 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## frontend: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> frontend: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> frontend: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_frontend_mean_blocked_seconds_per_conn_request",
"critical_frontend_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## frontend: internal_indexed_search_error_responses
<p class="subtitle">internal indexed search error responses every 5m</p>
@ -1025,6 +1001,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## frontend: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> frontend: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> frontend: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_frontend_mean_blocked_seconds_per_conn_request",
"critical_frontend_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## frontend: container_cpu_usage
<p class="subtitle">container cpu usage total (1m average) across all cores by instance</p>
@ -2450,30 +2450,6 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## precise-code-intel-worker: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> precise-code-intel-worker: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> precise-code-intel-worker: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_precise-code-intel-worker_mean_blocked_seconds_per_conn_request",
"critical_precise-code-intel-worker_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## precise-code-intel-worker: frontend_internal_api_error_responses
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
@ -2503,6 +2479,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## precise-code-intel-worker: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> precise-code-intel-worker: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> precise-code-intel-worker: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_precise-code-intel-worker_mean_blocked_seconds_per_conn_request",
"critical_precise-code-intel-worker_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## precise-code-intel-worker: container_cpu_usage
<p class="subtitle">container cpu usage total (1m average) across all cores by instance</p>
@ -2744,30 +2744,6 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## query-runner: container_memory_usage
<p class="subtitle">container memory usage by instance</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> query-runner: 99%+ container memory usage by instance
**Possible solutions**
- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`.
- **Docker Compose:** Consider increasing `memory:` of query-runner container in `docker-compose.yml`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_query-runner_container_memory_usage"
]
```
<sub>*Managed by the [Sourcegraph Search team](https://about.sourcegraph.com/handbook/engineering/search).*</sub>
<br />
## query-runner: container_cpu_usage
<p class="subtitle">container cpu usage total (1m average) across all cores by instance</p>
@ -2792,6 +2768,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## query-runner: container_memory_usage
<p class="subtitle">container memory usage by instance</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> query-runner: 99%+ container memory usage by instance
**Possible solutions**
- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`.
- **Docker Compose:** Consider increasing `memory:` of query-runner container in `docker-compose.yml`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_query-runner_container_memory_usage"
]
```
<sub>*Managed by the [Sourcegraph Search team](https://about.sourcegraph.com/handbook/engineering/search).*</sub>
<br />
## query-runner: provisioning_container_cpu_usage_long_term
<p class="subtitle">container cpu usage total (90th percentile over 1d) across all cores by instance</p>
@ -3282,30 +3282,6 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## worker: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> worker: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> worker: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_worker_mean_blocked_seconds_per_conn_request",
"critical_worker_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## worker: frontend_internal_api_error_responses
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
@ -3335,6 +3311,30 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## worker: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> worker: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> worker: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_worker_mean_blocked_seconds_per_conn_request",
"critical_worker_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## worker: container_cpu_usage
<p class="subtitle">container cpu usage total (1m average) across all cores by instance</p>
@ -3547,35 +3547,6 @@ To learn more about Sourcegraph's alerting and how to set up alerts, see [our al
<br />
## repo-updater: frontend_internal_api_error_responses
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> repo-updater: 2%+ frontend-internal API error responses every 5m by route for 5m0s
**Possible solutions**
- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
- **Kubernetes:**
- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
- Check `kubectl logs repo-updater` for logs indicate request failures to `frontend` or `frontend-internal`.
- **Docker Compose:**
- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
- Check `docker logs repo-updater` for logs indicating request failures to `frontend` or `frontend-internal`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_repo-updater_frontend_internal_api_error_responses"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## repo-updater: src_repoupdater_max_sync_backoff
<p class="subtitle">time since oldest sync</p>
@ -4259,6 +4230,35 @@ with your code hosts connections or networking issues affecting communication wi
<br />
## repo-updater: frontend_internal_api_error_responses
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> repo-updater: 2%+ frontend-internal API error responses every 5m by route for 5m0s
**Possible solutions**
- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
- **Kubernetes:**
- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
- Check `kubectl logs repo-updater` for logs indicate request failures to `frontend` or `frontend-internal`.
- **Docker Compose:**
- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
- Check `docker logs repo-updater` for logs indicating request failures to `frontend` or `frontend-internal`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_repo-updater_frontend_internal_api_error_responses"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## repo-updater: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
@ -6086,30 +6086,6 @@ with your code hosts connections or networking issues affecting communication wi
<br />
## executor-queue: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> executor-queue: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> executor-queue: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_executor-queue_mean_blocked_seconds_per_conn_request",
"critical_executor-queue_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## executor-queue: frontend_internal_api_error_responses
<p class="subtitle">frontend-internal API error responses every 5m by route</p>
@ -6139,6 +6115,30 @@ with your code hosts connections or networking issues affecting communication wi
<br />
## executor-queue: mean_blocked_seconds_per_conn_request
<p class="subtitle">mean blocked seconds per conn request</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> executor-queue: 0.05s+ mean blocked seconds per conn request for 5m0s
- <span class="badge badge-critical">critical</span> executor-queue: 0.1s+ mean blocked seconds per conn request for 10m0s
**Possible solutions**
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_executor-queue_mean_blocked_seconds_per_conn_request",
"critical_executor-queue_mean_blocked_seconds_per_conn_request"
]
```
<sub>*Managed by the [Sourcegraph Core application team](https://about.sourcegraph.com/handbook/engineering/core-application).*</sub>
<br />
## executor-queue: container_cpu_usage
<p class="subtitle">container cpu usage total (1m average) across all cores by instance</p>
@ -6494,7 +6494,7 @@ with your code hosts connections or networking issues affecting communication wi
**Possible solutions**
- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-worker container in `docker-compose.yml`.
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-indexer container in `docker-compose.yml`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
@ -6518,7 +6518,7 @@ with your code hosts connections or networking issues affecting communication wi
**Possible solutions**
- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`.
- **Docker Compose:** Consider increasing `memory:` of precise-code-intel-worker container in `docker-compose.yml`.
- **Docker Compose:** Consider increasing `memory:` of precise-code-intel-indexer container in `docker-compose.yml`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
@ -6541,8 +6541,8 @@ with your code hosts connections or networking issues affecting communication wi
**Possible solutions**
- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the precise-code-intel-worker service.
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-worker container in `docker-compose.yml`.
- **Kubernetes:** Consider increasing CPU limits in the `Deployment.yaml` for the precise-code-intel-indexer service.
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-indexer container in `docker-compose.yml`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
@ -6565,8 +6565,8 @@ with your code hosts connections or networking issues affecting communication wi
**Possible solutions**
- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the precise-code-intel-worker service.
- **Docker Compose:** Consider increasing `memory:` of the precise-code-intel-worker container in `docker-compose.yml`.
- **Kubernetes:** Consider increasing memory limits in the `Deployment.yaml` for the precise-code-intel-indexer service.
- **Docker Compose:** Consider increasing `memory:` of the precise-code-intel-indexer container in `docker-compose.yml`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
@ -6590,7 +6590,7 @@ with your code hosts connections or networking issues affecting communication wi
**Possible solutions**
- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-worker container in `docker-compose.yml`.
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-indexer container in `docker-compose.yml`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
@ -6614,7 +6614,7 @@ with your code hosts connections or networking issues affecting communication wi
**Possible solutions**
- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`.
- **Docker Compose:** Consider increasing `memory:` of precise-code-intel-worker container in `docker-compose.yml`.
- **Docker Compose:** Consider increasing `memory:` of precise-code-intel-indexer container in `docker-compose.yml`.
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,8 @@ import (
)
func ExecutorQueue() *monitoring.Container {
const containerName = "executor-queue"
return &monitoring.Container{
Name: "executor-queue",
Title: "Executor Queue",
@ -90,66 +92,13 @@ func ExecutorQueue() *monitoring.Container {
},
},
},
{
Title: shared.TitleDatabaseConnectionsMonitoring,
Hidden: true,
Rows: shared.DatabaseConnectionsMonitoring("executor-queue"),
},
{
Title: "Internal service requests",
Hidden: true,
Rows: []monitoring.Row{
{
shared.FrontendInternalAPIErrorResponses("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ContainerMemoryUsage("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ContainerMissing("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageLongTerm("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageShortTerm("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.GoGcDuration("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("executor-queue", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
},
}
}

View File

@ -490,11 +490,7 @@ func Frontend() *monitoring.Container {
},
},
},
{
Title: shared.TitleDatabaseConnectionsMonitoring,
Hidden: true,
Rows: shared.DatabaseConnectionsMonitoring("frontend"),
},
{
Title: "Internal service requests",
Hidden: true,
@ -576,52 +572,13 @@ func Frontend() *monitoring.Container {
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ContainerMemoryUsage(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
},
{
shared.ContainerMissing(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageLongTerm(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageShortTerm(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
shared.GoGcDuration(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable(containerName, monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
shared.NewDatabaseConnectionsMonitoringGroup("frontend"),
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
{
Title: "Sentinel queries (only on sourcegraph.com)",
Hidden: true,

View File

@ -10,6 +10,17 @@ import (
)
func GitServer() *monitoring.Container {
const containerName = "gitserver"
gitserverHighMemoryNoAlertTransformer := func(observable shared.Observable) shared.Observable {
return observable.WithNoAlerts(`Git Server is expected to use up all the memory it is provided.`)
}
var provisioningIndicatorsOptions = &shared.ContainerProvisioningIndicatorsGroupOptions{
LongTermMemoryUsage: gitserverHighMemoryNoAlertTransformer,
ShortTermMemoryUsage: gitserverHighMemoryNoAlertTransformer,
}
return &monitoring.Container{
Name: "gitserver",
Title: "Git Server",
@ -299,62 +310,12 @@ func GitServer() *monitoring.Container {
},
},
},
{
Title: shared.TitleDatabaseConnectionsMonitoring,
Hidden: true,
Rows: shared.DatabaseConnectionsMonitoring("gitserver"),
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("gitserver", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ContainerMemoryUsage("gitserver", monitoring.ObservableOwnerCoreApplication).Observable(),
},
{
shared.ContainerMissing("gitserver", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ContainerIOUsage("gitserver", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("gitserver", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageLongTerm("gitserver", monitoring.ObservableOwnerCoreApplication).
WithNoAlerts(`Git Server is expected to use up all the memory it is provided.`).
Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("gitserver", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageShortTerm("gitserver", monitoring.ObservableOwnerCoreApplication).
WithNoAlerts(`Git Server is expected to use up all the memory it is provided.`).
Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("gitserver", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.GoGcDuration("gitserver", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: "Kubernetes monitoring (ignore if using Docker Compose or server)",
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("gitserver", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCoreApplication, provisioningIndicatorsOptions),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
},
}
}

View File

@ -8,6 +8,8 @@ import (
)
func GitHubProxy() *monitoring.Container {
const containerName = "github-proxy"
return &monitoring.Container{
Name: "github-proxy",
Title: "GitHub Proxy",
@ -31,52 +33,11 @@ func GitHubProxy() *monitoring.Container {
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ContainerMemoryUsage("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
},
{
shared.ContainerMissing("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageLongTerm("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageShortTerm("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.GoGcDuration("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("github-proxy", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
},
}
}

View File

@ -8,18 +8,21 @@ import (
)
func Postgres() *monitoring.Container {
sum := "sum"
const (
// In docker-compose, codeintel-db container is called pgsql. In Kubernetes,
// codeintel-db container is called codeintel-db Because of this, we track
// all database cAdvisor metrics in a single panel using this container
// name regex to ensure we have observability on all platforms.
containerName = "(pgsql|codeintel-db)"
)
// In docker-compose, codeintel-db container is called pgsql
// In Kubernetes, codeintel-db container is called codeintel-db
// Because of this, we track all database cAdvisor metrics in a single panel using this
// container name regex to ensure we have observability on all platforms.
const databaseContainerNames = "(pgsql|codeintel-db)"
var sumAggregator = "sum"
return &monitoring.Container{
Name: "postgres",
Title: "Postgres",
Description: "Postgres metrics, exported from postgres_exporter (only available on Kubernetes).",
Name: "postgres",
Title: "Postgres",
Description: "Postgres metrics, exported from postgres_exporter (only available on Kubernetes).",
NoSourcegraphDebugServer: true, // This is third-party service
Groups: []monitoring.Group{
{
Title: "General",
@ -67,7 +70,7 @@ func Postgres() *monitoring.Container {
Owner: monitoring.ObservableOwnerCoreApplication,
Query: "max by (relname)(pg_invalid_index_count)",
Panel: monitoring.Panel().LegendFormat("{{relname}}"),
Critical: monitoring.Alert().GreaterOrEqual(1, &sum).For(0),
Critical: monitoring.Alert().GreaterOrEqual(1, &sumAggregator).For(0),
PossibleSolutions: `
- Drop and re-create the invalid trigger - please contact Sourcegraph to supply the trigger definition.
`,
@ -160,33 +163,9 @@ func Postgres() *monitoring.Container {
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
// See docstring for databaseContainerNames
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm(databaseContainerNames, monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageLongTerm(databaseContainerNames, monitoring.ObservableOwnerCoreApplication).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm(databaseContainerNames, monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageShortTerm(databaseContainerNames, monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable(databaseContainerNames, monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
},
// This is third-party service
NoSourcegraphDebugServer: true,
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
},
}
}

View File

@ -6,6 +6,8 @@ import (
)
func PreciseCodeIntelIndexer() *monitoring.Container {
const containerName = "precise-code-intel-indexer"
return &monitoring.Container{
Name: "precise-code-intel-indexer",
Title: "Precise Code Intel Indexer",
@ -135,52 +137,11 @@ func PreciseCodeIntelIndexer() *monitoring.Container {
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ContainerMemoryUsage("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ContainerMissing("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageLongTerm("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageShortTerm("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.GoGcDuration("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
},
}
}

View File

@ -6,6 +6,8 @@ import (
)
func PreciseCodeIntelWorker() *monitoring.Container {
const containerName = "precise-code-intel-worker"
return &monitoring.Container{
Name: "precise-code-intel-worker",
Title: "Precise Code Intel Worker",
@ -186,66 +188,13 @@ func PreciseCodeIntelWorker() *monitoring.Container {
},
},
},
{
Title: shared.TitleDatabaseConnectionsMonitoring,
Hidden: true,
Rows: shared.DatabaseConnectionsMonitoring("precise-code-intel-worker"),
},
{
Title: "Internal service requests",
Hidden: true,
Rows: []monitoring.Row{
{
shared.FrontendInternalAPIErrorResponses("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ContainerMemoryUsage("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ContainerMissing("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageLongTerm("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageShortTerm("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.GoGcDuration("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("precise-code-intel-worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
},
}
}

View File

@ -8,13 +8,18 @@ import (
)
func Prometheus() *monitoring.Container {
// ruleGroupInterpretation provides interpretation documentation for observables that are per prometheus rule_group.
const ruleGroupInterpretation = `Rules that Sourcegraph ships with are grouped under '/sg_config_prometheus'. [Custom rules are grouped under '/sg_prometheus_addons'](https://docs.sourcegraph.com/admin/observability/metrics#prometheus-configuration).`
const (
containerName = "prometheus"
// ruleGroupInterpretation provides interpretation documentation for observables that are per prometheus rule_group.
ruleGroupInterpretation = `Rules that Sourcegraph ships with are grouped under '/sg_config_prometheus'. [Custom rules are grouped under '/sg_prometheus_addons'](https://docs.sourcegraph.com/admin/observability/metrics#prometheus-configuration).`
)
return &monitoring.Container{
Name: "prometheus",
Title: "Prometheus",
Description: "Sourcegraph's all-in-one Prometheus and Alertmanager service.",
Name: "prometheus",
Title: "Prometheus",
Description: "Sourcegraph's all-in-one Prometheus and Alertmanager service.",
NoSourcegraphDebugServer: true, // This is third-party service
Groups: []monitoring.Group{
{
Title: "Metrics",
@ -148,45 +153,10 @@ func Prometheus() *monitoring.Container {
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("prometheus", monitoring.ObservableOwnerDistribution).Observable(),
shared.ContainerMemoryUsage("prometheus", monitoring.ObservableOwnerDistribution).Observable(),
},
{
shared.ContainerMissing("prometheus", monitoring.ObservableOwnerDistribution).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("prometheus", monitoring.ObservableOwnerDistribution).Observable(),
shared.ProvisioningMemoryUsageLongTerm("prometheus", monitoring.ObservableOwnerDistribution).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("prometheus", monitoring.ObservableOwnerDistribution).Observable(),
shared.ProvisioningMemoryUsageShortTerm("prometheus", monitoring.ObservableOwnerDistribution).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("prometheus", monitoring.ObservableOwnerDistribution).Observable(),
},
},
},
},
// This is third-party service
NoSourcegraphDebugServer: true,
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerDistribution, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerDistribution, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerDistribution, nil),
},
}
}

View File

@ -6,65 +6,18 @@ import (
)
func QueryRunner() *monitoring.Container {
const containerName = "query-runner"
return &monitoring.Container{
Name: "query-runner",
Title: "Query Runner",
Description: "Periodically runs saved searches and instructs the frontend to send out notifications.",
Groups: []monitoring.Group{
{
Title: "General",
Rows: []monitoring.Row{
{
shared.FrontendInternalAPIErrorResponses("query-runner", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerMemoryUsage("query-runner", monitoring.ObservableOwnerSearch).Observable(),
shared.ContainerCPUUsage("query-runner", monitoring.ObservableOwnerSearch).Observable(),
},
{
shared.ContainerMissing("query-runner", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("query-runner", monitoring.ObservableOwnerSearch).Observable(),
shared.ProvisioningMemoryUsageLongTerm("query-runner", monitoring.ObservableOwnerSearch).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("query-runner", monitoring.ObservableOwnerSearch).Observable(),
shared.ProvisioningMemoryUsageShortTerm("query-runner", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("query-runner", monitoring.ObservableOwnerSearch).Observable(),
shared.GoGcDuration("query-runner", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("query-runner", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
},
}
}

View File

@ -9,22 +9,24 @@ import (
)
func RepoUpdater() *monitoring.Container {
// This is set a bit longer than maxSyncInterval in internal/repos/syncer.go
const syncDurationThreshold = 9 * time.Hour
const (
containerName = "repo-updater"
// This is set a bit longer than maxSyncInterval in internal/repos/syncer.go
syncDurationThreshold = 9 * time.Hour
)
var containerMonitoringOptions = &shared.ContainerMonitoringGroupOptions{
MemoryUsage: func(observable shared.Observable) shared.Observable {
return observable.WithWarning(nil).WithCritical(monitoring.Alert().GreaterOrEqual(90, nil).For(10 * time.Minute))
},
}
return &monitoring.Container{
Name: "repo-updater",
Title: "Repo Updater",
Description: "Manages interaction with code hosts, instructs Gitserver to update repositories.",
Groups: []monitoring.Group{
{
Title: "General",
Rows: []monitoring.Row{
{
shared.FrontendInternalAPIErrorResponses("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: "Repositories",
Rows: []monitoring.Row{
@ -426,60 +428,13 @@ func RepoUpdater() *monitoring.Container {
},
},
},
{
Title: shared.TitleDatabaseConnectionsMonitoring,
Hidden: true,
Rows: shared.DatabaseConnectionsMonitoring("repo-updater"),
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ContainerMemoryUsage("repo-updater", monitoring.ObservableOwnerCoreApplication).
WithWarning(nil).
WithCritical(monitoring.Alert().GreaterOrEqual(90, nil).For(10 * time.Minute)).
Observable(),
},
{
shared.ContainerMissing("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageLongTerm("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageShortTerm("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.GoGcDuration("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("repo-updater", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, containerMonitoringOptions),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
},
}
}

View File

@ -8,6 +8,8 @@ import (
)
func Searcher() *monitoring.Container {
const containerName = "searcher"
return &monitoring.Container{
Name: "searcher",
Title: "Searcher",
@ -35,56 +37,15 @@ func Searcher() *monitoring.Container {
Owner: monitoring.ObservableOwnerSearch,
PossibleSolutions: "none",
},
shared.FrontendInternalAPIErrorResponses("searcher", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("searcher", monitoring.ObservableOwnerSearch).Observable(),
shared.ContainerMemoryUsage("searcher", monitoring.ObservableOwnerSearch).Observable(),
},
{
shared.ContainerMissing("searcher", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("searcher", monitoring.ObservableOwnerSearch).Observable(),
shared.ProvisioningMemoryUsageLongTerm("searcher", monitoring.ObservableOwnerSearch).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("searcher", monitoring.ObservableOwnerSearch).Observable(),
shared.ProvisioningMemoryUsageShortTerm("searcher", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("searcher", monitoring.ObservableOwnerSearch).Observable(),
shared.GoGcDuration("searcher", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("searcher", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
},
}
}

View File

@ -87,3 +87,39 @@ var (
}
}
)
type ContainerMonitoringGroupOptions struct {
// ContainerMissing transforms the default observable used to construct the container missing panel.
ContainerMissing ObservableOption
// CPUUsage transforms the default observable used to construct the CPU usage panel.
CPUUsage ObservableOption
// MemoryUsage transforms the default observable used to construct the memory usage panel.
MemoryUsage ObservableOption
// IOUsage transforms the default observable used to construct the IO usage panel.
IOUsage ObservableOption
}
// NewContainerMonitoringGroup creates a group containing panels displaying
// container monitoring metrics - cpu, memory, io resource usage as well as
// a container missing alert - for the given container.
func NewContainerMonitoringGroup(containerName string, owner monitoring.ObservableOwner, options *ContainerMonitoringGroupOptions) monitoring.Group {
if options == nil {
options = &ContainerMonitoringGroupOptions{}
}
return monitoring.Group{
Title: TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
options.ContainerMissing.safeApply(ContainerMissing(containerName, owner)).Observable(),
options.CPUUsage.safeApply(ContainerCPUUsage(containerName, owner)).Observable(),
options.MemoryUsage.safeApply(ContainerMemoryUsage(containerName, owner)).Observable(),
options.IOUsage.safeApply(ContainerIOUsage(containerName, owner)).Observable(),
},
},
}
}

View File

@ -100,3 +100,13 @@ func DatabaseConnectionsMonitoring(app string) []monitoring.Row {
},
}
}
// NewDatabaseConnectionsMonitoringGroup creates a group containing panels displaying
// database monitoring metrics for the given container.
func NewDatabaseConnectionsMonitoringGroup(containerName string) monitoring.Group {
return monitoring.Group{
Title: TitleDatabaseConnectionsMonitoring,
Hidden: true,
Rows: DatabaseConnectionsMonitoring(containerName),
}
}

View File

@ -27,3 +27,26 @@ var FrontendInternalAPIErrorResponses sharedObservable = func(containerName stri
`, "{{CONTAINER_NAME}}", containerName),
}
}
type FrontendInternalAPIERrorResponseMonitoringOptions struct {
// ErrorResponses transforms the default observable used to construct the error responses panel.
ErrorResponses ObservableOption
}
// NewProvisioningIndicatorsGroup creates a group containing panels displaying
// internal API error response metrics for the given container.
func NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName string, owner monitoring.ObservableOwner, options *FrontendInternalAPIERrorResponseMonitoringOptions) monitoring.Group {
if options == nil {
options = &FrontendInternalAPIERrorResponseMonitoringOptions{}
}
return monitoring.Group{
Title: "Internal service requests",
Hidden: true,
Rows: []monitoring.Row{
{
options.ErrorResponses.safeApply(FrontendInternalAPIErrorResponses(containerName, owner)).Observable(),
},
},
}
}

View File

@ -39,3 +39,30 @@ var (
}
}
)
type GolangMonitoringOptions struct {
// Goroutines transforms the default observable used to construct the Go goroutines count panel.
Goroutines ObservableOption
// GCDuration transforms the default observable used to construct the Go GC duration panel.
GCDuration ObservableOption
}
// NewGolangMonitoringGroup creates a group containing panels displaying Go monitoring
// metrics for the given container.
func NewGolangMonitoringGroup(containerName string, owner monitoring.ObservableOwner, options *GolangMonitoringOptions) monitoring.Group {
if options == nil {
options = &GolangMonitoringOptions{}
}
return monitoring.Group{
Title: TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
options.Goroutines.safeApply(GoGoroutines(containerName, owner)).Observable(),
options.GCDuration.safeApply(GoGcDuration(containerName, owner)).Observable(),
},
},
}
}

View File

@ -27,3 +27,27 @@ var (
}
}
)
type KubernetesMonitoringOptions struct {
// PodsAvailable transforms the default observable used to construct the pods available panel.
PodsAvailable ObservableOption
}
// NewProvisioningIndicatorsGroup creates a group containing panels displaying
// provisioning indication metrics - long and short term usage for both CPU and
// memory usage - for the given container.
func NewKubernetesMonitoringGroup(containerName string, owner monitoring.ObservableOwner, options *KubernetesMonitoringOptions) monitoring.Group {
if options == nil {
options = &KubernetesMonitoringOptions{}
}
return monitoring.Group{
Title: TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
options.PodsAvailable.safeApply(KubernetesPodsAvailable(containerName, owner)).Observable(),
},
},
}
}

View File

@ -77,3 +77,41 @@ var (
}
}
)
type ContainerProvisioningIndicatorsGroupOptions struct {
// LongTermCPUUsage transforms the default observable used to construct the long-term CPU usage panel.
LongTermCPUUsage ObservableOption
// LongTermMemoryUsage transforms the default observable used to construct the long-term memory usage panel.
LongTermMemoryUsage ObservableOption
// ShortTermCPUUsage transforms the default observable used to construct the short-term CPU usage panel.
ShortTermCPUUsage ObservableOption
// ShortTermMemoryUsage transforms the default observable used to construct the short-term memory usage panel.
ShortTermMemoryUsage ObservableOption
}
// NewProvisioningIndicatorsGroup creates a group containing panels displaying
// provisioning indication metrics - long and short term usage for both CPU and
// memory usage - for the given container.
func NewProvisioningIndicatorsGroup(containerName string, owner monitoring.ObservableOwner, options *ContainerProvisioningIndicatorsGroupOptions) monitoring.Group {
if options == nil {
options = &ContainerProvisioningIndicatorsGroupOptions{}
}
return monitoring.Group{
Title: TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
options.LongTermCPUUsage.safeApply(ProvisioningCPUUsageLongTerm(containerName, owner)).Observable(),
options.LongTermMemoryUsage.safeApply(ProvisioningMemoryUsageLongTerm(containerName, owner)).Observable(),
},
{
options.ShortTermCPUUsage.safeApply(ProvisioningCPUUsageShortTerm(containerName, owner)).Observable(),
options.ShortTermMemoryUsage.safeApply(ProvisioningMemoryUsageShortTerm(containerName, owner)).Observable(),
},
},
}
}

View File

@ -63,6 +63,17 @@ func (o Observable) WithNoAlerts(interpretation string) Observable {
return o
}
// ObservableOption is a function that transforms an observable.
type ObservableOption func(observable Observable) Observable
func (f ObservableOption) safeApply(observable Observable) Observable {
if f == nil {
return observable
}
return f(observable)
}
// sharedObservable defines the type all shared observable variables should have in this package.
type sharedObservable func(containerName string, owner monitoring.ObservableOwner) Observable

View File

@ -6,6 +6,8 @@ import (
)
func Symbols() *monitoring.Container {
const containerName = "symbols"
return &monitoring.Container{
Name: "symbols",
Title: "Symbols",
@ -34,57 +36,14 @@ func Symbols() *monitoring.Container {
PossibleSolutions: "none",
},
},
{
shared.FrontendInternalAPIErrorResponses("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ContainerMemoryUsage("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ContainerMissing("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageLongTerm("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageShortTerm("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.GoGcDuration("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("symbols", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
},
}
}

View File

@ -6,10 +6,13 @@ import (
)
func SyntectServer() *monitoring.Container {
const containerName = "syntect-server"
return &monitoring.Container{
Name: "syntect-server",
Title: "Syntect Server",
Description: "Handles syntax highlighting for code files.",
Name: "syntect-server",
Title: "Syntect Server",
Description: "Handles syntax highlighting for code files.",
NoSourcegraphDebugServer: true, // This is third-party service
Groups: []monitoring.Group{
{
Title: "General",
@ -56,43 +59,10 @@ func SyntectServer() *monitoring.Container {
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("syntect-server", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ContainerMemoryUsage("syntect-server", monitoring.ObservableOwnerCoreApplication).Observable(),
},
{
shared.ContainerMissing("syntect-server", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("syntect-server", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageLongTerm("syntect-server", monitoring.ObservableOwnerCoreApplication).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("syntect-server", monitoring.ObservableOwnerCoreApplication).Observable(),
shared.ProvisioningMemoryUsageShortTerm("syntect-server", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("syntect-server", monitoring.ObservableOwnerCoreApplication).Observable(),
},
},
},
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCoreApplication, nil),
},
NoSourcegraphDebugServer: true,
}
}

View File

@ -9,6 +9,8 @@ import (
)
func Worker() *monitoring.Container {
const containerName = "worker"
return &monitoring.Container{
Name: "worker",
Title: "Worker",
@ -202,66 +204,13 @@ func Worker() *monitoring.Container {
},
},
},
{
Title: shared.TitleDatabaseConnectionsMonitoring,
Hidden: true,
Rows: shared.DatabaseConnectionsMonitoring("worker"),
},
{
Title: "Internal service requests",
Hidden: true,
Rows: []monitoring.Row{
{
shared.FrontendInternalAPIErrorResponses("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ContainerMemoryUsage("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ContainerMissing("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageLongTerm("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.ProvisioningMemoryUsageShortTerm("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleGolangMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.GoGoroutines("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
shared.GoGcDuration("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.KubernetesPodsAvailable("worker", monitoring.ObservableOwnerCodeIntel).Observable(),
},
},
},
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
},
}
}

View File

@ -8,10 +8,17 @@ import (
)
func ZoektIndexServer() *monitoring.Container {
const (
containerName = "zoekt-indexserver"
bundledContainerName = "indexed-search"
)
return &monitoring.Container{
Name: "zoekt-indexserver",
Title: "Zoekt Index Server",
Description: "Indexes repositories and populates the search index.",
Name: "zoekt-indexserver",
Title: "Zoekt Index Server",
Description: "Indexes repositories and populates the search index.",
NoSourcegraphDebugServer: true,
Groups: []monitoring.Group{
{
Title: "General",
@ -96,48 +103,14 @@ func ZoektIndexServer() *monitoring.Container {
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("zoekt-indexserver", monitoring.ObservableOwnerSearch).Observable(),
shared.ContainerMemoryUsage("zoekt-indexserver", monitoring.ObservableOwnerSearch).Observable(),
},
{
shared.ContainerMissing("zoekt-indexserver", monitoring.ObservableOwnerSearch).Observable(),
shared.ContainerIOUsage("zoekt-indexserver", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("zoekt-indexserver", monitoring.ObservableOwnerSearch).Observable(),
shared.ProvisioningMemoryUsageLongTerm("zoekt-indexserver", monitoring.ObservableOwnerSearch).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("zoekt-indexserver", monitoring.ObservableOwnerSearch).Observable(),
shared.ProvisioningMemoryUsageShortTerm("zoekt-indexserver", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleKubernetesMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
// zoekt_index_server, zoekt_web_server are deployed together
// as part of the indexed-search service, so only show pod
// availability here.
shared.KubernetesPodsAvailable("indexed-search", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
},
NoSourcegraphDebugServer: true,
// Note:
// zoekt_indexserver and zoekt_webserver are deployed together as part of the indexed-search service
// We show pod availability here for both the webserver and indexserver as they are bundled together.
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewKubernetesMonitoringGroup(bundledContainerName, monitoring.ObservableOwnerSearch, nil),
},
}
}

View File

@ -8,10 +8,13 @@ import (
)
func ZoektWebServer() *monitoring.Container {
const containerName = "zoekt-webserver"
return &monitoring.Container{
Name: "zoekt-webserver",
Title: "Zoekt Web Server",
Description: "Serves indexed search requests using the search index.",
Name: "zoekt-webserver",
Title: "Zoekt Web Server",
Description: "Serves indexed search requests using the search index.",
NoSourcegraphDebugServer: true,
Groups: []monitoring.Group{
{
Title: "General",
@ -29,42 +32,17 @@ func ZoektWebServer() *monitoring.Container {
},
},
},
{
Title: shared.TitleContainerMonitoring,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ContainerCPUUsage("zoekt-webserver", monitoring.ObservableOwnerSearch).Observable(),
shared.ContainerMemoryUsage("zoekt-webserver", monitoring.ObservableOwnerSearch).Observable(),
},
{
// indexed-search does not have 0-downtime deploy, so deploys can
// cause extended container restarts. still seta warning alert for
// extended periods of container restarts, since this might still
// indicate a problem.
shared.ContainerMissing("zoekt-webserver", monitoring.ObservableOwnerSearch).Observable(),
shared.ContainerIOUsage("zoekt-webserver", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
{
Title: shared.TitleProvisioningIndicators,
Hidden: true,
Rows: []monitoring.Row{
{
shared.ProvisioningCPUUsageLongTerm("zoekt-webserver", monitoring.ObservableOwnerSearch).Observable(),
shared.ProvisioningMemoryUsageLongTerm("zoekt-webserver", monitoring.ObservableOwnerSearch).Observable(),
},
{
shared.ProvisioningCPUUsageShortTerm("zoekt-webserver", monitoring.ObservableOwnerSearch).Observable(),
shared.ProvisioningMemoryUsageShortTerm("zoekt-webserver", monitoring.ObservableOwnerSearch).Observable(),
},
},
},
// kubernetes monitoring for zoekt-web-server is provided by zoekt-index-server,
// since both services are deployed together
},
NoSourcegraphDebugServer: true,
// Note 1:
// indexed-search does not have zero-downtime deploy, so deploys can cause extended container restarts.
// We set the default warning alert for extended periods of container restarts as it may still indicate
// a real problem.
//
// Note 2:
// Kubernetes monitoring for zoekt-webserver is provided by zoekt-indexserver as they are bundled together.
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSearch, nil),
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerSearch, nil),
},
}
}