codeintel: Fix worker task alerts (#36405)

This commit is contained in:
Eric Fritz 2022-06-01 14:08:01 -05:00 committed by GitHub
parent c4cf4acf83
commit 789b06e71f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 46 additions and 44 deletions

View File

@ -2677,28 +2677,28 @@ count being required for the volume of uploads.
<br />
## worker: worker_job_codeintel-janitor_count
## worker: worker_job_codeintel-upload-janitor_count
<p class="subtitle">number of worker instances running the codeintel-janitor job</p>
<p class="subtitle">number of worker instances running the codeintel-upload-janitor job</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> worker: less than 1 number of worker instances running the codeintel-janitor job for 1m0s
- <span class="badge badge-critical">critical</span> worker: less than 1 number of worker instances running the codeintel-janitor job for 5m0s
- <span class="badge badge-warning">warning</span> worker: less than 1 number of worker instances running the codeintel-upload-janitor job for 1m0s
- <span class="badge badge-critical">critical</span> worker: less than 1 number of worker instances running the codeintel-upload-janitor job for 5m0s
**Possible solutions**
- Ensure your instance defines a worker container such that:
- `WORKER_JOB_ALLOWLIST` contains "codeintel-janitor" (or "all"), and
- `WORKER_JOB_BLOCKLIST` does not contain "codeintel-janitor"
- `WORKER_JOB_ALLOWLIST` contains "codeintel-upload-janitor" (or "all"), and
- `WORKER_JOB_BLOCKLIST` does not contain "codeintel-upload-janitor"
- Ensure that such a container is not failing to start or stay active
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#worker-worker-job-codeintel-janitor-count).
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#worker-worker-job-codeintel-upload-janitor-count).
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_worker_worker_job_codeintel-janitor_count",
"critical_worker_worker_job_codeintel-janitor_count"
"warning_worker_worker_job_codeintel-upload-janitor_count",
"critical_worker_worker_job_codeintel-upload-janitor_count"
]
```
@ -2706,28 +2706,28 @@ count being required for the volume of uploads.
<br />
## worker: worker_job_codeintel-commitgraph_count
## worker: worker_job_codeintel-commitgraph-updater_count
<p class="subtitle">number of worker instances running the codeintel-commitgraph job</p>
<p class="subtitle">number of worker instances running the codeintel-commitgraph-updater job</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> worker: less than 1 number of worker instances running the codeintel-commitgraph job for 1m0s
- <span class="badge badge-critical">critical</span> worker: less than 1 number of worker instances running the codeintel-commitgraph job for 5m0s
- <span class="badge badge-warning">warning</span> worker: less than 1 number of worker instances running the codeintel-commitgraph-updater job for 1m0s
- <span class="badge badge-critical">critical</span> worker: less than 1 number of worker instances running the codeintel-commitgraph-updater job for 5m0s
**Possible solutions**
- Ensure your instance defines a worker container such that:
- `WORKER_JOB_ALLOWLIST` contains "codeintel-commitgraph" (or "all"), and
- `WORKER_JOB_BLOCKLIST` does not contain "codeintel-commitgraph"
- `WORKER_JOB_ALLOWLIST` contains "codeintel-commitgraph-updater" (or "all"), and
- `WORKER_JOB_BLOCKLIST` does not contain "codeintel-commitgraph-updater"
- Ensure that such a container is not failing to start or stay active
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#worker-worker-job-codeintel-commitgraph-count).
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#worker-worker-job-codeintel-commitgraph-updater-count).
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_worker_worker_job_codeintel-commitgraph_count",
"critical_worker_worker_job_codeintel-commitgraph_count"
"warning_worker_worker_job_codeintel-commitgraph-updater_count",
"critical_worker_worker_job_codeintel-commitgraph-updater_count"
]
```
@ -2735,28 +2735,28 @@ count being required for the volume of uploads.
<br />
## worker: worker_job_codeintel-auto-indexing_count
## worker: worker_job_codeintel-autoindexing-scheduler_count
<p class="subtitle">number of worker instances running the codeintel-auto-indexing job</p>
<p class="subtitle">number of worker instances running the codeintel-autoindexing-scheduler job</p>
**Descriptions**
- <span class="badge badge-warning">warning</span> worker: less than 1 number of worker instances running the codeintel-auto-indexing job for 1m0s
- <span class="badge badge-critical">critical</span> worker: less than 1 number of worker instances running the codeintel-auto-indexing job for 5m0s
- <span class="badge badge-warning">warning</span> worker: less than 1 number of worker instances running the codeintel-autoindexing-scheduler job for 1m0s
- <span class="badge badge-critical">critical</span> worker: less than 1 number of worker instances running the codeintel-autoindexing-scheduler job for 5m0s
**Possible solutions**
- Ensure your instance defines a worker container such that:
- `WORKER_JOB_ALLOWLIST` contains "codeintel-auto-indexing" (or "all"), and
- `WORKER_JOB_BLOCKLIST` does not contain "codeintel-auto-indexing"
- `WORKER_JOB_ALLOWLIST` contains "codeintel-autoindexing-scheduler" (or "all"), and
- `WORKER_JOB_BLOCKLIST` does not contain "codeintel-autoindexing-scheduler"
- Ensure that such a container is not failing to start or stay active
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#worker-worker-job-codeintel-auto-indexing-count).
- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#worker-worker-job-codeintel-autoindexing-scheduler-count).
- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:
```json
"observability.silenceAlerts": [
"warning_worker_worker_job_codeintel-auto-indexing_count",
"critical_worker_worker_job_codeintel-auto-indexing_count"
"warning_worker_worker_job_codeintel-autoindexing-scheduler_count",
"critical_worker_worker_job_codeintel-autoindexing-scheduler_count"
]
```

View File

@ -7934,11 +7934,11 @@ Query: `sum by (job_name) (src_worker_jobs{job="worker"})`
<br />
#### worker: worker_job_codeintel-janitor_count
#### worker: worker_job_codeintel-upload-janitor_count
<p class="subtitle">Number of worker instances running the codeintel-janitor job</p>
<p class="subtitle">Number of worker instances running the codeintel-upload-janitor job</p>
Refer to the [alert solutions reference](./alert_solutions.md#worker-worker-job-codeintel-janitor-count) for 2 alerts related to this panel.
Refer to the [alert solutions reference](./alert_solutions.md#worker-worker-job-codeintel-upload-janitor-count) for 2 alerts related to this panel.
To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100010` on your Sourcegraph instance.
@ -7947,17 +7947,17 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100010` on
<details>
<summary>Technical details</summary>
Query: `sum (src_worker_jobs{job="worker", job_name="codeintel-janitor"})`
Query: `sum (src_worker_jobs{job="worker", job_name="codeintel-upload-janitor"})`
</details>
<br />
#### worker: worker_job_codeintel-commitgraph_count
#### worker: worker_job_codeintel-commitgraph-updater_count
<p class="subtitle">Number of worker instances running the codeintel-commitgraph job</p>
<p class="subtitle">Number of worker instances running the codeintel-commitgraph-updater job</p>
Refer to the [alert solutions reference](./alert_solutions.md#worker-worker-job-codeintel-commitgraph-count) for 2 alerts related to this panel.
Refer to the [alert solutions reference](./alert_solutions.md#worker-worker-job-codeintel-commitgraph-updater-count) for 2 alerts related to this panel.
To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100011` on your Sourcegraph instance.
@ -7966,17 +7966,17 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100011` on
<details>
<summary>Technical details</summary>
Query: `sum (src_worker_jobs{job="worker", job_name="codeintel-commitgraph"})`
Query: `sum (src_worker_jobs{job="worker", job_name="codeintel-commitgraph-updater"})`
</details>
<br />
#### worker: worker_job_codeintel-auto-indexing_count
#### worker: worker_job_codeintel-autoindexing-scheduler_count
<p class="subtitle">Number of worker instances running the codeintel-auto-indexing job</p>
<p class="subtitle">Number of worker instances running the codeintel-autoindexing-scheduler job</p>
Refer to the [alert solutions reference](./alert_solutions.md#worker-worker-job-codeintel-auto-indexing-count) for 2 alerts related to this panel.
Refer to the [alert solutions reference](./alert_solutions.md#worker-worker-job-codeintel-autoindexing-scheduler-count) for 2 alerts related to this panel.
To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100012` on your Sourcegraph instance.
@ -7985,7 +7985,7 @@ To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=100012` on
<details>
<summary>Technical details</summary>
Query: `sum (src_worker_jobs{job="worker", job_name="codeintel-auto-indexing"})`
Query: `sum (src_worker_jobs{job="worker", job_name="codeintel-autoindexing-scheduler"})`
</details>

View File

@ -39,8 +39,6 @@ func main() {
go setAuthzProviders()
additionalJobs := map[string]job.Job{
"codeintel-janitor": codeintel.NewJanitorJob(),
"codeintel-auto-indexing": codeintel.NewIndexingJob(),
"codehost-version-syncing": versions.NewSyncingJob(),
"insights-job": workerinsights.NewInsightsJob(),
"insights-query-runner-job": workerinsights.NewInsightsQueryRunnerJob(),
@ -57,6 +55,10 @@ func main() {
"codeintel-upload-expirer": freshcodeintel.NewUploadExpirerJob(),
"codeintel-commitgraph-updater": freshcodeintel.NewCommitGraphUpdaterJob(),
"codeintel-autoindexing-scheduler": freshcodeintel.NewAutoindexingSchedulerJob(),
// temporary
"codeintel-janitor": codeintel.NewJanitorJob(),
"codeintel-auto-indexing": codeintel.NewIndexingJob(),
}
if err := shared.Start(logger, additionalJobs, registerEnterpriseMigrations); err != nil {

View File

@ -15,9 +15,9 @@ func Worker() *monitoring.Dashboard {
Name string
Owner monitoring.ObservableOwner
}{
{Name: "codeintel-janitor", Owner: monitoring.ObservableOwnerCodeIntel},
{Name: "codeintel-commitgraph", Owner: monitoring.ObservableOwnerCodeIntel},
{Name: "codeintel-auto-indexing", Owner: monitoring.ObservableOwnerCodeIntel},
{Name: "codeintel-upload-janitor", Owner: monitoring.ObservableOwnerCodeIntel},
{Name: "codeintel-commitgraph-updater", Owner: monitoring.ObservableOwnerCodeIntel},
{Name: "codeintel-autoindexing-scheduler", Owner: monitoring.ObservableOwnerCodeIntel},
}
var activeJobObservables []monitoring.Observable