From cdd1b7715f8c126e717cb361bda6fa05bbfa34c3 Mon Sep 17 00:00:00 2001 From: Robert Lin Date: Thu, 6 Aug 2020 16:20:47 +0800 Subject: [PATCH] monitoring: encourage silencing, render entry for alerts w/o solutions (#12731) --- doc/admin/observability/alert_solutions.md | 3420 ++++++++++++++++++-- monitoring/generator.go | 33 +- monitoring/shared.go | 3 +- 3 files changed, 3097 insertions(+), 359 deletions(-) diff --git a/doc/admin/observability/alert_solutions.md b/doc/admin/observability/alert_solutions.md index 50f5dd6595e..de182ab0511 100644 --- a/doc/admin/observability/alert_solutions.md +++ b/doc/admin/observability/alert_solutions.md @@ -4,13 +4,15 @@ This document contains possible solutions for when you find alerts are firing in If your alert isn't mentioned here, or if the solution doesn't help, [contact us](mailto:support@sourcegraph.com) for assistance. +To learn more about Sourcegraph's alerting, see [our alerting documentation](https://docs.sourcegraph.com/admin/observability/alerting). + -# frontend: 99th_percentile_search_request_duration +## frontend: 99th_percentile_search_request_duration **Descriptions:** -- _frontend: 20s+ 99th percentile successful search request duration over 5m_ (`warning_frontend_99th_percentile_search_request_duration`) +- _frontend: 20s+ 99th percentile successful search request duration over 5m_ **Possible solutions:** @@ -18,12 +20,19 @@ for assistance. - **Check that most repositories are indexed** by visiting https://sourcegraph.example.com/site-admin/repositories?filter=needs-index (it should show few or no results.) - **Kubernetes:** Check CPU usage of zoekt-webserver in the indexed-search pod, consider increasing CPU limits in the `indexed-search.Deployment.yaml` if regularly hitting max CPU utilization. - **Docker Compose:** Check CPU usage on the Zoekt Web Server dashboard, consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml` if regularly hitting max CPU utilization. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: 90th_percentile_search_request_duration +```json +"observability.silenceAlerts": [ + "warning_frontend_99th_percentile_search_request_duration" +] +``` + +## frontend: 90th_percentile_search_request_duration **Descriptions:** -- _frontend: 15s+ 90th percentile successful search request duration over 5m_ (`warning_frontend_90th_percentile_search_request_duration`) +- _frontend: 15s+ 90th percentile successful search request duration over 5m_ **Possible solutions:** @@ -31,44 +40,126 @@ for assistance. - **Check that most repositories are indexed** by visiting https://sourcegraph.example.com/site-admin/repositories?filter=needs-index (it should show few or no results.) - **Kubernetes:** Check CPU usage of zoekt-webserver in the indexed-search pod, consider increasing CPU limits in the `indexed-search.Deployment.yaml` if regularly hitting max CPU utilization. - **Docker Compose:** Check CPU usage on the Zoekt Web Server dashboard, consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml` if regularly hitting max CPU utilization. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: search_alert_user_suggestions +```json +"observability.silenceAlerts": [ + "warning_frontend_90th_percentile_search_request_duration" +] +``` + +## frontend: hard_timeout_search_responses **Descriptions:** -- _frontend: 50+ search alert user suggestions shown every 5m_ (`warning_frontend_search_alert_user_suggestions`) +- _frontend: 5+ hard timeout search responses every 5m_ + +- _frontend: 20+ hard timeout search responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_hard_timeout_search_responses", + "critical_frontend_hard_timeout_search_responses" +] +``` + +## frontend: hard_error_search_responses + +**Descriptions:** + +- _frontend: 5+ hard error search responses every 5m_ + +- _frontend: 20+ hard error search responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_hard_error_search_responses", + "critical_frontend_hard_error_search_responses" +] +``` + +## frontend: partial_timeout_search_responses + +**Descriptions:** + +- _frontend: 5+ partial timeout search responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_partial_timeout_search_responses" +] +``` + +## frontend: search_alert_user_suggestions + +**Descriptions:** + +- _frontend: 50+ search alert user suggestions shown every 5m_ **Possible solutions:** - This indicates your user`s are making syntax errors or similar user errors. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: page_load_latency +```json +"observability.silenceAlerts": [ + "warning_frontend_search_alert_user_suggestions" +] +``` + +## frontend: page_load_latency **Descriptions:** -- _frontend: 2s+ 90th percentile page load latency over all routes over 10m_ (`critical_frontend_page_load_latency`) +- _frontend: 2s+ 90th percentile page load latency over all routes over 10m_ **Possible solutions:** - Confirm that the Sourcegraph frontend has enough CPU/memory using the provisioning panels. - Trace a request to see what the slowest part is: https://docs.sourcegraph.com/admin/observability/tracing +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: blob_load_latency +```json +"observability.silenceAlerts": [ + "critical_frontend_page_load_latency" +] +``` + +## frontend: blob_load_latency **Descriptions:** -- _frontend: 2s+ 90th percentile blob load latency over 10m_ (`critical_frontend_blob_load_latency`) +- _frontend: 2s+ 90th percentile blob load latency over 10m_ **Possible solutions:** - Confirm that the Sourcegraph frontend has enough CPU/memory using the provisioning panels. - Trace a request to see what the slowest part is: https://docs.sourcegraph.com/admin/observability/tracing +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: 99th_percentile_search_codeintel_request_duration +```json +"observability.silenceAlerts": [ + "critical_frontend_blob_load_latency" +] +``` + +## frontend: 99th_percentile_search_codeintel_request_duration **Descriptions:** -- _frontend: 20s+ 99th percentile code-intel successful search request duration over 5m_ (`warning_frontend_99th_percentile_search_codeintel_request_duration`) +- _frontend: 20s+ 99th percentile code-intel successful search request duration over 5m_ **Possible solutions:** @@ -76,12 +167,19 @@ for assistance. - **Check that most repositories are indexed** by visiting https://sourcegraph.example.com/site-admin/repositories?filter=needs-index (it should show few or no results.) - **Kubernetes:** Check CPU usage of zoekt-webserver in the indexed-search pod, consider increasing CPU limits in the `indexed-search.Deployment.yaml` if regularly hitting max CPU utilization. - **Docker Compose:** Check CPU usage on the Zoekt Web Server dashboard, consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml` if regularly hitting max CPU utilization. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: 90th_percentile_search_codeintel_request_duration +```json +"observability.silenceAlerts": [ + "warning_frontend_99th_percentile_search_codeintel_request_duration" +] +``` + +## frontend: 90th_percentile_search_codeintel_request_duration **Descriptions:** -- _frontend: 15s+ 90th percentile code-intel successful search request duration over 5m_ (`warning_frontend_90th_percentile_search_codeintel_request_duration`) +- _frontend: 15s+ 90th percentile code-intel successful search request duration over 5m_ **Possible solutions:** @@ -89,22 +187,90 @@ for assistance. - **Check that most repositories are indexed** by visiting https://sourcegraph.example.com/site-admin/repositories?filter=needs-index (it should show few or no results.) - **Kubernetes:** Check CPU usage of zoekt-webserver in the indexed-search pod, consider increasing CPU limits in the `indexed-search.Deployment.yaml` if regularly hitting max CPU utilization. - **Docker Compose:** Check CPU usage on the Zoekt Web Server dashboard, consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml` if regularly hitting max CPU utilization. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: search_codeintel_alert_user_suggestions +```json +"observability.silenceAlerts": [ + "warning_frontend_90th_percentile_search_codeintel_request_duration" +] +``` + +## frontend: hard_timeout_search_codeintel_responses **Descriptions:** -- _frontend: 50+ search code-intel alert user suggestions shown every 5m_ (`warning_frontend_search_codeintel_alert_user_suggestions`) +- _frontend: 5+ hard timeout search code-intel responses every 5m_ + +- _frontend: 20+ hard timeout search code-intel responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_hard_timeout_search_codeintel_responses", + "critical_frontend_hard_timeout_search_codeintel_responses" +] +``` + +## frontend: hard_error_search_codeintel_responses + +**Descriptions:** + +- _frontend: 5+ hard error search code-intel responses every 5m_ + +- _frontend: 20+ hard error search code-intel responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_hard_error_search_codeintel_responses", + "critical_frontend_hard_error_search_codeintel_responses" +] +``` + +## frontend: partial_timeout_search_codeintel_responses + +**Descriptions:** + +- _frontend: 5+ partial timeout search code-intel responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_partial_timeout_search_codeintel_responses" +] +``` + +## frontend: search_codeintel_alert_user_suggestions + +**Descriptions:** + +- _frontend: 50+ search code-intel alert user suggestions shown every 5m_ **Possible solutions:** - This indicates a bug in Sourcegraph, please [open an issue](https://github.com/sourcegraph/sourcegraph/issues/new/choose). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: 99th_percentile_search_api_request_duration +```json +"observability.silenceAlerts": [ + "warning_frontend_search_codeintel_alert_user_suggestions" +] +``` + +## frontend: 99th_percentile_search_api_request_duration **Descriptions:** -- _frontend: 50s+ 99th percentile successful search API request duration over 5m_ (`warning_frontend_99th_percentile_search_api_request_duration`) +- _frontend: 50s+ 99th percentile successful search API request duration over 5m_ **Possible solutions:** @@ -113,12 +279,19 @@ for assistance. - **Check that most repositories are indexed** by visiting https://sourcegraph.example.com/site-admin/repositories?filter=needs-index (it should show few or no results.) - **Kubernetes:** Check CPU usage of zoekt-webserver in the indexed-search pod, consider increasing CPU limits in the `indexed-search.Deployment.yaml` if regularly hitting max CPU utilization. - **Docker Compose:** Check CPU usage on the Zoekt Web Server dashboard, consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml` if regularly hitting max CPU utilization. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: 90th_percentile_search_api_request_duration +```json +"observability.silenceAlerts": [ + "warning_frontend_99th_percentile_search_api_request_duration" +] +``` + +## frontend: 90th_percentile_search_api_request_duration **Descriptions:** -- _frontend: 40s+ 90th percentile successful search API request duration over 5m_ (`warning_frontend_90th_percentile_search_api_request_duration`) +- _frontend: 40s+ 90th percentile successful search API request duration over 5m_ **Possible solutions:** @@ -127,94 +300,371 @@ for assistance. - **Check that most repositories are indexed** by visiting https://sourcegraph.example.com/site-admin/repositories?filter=needs-index (it should show few or no results.) - **Kubernetes:** Check CPU usage of zoekt-webserver in the indexed-search pod, consider increasing CPU limits in the `indexed-search.Deployment.yaml` if regularly hitting max CPU utilization. - **Docker Compose:** Check CPU usage on the Zoekt Web Server dashboard, consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml` if regularly hitting max CPU utilization. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: search_api_alert_user_suggestions +```json +"observability.silenceAlerts": [ + "warning_frontend_90th_percentile_search_api_request_duration" +] +``` + +## frontend: hard_timeout_search_api_responses **Descriptions:** -- _frontend: 50+ search API alert user suggestions shown every 5m_ (`warning_frontend_search_api_alert_user_suggestions`) +- _frontend: 5+ hard timeout search API responses every 5m_ + +- _frontend: 20+ hard timeout search API responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_hard_timeout_search_api_responses", + "critical_frontend_hard_timeout_search_api_responses" +] +``` + +## frontend: hard_error_search_api_responses + +**Descriptions:** + +- _frontend: 5+ hard error search API responses every 5m_ + +- _frontend: 20+ hard error search API responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_hard_error_search_api_responses", + "critical_frontend_hard_error_search_api_responses" +] +``` + +## frontend: partial_timeout_search_api_responses + +**Descriptions:** + +- _frontend: 5+ partial timeout search API responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_partial_timeout_search_api_responses" +] +``` + +## frontend: search_api_alert_user_suggestions + +**Descriptions:** + +- _frontend: 50+ search API alert user suggestions shown every 5m_ **Possible solutions:** - This indicates your user`s search API requests have syntax errors or a similar user error. Check the responses the API sends back for an explanation. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: internal_indexed_search_error_responses +```json +"observability.silenceAlerts": [ + "warning_frontend_search_api_alert_user_suggestions" +] +``` + +## frontend: 99th_percentile_precise_code_intel_api_duration **Descriptions:** -- _frontend: 5+ internal indexed search error responses every 5m_ (`warning_frontend_internal_indexed_search_error_responses`) +- _frontend: 20s+ 99th percentile successful precise code intel api query duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_99th_percentile_precise_code_intel_api_duration" +] +``` + +## frontend: precise_code_intel_api_errors + +**Descriptions:** + +- _frontend: 20+ precise code intel api errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_precise_code_intel_api_errors" +] +``` + +## frontend: 99th_percentile_precise_code_intel_store_duration + +**Descriptions:** + +- _frontend: 20s+ 99th percentile successful precise code intel database query duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_99th_percentile_precise_code_intel_store_duration" +] +``` + +## frontend: precise_code_intel_store_errors + +**Descriptions:** + +- _frontend: 20+ precise code intel database errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_precise_code_intel_store_errors" +] +``` + +## frontend: internal_indexed_search_error_responses + +**Descriptions:** + +- _frontend: 5+ internal indexed search error responses every 5m_ **Possible solutions:** - Check the Zoekt Web Server dashboard for indications it might be unhealthy. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: internal_unindexed_search_error_responses +```json +"observability.silenceAlerts": [ + "warning_frontend_internal_indexed_search_error_responses" +] +``` + +## frontend: internal_unindexed_search_error_responses **Descriptions:** -- _frontend: 5+ internal unindexed search error responses every 5m_ (`warning_frontend_internal_unindexed_search_error_responses`) +- _frontend: 5+ internal unindexed search error responses every 5m_ **Possible solutions:** - Check the Searcher dashboard for indications it might be unhealthy. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_frontend_internal_unindexed_search_error_responses" +] +``` + +## frontend: internal_api_error_responses **Descriptions:** -- _frontend: 25+ internal API error responses every 5m by route_ (`warning_frontend_internal_api_error_responses`) +- _frontend: 25+ internal API error responses every 5m by route_ **Possible solutions:** - May not be a substantial issue, check the `frontend` logs for potential causes. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: observability_test_alert_warning +```json +"observability.silenceAlerts": [ + "warning_frontend_internal_api_error_responses" +] +``` + +## frontend: 99th_percentile_precise_code_intel_bundle_manager_query_duration **Descriptions:** -- _frontend: 1+ warning test alert metric_ (`warning_frontend_observability_test_alert_warning`) +- _frontend: 20s+ 99th percentile successful precise-code-intel-bundle-manager query duration over 5m_ **Possible solutions:** -This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint, and will automatically resolve itself. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: observability_test_alert_critical +```json +"observability.silenceAlerts": [ + "warning_frontend_99th_percentile_precise_code_intel_bundle_manager_query_duration" +] +``` + +## frontend: 99th_percentile_precise_code_intel_bundle_manager_transfer_duration **Descriptions:** -- _frontend: 1+ critical test alert metric_ (`critical_frontend_observability_test_alert_critical`) +- _frontend: 300s+ 99th percentile successful precise-code-intel-bundle-manager data transfer duration over 5m_ **Possible solutions:** -This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint, and will automatically resolve itself. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_frontend_99th_percentile_precise_code_intel_bundle_manager_transfer_duration" +] +``` + +## frontend: precise_code_intel_bundle_manager_error_responses **Descriptions:** -- _frontend: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_frontend_container_cpu_usage`) +- _frontend: 5+ precise-code-intel-bundle-manager error responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_precise_code_intel_bundle_manager_error_responses" +] +``` + +## frontend: 99th_percentile_gitserver_duration + +**Descriptions:** + +- _frontend: 20s+ 99th percentile successful gitserver query duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_99th_percentile_gitserver_duration" +] +``` + +## frontend: gitserver_error_responses + +**Descriptions:** + +- _frontend: 5+ gitserver error responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_gitserver_error_responses" +] +``` + +## frontend: 90th_percentile_updatecheck_requests + +**Descriptions:** + +- _frontend: 0.1s+ 90th percentile successful update-check requests (sourcegraph.com only)_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_90th_percentile_updatecheck_requests" +] +``` + +## frontend: observability_test_alert_warning + +**Descriptions:** + +- _frontend: 1+ warning test alert metric_ + +**Possible solutions:** + +- This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint, and will automatically resolve itself. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_observability_test_alert_warning" +] +``` + +## frontend: observability_test_alert_critical + +**Descriptions:** + +- _frontend: 1+ critical test alert metric_ + +**Possible solutions:** + +- This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint, and will automatically resolve itself. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_frontend_observability_test_alert_critical" +] +``` + +## frontend: container_cpu_usage + +**Descriptions:** + +- _frontend: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the frontend container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_frontend_container_cpu_usage" +] +``` + +## frontend: container_memory_usage **Descriptions:** -- _frontend: 99%+ container memory usage by instance_ (`warning_frontend_container_memory_usage`) +- _frontend: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of frontend container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: container_restarts +```json +"observability.silenceAlerts": [ + "warning_frontend_container_memory_usage" +] +``` + +## frontend: container_restarts **Descriptions:** -- _frontend: 1+ container restarts every 5m by instance_ (`warning_frontend_container_restarts`) +- _frontend: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -224,23 +674,37 @@ This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' frontend` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the frontend container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs frontend` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_frontend_container_restarts" +] +``` + +## frontend: fs_inodes_used **Descriptions:** -- _frontend: 3e+06+ fs inodes in use by instance_ (`warning_frontend_fs_inodes_used`) +- _frontend: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_frontend_fs_inodes_used" +] +``` + +## frontend: provisioning_container_cpu_usage_7d **Descriptions:** -- _frontend: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_frontend_provisioning_container_cpu_usage_7d`) +- _frontend: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -248,12 +712,19 @@ This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the frontend container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_frontend_provisioning_container_cpu_usage_7d" +] +``` + +## frontend: provisioning_container_memory_usage_7d **Descriptions:** -- _frontend: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_frontend_provisioning_container_memory_usage_7d`) +- _frontend: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -261,100 +732,204 @@ This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of frontend container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_frontend_provisioning_container_memory_usage_7d" +] +``` + +## frontend: provisioning_container_cpu_usage_5m **Descriptions:** -- _frontend: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_frontend_provisioning_container_cpu_usage_5m`) +- _frontend: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the frontend container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# frontend: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_frontend_provisioning_container_cpu_usage_5m" +] +``` + +## frontend: provisioning_container_memory_usage_5m **Descriptions:** -- _frontend: 90%+ container memory usage (5m maximum) by instance_ (`warning_frontend_provisioning_container_memory_usage_5m`) +- _frontend: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of frontend container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: disk_space_remaining +```json +"observability.silenceAlerts": [ + "warning_frontend_provisioning_container_memory_usage_5m" +] +``` + +## frontend: go_goroutines **Descriptions:** -- _gitserver: less than 25% disk space remaining by instance_ (`warning_gitserver_disk_space_remaining`) +- _frontend: 10000+ maximum active goroutines for 10m_ +**Possible solutions:** -- _gitserver: less than 15% disk space remaining by instance_ (`critical_gitserver_disk_space_remaining`) +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_go_goroutines" +] +``` + +## frontend: go_gc_duration_seconds + +**Descriptions:** + +- _frontend: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_go_gc_duration_seconds" +] +``` + +## frontend: pods_available_percentage + +**Descriptions:** + +- _frontend: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_frontend_pods_available_percentage" +] +``` + +## gitserver: disk_space_remaining + +**Descriptions:** + +- _gitserver: less than 25% disk space remaining by instance_ + +- _gitserver: less than 15% disk space remaining by instance_ **Possible solutions:** - **Provision more disk space:** Sourcegraph will begin deleting least-used repository clones at 10% disk space remaining which may result in decreased performance, users having to wait for repositories to clone, etc. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: running_git_commands +```json +"observability.silenceAlerts": [ + "warning_gitserver_disk_space_remaining", + "critical_gitserver_disk_space_remaining" +] +``` + +## gitserver: running_git_commands **Descriptions:** -- _gitserver: 50+ running git commands (signals load)_ (`warning_gitserver_running_git_commands`) +- _gitserver: 50+ running git commands (signals load)_ - -- _gitserver: 100+ running git commands (signals load)_ (`critical_gitserver_running_git_commands`) +- _gitserver: 100+ running git commands (signals load)_ **Possible solutions:** - **Check if the problem may be an intermittent and temporary peak** using the "Container monitoring" section at the bottom of the Git Server dashboard. - **Single container deployments:** Consider upgrading to a [Docker Compose deployment](../install/docker-compose/migrate.md) which offers better scalability and resource isolation. - **Kubernetes and Docker Compose:** Check that you are running a similar number of git server replicas and that their CPU/memory limits are allocated according to what is shown in the [Sourcegraph resource estimator](../install/resource_estimator.md). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: repository_clone_queue_size +```json +"observability.silenceAlerts": [ + "warning_gitserver_running_git_commands", + "critical_gitserver_running_git_commands" +] +``` + +## gitserver: repository_clone_queue_size **Descriptions:** -- _gitserver: 25+ repository clone queue size_ (`warning_gitserver_repository_clone_queue_size`) +- _gitserver: 25+ repository clone queue size_ **Possible solutions:** - **If you just added several repositories**, the warning may be expected. - **Check which repositories need cloning**, by visiting e.g. https://sourcegraph.example.com/site-admin/repositories?filter=not-cloned +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: repository_existence_check_queue_size +```json +"observability.silenceAlerts": [ + "warning_gitserver_repository_clone_queue_size" +] +``` + +## gitserver: repository_existence_check_queue_size **Descriptions:** -- _gitserver: 25+ repository existence check queue size_ (`warning_gitserver_repository_existence_check_queue_size`) +- _gitserver: 25+ repository existence check queue size_ **Possible solutions:** - **Check the code host status indicator for errors:** on the Sourcegraph app homepage, when signed in as an admin click the cloud icon in the top right corner of the page. - **Check if the issue continues to happen after 30 minutes**, it may be temporary. - **Check the gitserver logs for more information.** +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: echo_command_duration_test +```json +"observability.silenceAlerts": [ + "warning_gitserver_repository_existence_check_queue_size" +] +``` + +## gitserver: echo_command_duration_test **Descriptions:** -- _gitserver: 1s+ echo command duration test_ (`warning_gitserver_echo_command_duration_test`) +- _gitserver: 1s+ echo command duration test_ - -- _gitserver: 2s+ echo command duration test_ (`critical_gitserver_echo_command_duration_test`) +- _gitserver: 2s+ echo command duration test_ **Possible solutions:** - **Check if the problem may be an intermittent and temporary peak** using the "Container monitoring" section at the bottom of the Git Server dashboard. - **Single container deployments:** Consider upgrading to a [Docker Compose deployment](../install/docker-compose/migrate.md) which offers better scalability and resource isolation. - **Kubernetes and Docker Compose:** Check that you are running a similar number of git server replicas and that their CPU/memory limits are allocated according to what is shown in the [Sourcegraph resource estimator](../install/resource_estimator.md). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: frontend_internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_gitserver_echo_command_duration_test", + "critical_gitserver_echo_command_duration_test" +] +``` + +## gitserver: frontend_internal_api_error_responses **Descriptions:** -- _gitserver: 5+ frontend-internal API error responses every 5m by route_ (`warning_gitserver_frontend_internal_api_error_responses`) +- _gitserver: 5+ frontend-internal API error responses every 5m by route_ **Possible solutions:** @@ -365,34 +940,55 @@ This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint - **Docker Compose:** - Confirm that `docker ps` shows the `frontend-internal` container is healthy. - Check `docker logs gitserver` for logs indicating request failures to `frontend` or `frontend-internal`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_gitserver_frontend_internal_api_error_responses" +] +``` + +## gitserver: container_cpu_usage **Descriptions:** -- _gitserver: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_gitserver_container_cpu_usage`) +- _gitserver: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the gitserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_gitserver_container_cpu_usage" +] +``` + +## gitserver: container_memory_usage **Descriptions:** -- _gitserver: 99%+ container memory usage by instance_ (`warning_gitserver_container_memory_usage`) +- _gitserver: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of gitserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: container_restarts +```json +"observability.silenceAlerts": [ + "warning_gitserver_container_memory_usage" +] +``` + +## gitserver: container_restarts **Descriptions:** -- _gitserver: 1+ container restarts every 5m by instance_ (`warning_gitserver_container_restarts`) +- _gitserver: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -402,23 +998,37 @@ This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' gitserver` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the gitserver container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs gitserver` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_gitserver_container_restarts" +] +``` + +## gitserver: fs_inodes_used **Descriptions:** -- _gitserver: 3e+06+ fs inodes in use by instance_ (`warning_gitserver_fs_inodes_used`) +- _gitserver: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_gitserver_fs_inodes_used" +] +``` + +## gitserver: provisioning_container_cpu_usage_7d **Descriptions:** -- _gitserver: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_gitserver_provisioning_container_cpu_usage_7d`) +- _gitserver: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -426,76 +1036,173 @@ This alert is triggered via the `triggerObservabilityTestAlert` GraphQL endpoint - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the gitserver container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_gitserver_provisioning_container_cpu_usage_7d" +] +``` + +## gitserver: provisioning_container_memory_usage_7d **Descriptions:** -- _gitserver: less than 30% container memory usage (7d maximum) by instance_ (`warning_gitserver_provisioning_container_memory_usage_7d`) +- _gitserver: less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider decreasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider decreasing `memory:` of gitserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# gitserver: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_gitserver_provisioning_container_memory_usage_7d" +] +``` + +## gitserver: provisioning_container_cpu_usage_5m **Descriptions:** -- _gitserver: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_gitserver_provisioning_container_cpu_usage_5m`) +- _gitserver: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the gitserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: github_core_rate_limit_remaining +```json +"observability.silenceAlerts": [ + "warning_gitserver_provisioning_container_cpu_usage_5m" +] +``` + +## gitserver: go_goroutines **Descriptions:** -- _github-proxy: less than 1000 remaining calls to GitHub before hitting the rate limit_ (`critical_github-proxy_github_core_rate_limit_remaining`) +- _gitserver: 10000+ maximum active goroutines for 10m_ **Possible solutions:** -Try restarting the pod to get a different public IP. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: github_search_rate_limit_remaining +```json +"observability.silenceAlerts": [ + "warning_gitserver_go_goroutines" +] +``` + +## gitserver: go_gc_duration_seconds **Descriptions:** -- _github-proxy: less than 5 remaining calls to GitHub search before hitting the rate limit_ (`warning_github-proxy_github_search_rate_limit_remaining`) +- _gitserver: 2s+ maximum go garbage collection duration_ **Possible solutions:** -Try restarting the pod to get a different public IP. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_gitserver_go_gc_duration_seconds" +] +``` + +## gitserver: pods_available_percentage **Descriptions:** -- _github-proxy: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_github-proxy_container_cpu_usage`) +- _gitserver: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_gitserver_pods_available_percentage" +] +``` + +## github-proxy: github_core_rate_limit_remaining + +**Descriptions:** + +- _github-proxy: less than 1000 remaining calls to GitHub before hitting the rate limit_ + +**Possible solutions:** + +- Try restarting the pod to get a different public IP. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_github-proxy_github_core_rate_limit_remaining" +] +``` + +## github-proxy: github_search_rate_limit_remaining + +**Descriptions:** + +- _github-proxy: less than 5 remaining calls to GitHub search before hitting the rate limit_ + +**Possible solutions:** + +- Try restarting the pod to get a different public IP. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_github-proxy_github_search_rate_limit_remaining" +] +``` + +## github-proxy: container_cpu_usage + +**Descriptions:** + +- _github-proxy: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the github-proxy container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_github-proxy_container_cpu_usage" +] +``` + +## github-proxy: container_memory_usage **Descriptions:** -- _github-proxy: 99%+ container memory usage by instance_ (`warning_github-proxy_container_memory_usage`) +- _github-proxy: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of github-proxy container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: container_restarts +```json +"observability.silenceAlerts": [ + "warning_github-proxy_container_memory_usage" +] +``` + +## github-proxy: container_restarts **Descriptions:** -- _github-proxy: 1+ container restarts every 5m by instance_ (`warning_github-proxy_container_restarts`) +- _github-proxy: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -505,23 +1212,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' github-proxy` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the github-proxy container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs github-proxy` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_github-proxy_container_restarts" +] +``` + +## github-proxy: fs_inodes_used **Descriptions:** -- _github-proxy: 3e+06+ fs inodes in use by instance_ (`warning_github-proxy_fs_inodes_used`) +- _github-proxy: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_github-proxy_fs_inodes_used" +] +``` + +## github-proxy: provisioning_container_cpu_usage_7d **Descriptions:** -- _github-proxy: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_github-proxy_provisioning_container_cpu_usage_7d`) +- _github-proxy: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -529,12 +1250,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the github-proxy container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_github-proxy_provisioning_container_cpu_usage_7d" +] +``` + +## github-proxy: provisioning_container_memory_usage_7d **Descriptions:** -- _github-proxy: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_github-proxy_provisioning_container_memory_usage_7d`) +- _github-proxy: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -542,47 +1270,283 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of github-proxy container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_github-proxy_provisioning_container_memory_usage_7d" +] +``` + +## github-proxy: provisioning_container_cpu_usage_5m **Descriptions:** -- _github-proxy: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_github-proxy_provisioning_container_cpu_usage_5m`) +- _github-proxy: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the github-proxy container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# github-proxy: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_github-proxy_provisioning_container_cpu_usage_5m" +] +``` + +## github-proxy: provisioning_container_memory_usage_5m **Descriptions:** -- _github-proxy: 90%+ container memory usage (5m maximum) by instance_ (`warning_github-proxy_provisioning_container_memory_usage_5m`) +- _github-proxy: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of github-proxy container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: disk_space_remaining +```json +"observability.silenceAlerts": [ + "warning_github-proxy_provisioning_container_memory_usage_5m" +] +``` + +## github-proxy: go_goroutines **Descriptions:** -- _precise-code-intel-bundle-manager: less than 25% disk space remaining by instance_ (`warning_precise-code-intel-bundle-manager_disk_space_remaining`) +- _github-proxy: 10000+ maximum active goroutines for 10m_ +**Possible solutions:** -- _precise-code-intel-bundle-manager: less than 15% disk space remaining by instance_ (`critical_precise-code-intel-bundle-manager_disk_space_remaining`) +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_github-proxy_go_goroutines" +] +``` + +## github-proxy: go_gc_duration_seconds + +**Descriptions:** + +- _github-proxy: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_github-proxy_go_gc_duration_seconds" +] +``` + +## github-proxy: pods_available_percentage + +**Descriptions:** + +- _github-proxy: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_github-proxy_pods_available_percentage" +] +``` + +## precise-code-intel-bundle-manager: 99th_percentile_bundle_database_duration + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 20s+ 99th percentile successful bundle database query duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_99th_percentile_bundle_database_duration" +] +``` + +## precise-code-intel-bundle-manager: bundle_database_errors + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 20+ bundle database errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_bundle_database_errors" +] +``` + +## precise-code-intel-bundle-manager: 99th_percentile_bundle_reader_duration + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 20s+ 99th percentile successful bundle reader query duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_99th_percentile_bundle_reader_duration" +] +``` + +## precise-code-intel-bundle-manager: bundle_reader_errors + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 20+ bundle reader errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_bundle_reader_errors" +] +``` + +## precise-code-intel-bundle-manager: disk_space_remaining + +**Descriptions:** + +- _precise-code-intel-bundle-manager: less than 25% disk space remaining by instance_ + +- _precise-code-intel-bundle-manager: less than 15% disk space remaining by instance_ **Possible solutions:** - **Provision more disk space:** Sourcegraph will begin deleting the oldest uploaded bundle files at 10% disk space remaining. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: frontend_internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_disk_space_remaining", + "critical_precise-code-intel-bundle-manager_disk_space_remaining" +] +``` + +## precise-code-intel-bundle-manager: janitor_errors **Descriptions:** -- _precise-code-intel-bundle-manager: 5+ frontend-internal API error responses every 5m by route_ (`warning_precise-code-intel-bundle-manager_frontend_internal_api_error_responses`) +- _precise-code-intel-bundle-manager: 20+ janitor errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_janitor_errors" +] +``` + +## precise-code-intel-bundle-manager: janitor_old_uploads_removed + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 20+ upload files removed (due to age) every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_janitor_old_uploads_removed" +] +``` + +## precise-code-intel-bundle-manager: janitor_old_parts_removed + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 20+ upload and database part files removed (due to age) every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_janitor_old_parts_removed" +] +``` + +## precise-code-intel-bundle-manager: janitor_old_dumps_removed + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 20+ bundle files removed (due to low disk space) every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_janitor_old_dumps_removed" +] +``` + +## precise-code-intel-bundle-manager: janitor_orphans + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 20+ bundle and upload files removed (with no corresponding database entry) every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_janitor_orphans" +] +``` + +## precise-code-intel-bundle-manager: janitor_uploads_removed + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 20+ upload records removed every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_janitor_uploads_removed" +] +``` + +## precise-code-intel-bundle-manager: frontend_internal_api_error_responses + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 5+ frontend-internal API error responses every 5m by route_ **Possible solutions:** @@ -593,34 +1557,55 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Confirm that `docker ps` shows the `frontend-internal` container is healthy. - Check `docker logs precise-code-intel-bundle-manager` for logs indicating request failures to `frontend` or `frontend-internal`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_frontend_internal_api_error_responses" +] +``` + +## precise-code-intel-bundle-manager: container_cpu_usage **Descriptions:** -- _precise-code-intel-bundle-manager: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_precise-code-intel-bundle-manager_container_cpu_usage`) +- _precise-code-intel-bundle-manager: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-bundle-manager container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_container_cpu_usage" +] +``` + +## precise-code-intel-bundle-manager: container_memory_usage **Descriptions:** -- _precise-code-intel-bundle-manager: 99%+ container memory usage by instance_ (`warning_precise-code-intel-bundle-manager_container_memory_usage`) +- _precise-code-intel-bundle-manager: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of precise-code-intel-bundle-manager container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: container_restarts +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_container_memory_usage" +] +``` + +## precise-code-intel-bundle-manager: container_restarts **Descriptions:** -- _precise-code-intel-bundle-manager: 1+ container restarts every 5m by instance_ (`warning_precise-code-intel-bundle-manager_container_restarts`) +- _precise-code-intel-bundle-manager: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -630,23 +1615,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' precise-code-intel-bundle-manager` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the precise-code-intel-bundle-manager container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs precise-code-intel-bundle-manager` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_container_restarts" +] +``` + +## precise-code-intel-bundle-manager: fs_inodes_used **Descriptions:** -- _precise-code-intel-bundle-manager: 3e+06+ fs inodes in use by instance_ (`warning_precise-code-intel-bundle-manager_fs_inodes_used`) +- _precise-code-intel-bundle-manager: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_fs_inodes_used" +] +``` + +## precise-code-intel-bundle-manager: provisioning_container_cpu_usage_7d **Descriptions:** -- _precise-code-intel-bundle-manager: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_precise-code-intel-bundle-manager_provisioning_container_cpu_usage_7d`) +- _precise-code-intel-bundle-manager: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -654,12 +1653,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the precise-code-intel-bundle-manager container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_provisioning_container_cpu_usage_7d" +] +``` + +## precise-code-intel-bundle-manager: provisioning_container_memory_usage_7d **Descriptions:** -- _precise-code-intel-bundle-manager: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_precise-code-intel-bundle-manager_provisioning_container_memory_usage_7d`) +- _precise-code-intel-bundle-manager: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -667,34 +1673,295 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of precise-code-intel-bundle-manager container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_provisioning_container_memory_usage_7d" +] +``` + +## precise-code-intel-bundle-manager: provisioning_container_cpu_usage_5m **Descriptions:** -- _precise-code-intel-bundle-manager: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_precise-code-intel-bundle-manager_provisioning_container_cpu_usage_5m`) +- _precise-code-intel-bundle-manager: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-bundle-manager container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-bundle-manager: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_provisioning_container_cpu_usage_5m" +] +``` + +## precise-code-intel-bundle-manager: provisioning_container_memory_usage_5m **Descriptions:** -- _precise-code-intel-bundle-manager: 90%+ container memory usage (5m maximum) by instance_ (`warning_precise-code-intel-bundle-manager_provisioning_container_memory_usage_5m`) +- _precise-code-intel-bundle-manager: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of precise-code-intel-bundle-manager container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-worker: frontend_internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_provisioning_container_memory_usage_5m" +] +``` + +## precise-code-intel-bundle-manager: go_goroutines **Descriptions:** -- _precise-code-intel-worker: 5+ frontend-internal API error responses every 5m by route_ (`warning_precise-code-intel-worker_frontend_internal_api_error_responses`) +- _precise-code-intel-bundle-manager: 10000+ maximum active goroutines for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_go_goroutines" +] +``` + +## precise-code-intel-bundle-manager: go_gc_duration_seconds + +**Descriptions:** + +- _precise-code-intel-bundle-manager: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-bundle-manager_go_gc_duration_seconds" +] +``` + +## precise-code-intel-bundle-manager: pods_available_percentage + +**Descriptions:** + +- _precise-code-intel-bundle-manager: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_precise-code-intel-bundle-manager_pods_available_percentage" +] +``` + +## precise-code-intel-worker: upload_queue_size + +**Descriptions:** + +- _precise-code-intel-worker: 100+ upload queue size_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_upload_queue_size" +] +``` + +## precise-code-intel-worker: upload_queue_growth_rate + +**Descriptions:** + +- _precise-code-intel-worker: 5+ upload queue growth rate every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_upload_queue_growth_rate" +] +``` + +## precise-code-intel-worker: upload_process_errors + +**Descriptions:** + +- _precise-code-intel-worker: 20+ upload process errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_upload_process_errors" +] +``` + +## precise-code-intel-worker: 99th_percentile_store_duration + +**Descriptions:** + +- _precise-code-intel-worker: 20s+ 99th percentile successful database query duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_99th_percentile_store_duration" +] +``` + +## precise-code-intel-worker: store_errors + +**Descriptions:** + +- _precise-code-intel-worker: 20+ database errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_store_errors" +] +``` + +## precise-code-intel-worker: processing_uploads_reset + +**Descriptions:** + +- _precise-code-intel-worker: 20+ uploads reset to queued state every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_processing_uploads_reset" +] +``` + +## precise-code-intel-worker: processing_uploads_reset_failures + +**Descriptions:** + +- _precise-code-intel-worker: 20+ uploads errored after repeated resets every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_processing_uploads_reset_failures" +] +``` + +## precise-code-intel-worker: upload_resetter_errors + +**Descriptions:** + +- _precise-code-intel-worker: 20+ upload resetter errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_upload_resetter_errors" +] +``` + +## precise-code-intel-worker: 99th_percentile_bundle_manager_transfer_duration + +**Descriptions:** + +- _precise-code-intel-worker: 300s+ 99th percentile successful bundle manager data transfer duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_99th_percentile_bundle_manager_transfer_duration" +] +``` + +## precise-code-intel-worker: bundle_manager_error_responses + +**Descriptions:** + +- _precise-code-intel-worker: 5+ bundle manager error responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_bundle_manager_error_responses" +] +``` + +## precise-code-intel-worker: 99th_percentile_gitserver_duration + +**Descriptions:** + +- _precise-code-intel-worker: 20s+ 99th percentile successful gitserver query duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_99th_percentile_gitserver_duration" +] +``` + +## precise-code-intel-worker: gitserver_error_responses + +**Descriptions:** + +- _precise-code-intel-worker: 5+ gitserver error responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_gitserver_error_responses" +] +``` + +## precise-code-intel-worker: frontend_internal_api_error_responses + +**Descriptions:** + +- _precise-code-intel-worker: 5+ frontend-internal API error responses every 5m by route_ **Possible solutions:** @@ -705,34 +1972,55 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Confirm that `docker ps` shows the `frontend-internal` container is healthy. - Check `docker logs precise-code-intel-worker` for logs indicating request failures to `frontend` or `frontend-internal`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-worker: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_frontend_internal_api_error_responses" +] +``` + +## precise-code-intel-worker: container_cpu_usage **Descriptions:** -- _precise-code-intel-worker: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_precise-code-intel-worker_container_cpu_usage`) +- _precise-code-intel-worker: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-worker container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-worker: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_container_cpu_usage" +] +``` + +## precise-code-intel-worker: container_memory_usage **Descriptions:** -- _precise-code-intel-worker: 99%+ container memory usage by instance_ (`warning_precise-code-intel-worker_container_memory_usage`) +- _precise-code-intel-worker: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of precise-code-intel-worker container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-worker: container_restarts +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_container_memory_usage" +] +``` + +## precise-code-intel-worker: container_restarts **Descriptions:** -- _precise-code-intel-worker: 1+ container restarts every 5m by instance_ (`warning_precise-code-intel-worker_container_restarts`) +- _precise-code-intel-worker: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -742,23 +2030,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' precise-code-intel-worker` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the precise-code-intel-worker container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs precise-code-intel-worker` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-worker: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_container_restarts" +] +``` + +## precise-code-intel-worker: fs_inodes_used **Descriptions:** -- _precise-code-intel-worker: 3e+06+ fs inodes in use by instance_ (`warning_precise-code-intel-worker_fs_inodes_used`) +- _precise-code-intel-worker: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-worker: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_fs_inodes_used" +] +``` + +## precise-code-intel-worker: provisioning_container_cpu_usage_7d **Descriptions:** -- _precise-code-intel-worker: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_precise-code-intel-worker_provisioning_container_cpu_usage_7d`) +- _precise-code-intel-worker: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -766,12 +2068,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the precise-code-intel-worker container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-worker: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_provisioning_container_cpu_usage_7d" +] +``` + +## precise-code-intel-worker: provisioning_container_memory_usage_7d **Descriptions:** -- _precise-code-intel-worker: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_precise-code-intel-worker_provisioning_container_memory_usage_7d`) +- _precise-code-intel-worker: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -779,34 +2088,327 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of precise-code-intel-worker container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-worker: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_provisioning_container_memory_usage_7d" +] +``` + +## precise-code-intel-worker: provisioning_container_cpu_usage_5m **Descriptions:** -- _precise-code-intel-worker: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_precise-code-intel-worker_provisioning_container_cpu_usage_5m`) +- _precise-code-intel-worker: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-worker container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-worker: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_provisioning_container_cpu_usage_5m" +] +``` + +## precise-code-intel-worker: provisioning_container_memory_usage_5m **Descriptions:** -- _precise-code-intel-worker: 90%+ container memory usage (5m maximum) by instance_ (`warning_precise-code-intel-worker_provisioning_container_memory_usage_5m`) +- _precise-code-intel-worker: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of precise-code-intel-worker container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-indexer: frontend_internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_provisioning_container_memory_usage_5m" +] +``` + +## precise-code-intel-worker: go_goroutines **Descriptions:** -- _precise-code-intel-indexer: 5+ frontend-internal API error responses every 5m by route_ (`warning_precise-code-intel-indexer_frontend_internal_api_error_responses`) +- _precise-code-intel-worker: 10000+ maximum active goroutines for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_go_goroutines" +] +``` + +## precise-code-intel-worker: go_gc_duration_seconds + +**Descriptions:** + +- _precise-code-intel-worker: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-worker_go_gc_duration_seconds" +] +``` + +## precise-code-intel-worker: pods_available_percentage + +**Descriptions:** + +- _precise-code-intel-worker: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_precise-code-intel-worker_pods_available_percentage" +] +``` + +## precise-code-intel-indexer: index_queue_size + +**Descriptions:** + +- _precise-code-intel-indexer: 100+ index queue size_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_index_queue_size" +] +``` + +## precise-code-intel-indexer: index_queue_growth_rate + +**Descriptions:** + +- _precise-code-intel-indexer: 5+ index queue growth rate every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_index_queue_growth_rate" +] +``` + +## precise-code-intel-indexer: index_process_errors + +**Descriptions:** + +- _precise-code-intel-indexer: 20+ index process errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_index_process_errors" +] +``` + +## precise-code-intel-indexer: 99th_percentile_store_duration + +**Descriptions:** + +- _precise-code-intel-indexer: 20s+ 99th percentile successful database query duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_99th_percentile_store_duration" +] +``` + +## precise-code-intel-indexer: store_errors + +**Descriptions:** + +- _precise-code-intel-indexer: 20+ database errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_store_errors" +] +``` + +## precise-code-intel-indexer: indexability_updater_errors + +**Descriptions:** + +- _precise-code-intel-indexer: 20+ indexability updater errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_indexability_updater_errors" +] +``` + +## precise-code-intel-indexer: index_scheduler_errors + +**Descriptions:** + +- _precise-code-intel-indexer: 20+ index scheduler errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_index_scheduler_errors" +] +``` + +## precise-code-intel-indexer: processing_indexes_reset + +**Descriptions:** + +- _precise-code-intel-indexer: 20+ indexes reset to queued state every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_processing_indexes_reset" +] +``` + +## precise-code-intel-indexer: processing_indexes_reset_failures + +**Descriptions:** + +- _precise-code-intel-indexer: 20+ indexes errored after repeated resets every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_processing_indexes_reset_failures" +] +``` + +## precise-code-intel-indexer: index_resetter_errors + +**Descriptions:** + +- _precise-code-intel-indexer: 20+ index resetter errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_index_resetter_errors" +] +``` + +## precise-code-intel-indexer: janitor_errors + +**Descriptions:** + +- _precise-code-intel-indexer: 20+ janitor errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_janitor_errors" +] +``` + +## precise-code-intel-indexer: janitor_indexes_removed + +**Descriptions:** + +- _precise-code-intel-indexer: 20+ index records removed every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_janitor_indexes_removed" +] +``` + +## precise-code-intel-indexer: 99th_percentile_gitserver_duration + +**Descriptions:** + +- _precise-code-intel-indexer: 20s+ 99th percentile successful gitserver query duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_99th_percentile_gitserver_duration" +] +``` + +## precise-code-intel-indexer: gitserver_error_responses + +**Descriptions:** + +- _precise-code-intel-indexer: 5+ gitserver error responses every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_gitserver_error_responses" +] +``` + +## precise-code-intel-indexer: frontend_internal_api_error_responses + +**Descriptions:** + +- _precise-code-intel-indexer: 5+ frontend-internal API error responses every 5m by route_ **Possible solutions:** @@ -817,34 +2419,55 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Confirm that `docker ps` shows the `frontend-internal` container is healthy. - Check `docker logs precise-code-intel-indexer` for logs indicating request failures to `frontend` or `frontend-internal`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-indexer: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_frontend_internal_api_error_responses" +] +``` + +## precise-code-intel-indexer: container_cpu_usage **Descriptions:** -- _precise-code-intel-indexer: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_precise-code-intel-indexer_container_cpu_usage`) +- _precise-code-intel-indexer: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-indexer container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-indexer: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_container_cpu_usage" +] +``` + +## precise-code-intel-indexer: container_memory_usage **Descriptions:** -- _precise-code-intel-indexer: 99%+ container memory usage by instance_ (`warning_precise-code-intel-indexer_container_memory_usage`) +- _precise-code-intel-indexer: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of precise-code-intel-indexer container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-indexer: container_restarts +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_container_memory_usage" +] +``` + +## precise-code-intel-indexer: container_restarts **Descriptions:** -- _precise-code-intel-indexer: 1+ container restarts every 5m by instance_ (`warning_precise-code-intel-indexer_container_restarts`) +- _precise-code-intel-indexer: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -854,23 +2477,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' precise-code-intel-indexer` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the precise-code-intel-indexer container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs precise-code-intel-indexer` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-indexer: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_container_restarts" +] +``` + +## precise-code-intel-indexer: fs_inodes_used **Descriptions:** -- _precise-code-intel-indexer: 3e+06+ fs inodes in use by instance_ (`warning_precise-code-intel-indexer_fs_inodes_used`) +- _precise-code-intel-indexer: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-indexer: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_fs_inodes_used" +] +``` + +## precise-code-intel-indexer: provisioning_container_cpu_usage_7d **Descriptions:** -- _precise-code-intel-indexer: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_precise-code-intel-indexer_provisioning_container_cpu_usage_7d`) +- _precise-code-intel-indexer: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -878,12 +2515,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the precise-code-intel-indexer container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-indexer: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_provisioning_container_cpu_usage_7d" +] +``` + +## precise-code-intel-indexer: provisioning_container_memory_usage_7d **Descriptions:** -- _precise-code-intel-indexer: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_precise-code-intel-indexer_provisioning_container_memory_usage_7d`) +- _precise-code-intel-indexer: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -891,34 +2535,103 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of precise-code-intel-indexer container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-indexer: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_provisioning_container_memory_usage_7d" +] +``` + +## precise-code-intel-indexer: provisioning_container_cpu_usage_5m **Descriptions:** -- _precise-code-intel-indexer: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_precise-code-intel-indexer_provisioning_container_cpu_usage_5m`) +- _precise-code-intel-indexer: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-indexer container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# precise-code-intel-indexer: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_provisioning_container_cpu_usage_5m" +] +``` + +## precise-code-intel-indexer: provisioning_container_memory_usage_5m **Descriptions:** -- _precise-code-intel-indexer: 90%+ container memory usage (5m maximum) by instance_ (`warning_precise-code-intel-indexer_provisioning_container_memory_usage_5m`) +- _precise-code-intel-indexer: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of precise-code-intel-indexer container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# query-runner: frontend_internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_provisioning_container_memory_usage_5m" +] +``` + +## precise-code-intel-indexer: go_goroutines **Descriptions:** -- _query-runner: 5+ frontend-internal API error responses every 5m by route_ (`warning_query-runner_frontend_internal_api_error_responses`) +- _precise-code-intel-indexer: 10000+ maximum active goroutines for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_go_goroutines" +] +``` + +## precise-code-intel-indexer: go_gc_duration_seconds + +**Descriptions:** + +- _precise-code-intel-indexer: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_precise-code-intel-indexer_go_gc_duration_seconds" +] +``` + +## precise-code-intel-indexer: pods_available_percentage + +**Descriptions:** + +- _precise-code-intel-indexer: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_precise-code-intel-indexer_pods_available_percentage" +] +``` + +## query-runner: frontend_internal_api_error_responses + +**Descriptions:** + +- _query-runner: 5+ frontend-internal API error responses every 5m by route_ **Possible solutions:** @@ -929,34 +2642,55 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Confirm that `docker ps` shows the `frontend-internal` container is healthy. - Check `docker logs query-runner` for logs indicating request failures to `frontend` or `frontend-internal`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# query-runner: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_query-runner_frontend_internal_api_error_responses" +] +``` + +## query-runner: container_memory_usage **Descriptions:** -- _query-runner: 99%+ container memory usage by instance_ (`warning_query-runner_container_memory_usage`) +- _query-runner: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of query-runner container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# query-runner: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_query-runner_container_memory_usage" +] +``` + +## query-runner: container_cpu_usage **Descriptions:** -- _query-runner: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_query-runner_container_cpu_usage`) +- _query-runner: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the query-runner container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# query-runner: container_restarts +```json +"observability.silenceAlerts": [ + "warning_query-runner_container_cpu_usage" +] +``` + +## query-runner: container_restarts **Descriptions:** -- _query-runner: 1+ container restarts every 5m by instance_ (`warning_query-runner_container_restarts`) +- _query-runner: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -966,23 +2700,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' query-runner` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the query-runner container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs query-runner` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# query-runner: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_query-runner_container_restarts" +] +``` + +## query-runner: fs_inodes_used **Descriptions:** -- _query-runner: 3e+06+ fs inodes in use by instance_ (`warning_query-runner_fs_inodes_used`) +- _query-runner: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# query-runner: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_query-runner_fs_inodes_used" +] +``` + +## query-runner: provisioning_container_cpu_usage_7d **Descriptions:** -- _query-runner: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_query-runner_provisioning_container_cpu_usage_7d`) +- _query-runner: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -990,12 +2738,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the query-runner container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# query-runner: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_query-runner_provisioning_container_cpu_usage_7d" +] +``` + +## query-runner: provisioning_container_memory_usage_7d **Descriptions:** -- _query-runner: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_query-runner_provisioning_container_memory_usage_7d`) +- _query-runner: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -1003,34 +2758,103 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of query-runner container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# query-runner: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_query-runner_provisioning_container_memory_usage_7d" +] +``` + +## query-runner: provisioning_container_cpu_usage_5m **Descriptions:** -- _query-runner: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_query-runner_provisioning_container_cpu_usage_5m`) +- _query-runner: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the query-runner container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# query-runner: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_query-runner_provisioning_container_cpu_usage_5m" +] +``` + +## query-runner: provisioning_container_memory_usage_5m **Descriptions:** -- _query-runner: 90%+ container memory usage (5m maximum) by instance_ (`warning_query-runner_provisioning_container_memory_usage_5m`) +- _query-runner: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of query-runner container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# replacer: frontend_internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_query-runner_provisioning_container_memory_usage_5m" +] +``` + +## query-runner: go_goroutines **Descriptions:** -- _replacer: 5+ frontend-internal API error responses every 5m by route_ (`warning_replacer_frontend_internal_api_error_responses`) +- _query-runner: 10000+ maximum active goroutines for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_query-runner_go_goroutines" +] +``` + +## query-runner: go_gc_duration_seconds + +**Descriptions:** + +- _query-runner: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_query-runner_go_gc_duration_seconds" +] +``` + +## query-runner: pods_available_percentage + +**Descriptions:** + +- _query-runner: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_query-runner_pods_available_percentage" +] +``` + +## replacer: frontend_internal_api_error_responses + +**Descriptions:** + +- _replacer: 5+ frontend-internal API error responses every 5m by route_ **Possible solutions:** @@ -1041,34 +2865,55 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Confirm that `docker ps` shows the `frontend-internal` container is healthy. - Check `docker logs replacer` for logs indicating request failures to `frontend` or `frontend-internal`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# replacer: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_replacer_frontend_internal_api_error_responses" +] +``` + +## replacer: container_cpu_usage **Descriptions:** -- _replacer: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_replacer_container_cpu_usage`) +- _replacer: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the replacer container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# replacer: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_replacer_container_cpu_usage" +] +``` + +## replacer: container_memory_usage **Descriptions:** -- _replacer: 99%+ container memory usage by instance_ (`warning_replacer_container_memory_usage`) +- _replacer: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of replacer container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# replacer: container_restarts +```json +"observability.silenceAlerts": [ + "warning_replacer_container_memory_usage" +] +``` + +## replacer: container_restarts **Descriptions:** -- _replacer: 1+ container restarts every 5m by instance_ (`warning_replacer_container_restarts`) +- _replacer: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -1078,23 +2923,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' replacer` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the replacer container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs replacer` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# replacer: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_replacer_container_restarts" +] +``` + +## replacer: fs_inodes_used **Descriptions:** -- _replacer: 3e+06+ fs inodes in use by instance_ (`warning_replacer_fs_inodes_used`) +- _replacer: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# replacer: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_replacer_fs_inodes_used" +] +``` + +## replacer: provisioning_container_cpu_usage_7d **Descriptions:** -- _replacer: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_replacer_provisioning_container_cpu_usage_7d`) +- _replacer: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -1102,12 +2961,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the replacer container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# replacer: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_replacer_provisioning_container_cpu_usage_7d" +] +``` + +## replacer: provisioning_container_memory_usage_7d **Descriptions:** -- _replacer: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_replacer_provisioning_container_memory_usage_7d`) +- _replacer: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -1115,34 +2981,103 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of replacer container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# replacer: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_replacer_provisioning_container_memory_usage_7d" +] +``` + +## replacer: provisioning_container_cpu_usage_5m **Descriptions:** -- _replacer: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_replacer_provisioning_container_cpu_usage_5m`) +- _replacer: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the replacer container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# replacer: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_replacer_provisioning_container_cpu_usage_5m" +] +``` + +## replacer: provisioning_container_memory_usage_5m **Descriptions:** -- _replacer: 90%+ container memory usage (5m maximum) by instance_ (`warning_replacer_provisioning_container_memory_usage_5m`) +- _replacer: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of replacer container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# repo-updater: frontend_internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_replacer_provisioning_container_memory_usage_5m" +] +``` + +## replacer: go_goroutines **Descriptions:** -- _repo-updater: 5+ frontend-internal API error responses every 5m by route_ (`warning_repo-updater_frontend_internal_api_error_responses`) +- _replacer: 10000+ maximum active goroutines for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_replacer_go_goroutines" +] +``` + +## replacer: go_gc_duration_seconds + +**Descriptions:** + +- _replacer: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_replacer_go_gc_duration_seconds" +] +``` + +## replacer: pods_available_percentage + +**Descriptions:** + +- _replacer: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_replacer_pods_available_percentage" +] +``` + +## repo-updater: frontend_internal_api_error_responses + +**Descriptions:** + +- _repo-updater: 5+ frontend-internal API error responses every 5m by route_ **Possible solutions:** @@ -1153,34 +3088,55 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Confirm that `docker ps` shows the `frontend-internal` container is healthy. - Check `docker logs repo-updater` for logs indicating request failures to `frontend` or `frontend-internal`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# repo-updater: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_repo-updater_frontend_internal_api_error_responses" +] +``` + +## repo-updater: container_cpu_usage **Descriptions:** -- _repo-updater: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_repo-updater_container_cpu_usage`) +- _repo-updater: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the repo-updater container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# repo-updater: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_repo-updater_container_cpu_usage" +] +``` + +## repo-updater: container_memory_usage **Descriptions:** -- _repo-updater: 99%+ container memory usage by instance_ (`warning_repo-updater_container_memory_usage`) +- _repo-updater: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of repo-updater container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# repo-updater: container_restarts +```json +"observability.silenceAlerts": [ + "warning_repo-updater_container_memory_usage" +] +``` + +## repo-updater: container_restarts **Descriptions:** -- _repo-updater: 1+ container restarts every 5m by instance_ (`warning_repo-updater_container_restarts`) +- _repo-updater: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -1190,23 +3146,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' repo-updater` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the repo-updater container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs repo-updater` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# repo-updater: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_repo-updater_container_restarts" +] +``` + +## repo-updater: fs_inodes_used **Descriptions:** -- _repo-updater: 3e+06+ fs inodes in use by instance_ (`warning_repo-updater_fs_inodes_used`) +- _repo-updater: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# repo-updater: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_repo-updater_fs_inodes_used" +] +``` + +## repo-updater: provisioning_container_cpu_usage_7d **Descriptions:** -- _repo-updater: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_repo-updater_provisioning_container_cpu_usage_7d`) +- _repo-updater: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -1214,12 +3184,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the repo-updater container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# repo-updater: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_repo-updater_provisioning_container_cpu_usage_7d" +] +``` + +## repo-updater: provisioning_container_memory_usage_7d **Descriptions:** -- _repo-updater: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_repo-updater_provisioning_container_memory_usage_7d`) +- _repo-updater: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -1227,34 +3204,135 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of repo-updater container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# repo-updater: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_repo-updater_provisioning_container_memory_usage_7d" +] +``` + +## repo-updater: provisioning_container_cpu_usage_5m **Descriptions:** -- _repo-updater: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_repo-updater_provisioning_container_cpu_usage_5m`) +- _repo-updater: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the repo-updater container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# repo-updater: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_repo-updater_provisioning_container_cpu_usage_5m" +] +``` + +## repo-updater: provisioning_container_memory_usage_5m **Descriptions:** -- _repo-updater: 90%+ container memory usage (5m maximum) by instance_ (`warning_repo-updater_provisioning_container_memory_usage_5m`) +- _repo-updater: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of repo-updater container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# searcher: frontend_internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_repo-updater_provisioning_container_memory_usage_5m" +] +``` + +## repo-updater: go_goroutines **Descriptions:** -- _searcher: 5+ frontend-internal API error responses every 5m by route_ (`warning_searcher_frontend_internal_api_error_responses`) +- _repo-updater: 10000+ maximum active goroutines for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_repo-updater_go_goroutines" +] +``` + +## repo-updater: go_gc_duration_seconds + +**Descriptions:** + +- _repo-updater: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_repo-updater_go_gc_duration_seconds" +] +``` + +## repo-updater: pods_available_percentage + +**Descriptions:** + +- _repo-updater: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_repo-updater_pods_available_percentage" +] +``` + +## searcher: unindexed_search_request_errors + +**Descriptions:** + +- _searcher: 5+ unindexed search request errors every 5m by code_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_searcher_unindexed_search_request_errors" +] +``` + +## searcher: replica_traffic + +**Descriptions:** + +- _searcher: 5+ requests per second over 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_searcher_replica_traffic" +] +``` + +## searcher: frontend_internal_api_error_responses + +**Descriptions:** + +- _searcher: 5+ frontend-internal API error responses every 5m by route_ **Possible solutions:** @@ -1265,34 +3343,55 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Confirm that `docker ps` shows the `frontend-internal` container is healthy. - Check `docker logs searcher` for logs indicating request failures to `frontend` or `frontend-internal`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# searcher: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_searcher_frontend_internal_api_error_responses" +] +``` + +## searcher: container_cpu_usage **Descriptions:** -- _searcher: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_searcher_container_cpu_usage`) +- _searcher: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the searcher container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# searcher: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_searcher_container_cpu_usage" +] +``` + +## searcher: container_memory_usage **Descriptions:** -- _searcher: 99%+ container memory usage by instance_ (`warning_searcher_container_memory_usage`) +- _searcher: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of searcher container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# searcher: container_restarts +```json +"observability.silenceAlerts": [ + "warning_searcher_container_memory_usage" +] +``` + +## searcher: container_restarts **Descriptions:** -- _searcher: 1+ container restarts every 5m by instance_ (`warning_searcher_container_restarts`) +- _searcher: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -1302,23 +3401,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' searcher` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the searcher container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs searcher` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# searcher: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_searcher_container_restarts" +] +``` + +## searcher: fs_inodes_used **Descriptions:** -- _searcher: 3e+06+ fs inodes in use by instance_ (`warning_searcher_fs_inodes_used`) +- _searcher: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# searcher: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_searcher_fs_inodes_used" +] +``` + +## searcher: provisioning_container_cpu_usage_7d **Descriptions:** -- _searcher: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_searcher_provisioning_container_cpu_usage_7d`) +- _searcher: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -1326,12 +3439,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the searcher container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# searcher: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_searcher_provisioning_container_cpu_usage_7d" +] +``` + +## searcher: provisioning_container_memory_usage_7d **Descriptions:** -- _searcher: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_searcher_provisioning_container_memory_usage_7d`) +- _searcher: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -1339,34 +3459,135 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of searcher container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# searcher: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_searcher_provisioning_container_memory_usage_7d" +] +``` + +## searcher: provisioning_container_cpu_usage_5m **Descriptions:** -- _searcher: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_searcher_provisioning_container_cpu_usage_5m`) +- _searcher: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the searcher container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# searcher: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_searcher_provisioning_container_cpu_usage_5m" +] +``` + +## searcher: provisioning_container_memory_usage_5m **Descriptions:** -- _searcher: 90%+ container memory usage (5m maximum) by instance_ (`warning_searcher_provisioning_container_memory_usage_5m`) +- _searcher: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of searcher container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# symbols: frontend_internal_api_error_responses +```json +"observability.silenceAlerts": [ + "warning_searcher_provisioning_container_memory_usage_5m" +] +``` + +## searcher: go_goroutines **Descriptions:** -- _symbols: 5+ frontend-internal API error responses every 5m by route_ (`warning_symbols_frontend_internal_api_error_responses`) +- _searcher: 10000+ maximum active goroutines for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_searcher_go_goroutines" +] +``` + +## searcher: go_gc_duration_seconds + +**Descriptions:** + +- _searcher: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_searcher_go_gc_duration_seconds" +] +``` + +## searcher: pods_available_percentage + +**Descriptions:** + +- _searcher: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_searcher_pods_available_percentage" +] +``` + +## symbols: store_fetch_failures + +**Descriptions:** + +- _symbols: 5+ store fetch failures every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_symbols_store_fetch_failures" +] +``` + +## symbols: current_fetch_queue_size + +**Descriptions:** + +- _symbols: 25+ current fetch queue size_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_symbols_current_fetch_queue_size" +] +``` + +## symbols: frontend_internal_api_error_responses + +**Descriptions:** + +- _symbols: 5+ frontend-internal API error responses every 5m by route_ **Possible solutions:** @@ -1377,34 +3598,55 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Confirm that `docker ps` shows the `frontend-internal` container is healthy. - Check `docker logs symbols` for logs indicating request failures to `frontend` or `frontend-internal`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# symbols: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_symbols_frontend_internal_api_error_responses" +] +``` + +## symbols: container_cpu_usage **Descriptions:** -- _symbols: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_symbols_container_cpu_usage`) +- _symbols: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the symbols container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# symbols: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_symbols_container_cpu_usage" +] +``` + +## symbols: container_memory_usage **Descriptions:** -- _symbols: 99%+ container memory usage by instance_ (`warning_symbols_container_memory_usage`) +- _symbols: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of symbols container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# symbols: container_restarts +```json +"observability.silenceAlerts": [ + "warning_symbols_container_memory_usage" +] +``` + +## symbols: container_restarts **Descriptions:** -- _symbols: 1+ container restarts every 5m by instance_ (`warning_symbols_container_restarts`) +- _symbols: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -1414,23 +3656,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' symbols` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the symbols container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs symbols` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# symbols: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_symbols_container_restarts" +] +``` + +## symbols: fs_inodes_used **Descriptions:** -- _symbols: 3e+06+ fs inodes in use by instance_ (`warning_symbols_fs_inodes_used`) +- _symbols: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# symbols: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_symbols_fs_inodes_used" +] +``` + +## symbols: provisioning_container_cpu_usage_7d **Descriptions:** -- _symbols: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_symbols_provisioning_container_cpu_usage_7d`) +- _symbols: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -1438,12 +3694,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the symbols container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# symbols: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_symbols_provisioning_container_cpu_usage_7d" +] +``` + +## symbols: provisioning_container_memory_usage_7d **Descriptions:** -- _symbols: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_symbols_provisioning_container_memory_usage_7d`) +- _symbols: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -1451,56 +3714,203 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of symbols container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# symbols: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_symbols_provisioning_container_memory_usage_7d" +] +``` + +## symbols: provisioning_container_cpu_usage_5m **Descriptions:** -- _symbols: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_symbols_provisioning_container_cpu_usage_5m`) +- _symbols: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the symbols container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# symbols: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_symbols_provisioning_container_cpu_usage_5m" +] +``` + +## symbols: provisioning_container_memory_usage_5m **Descriptions:** -- _symbols: 90%+ container memory usage (5m maximum) by instance_ (`warning_symbols_provisioning_container_memory_usage_5m`) +- _symbols: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of symbols container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# syntect-server: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_symbols_provisioning_container_memory_usage_5m" +] +``` + +## symbols: go_goroutines **Descriptions:** -- _syntect-server: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_syntect-server_container_cpu_usage`) +- _symbols: 10000+ maximum active goroutines for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_symbols_go_goroutines" +] +``` + +## symbols: go_gc_duration_seconds + +**Descriptions:** + +- _symbols: 2s+ maximum go garbage collection duration_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_symbols_go_gc_duration_seconds" +] +``` + +## symbols: pods_available_percentage + +**Descriptions:** + +- _symbols: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_symbols_pods_available_percentage" +] +``` + +## syntect-server: syntax_highlighting_errors + +**Descriptions:** + +- _syntect-server: 5+ syntax highlighting errors every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_syntect-server_syntax_highlighting_errors" +] +``` + +## syntect-server: syntax_highlighting_panics + +**Descriptions:** + +- _syntect-server: 5+ syntax highlighting panics every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_syntect-server_syntax_highlighting_panics" +] +``` + +## syntect-server: syntax_highlighting_timeouts + +**Descriptions:** + +- _syntect-server: 5+ syntax highlighting timeouts every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_syntect-server_syntax_highlighting_timeouts" +] +``` + +## syntect-server: syntax_highlighting_worker_deaths + +**Descriptions:** + +- _syntect-server: 1+ syntax highlighter worker deaths every 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_syntect-server_syntax_highlighting_worker_deaths" +] +``` + +## syntect-server: container_cpu_usage + +**Descriptions:** + +- _syntect-server: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the syntect-server container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# syntect-server: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_syntect-server_container_cpu_usage" +] +``` + +## syntect-server: container_memory_usage **Descriptions:** -- _syntect-server: 99%+ container memory usage by instance_ (`warning_syntect-server_container_memory_usage`) +- _syntect-server: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of syntect-server container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# syntect-server: container_restarts +```json +"observability.silenceAlerts": [ + "warning_syntect-server_container_memory_usage" +] +``` + +## syntect-server: container_restarts **Descriptions:** -- _syntect-server: 1+ container restarts every 5m by instance_ (`warning_syntect-server_container_restarts`) +- _syntect-server: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -1510,23 +3920,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' syntect-server` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the syntect-server container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs syntect-server` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# syntect-server: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_syntect-server_container_restarts" +] +``` + +## syntect-server: fs_inodes_used **Descriptions:** -- _syntect-server: 3e+06+ fs inodes in use by instance_ (`warning_syntect-server_fs_inodes_used`) +- _syntect-server: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# syntect-server: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_syntect-server_fs_inodes_used" +] +``` + +## syntect-server: provisioning_container_cpu_usage_7d **Descriptions:** -- _syntect-server: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_syntect-server_provisioning_container_cpu_usage_7d`) +- _syntect-server: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -1534,12 +3958,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the syntect-server container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# syntect-server: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_syntect-server_provisioning_container_cpu_usage_7d" +] +``` + +## syntect-server: provisioning_container_memory_usage_7d **Descriptions:** -- _syntect-server: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_syntect-server_provisioning_container_memory_usage_7d`) +- _syntect-server: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -1547,56 +3978,126 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of syntect-server container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# syntect-server: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_syntect-server_provisioning_container_memory_usage_7d" +] +``` + +## syntect-server: provisioning_container_cpu_usage_5m **Descriptions:** -- _syntect-server: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_syntect-server_provisioning_container_cpu_usage_5m`) +- _syntect-server: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the syntect-server container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# syntect-server: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_syntect-server_provisioning_container_cpu_usage_5m" +] +``` + +## syntect-server: provisioning_container_memory_usage_5m **Descriptions:** -- _syntect-server: 90%+ container memory usage (5m maximum) by instance_ (`warning_syntect-server_provisioning_container_memory_usage_5m`) +- _syntect-server: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of syntect-server container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-indexserver: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_syntect-server_provisioning_container_memory_usage_5m" +] +``` + +## syntect-server: pods_available_percentage **Descriptions:** -- _zoekt-indexserver: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_zoekt-indexserver_container_cpu_usage`) +- _syntect-server: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_syntect-server_pods_available_percentage" +] +``` + +## zoekt-indexserver: average_resolve_revision_duration + +**Descriptions:** + +- _zoekt-indexserver: 15s+ average resolve revision duration over 5m_ + +- _zoekt-indexserver: 30s+ average resolve revision duration over 5m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_zoekt-indexserver_average_resolve_revision_duration", + "critical_zoekt-indexserver_average_resolve_revision_duration" +] +``` + +## zoekt-indexserver: container_cpu_usage + +**Descriptions:** + +- _zoekt-indexserver: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the zoekt-indexserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-indexserver: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_zoekt-indexserver_container_cpu_usage" +] +``` + +## zoekt-indexserver: container_memory_usage **Descriptions:** -- _zoekt-indexserver: 99%+ container memory usage by instance_ (`warning_zoekt-indexserver_container_memory_usage`) +- _zoekt-indexserver: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of zoekt-indexserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-indexserver: container_restarts +```json +"observability.silenceAlerts": [ + "warning_zoekt-indexserver_container_memory_usage" +] +``` + +## zoekt-indexserver: container_restarts **Descriptions:** -- _zoekt-indexserver: 1+ container restarts every 5m by instance_ (`warning_zoekt-indexserver_container_restarts`) +- _zoekt-indexserver: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -1606,23 +4107,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' zoekt-indexserver` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the zoekt-indexserver container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs zoekt-indexserver` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-indexserver: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_zoekt-indexserver_container_restarts" +] +``` + +## zoekt-indexserver: fs_inodes_used **Descriptions:** -- _zoekt-indexserver: 3e+06+ fs inodes in use by instance_ (`warning_zoekt-indexserver_fs_inodes_used`) +- _zoekt-indexserver: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-indexserver: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_zoekt-indexserver_fs_inodes_used" +] +``` + +## zoekt-indexserver: provisioning_container_cpu_usage_7d **Descriptions:** -- _zoekt-indexserver: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_zoekt-indexserver_provisioning_container_cpu_usage_7d`) +- _zoekt-indexserver: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -1630,12 +4145,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the zoekt-indexserver container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-indexserver: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_zoekt-indexserver_provisioning_container_cpu_usage_7d" +] +``` + +## zoekt-indexserver: provisioning_container_memory_usage_7d **Descriptions:** -- _zoekt-indexserver: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_zoekt-indexserver_provisioning_container_memory_usage_7d`) +- _zoekt-indexserver: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -1643,56 +4165,123 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of zoekt-indexserver container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-indexserver: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_zoekt-indexserver_provisioning_container_memory_usage_7d" +] +``` + +## zoekt-indexserver: provisioning_container_cpu_usage_5m **Descriptions:** -- _zoekt-indexserver: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_zoekt-indexserver_provisioning_container_cpu_usage_5m`) +- _zoekt-indexserver: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the zoekt-indexserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-indexserver: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_zoekt-indexserver_provisioning_container_cpu_usage_5m" +] +``` + +## zoekt-indexserver: provisioning_container_memory_usage_5m **Descriptions:** -- _zoekt-indexserver: 90%+ container memory usage (5m maximum) by instance_ (`warning_zoekt-indexserver_provisioning_container_memory_usage_5m`) +- _zoekt-indexserver: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of zoekt-indexserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-webserver: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_zoekt-indexserver_provisioning_container_memory_usage_5m" +] +``` + +## zoekt-indexserver: pods_available_percentage **Descriptions:** -- _zoekt-webserver: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_zoekt-webserver_container_cpu_usage`) +- _zoekt-indexserver: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_zoekt-indexserver_pods_available_percentage" +] +``` + +## zoekt-webserver: indexed_search_request_errors + +**Descriptions:** + +- _zoekt-webserver: 50s+ indexed search request errors every 5m by code_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_zoekt-webserver_indexed_search_request_errors" +] +``` + +## zoekt-webserver: container_cpu_usage + +**Descriptions:** + +- _zoekt-webserver: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-webserver: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_zoekt-webserver_container_cpu_usage" +] +``` + +## zoekt-webserver: container_memory_usage **Descriptions:** -- _zoekt-webserver: 99%+ container memory usage by instance_ (`warning_zoekt-webserver_container_memory_usage`) +- _zoekt-webserver: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of zoekt-webserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-webserver: container_restarts +```json +"observability.silenceAlerts": [ + "warning_zoekt-webserver_container_memory_usage" +] +``` + +## zoekt-webserver: container_restarts **Descriptions:** -- _zoekt-webserver: 1+ container restarts every 5m by instance_ (`warning_zoekt-webserver_container_restarts`) +- _zoekt-webserver: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -1702,23 +4291,37 @@ Try restarting the pod to get a different public IP. - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' zoekt-webserver` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the zoekt-webserver container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs zoekt-webserver` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-webserver: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_zoekt-webserver_container_restarts" +] +``` + +## zoekt-webserver: fs_inodes_used **Descriptions:** -- _zoekt-webserver: 3e+06+ fs inodes in use by instance_ (`warning_zoekt-webserver_fs_inodes_used`) +- _zoekt-webserver: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-webserver: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_zoekt-webserver_fs_inodes_used" +] +``` + +## zoekt-webserver: provisioning_container_cpu_usage_7d **Descriptions:** -- _zoekt-webserver: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_zoekt-webserver_provisioning_container_cpu_usage_7d`) +- _zoekt-webserver: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -1726,12 +4329,19 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the zoekt-webserver container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-webserver: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_zoekt-webserver_provisioning_container_cpu_usage_7d" +] +``` + +## zoekt-webserver: provisioning_container_memory_usage_7d **Descriptions:** -- _zoekt-webserver: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_zoekt-webserver_provisioning_container_memory_usage_7d`) +- _zoekt-webserver: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -1739,66 +4349,124 @@ Try restarting the pod to get a different public IP. - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of zoekt-webserver container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-webserver: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_zoekt-webserver_provisioning_container_memory_usage_7d" +] +``` + +## zoekt-webserver: provisioning_container_cpu_usage_5m **Descriptions:** -- _zoekt-webserver: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_zoekt-webserver_provisioning_container_cpu_usage_5m`) +- _zoekt-webserver: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the zoekt-webserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# zoekt-webserver: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_zoekt-webserver_provisioning_container_cpu_usage_5m" +] +``` + +## zoekt-webserver: provisioning_container_memory_usage_5m **Descriptions:** -- _zoekt-webserver: 90%+ container memory usage (5m maximum) by instance_ (`warning_zoekt-webserver_provisioning_container_memory_usage_5m`) +- _zoekt-webserver: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of zoekt-webserver container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# prometheus: alertmanager_notifications_failed_total +```json +"observability.silenceAlerts": [ + "warning_zoekt-webserver_provisioning_container_memory_usage_5m" +] +``` + +## prometheus: prometheus_metrics_bloat **Descriptions:** -- _prometheus: 1+ failed alertmanager notifications over 1m_ (`warning_prometheus_alertmanager_notifications_failed_total`) +- _prometheus: 20000B+ prometheus metrics payload size_ **Possible solutions:** -Ensure that your [`observability.alerts` configuration](https://docs.sourcegraph.com/admin/observability/alerting#setting-up-alerting) (in site configuration) is valid. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# prometheus: container_cpu_usage +```json +"observability.silenceAlerts": [ + "warning_prometheus_prometheus_metrics_bloat" +] +``` + +## prometheus: alertmanager_notifications_failed_total **Descriptions:** -- _prometheus: 99%+ container cpu usage total (1m average) across all cores by instance_ (`warning_prometheus_container_cpu_usage`) +- _prometheus: 1+ failed alertmanager notifications over 1m_ + +**Possible solutions:** + +- Ensure that your [`observability.alerts` configuration](https://docs.sourcegraph.com/admin/observability/alerting#setting-up-alerting) (in site configuration) is valid. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_prometheus_alertmanager_notifications_failed_total" +] +``` + +## prometheus: container_cpu_usage + +**Descriptions:** + +- _prometheus: 99%+ container cpu usage total (1m average) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the prometheus container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# prometheus: container_memory_usage +```json +"observability.silenceAlerts": [ + "warning_prometheus_container_cpu_usage" +] +``` + +## prometheus: container_memory_usage **Descriptions:** -- _prometheus: 99%+ container memory usage by instance_ (`warning_prometheus_container_memory_usage`) +- _prometheus: 99%+ container memory usage by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of prometheus container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# prometheus: container_restarts +```json +"observability.silenceAlerts": [ + "warning_prometheus_container_memory_usage" +] +``` + +## prometheus: container_restarts **Descriptions:** -- _prometheus: 1+ container restarts every 5m by instance_ (`warning_prometheus_container_restarts`) +- _prometheus: 1+ container restarts every 5m by instance_ **Possible solutions:** @@ -1808,23 +4476,37 @@ Ensure that your [`observability.alerts` configuration](https://docs.sourcegraph - **Docker Compose:** - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' prometheus` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the prometheus container in `docker-compose.yml`. - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs prometheus` (note this will include logs from the previous and currently running container). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# prometheus: fs_inodes_used +```json +"observability.silenceAlerts": [ + "warning_prometheus_container_restarts" +] +``` + +## prometheus: fs_inodes_used **Descriptions:** -- _prometheus: 3e+06+ fs inodes in use by instance_ (`warning_prometheus_fs_inodes_used`) +- _prometheus: 3e+06+ fs inodes in use by instance_ **Possible solutions:** - - Refer to your OS or cloud provider`s documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. +- - Refer to your OS or cloud provider`s documentation for how to increase inodes. + - **Kubernetes:** consider provisioning more machines with less resources. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# prometheus: provisioning_container_cpu_usage_7d +```json +"observability.silenceAlerts": [ + "warning_prometheus_fs_inodes_used" +] +``` + +## prometheus: provisioning_container_cpu_usage_7d **Descriptions:** -- _prometheus: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ (`warning_prometheus_provisioning_container_cpu_usage_7d`) +- _prometheus: 80%+ or less than 30% container cpu usage total (7d maximum) across all cores by instance_ **Possible solutions:** @@ -1832,12 +4514,19 @@ Ensure that your [`observability.alerts` configuration](https://docs.sourcegraph - **Kubernetes:** Consider decreasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider descreasing `cpus:` of the prometheus container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# prometheus: provisioning_container_memory_usage_7d +```json +"observability.silenceAlerts": [ + "warning_prometheus_provisioning_container_cpu_usage_7d" +] +``` + +## prometheus: provisioning_container_memory_usage_7d **Descriptions:** -- _prometheus: 80%+ or less than 30% container memory usage (7d maximum) by instance_ (`warning_prometheus_provisioning_container_memory_usage_7d`) +- _prometheus: 80%+ or less than 30% container memory usage (7d maximum) by instance_ **Possible solutions:** @@ -1845,26 +4534,63 @@ Ensure that your [`observability.alerts` configuration](https://docs.sourcegraph - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of prometheus container in `docker-compose.yml`. - If usage is low, consider decreasing the above values. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# prometheus: provisioning_container_cpu_usage_5m +```json +"observability.silenceAlerts": [ + "warning_prometheus_provisioning_container_memory_usage_7d" +] +``` + +## prometheus: provisioning_container_cpu_usage_5m **Descriptions:** -- _prometheus: 90%+ container cpu usage total (5m maximum) across all cores by instance_ (`warning_prometheus_provisioning_container_cpu_usage_5m`) +- _prometheus: 90%+ container cpu usage total (5m maximum) across all cores by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `cpus:` of the prometheus container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: -# prometheus: provisioning_container_memory_usage_5m +```json +"observability.silenceAlerts": [ + "warning_prometheus_provisioning_container_cpu_usage_5m" +] +``` + +## prometheus: provisioning_container_memory_usage_5m **Descriptions:** -- _prometheus: 90%+ container memory usage (5m maximum) by instance_ (`warning_prometheus_provisioning_container_memory_usage_5m`) +- _prometheus: 90%+ container memory usage (5m maximum) by instance_ **Possible solutions:** - **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. - **Docker Compose:** Consider increasing `memory:` of prometheus container in `docker-compose.yml`. +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_prometheus_provisioning_container_memory_usage_5m" +] +``` + +## prometheus: pods_available_percentage + +**Descriptions:** + +- _prometheus: less than 90% percentage pods available for 10m_ + +**Possible solutions:** + +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_prometheus_pods_available_percentage" +] +``` diff --git a/monitoring/generator.go b/monitoring/generator.go index e21aea481fe..4379526d5ce 100644 --- a/monitoring/generator.go +++ b/monitoring/generator.go @@ -791,6 +791,8 @@ This document contains possible solutions for when you find alerts are firing in If your alert isn't mentioned here, or if the solution doesn't help, [contact us](mailto:support@sourcegraph.com) for assistance. +To learn more about Sourcegraph's alerting, see [our alerting documentation](https://docs.sourcegraph.com/admin/observability/alerting). + `) @@ -798,13 +800,10 @@ for assistance. for _, g := range c.Groups { for _, r := range g.Rows { for _, o := range r { - if o.PossibleSolutions == "none" { - continue - } - - fmt.Fprintf(&b, "# %s: %s\n\n", c.Name, o.Name) + fmt.Fprintf(&b, "## %s: %s\n\n", c.Name, o.Name) fmt.Fprintf(&b, "**Descriptions:**\n") + var prometheusAlertNames []string for _, alert := range []struct { level string threshold Alert @@ -815,14 +814,22 @@ for assistance. if alert.threshold.isEmpty() { continue } - fmt.Fprintf(&b, "\n- _%s_ (`%s`)\n\n", - c.alertDescription(o, alert.threshold), - prometheusAlertName(alert.level, c.Name, o.Name)) + fmt.Fprintf(&b, "\n- _%s_\n", c.alertDescription(o, alert.threshold)) + prometheusAlertNames = append(prometheusAlertNames, + fmt.Sprintf(" \"%s\"", prometheusAlertName(alert.level, c.Name, o.Name))) } + fmt.Fprint(&b, "\n") fmt.Fprintf(&b, "**Possible solutions:**\n\n") - possibleSolutions, _ := goMarkdown(o.PossibleSolutions) - fmt.Fprintf(&b, "%s\n\n", possibleSolutions) + if o.PossibleSolutions != "none" { + possibleSolutions, _ := goMarkdown(o.PossibleSolutions) + fmt.Fprintf(&b, "%s\n", possibleSolutions) + } + // add silencing configuration as another solution + fmt.Fprintf(&b, "- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert:\n\n") + fmt.Fprintf(&b, "```json\n%s\n```\n\n", fmt.Sprintf(`"observability.silenceAlerts": [ +%s +]`, strings.Join(prometheusAlertNames, ",\n"))) } } } @@ -857,6 +864,12 @@ func goMarkdown(m string) (string, error) { } m = strings.Join(lines[:len(lines)-1], "\n") } + + // If result is not a list, make it a list, so we can add items. + if !strings.HasPrefix(m, "-") && !strings.HasPrefix(m, "*") { + m = fmt.Sprintf("- %s", m) + } + return m, nil } diff --git a/monitoring/shared.go b/monitoring/shared.go index d793325a8aa..e903fbc5745 100644 --- a/monitoring/shared.go +++ b/monitoring/shared.go @@ -127,8 +127,7 @@ var sharedContainerFsInodes sharedObservable = func(containerName string) Observ Owner: ObservableOwnerDistribution, PossibleSolutions: ` - Refer to your OS or cloud provider's documentation for how to increase inodes. - - **Kubernetes:** consider provisioning more machines with less resources. -`, + - **Kubernetes:** consider provisioning more machines with less resources.`, } }