From dcbd01f54541ff31091ddfda393209b171b0f9b3 Mon Sep 17 00:00:00 2001 From: Erik Seliger Date: Wed, 3 Aug 2022 12:08:04 +0200 Subject: [PATCH] Push executor metrics (#36969) --- dev/gitserver/sg.config.yaml | 3 +- dev/prometheus/all/prometheus_targets.yml | 11 +- dev/prometheus/linux/prometheus_targets.yml | 11 +- dev/src-prof-services.json | 1 + doc/admin/deploy_executors.md | 233 ---------- doc/admin/observability/alerts.md | 2 +- doc/admin/observability/dashboards.md | 138 +++--- enterprise/cmd/executor/config.go | 45 +- .../cmd/executor/internal/apiclient/client.go | 101 +++-- .../internal/apiclient/client_test.go | 13 +- .../cmd/executor/internal/metrics/metrics.go | 144 ++++++ .../cmd/executor/internal/worker/worker.go | 17 +- enterprise/cmd/executor/vm-image/install.sh | 42 +- .../internal/executorqueue/handler/handler.go | 7 +- .../executorqueue/handler/handler_test.go | 51 ++- .../internal/executorqueue/handler/routes.go | 65 ++- .../internal/executorqueue/queuehandler.go | 4 +- .../executors/metricsserver_config.go | 17 + .../internal/executors/metricsserver_job.go | 50 ++ enterprise/cmd/worker/main.go | 1 + enterprise/dev/src-prof-services.json | 3 +- enterprise/internal/executor/client_types.go | 2 + go.mod | 2 +- internal/metrics/store/mocks_temp.go | 426 ++++++++++++++++++ internal/metrics/store/store.go | 117 +++++ mockgen.temp.yaml | 5 + monitoring/definitions/executor.go | 14 +- monitoring/definitions/frontend.go | 7 +- monitoring/definitions/postgres.go | 69 +-- monitoring/definitions/shared/codeintel.go | 2 +- monitoring/definitions/shared/constructor.go | 11 +- monitoring/definitions/shared/go.go | 57 ++- monitoring/definitions/shared/kubernetes.go | 26 +- .../definitions/shared/node_exporter.go | 68 +-- monitoring/definitions/shared/observation.go | 4 + monitoring/definitions/shared/queues.go | 6 +- monitoring/definitions/shared/standard.go | 10 +- monitoring/definitions/shared/workerutil.go | 2 +- monitoring/definitions/zoekt.go | 1 - sg.config.yaml | 1 + 40 files changed, 1231 insertions(+), 558 deletions(-) create mode 100644 enterprise/cmd/executor/internal/metrics/metrics.go create mode 100644 enterprise/cmd/worker/internal/executors/metricsserver_config.go create mode 100644 enterprise/cmd/worker/internal/executors/metricsserver_job.go create mode 100644 internal/metrics/store/mocks_temp.go create mode 100644 internal/metrics/store/store.go diff --git a/dev/gitserver/sg.config.yaml b/dev/gitserver/sg.config.yaml index 1889e5c95be..360befb6c69 100644 --- a/dev/gitserver/sg.config.yaml +++ b/dev/gitserver/sg.config.yaml @@ -18,8 +18,9 @@ env: { "Name": "precise-code-intel-worker", "Host": "127.0.0.1:6088" }, { "Name": "worker", "Host": "127.0.0.1:6089" }, { "Name": "enterprise-worker", "Host": "127.0.0.1:6089" }, + { "Name": "enterprise-worker-executors", "Host": "127.0.0.1:6996" }, { "Name": "executor-codeintel", "Host": "127.0.0.1:6092" }, - { "Name": "executor-batches", "Host": "127.0.0.1:6093" }, + "Name": "executor-batches", "Host": "127.0.0.1:6093" }, { "Name": "zoekt-indexserver-0", "Host": "127.0.0.1:6072" }, { "Name": "zoekt-indexserver-1", "Host": "127.0.0.1:6073" }, { "Name": "zoekt-webserver-0", "Host": "127.0.0.1:3070", "DefaultPath": "/debug/requests/" }, diff --git a/dev/prometheus/all/prometheus_targets.yml b/dev/prometheus/all/prometheus_targets.yml index 8d512aad0f2..8f2ab7f04a1 100644 --- a/dev/prometheus/all/prometheus_targets.yml +++ b/dev/prometheus/all/prometheus_targets.yml @@ -47,15 +47,10 @@ # worker - host.docker.internal:6089 - labels: - job: sourcegraph-code-intel-indexers + job: worker-executors targets: - # sourcegraph-code-intel-indexers - - host.docker.internal:6092 -- labels: - job: executor-batches - targets: - # executor-batches - - host.docker.internal:6093 + # worker + - host.docker.internal:6996 - labels: job: postgres_exporter targets: diff --git a/dev/prometheus/linux/prometheus_targets.yml b/dev/prometheus/linux/prometheus_targets.yml index 5d5f1bb0acb..9869f03c317 100644 --- a/dev/prometheus/linux/prometheus_targets.yml +++ b/dev/prometheus/linux/prometheus_targets.yml @@ -47,15 +47,10 @@ # worker - 127.0.0.1:6089 - labels: - job: sourcegraph-code-intel-indexers + job: worker-executors targets: - # sourcegraph-code-intel-indexers - - 127.0.0.1:6092 -- labels: - job: executor-batches - targets: - # executor-batches - - 127.0.0.1:6093 + # worker + - 127.0.0.1:6996 - labels: job: postgres_exporter targets: diff --git a/dev/src-prof-services.json b/dev/src-prof-services.json index e1d04627b76..0192a2f87ba 100644 --- a/dev/src-prof-services.json +++ b/dev/src-prof-services.json @@ -5,6 +5,7 @@ { "Name": "symbols", "Host": "127.0.0.1:6071" }, { "Name": "repo-updater", "Host": "127.0.0.1:6074" }, { "Name": "worker", "Host": "127.0.0.1:6089" }, + { "Name": "worker-executors", "Host": "127.0.0.1:6969" }, { "Name": "precise-code-intel-worker", "Host": "127.0.0.1:6088" }, { "Name": "executor-codeintel", "Host": "127.0.0.1:6092" }, { "Name": "executor-batches", "Host": "127.0.0.1:6093" }, diff --git a/doc/admin/deploy_executors.md b/doc/admin/deploy_executors.md index ea5c10075ac..1bc833779e0 100644 --- a/doc/admin/deploy_executors.md +++ b/doc/admin/deploy_executors.md @@ -31,7 +31,6 @@ That means, in order to deploy executors that can talk to the Sourcegraph instan - [Using binaries](#binaries) 1. [Confirm executors can reach Sourcegraph instance](#confirm-executors-are-working) 1. Optional: [Configuring auto scaling](#configuring-auto-scaling) -1. Optional: [Configuring observability](#configuring-observability) ### Configure Sourcegraph @@ -368,235 +367,3 @@ To test if the metric is correctly reported into the Cloud provider: - On AWS, this can be found in the CloudWatch metrics section. Under **All metrics**, select the namespace `sourcegraph-executor` and then the metric `environment, queueName`. Make sure there are entries returned. Next, you can test whether the number of executors rises and shrinks as load spikes occur. Keep in mind that auto-scaling is not a real-time operation on most cloud providers and usually takes a short moment and can have some delays between the metric going down and the desired machine count adjusting. - -## Configuring observability - -> NOTE: Observability features are currently not supported when [downloading and running executor binaries yourself](#binaries), and on managed instances since they require deployment adjustments. - -Sourcegraph [ships with dashboards](observability/metrics.md) that can display executor metrics. We highly encourage setting this up to help make informed decisions on scaling and to make debugging easier. - -In order to do that, the Prometheus instance bundled with your Sourcegraph deployment must be able to scrape the executor metrics endpoint. - -That requires two things: - -1. Provide Prometheus with service account credentials that allow it to get a list of active compute instances from the cloud provider. -2. Add additional scrape jobs to Prometheus. - -To add service account credentials, you can use the `credentials` submodule in both our [AWS](https://sourcegraph.com/github.com/sourcegraph/terraform-aws-executors/-/tree/modules/credentials) and [GCP](https://sourcegraph.com/github.com/sourcegraph/terraform-google-executors/-/tree/modules/credentials) executor modules. - -```terraform -module "credentials" { - source = "sourcegraph/executors///modules/credentials" - version = "" - - region = - resource_prefix = "" -} - -# For Google: -output "instance_scraper_credentials_file" { - value = module.my-credentials.instance_scraper_credentials_file -} - -# For AWS: -output "instance_scraper_access_key_id" { - value = module.my-credentials.instance_scraper_access_key_id -} - -output "instance_scraper_access_secret_key" { - value = module.my-credentials.instance_scraper_access_secret_key -} -``` - -Just as with [auto scaling](#configuring-auto-scaling), you use the `credentials` submodule to get properly configured credentials in the Terraform outputs. When applied, this will yield something like this: - -``` -# For AWS: -instance_scraper_access_key_id = -instance_scraper_access_secret_key = - -# For Google: -instance_scraper_credentials_file = -``` - -Now we can use these credentials for the different cloud providers. - -### Google - -Credentials need to be added to the Prometheus container of your Sourcegraph deployment and a new scrape job needs to be added. - -In a Kubernetes deployment, credentials can be added by mounting the credentials file obtained from the `credentials` module in the last step, and pointing to it from an environment variable. - -**Step 1:** Create a secret called `prometheus-secrets` containing the credentials file content: - -```yaml -apiVersion: v1 -kind: Secret -type: Opaque -metadata: - name: prometheus-secrets -data: - # The Terraform output for `instance_scraper_credentials_file` - GCP_ACCOUNT_JSON: -``` - -**Step 2:** Modify the Prometheus deployment manifest: - -```yaml -containers: -- name: prometheus - # [...] - env: - - name: GOOGLE_APPLICATION_CREDENTIALS - value: /credentials/google_application_credentials.json - volumeMounts: - - mountPath: /credentials/google_application_credentials.json - name: credentials - subPath: google_application_credentials.json - readOnly: true -volumes: -- name: credentials - secret: - secretName: prometheus-secrets - items: - - key: GCP_ACCOUNT_JSON - path: google_application_credentials.json - -``` - -**Step 3:** Add the following scraping job that uses [GCE service discovery configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) to the Prometheus configuration. To do that, you can edit the Prometheus `ConfigMap` and modify the contents of the `prometheus.yml` file. Under [`scrape_configs:`](https://sourcegraph.com/github.com/sourcegraph/deploy-sourcegraph@0938b6686f0c94d80e8331e36f5ddac4659027b1/-/blob/base/prometheus/prometheus.ConfigMap.yaml?L43:5) add the following and make sure to replace `{GCP_PROJECT}`, `{GCP_ZONE}` and `{INSTANCE_TAG}`. The `{INSTANCE_TAG}` value must be the same as [`instance_tag`](https://sourcegraph.com/search?q=context:global+repo:%5Egithub.com/sourcegraph/terraform-aws-executors%24+variable+%22instance_tag%22&patternType=literal). - -```yaml -- job_name: 'sourcegraph-executors' - metrics_path: /proxy - params: - module: [executor] - gce_sd_configs: &executor_gce_config - - project: {GCP_PROJECT} # Change this to the GCP project ID - port: 9999 - zone: {GCP_ZONE} # Change this to the GCP zone - filter: '(labels.executor_tag = {INSTANCE_TAG})' # Change {INSTANCE_TAG} to the `executor_instance_tag` set in the Terraform modules - relabel_configs: &executor_relabel_config - - source_labels: [__meta_gce_public_ip] - target_label: __address__ - replacement: "${1}${2}:9999" - separator: '' - - source_labels: [__meta_gce_zone] - regex: ".+/([^/]+)" - target_label: zone - separator: '' - - source_labels: [__meta_gce_project] - target_label: project - - source_labels: [__meta_gce_instance_name] - target_label: instance - separator: '' - - regex: "__meta_gce_metadata_(image_.+)" - action: labelmap -- job_name: 'sourcegraph-executor-nodes' - metrics_path: /proxy - params: - module: [node] - gce_sd_configs: *executor_gce_config - relabel_configs: *executor_relabel_config -# If you've also used the Terraform modules to provision Docker registry -# mirrors for executors: -- job_name: 'sourcegraph-executors-docker-registry-mirrors' - metrics_path: /proxy - params: - module: [registry] - gce_sd_configs: &gce_executor_mirror_config - - project: {GCP_PROJECT} # Change this to the GCP project ID - port: 9999 - zone: {GCP_ZONE} # Change this to the GCP zone - filter: '(labels.executor_tag = {INSTANCE_TAG}-docker-mirror)' # Change {INSTANCE_TAG} to the `executor_instance_tag` set in the Terraform modules - relabel_configs: *executor_relabel_config -- job_name: 'sourcegraph-executors-docker-registry-mirror-nodes' - metrics_path: /proxy - params: - module: [node] - gce_sd_configs: *gce_executor_mirror_config - relabel_configs: *executor_relabel_config -``` - -**Step 4:** Restart Prometheus. - -If you currently have any executors or Docker registry mirrors running, you should start seeing metrics on the _Executors_ dashboard in Grafana. Alternatively, you can check if the executors can be scraped, by [port-forwarding the Prometheus UI to your local machine and checkin in the UI](./observability/metrics.md#accessing-prometheus-directly). - -### AWS - -Credentials need to be added to the Prometheus container of your Sourcegraph deployment and a new scrape job needs to be added. - -In a Kubernetes deployment, credentials can be added by setting the two secrets obtained from the `credentials` module in the last step as environment variables. - -**Step 1:** Modify the Prometheus deployment manifest: - -```yaml -containers: -- name: prometheus - # [...] - env: - - name: AWS_ACCESS_KEY_ID - # The Terraform output for `instance_scraper_access_key_id` - value: - - name: AWS_SECRET_ACCESS_KEY - # The Terraform output for `instance_scraper_access_secret_key` - value: -``` - -**Step 2:** Add the following scraping job that uses [EC2 service discovery configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) to the Prometheus configuration. To do that, you can edit the Prometheus `ConfigMap` and modify the contents of the `prometheus.yml` file. Under [`scrape_configs:`](https://sourcegraph.com/github.com/sourcegraph/deploy-sourcegraph@master/-/blob/base/prometheus/prometheus.ConfigMap.yaml?L43:5) add the following and make sure to replace `{AWS_REGION}` and `{INSTANCE_TAG}`. The `{INSTANCE_TAG}` value must be the same as [`instance_tag`](https://sourcegraph.com/search?q=context:global+repo:%5Egithub.com/sourcegraph/terraform-aws-executors%24+variable+%22instance_tag%22&patternType=literal). - -```yaml -- job_name: 'sourcegraph-executors' - metrics_path: /proxy - params: - module: [executor] - ec2_sd_configs: &executor_ec2_config - - region: {AWS_REGION} # Change this to the AWS region - port: 9999 - filters: - - name: tag:executor_tag - values: [{INSTANCE_TAG}] # Change {INSTANCE_TAG} to the `executor_instance_tag` set in the Terraform modules - relabel_configs: &executor_relabel_config - - source_labels: [__meta_ec2_public_ip] - target_label: __address__ - replacement: "${1}${2}:9999" - separator: '' - - source_labels: [__meta_ec2_availability_zone] - regex: ".+/([^/]+)" - target_label: zone - separator: '' - - source_labels: [__meta_ec2_instance_id] - target_label: instance - separator: '' - - source_labels: [__meta_ec2_ami] - target_label: version -- job_name: 'sourcegraph-executor-nodes' - metrics_path: /proxy - params: - module: [node] - ec2_sd_configs: *executor_ec2_config - relabel_configs: *executor_relabel_config -# If you've also used the Terraform modules to provision Docker registry -# mirrors for executors: -- job_name: 'sourcegraph-executors-docker-registry-mirrors' - metrics_path: /proxy - params: - module: [registry] - ec2_sd_configs: &ec2_executor_mirror_config - - region: {AWS_REGION} - port: 9999 - filters: - - name: tag:executor_tag - values: [{INSTANCE_TAG}-docker-mirror] - relabel_configs: *executor_relabel_config -- job_name: 'sourcegraph-executors-docker-registry-mirror-nodes' - metrics_path: /proxy - params: - module: [node] - ec2_sd_configs: *ec2_executor_mirror_config - relabel_configs: *executor_relabel_config -``` - -**Step 3:** Restart Prometheus. - -If you currently have any executors or Docker registry mirrors running, you should start seeing metrics on the _Executors_ dashboard in Grafana. Alternatively, you can check if the executors can be scraped, by [port-forwarding the Prometheus UI to your local machine and checkin in the UI](./observability/metrics.md#accessing-prometheus-directly). diff --git a/doc/admin/observability/alerts.md b/doc/admin/observability/alerts.md index ce0ca709063..7ec8c026fe1 100644 --- a/doc/admin/observability/alerts.md +++ b/doc/admin/observability/alerts.md @@ -5931,7 +5931,7 @@ with your code hosts connections or networking issues affecting communication wi
Technical details -Custom alert query: `last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))[5h:]) / (last_over_time(sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))[5h:]) + last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))[5h:])) * 100` +Custom alert query: `last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))[5h:]) / (last_over_time(sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))[5h:]) + last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))[5h:])) * 100`
diff --git a/doc/admin/observability/dashboards.md b/doc/admin/observability/dashboards.md index 9e1ec9e7a34..059cb73ef4a 100644 --- a/doc/admin/observability/dashboards.md +++ b/doc/admin/observability/dashboards.md @@ -17470,7 +17470,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100100`
Technical details -Query: `sum(src_executor_processor_handlers{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"})` +Query: `sum(src_executor_processor_handlers{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"})`
@@ -17489,7 +17489,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100110`
Technical details -Query: `sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17508,7 +17508,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100111`
Technical details -Query: `sum by (le)(rate(src_executor_processor_duration_seconds_bucket{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (le)(rate(src_executor_processor_duration_seconds_bucket{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17527,7 +17527,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100112`
Technical details -Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17546,7 +17546,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100113`
Technical details -Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100` +Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17569,7 +17569,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100200`
Technical details -Query: `sum(increase(src_executor_run_lock_wait_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_executor_run_lock_wait_total{sg_jobs=~"^sourcegraph-executors.*"}[5m]))`
@@ -17590,7 +17590,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100201`
Technical details -Query: `sum(increase(src_executor_run_lock_held_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_executor_run_lock_held_total{sg_jobs=~"^sourcegraph-executors.*"}[5m]))`
@@ -17611,7 +17611,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100300`
Technical details -Query: `sum(increase(src_apiworker_apiclient_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17630,7 +17630,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100301`
Technical details -Query: `sum by (le)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (le)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17649,7 +17649,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100302`
Technical details -Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17668,7 +17668,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100303`
Technical details -Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_apiworker_apiclient_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100` +Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17687,7 +17687,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100310`
Technical details -Query: `sum by (op)(increase(src_apiworker_apiclient_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17706,7 +17706,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100311`
Technical details -Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])))` +Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^sourcegraph-executors.*"}[5m])))`
@@ -17725,7 +17725,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100312`
Technical details -Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17744,7 +17744,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100313`
Technical details -Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum by (op)(increase(src_apiworker_apiclient_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100` +Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17765,7 +17765,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100400`
Technical details -Query: `sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17784,7 +17784,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100401`
Technical details -Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17803,7 +17803,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100402`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17822,7 +17822,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100403`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17841,7 +17841,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100410`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17860,7 +17860,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100411`
Technical details -Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])))` +Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])))`
@@ -17879,7 +17879,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100412`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17898,7 +17898,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100413`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17919,7 +17919,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100500`
Technical details -Query: `sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17938,7 +17938,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100501`
Technical details -Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17957,7 +17957,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100502`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17976,7 +17976,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100503`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17995,7 +17995,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100510`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18014,7 +18014,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100511`
Technical details -Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])))` +Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])))`
@@ -18033,7 +18033,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100512`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18052,7 +18052,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100513`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -18073,7 +18073,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100600`
Technical details -Query: `sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18092,7 +18092,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100601`
Technical details -Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18111,7 +18111,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100602`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18130,7 +18130,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100603`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -18149,7 +18149,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100610`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18168,7 +18168,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100611`
Technical details -Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])))` +Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])))`
@@ -18187,7 +18187,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100612`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18206,7 +18206,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100613`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -18228,7 +18228,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100700`
Technical details -Query: `sum(rate(node_cpu_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",mode!~"(idle|iowait)",instance=~"$instance"}[$__rate_interval])) by(instance) / count(node_cpu_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",mode="system",instance=~"$instance"}) by (instance) * 100` +Query: `sum(rate(node_cpu_seconds_total{sg_job=~"sourcegraph-executors",mode!~"(idle|iowait)",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance) / count(node_cpu_seconds_total{sg_job=~"sourcegraph-executors",mode="system",sg_instance=~"$instance"}) by (sg_instance) * 100`
@@ -18248,7 +18248,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100701`
Technical details -Query: `rate(node_pressure_cpu_waiting_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])` +Query: `rate(node_pressure_cpu_waiting_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])`
@@ -18268,7 +18268,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100710`
Technical details -Query: `(1 - sum(node_memory_MemAvailable_bytes{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}) by (instance) / sum(node_memory_MemTotal_bytes{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}) by (instance)) * 100` +Query: `(1 - sum(node_memory_MemAvailable_bytes{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}) by (sg_instance) / sum(node_memory_MemTotal_bytes{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}) by (sg_instance)) * 100`
@@ -18288,7 +18288,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100711`
Technical details -Query: `(rate(node_vmstat_pgsteal_anon{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) / (rate(node_vmstat_pgscan_anon{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) * 100` +Query: `(rate(node_vmstat_pgsteal_anon{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) / (rate(node_vmstat_pgscan_anon{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) * 100`
@@ -18308,7 +18308,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100712`
Technical details -Query: `rate(node_pressure_memory_stalled_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])` +Query: `rate(node_pressure_memory_stalled_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])`
@@ -18328,7 +18328,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100720`
Technical details -Query: `sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(instance,disk) * 100` +Query: `sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(sg_instance,disk) * 100`
@@ -18348,7 +18348,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100721`
Technical details -Query: `sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(instance,disk)` +Query: `sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(sg_instance,disk)`
@@ -18368,7 +18368,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100722`
Technical details -Query: `rate(node_pressure_io_stalled_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])` +Query: `rate(node_pressure_io_stalled_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])`
@@ -18388,7 +18388,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100730`
Technical details -Query: `sum(rate(node_network_receive_bytes_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance) * 8` +Query: `sum(rate(node_network_receive_bytes_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance) * 8`
@@ -18408,7 +18408,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100731`
Technical details -Query: `sum(rate(node_network_receive_drop_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance)` +Query: `sum(rate(node_network_receive_drop_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance)`
@@ -18428,7 +18428,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100732`
Technical details -Query: `sum(rate(node_network_receive_errs_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance)` +Query: `sum(rate(node_network_receive_errs_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance)`
@@ -18448,7 +18448,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100740`
Technical details -Query: `sum(rate(node_network_transmit_bytes_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance) * 8` +Query: `sum(rate(node_network_transmit_bytes_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance) * 8`
@@ -18468,7 +18468,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100741`
Technical details -Query: `sum(rate(node_network_transmit_drop_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance)` +Query: `sum(rate(node_network_transmit_drop_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance)`
@@ -18488,7 +18488,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100742`
Technical details -Query: `sum(rate(node_network_transmit_errs_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance)` +Query: `sum(rate(node_network_transmit_errs_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance)`
@@ -18510,7 +18510,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100800`
Technical details -Query: `sum(rate(node_cpu_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",mode!~"(idle|iowait)",instance=~".*"}[$__rate_interval])) by(instance) / count(node_cpu_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",mode="system",instance=~".*"}) by (instance) * 100` +Query: `sum(rate(node_cpu_seconds_total{sg_job=~"sourcegraph-executors",mode!~"(idle|iowait)",sg_instance=~".*"}[$__rate_interval])) by(sg_instance) / count(node_cpu_seconds_total{sg_job=~"sourcegraph-executors",mode="system",sg_instance=~".*"}) by (sg_instance) * 100`
@@ -18530,7 +18530,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100801`
Technical details -Query: `rate(node_pressure_cpu_waiting_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])` +Query: `rate(node_pressure_cpu_waiting_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])`
@@ -18550,7 +18550,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100810`
Technical details -Query: `(1 - sum(node_memory_MemAvailable_bytes{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}) by (instance) / sum(node_memory_MemTotal_bytes{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}) by (instance)) * 100` +Query: `(1 - sum(node_memory_MemAvailable_bytes{sg_job=~"sourcegraph-executors",sg_instance=~".*"}) by (sg_instance) / sum(node_memory_MemTotal_bytes{sg_job=~"sourcegraph-executors",sg_instance=~".*"}) by (sg_instance)) * 100`
@@ -18570,7 +18570,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100811`
Technical details -Query: `(rate(node_vmstat_pgsteal_anon{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) / (rate(node_vmstat_pgscan_anon{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) * 100` +Query: `(rate(node_vmstat_pgsteal_anon{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) / (rate(node_vmstat_pgscan_anon{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) * 100`
@@ -18590,7 +18590,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100812`
Technical details -Query: `rate(node_pressure_memory_stalled_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])` +Query: `rate(node_pressure_memory_stalled_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])`
@@ -18610,7 +18610,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100820`
Technical details -Query: `sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(instance,disk) * 100` +Query: `sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(sg_instance,disk) * 100`
@@ -18630,7 +18630,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100821`
Technical details -Query: `sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(instance,disk)` +Query: `sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(sg_instance,disk)`
@@ -18650,7 +18650,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100822`
Technical details -Query: `rate(node_pressure_io_stalled_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])` +Query: `rate(node_pressure_io_stalled_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])`
@@ -18670,7 +18670,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100830`
Technical details -Query: `sum(rate(node_network_receive_bytes_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance) * 8` +Query: `sum(rate(node_network_receive_bytes_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance) * 8`
@@ -18690,7 +18690,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100831`
Technical details -Query: `sum(rate(node_network_receive_drop_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance)` +Query: `sum(rate(node_network_receive_drop_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance)`
@@ -18710,7 +18710,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100832`
Technical details -Query: `sum(rate(node_network_receive_errs_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance)` +Query: `sum(rate(node_network_receive_errs_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance)`
@@ -18730,7 +18730,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100840`
Technical details -Query: `sum(rate(node_network_transmit_bytes_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance) * 8` +Query: `sum(rate(node_network_transmit_bytes_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance) * 8`
@@ -18750,7 +18750,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100841`
Technical details -Query: `sum(rate(node_network_transmit_drop_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance)` +Query: `sum(rate(node_network_transmit_drop_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance)`
@@ -18770,7 +18770,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100842`
Technical details -Query: `sum(rate(node_network_transmit_errs_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance)` +Query: `sum(rate(node_network_transmit_errs_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance)`
@@ -18793,7 +18793,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100900`
Technical details -Query: `max by(instance) (go_goroutines{job=~".*(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors)"})` +Query: `max by(sg_instance) (go_goroutines{sg_job=~".*sourcegraph-executors"})`
@@ -18812,7 +18812,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100901`
Technical details -Query: `max by(instance) (go_gc_duration_seconds{job=~".*(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors)"})` +Query: `max by(sg_instance) (go_gc_duration_seconds{sg_job=~".*sourcegraph-executors"})`
diff --git a/enterprise/cmd/executor/config.go b/enterprise/cmd/executor/config.go index ca411d84105..698c4e07155 100644 --- a/enterprise/cmd/executor/config.go +++ b/enterprise/cmd/executor/config.go @@ -18,25 +18,27 @@ import ( type Config struct { env.BaseConfig - FrontendURL string - FrontendAuthorizationToken string - QueueName string - QueuePollInterval time.Duration - MaximumNumJobs int - FirecrackerImage string - VMStartupScriptPath string - VMPrefix string - KeepWorkspaces bool - DockerHostMountPath string - UseFirecracker bool - JobNumCPUs int - JobMemory string - FirecrackerDiskSpace string - MaximumRuntimePerJob time.Duration - CleanupTaskInterval time.Duration - NumTotalJobs int - MaxActiveTime time.Duration - WorkerHostname string + FrontendURL string + FrontendAuthorizationToken string + QueueName string + QueuePollInterval time.Duration + MaximumNumJobs int + FirecrackerImage string + VMStartupScriptPath string + VMPrefix string + KeepWorkspaces bool + DockerHostMountPath string + UseFirecracker bool + JobNumCPUs int + JobMemory string + FirecrackerDiskSpace string + MaximumRuntimePerJob time.Duration + CleanupTaskInterval time.Duration + NumTotalJobs int + MaxActiveTime time.Duration + NodeExporterURL string + DockerRegistryNodeExporterURL string + WorkerHostname string } func (c *Config) Load() { @@ -57,6 +59,8 @@ func (c *Config) Load() { c.MaximumRuntimePerJob = c.GetInterval("EXECUTOR_MAXIMUM_RUNTIME_PER_JOB", "30m", "The maximum wall time that can be spent on a single job.") c.CleanupTaskInterval = c.GetInterval("EXECUTOR_CLEANUP_TASK_INTERVAL", "1m", "The frequency with which to run periodic cleanup tasks.") c.NumTotalJobs = c.GetInt("EXECUTOR_NUM_TOTAL_JOBS", "0", "The maximum number of jobs that will be dequeued by the worker.") + c.NodeExporterURL = c.GetOptional("NODE_EXPORTER_URL", "The URL of the node_exporter instance, without the /metrics path.") + c.DockerRegistryNodeExporterURL = c.GetOptional("DOCKER_REGISTRY_NODE_EXPORTER_URL", "The URL of the Docker Registry instance's node_exporter, without the /metrics path.") c.MaxActiveTime = c.GetInterval("EXECUTOR_MAX_ACTIVE_TIME", "0", "The maximum time that can be spent by the worker dequeueing records to be handled.") hn := hostname.Get() @@ -88,6 +92,9 @@ func (c *Config) APIWorkerOptions(telemetryOptions apiclient.TelemetryOptions) a // git repositories that make it into commands or stdout/stderr streams. c.FrontendAuthorizationToken: "SECRET_REMOVED", }, + + NodeExporterEndpoint: c.NodeExporterURL, + DockerRegistryNodeExporterEndpoint: c.DockerRegistryNodeExporterURL, } } diff --git a/enterprise/cmd/executor/internal/apiclient/client.go b/enterprise/cmd/executor/internal/apiclient/client.go index 836e623c382..160cfc239cb 100644 --- a/enterprise/cmd/executor/internal/apiclient/client.go +++ b/enterprise/cmd/executor/internal/apiclient/client.go @@ -1,6 +1,7 @@ package apiclient import ( + "bytes" "context" "fmt" "net/http" @@ -8,19 +9,27 @@ import ( "path/filepath" "strconv" "strings" + "time" - "github.com/opentracing/opentracing-go/log" + otlog "github.com/opentracing/opentracing-go/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/expfmt" + + "github.com/sourcegraph/log" "github.com/sourcegraph/sourcegraph/enterprise/internal/executor" "github.com/sourcegraph/sourcegraph/internal/observation" "github.com/sourcegraph/sourcegraph/internal/workerutil" + "github.com/sourcegraph/sourcegraph/lib/errors" ) // Client is the client used to communicate with a remote job queue API. type Client struct { - options Options - client *BaseClient - operations *operations + options Options + client *BaseClient + logger log.Logger + metricsGatherer prometheus.Gatherer + operations *operations } type Options struct { @@ -48,17 +57,19 @@ type EndpointOptions struct { Token string } -func New(options Options, observationContext *observation.Context) *Client { +func New(options Options, metricsGatherer prometheus.Gatherer, observationContext *observation.Context) *Client { return &Client{ - options: options, - client: NewBaseClient(options.BaseClientOptions), - operations: newOperations(observationContext), + options: options, + client: NewBaseClient(options.BaseClientOptions), + logger: log.Scoped("executor-api-client", "The API client adapter for executors to use dbworkers over HTTP"), + metricsGatherer: metricsGatherer, + operations: newOperations(observationContext), } } func (c *Client) Dequeue(ctx context.Context, queueName string, job *executor.Job) (_ bool, err error) { - ctx, _, endObservation := c.operations.dequeue.With(ctx, &err, observation.Args{LogFields: []log.Field{ - log.String("queueName", queueName), + ctx, _, endObservation := c.operations.dequeue.With(ctx, &err, observation.Args{LogFields: []otlog.Field{ + otlog.String("queueName", queueName), }}) defer endObservation(1, observation.Args{}) @@ -73,9 +84,9 @@ func (c *Client) Dequeue(ctx context.Context, queueName string, job *executor.Jo } func (c *Client) AddExecutionLogEntry(ctx context.Context, queueName string, jobID int, entry workerutil.ExecutionLogEntry) (entryID int, err error) { - ctx, _, endObservation := c.operations.addExecutionLogEntry.With(ctx, &err, observation.Args{LogFields: []log.Field{ - log.String("queueName", queueName), - log.Int("jobID", jobID), + ctx, _, endObservation := c.operations.addExecutionLogEntry.With(ctx, &err, observation.Args{LogFields: []otlog.Field{ + otlog.String("queueName", queueName), + otlog.Int("jobID", jobID), }}) defer endObservation(1, observation.Args{}) @@ -93,10 +104,10 @@ func (c *Client) AddExecutionLogEntry(ctx context.Context, queueName string, job } func (c *Client) UpdateExecutionLogEntry(ctx context.Context, queueName string, jobID, entryID int, entry workerutil.ExecutionLogEntry) (err error) { - ctx, _, endObservation := c.operations.updateExecutionLogEntry.With(ctx, &err, observation.Args{LogFields: []log.Field{ - log.String("queueName", queueName), - log.Int("jobID", jobID), - log.Int("entryID", entryID), + ctx, _, endObservation := c.operations.updateExecutionLogEntry.With(ctx, &err, observation.Args{LogFields: []otlog.Field{ + otlog.String("queueName", queueName), + otlog.Int("jobID", jobID), + otlog.Int("entryID", entryID), }}) defer endObservation(1, observation.Args{}) @@ -114,9 +125,9 @@ func (c *Client) UpdateExecutionLogEntry(ctx context.Context, queueName string, } func (c *Client) MarkComplete(ctx context.Context, queueName string, jobID int) (err error) { - ctx, _, endObservation := c.operations.markComplete.With(ctx, &err, observation.Args{LogFields: []log.Field{ - log.String("queueName", queueName), - log.Int("jobID", jobID), + ctx, _, endObservation := c.operations.markComplete.With(ctx, &err, observation.Args{LogFields: []otlog.Field{ + otlog.String("queueName", queueName), + otlog.Int("jobID", jobID), }}) defer endObservation(1, observation.Args{}) @@ -132,9 +143,9 @@ func (c *Client) MarkComplete(ctx context.Context, queueName string, jobID int) } func (c *Client) MarkErrored(ctx context.Context, queueName string, jobID int, errorMessage string) (err error) { - ctx, _, endObservation := c.operations.markErrored.With(ctx, &err, observation.Args{LogFields: []log.Field{ - log.String("queueName", queueName), - log.Int("jobID", jobID), + ctx, _, endObservation := c.operations.markErrored.With(ctx, &err, observation.Args{LogFields: []otlog.Field{ + otlog.String("queueName", queueName), + otlog.Int("jobID", jobID), }}) defer endObservation(1, observation.Args{}) @@ -151,9 +162,9 @@ func (c *Client) MarkErrored(ctx context.Context, queueName string, jobID int, e } func (c *Client) MarkFailed(ctx context.Context, queueName string, jobID int, errorMessage string) (err error) { - ctx, _, endObservation := c.operations.markFailed.With(ctx, &err, observation.Args{LogFields: []log.Field{ - log.String("queueName", queueName), - log.Int("jobID", jobID), + ctx, _, endObservation := c.operations.markFailed.With(ctx, &err, observation.Args{LogFields: []otlog.Field{ + otlog.String("queueName", queueName), + otlog.Int("jobID", jobID), }}) defer endObservation(1, observation.Args{}) @@ -197,12 +208,18 @@ func (c *Client) Ping(ctx context.Context, queueName string, jobIDs []int) (err } func (c *Client) Heartbeat(ctx context.Context, queueName string, jobIDs []int) (knownIDs []int, err error) { - ctx, _, endObservation := c.operations.heartbeat.With(ctx, &err, observation.Args{LogFields: []log.Field{ - log.String("queueName", queueName), - log.String("jobIDs", intsToString(jobIDs)), + ctx, _, endObservation := c.operations.heartbeat.With(ctx, &err, observation.Args{LogFields: []otlog.Field{ + otlog.String("queueName", queueName), + otlog.String("jobIDs", intsToString(jobIDs)), }}) defer endObservation(1, observation.Args{}) + metrics, err := gatherMetrics(c.logger, c.metricsGatherer) + if err != nil { + c.logger.Error("Failed to collect prometheus metrics for heartbeat", log.Error(err)) + // Continue, no metrics should not prevent heartbeats. + } + req, err := c.makeRequest("POST", fmt.Sprintf("%s/heartbeat", queueName), executor.HeartbeatRequest{ ExecutorName: c.options.ExecutorName, JobIDs: jobIDs, @@ -214,6 +231,8 @@ func (c *Client) Heartbeat(ctx context.Context, queueName string, jobIDs []int) GitVersion: c.options.TelemetryOptions.GitVersion, IgniteVersion: c.options.TelemetryOptions.IgniteVersion, SrcCliVersion: c.options.TelemetryOptions.SrcCliVersion, + + PrometheusMetrics: metrics, }) if err != nil { return nil, err @@ -264,3 +283,27 @@ func intsToString(ints []int) string { return strings.Join(segments, ", ") } + +func gatherMetrics(logger log.Logger, gatherer prometheus.Gatherer) (string, error) { + maxDuration := 3 * time.Second + ctx, cancel := context.WithTimeout(context.Background(), maxDuration) + defer cancel() + go func() { + <-ctx.Done() + if ctx.Err() == context.DeadlineExceeded { + logger.Warn("gathering metrics took longer than expected", log.Duration("maxDuration", maxDuration)) + } + }() + mfs, err := gatherer.Gather() + if err != nil { + return "", err + } + var buf bytes.Buffer + enc := expfmt.NewEncoder(&buf, expfmt.FmtText) + for _, mf := range mfs { + if err := enc.Encode(mf); err != nil { + return "", errors.Wrap(err, "encoding metric family") + } + } + return buf.String(), nil +} diff --git a/enterprise/cmd/executor/internal/apiclient/client_test.go b/enterprise/cmd/executor/internal/apiclient/client_test.go index 5d19b61a52c..4e73b02482d 100644 --- a/enterprise/cmd/executor/internal/apiclient/client_test.go +++ b/enterprise/cmd/executor/internal/apiclient/client_test.go @@ -11,6 +11,8 @@ import ( "time" "github.com/google/go-cmp/cmp" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/sourcegraph/sourcegraph/enterprise/internal/executor" "github.com/sourcegraph/sourcegraph/internal/observation" @@ -359,7 +361,9 @@ func TestHeartbeat(t *testing.T) { "executorVersion": "test-executor-version", "gitVersion": "test-git-version", "igniteVersion": "test-ignite-version", - "srcCliVersion": "test-src-cli-version" + "srcCliVersion": "test-src-cli-version", + + "prometheusMetrics": "" }`, responseStatus: http.StatusOK, responsePayload: `[1]`, @@ -393,7 +397,9 @@ func TestHeartbeatBadResponse(t *testing.T) { "executorVersion": "test-executor-version", "gitVersion": "test-git-version", "igniteVersion": "test-ignite-version", - "srcCliVersion": "test-src-cli-version" + "srcCliVersion": "test-src-cli-version", + + "prometheusMetrics": "" }`, responseStatus: http.StatusInternalServerError, responsePayload: ``, @@ -438,7 +444,8 @@ func testRoute(t *testing.T, spec routeSpec, f func(client *Client)) { }, } - f(New(options, &observation.TestContext)) + client := New(options, prometheus.GathererFunc(func() ([]*dto.MetricFamily, error) { return nil, nil }), &observation.TestContext) + f(client) } func testServer(t *testing.T, spec routeSpec) *httptest.Server { diff --git a/enterprise/cmd/executor/internal/metrics/metrics.go b/enterprise/cmd/executor/internal/metrics/metrics.go new file mode 100644 index 00000000000..7d78f3092f8 --- /dev/null +++ b/enterprise/cmd/executor/internal/metrics/metrics.go @@ -0,0 +1,144 @@ +package metrics + +import ( + "bytes" + "io" + "net/http" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + + "github.com/sourcegraph/log" + + "github.com/sourcegraph/sourcegraph/lib/errors" +) + +type metricsSyncPoint struct { + notify *sync.Cond + result chan metricsResult +} + +func newMetricsSyncPoint() metricsSyncPoint { + return metricsSyncPoint{ + notify: sync.NewCond(&sync.Mutex{}), + result: make(chan metricsResult, 1), + } +} + +type metricsResult struct { + metrics map[string]*dto.MetricFamily + err error +} + +// MakeExecutorMetricsGatherer uses the given prometheus gatherer to collect all current +// metrics, and optionally also gathers metrics from node exporter and the docker +// registry mirror, if configured. +func MakeExecutorMetricsGatherer( + logger log.Logger, + gatherer prometheus.Gatherer, + // nodeExporterEndpoint is the URL of the local node_exporter endpoint, without + // the /metrics path. Disabled, when an empty string. + nodeExporterEndpoint string, + // dockerRegsitryEndpoint is the URL of the intermediary caching docker registry, + // for scraping and forwarding metrics. Disabled, when an empty string. + dockerRegistryNodeExporterEndpoint string, +) prometheus.GathererFunc { + nodeMetrics := newMetricsSyncPoint() + registryMetrics := newMetricsSyncPoint() + + go backgroundCollectNodeExporterMetrics(nodeExporterEndpoint, nodeMetrics) + go backgroundCollectNodeExporterMetrics(dockerRegistryNodeExporterEndpoint, registryMetrics) + + return func() (mfs []*dto.MetricFamily, err error) { + // notify to start a scrape + nodeMetrics.notify.Signal() + registryMetrics.notify.Signal() + + mfs, err = gatherer.Gather() + if err != nil { + return nil, errors.Wrap(err, "getting default gatherer") + } + + if nodeExporterEndpoint != "" { + result := <-registryMetrics.result + if result.err != nil { + logger.Warn("failed to get metrics for node exporter", log.Error(result.err)) + } + for key, mf := range result.metrics { + if strings.HasPrefix(key, "go_") || strings.HasPrefix(key, "promhttp_") || strings.HasPrefix(key, "process_") { + continue + } + + mfs = append(mfs, mf) + } + } + + if dockerRegistryNodeExporterEndpoint != "" { + result := <-registryMetrics.result + if result.err != nil { + logger.Warn("failed to get metrics for docker registry", log.Error(result.err)) + } + for key, mf := range result.metrics { + if strings.HasPrefix(key, "go_") || strings.HasPrefix(key, "promhttp_") || strings.HasPrefix(key, "process_") { + continue + } + + // should only be 1 registry, so we give it a set instance value + metricLabelInstance := "sg_instance" + instanceName := "docker-regsitry" + for _, m := range mf.Metric { + m.Label = append(m.Label, &dto.LabelPair{Name: &metricLabelInstance, Value: &instanceName}) + } + + mfs = append(mfs, mf) + } + } + + return mfs, nil + } +} + +// On notify, scrapes the specified endpoint for prometheus metrics and sends them down the +// associated channel. If the endpoint is "", then the channel is closed so that subsequent +// reads return an empty value instead of blocking indefinitely. +func backgroundCollectNodeExporterMetrics(endpoint string, syncPoint metricsSyncPoint) { + if endpoint == "" { + close(syncPoint.result) + return + } + + collect := func() (map[string]*dto.MetricFamily, error) { + resp, err := (&http.Client{ + Timeout: 2 * time.Second, + }).Get(endpoint + "/metrics") + if err != nil { + return nil, err + } + defer resp.Body.Close() + + b, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + var parser expfmt.TextParser + mfMap, err := parser.TextToMetricFamilies(bytes.NewReader(b)) + return mfMap, errors.Wrapf(err, "parsing node_exporter metrics, response: %s", string(b)) + } + + for { + syncPoint.notify.L.Lock() + syncPoint.notify.Wait() + mfMap, err := collect() + if err != nil { + syncPoint.result <- metricsResult{err: err} + } else { + syncPoint.result <- metricsResult{metrics: mfMap} + } + syncPoint.notify.L.Unlock() + } +} diff --git a/enterprise/cmd/executor/internal/worker/worker.go b/enterprise/cmd/executor/internal/worker/worker.go index d59074b22f1..226d0b7ea78 100644 --- a/enterprise/cmd/executor/internal/worker/worker.go +++ b/enterprise/cmd/executor/internal/worker/worker.go @@ -8,10 +8,14 @@ import ( "time" "github.com/inconshreveable/log15" + "github.com/prometheus/client_golang/prometheus" + + "github.com/sourcegraph/log" "github.com/sourcegraph/sourcegraph/enterprise/cmd/executor/internal/apiclient" "github.com/sourcegraph/sourcegraph/enterprise/cmd/executor/internal/command" "github.com/sourcegraph/sourcegraph/enterprise/cmd/executor/internal/janitor" + "github.com/sourcegraph/sourcegraph/enterprise/cmd/executor/internal/metrics" "github.com/sourcegraph/sourcegraph/internal/goroutine" "github.com/sourcegraph/sourcegraph/internal/observation" "github.com/sourcegraph/sourcegraph/internal/workerutil" @@ -60,6 +64,14 @@ type Options struct { // ResourceOptions configures the resource limits of docker container and Firecracker // virtual machines running on the executor. ResourceOptions command.ResourceOptions + + // NodeExporterEndpoint is the URL of the local node_exporter endpoint, without + // the /metrics path. + NodeExporterEndpoint string + + // DockerRegsitryEndpoint is the URL of the intermediary caching docker registry, + // for scraping and forwarding metrics. + DockerRegistryNodeExporterEndpoint string } // NewWorker creates a worker that polls a remote job queue API for work. The returned @@ -67,8 +79,9 @@ type Options struct { // as a heartbeat routine that will periodically hit the remote API with the work that is // currently being performed, which is necessary so the job queue API doesn't hand out jobs // it thinks may have been dropped. -func NewWorker(nameSet *janitor.NameSet, options Options, observationContext *observation.Context) (worker goroutine.WaitableBackgroundRoutine) { - queueStore := apiclient.New(options.ClientOptions, observationContext) +func NewWorker(nameSet *janitor.NameSet, options Options, observationContext *observation.Context) goroutine.WaitableBackgroundRoutine { + gatherer := metrics.MakeExecutorMetricsGatherer(log.Scoped("executor-worker.metrics-gatherer", ""), prometheus.DefaultGatherer, options.NodeExporterEndpoint, options.DockerRegistryNodeExporterEndpoint) + queueStore := apiclient.New(options.ClientOptions, gatherer, observationContext) store := &storeShim{queueName: options.QueueName, queueStore: queueStore} if !connectToFrontend(queueStore, options) { diff --git a/enterprise/cmd/executor/vm-image/install.sh b/enterprise/cmd/executor/vm-image/install.sh index 692220c31c8..7a2ecbc37fc 100755 --- a/enterprise/cmd/executor/vm-image/install.sh +++ b/enterprise/cmd/executor/vm-image/install.sh @@ -6,7 +6,7 @@ export CNI_VERSION=v0.9.1 export KERNEL_IMAGE="weaveworks/ignite-kernel:5.10.51" export EXECUTOR_FIRECRACKER_IMAGE="sourcegraph/ignite-ubuntu:insiders" export NODE_EXPORTER_VERSION=1.2.2 -export EXPORTER_EXPORTER_VERSION=0.4.5 +export NODE_EXPORTER_ADDR="127.0.0.1:9100" ## Install ops agent ## Reference: https://cloud.google.com/logging/docs/agent/ops-agent/installation @@ -100,6 +100,7 @@ Environment=HOME="%h" Environment=SRC_LOG_LEVEL=dbug Environment=SRC_PROF_HTTP=127.0.0.1:6060 Environment=EXECUTOR_FIRECRACKER_IMAGE="${EXECUTOR_FIRECRACKER_IMAGE}" +Environment=NODE_EXPORTER_URL="http://${NODE_EXPORTER_ADDR}" [Install] WantedBy=multi-user.target @@ -147,7 +148,7 @@ Description=Node Exporter [Service] User=node_exporter ExecStart=/usr/local/bin/node_exporter \ - --web.listen-address="127.0.0.1:9100" \ + --web.listen-address="${NODE_EXPORTER_ADDR}" \ --collector.disable-defaults \ --collector.cpu \ --collector.loadavg \ @@ -169,42 +170,6 @@ EOF systemctl enable node_exporter } -function install_exporter_exporter() { - useradd --system --shell /bin/false exporter_exporter - - wget https://github.com/QubitProducts/exporter_exporter/releases/download/v${EXPORTER_EXPORTER_VERSION}/exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64.tar.gz - tar xvfz exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64.tar.gz - mv exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64/exporter_exporter /usr/local/bin/exporter_exporter - rm -rf exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64 exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64.tar.gz - - chown exporter_exporter:exporter_exporter /usr/local/bin/exporter_exporter - - cat </usr/local/bin/exporter_exporter.yaml -modules: - node: - method: http - http: - port: 9100 - executor: - method: http - http: - port: 6060 -EOF - - cat </etc/systemd/system/exporter_exporter.service -[Unit] -Description=Exporter Exporter -[Service] -User=exporter_exporter -ExecStart=/usr/local/bin/exporter_exporter -config.file "/usr/local/bin/exporter_exporter.yaml" -[Install] -WantedBy=multi-user.target -EOF - - systemctl daemon-reload - systemctl enable exporter_exporter -} - # Install src-cli to the host system. It's needed for src steps outside of firecracker. function install_src_cli() { curl -f -L -o src-cli.tar.gz "https://github.com/sourcegraph/src-cli/releases/download/${SRC_CLI_VERSION}/src-cli_${SRC_CLI_VERSION}_linux_amd64.tar.gz" @@ -252,7 +217,6 @@ install_ignite # Services install_executor install_node_exporter -install_exporter_exporter # Service prep and cleanup generate_ignite_base_image diff --git a/enterprise/cmd/frontend/internal/executorqueue/handler/handler.go b/enterprise/cmd/frontend/internal/executorqueue/handler/handler.go index a55b354fdf5..8f7c2625b78 100644 --- a/enterprise/cmd/frontend/internal/executorqueue/handler/handler.go +++ b/enterprise/cmd/frontend/internal/executorqueue/handler/handler.go @@ -7,6 +7,7 @@ import ( "github.com/sourcegraph/log" apiclient "github.com/sourcegraph/sourcegraph/enterprise/internal/executor" + metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store" executor "github.com/sourcegraph/sourcegraph/internal/services/executors/store" "github.com/sourcegraph/sourcegraph/internal/types" "github.com/sourcegraph/sourcegraph/internal/workerutil" @@ -17,6 +18,8 @@ import ( type handler struct { QueueOptions executorStore executor.Store + metricsStore metricsstore.DistributedStore + logger log.Logger } type QueueOptions struct { @@ -31,9 +34,11 @@ type QueueOptions struct { RecordTransformer func(ctx context.Context, record workerutil.Record) (apiclient.Job, error) } -func newHandler(executorStore executor.Store, queueOptions QueueOptions) *handler { +func newHandler(executorStore executor.Store, metricsStore metricsstore.DistributedStore, queueOptions QueueOptions) *handler { return &handler{ executorStore: executorStore, + metricsStore: metricsStore, + logger: log.Scoped("executor-queue-handler", "The route handler for all executor dbworker API tunnel endpoints"), QueueOptions: queueOptions, } } diff --git a/enterprise/cmd/frontend/internal/executorqueue/handler/handler_test.go b/enterprise/cmd/frontend/internal/executorqueue/handler/handler_test.go index 507880399f4..f2161e00cce 100644 --- a/enterprise/cmd/frontend/internal/executorqueue/handler/handler_test.go +++ b/enterprise/cmd/frontend/internal/executorqueue/handler/handler_test.go @@ -7,6 +7,7 @@ import ( "github.com/google/go-cmp/cmp" apiclient "github.com/sourcegraph/sourcegraph/enterprise/internal/executor" + metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store" "github.com/sourcegraph/sourcegraph/internal/types" "github.com/sourcegraph/sourcegraph/internal/workerutil" "github.com/sourcegraph/sourcegraph/internal/workerutil/dbworker/store" @@ -39,8 +40,9 @@ func TestDequeue(t *testing.T) { } executorStore := NewMockStore() + metricsStore := metricsstore.NewMockDistributedStore() - handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) job, dequeued, err := handler.dequeue(context.Background(), "deadbeef") if err != nil { @@ -58,7 +60,10 @@ func TestDequeue(t *testing.T) { } func TestDequeueNoRecord(t *testing.T) { - handler := newHandler(NewMockStore(), QueueOptions{Store: workerstoremocks.NewMockStore()}) + executorStore := NewMockStore() + metricsStore := metricsstore.NewMockDistributedStore() + + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: workerstoremocks.NewMockStore()}) _, dequeued, err := handler.dequeue(context.Background(), "deadbeef") if err != nil { @@ -79,8 +84,9 @@ func TestAddExecutionLogEntry(t *testing.T) { store.AddExecutionLogEntryFunc.SetDefaultReturn(fakeEntryID, nil) executorStore := NewMockStore() + metricsStore := metricsstore.NewMockDistributedStore() - handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) job, dequeued, err := handler.dequeue(context.Background(), "deadbeef") if err != nil { @@ -118,7 +124,8 @@ func TestAddExecutionLogEntryUnknownJob(t *testing.T) { store := workerstoremocks.NewMockStore() store.AddExecutionLogEntryFunc.SetDefaultReturn(0, workerstore.ErrExecutionLogEntryNotUpdated) executorStore := NewMockStore() - handler := newHandler(executorStore, QueueOptions{Store: store}) + metricsStore := metricsstore.NewMockDistributedStore() + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store}) entry := workerutil.ExecutionLogEntry{ Command: []string{"ls", "-a"}, @@ -137,8 +144,9 @@ func TestUpdateExecutionLogEntry(t *testing.T) { } executorStore := NewMockStore() + metricsStore := metricsstore.NewMockDistributedStore() - handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) job, dequeued, err := handler.dequeue(context.Background(), "deadbeef") if err != nil { @@ -176,7 +184,8 @@ func TestUpdateExecutionLogEntryUnknownJob(t *testing.T) { store := workerstoremocks.NewMockStore() store.UpdateExecutionLogEntryFunc.SetDefaultReturn(workerstore.ErrExecutionLogEntryNotUpdated) executorStore := NewMockStore() - handler := newHandler(executorStore, QueueOptions{Store: store}) + metricsStore := metricsstore.NewMockDistributedStore() + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store}) entry := workerutil.ExecutionLogEntry{ Command: []string{"ls", "-a"}, @@ -196,8 +205,9 @@ func TestMarkComplete(t *testing.T) { } executorStore := NewMockStore() + metricsStore := metricsstore.NewMockDistributedStore() - handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) job, dequeued, err := handler.dequeue(context.Background(), "deadbeef") if err != nil { @@ -224,7 +234,8 @@ func TestMarkCompleteUnknownJob(t *testing.T) { store := workerstoremocks.NewMockStore() store.MarkCompleteFunc.SetDefaultReturn(false, nil) executorStore := NewMockStore() - handler := newHandler(executorStore, QueueOptions{Store: store}) + metricsStore := metricsstore.NewMockDistributedStore() + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store}) if err := handler.markComplete(context.Background(), "deadbeef", 42); err != ErrUnknownJob { t.Fatalf("unexpected error. want=%q have=%q", ErrUnknownJob, err) @@ -236,7 +247,8 @@ func TestMarkCompleteStoreError(t *testing.T) { internalErr := errors.New("something went wrong") store.MarkCompleteFunc.SetDefaultReturn(false, internalErr) executorStore := NewMockStore() - handler := newHandler(executorStore, QueueOptions{Store: store}) + metricsStore := metricsstore.NewMockDistributedStore() + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store}) if err := handler.markComplete(context.Background(), "deadbeef", 42); err == nil || errors.UnwrapAll(err).Error() != internalErr.Error() { t.Fatalf("unexpected error. want=%q have=%q", internalErr, errors.UnwrapAll(err)) @@ -252,8 +264,9 @@ func TestMarkErrored(t *testing.T) { } executorStore := NewMockStore() + metricsStore := metricsstore.NewMockDistributedStore() - handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) job, dequeued, err := handler.dequeue(context.Background(), "deadbeef") if err != nil { @@ -283,7 +296,8 @@ func TestMarkErroredUnknownJob(t *testing.T) { store := workerstoremocks.NewMockStore() store.MarkErroredFunc.SetDefaultReturn(false, nil) executorStore := NewMockStore() - handler := newHandler(executorStore, QueueOptions{Store: store}) + metricsStore := metricsstore.NewMockDistributedStore() + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store}) if err := handler.markErrored(context.Background(), "deadbeef", 42, "OH NO"); err != ErrUnknownJob { t.Fatalf("unexpected error. want=%q have=%q", ErrUnknownJob, err) @@ -295,7 +309,8 @@ func TestMarkErroredStoreError(t *testing.T) { storeErr := errors.New("something went wrong") store.MarkErroredFunc.SetDefaultReturn(false, storeErr) executorStore := NewMockStore() - handler := newHandler(executorStore, QueueOptions{Store: store}) + metricsStore := metricsstore.NewMockDistributedStore() + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store}) if err := handler.markErrored(context.Background(), "deadbeef", 42, "OH NO"); err == nil || errors.UnwrapAll(err).Error() != storeErr.Error() { t.Fatalf("unexpected error. want=%q have=%q", storeErr, errors.UnwrapAll(err)) @@ -311,8 +326,9 @@ func TestMarkFailed(t *testing.T) { } executorStore := NewMockStore() + metricsStore := metricsstore.NewMockDistributedStore() - handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer}) job, dequeued, err := handler.dequeue(context.Background(), "deadbeef") if err != nil { @@ -342,7 +358,8 @@ func TestMarkFailedUnknownJob(t *testing.T) { store := workerstoremocks.NewMockStore() store.MarkFailedFunc.SetDefaultReturn(false, nil) executorStore := NewMockStore() - handler := newHandler(executorStore, QueueOptions{Store: store}) + metricsStore := metricsstore.NewMockDistributedStore() + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store}) if err := handler.markFailed(context.Background(), "deadbeef", 42, "OH NO"); err != ErrUnknownJob { t.Fatalf("unexpected error. want=%q have=%q", ErrUnknownJob, err) @@ -354,7 +371,8 @@ func TestMarkFailedStoreError(t *testing.T) { storeErr := errors.New("something went wrong") store.MarkFailedFunc.SetDefaultReturn(false, storeErr) executorStore := NewMockStore() - handler := newHandler(executorStore, QueueOptions{Store: store}) + metricsStore := metricsstore.NewMockDistributedStore() + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store}) if err := handler.markFailed(context.Background(), "deadbeef", 42, "OH NO"); err == nil || errors.UnwrapAll(err).Error() != storeErr.Error() { t.Fatalf("unexpected error. want=%q have=%q", storeErr, errors.UnwrapAll(err)) @@ -372,6 +390,7 @@ func TestHeartbeat(t *testing.T) { }) executorStore := NewMockStore() + metricsStore := metricsstore.NewMockDistributedStore() executor := types.Executor{ Hostname: "test-hostname", @@ -385,7 +404,7 @@ func TestHeartbeat(t *testing.T) { SrcCliVersion: "test-src-cli-version", } - handler := newHandler(executorStore, QueueOptions{Store: s, RecordTransformer: recordTransformer}) + handler := newHandler(executorStore, metricsStore, QueueOptions{Store: s, RecordTransformer: recordTransformer}) if knownIDs, err := handler.heartbeat(context.Background(), executor, []int{testKnownID, 10}); err != nil { t.Fatalf("unexpected error performing heartbeat: %s", err) diff --git a/enterprise/cmd/frontend/internal/executorqueue/handler/routes.go b/enterprise/cmd/frontend/internal/executorqueue/handler/routes.go index ca2896f5539..a779ca8d722 100644 --- a/enterprise/cmd/frontend/internal/executorqueue/handler/routes.go +++ b/enterprise/cmd/frontend/internal/executorqueue/handler/routes.go @@ -6,21 +6,29 @@ import ( "fmt" "io" "net/http" + "strings" + + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" "github.com/gorilla/mux" "github.com/grafana/regexp" "github.com/inconshreveable/log15" + "github.com/sourcegraph/log" + apiclient "github.com/sourcegraph/sourcegraph/enterprise/internal/executor" + metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store" executor "github.com/sourcegraph/sourcegraph/internal/services/executors/store" "github.com/sourcegraph/sourcegraph/internal/types" + "github.com/sourcegraph/sourcegraph/lib/errors" ) // SetupRoutes registers all route handlers required for all configured executor // queues with the given router. -func SetupRoutes(executorStore executor.Store, queueOptionsMap []QueueOptions, router *mux.Router) { +func SetupRoutes(executorStore executor.Store, metricsStore metricsstore.DistributedStore, queueOptionsMap []QueueOptions, router *mux.Router) { for _, queueOptions := range queueOptionsMap { - h := newHandler(executorStore, queueOptions) + h := newHandler(executorStore, metricsStore, queueOptions) subRouter := router.PathPrefix(fmt.Sprintf("/{queueName:(?:%s)}/", regexp.QuoteMeta(queueOptions.Name))).Subrouter() routes := map[string]func(w http.ResponseWriter, r *http.Request){ @@ -132,6 +140,22 @@ func (h *handler) handleHeartbeat(w http.ResponseWriter, r *http.Request) { SrcCliVersion: payload.SrcCliVersion, } + // Handle metrics in the background, this should not delay the heartbeat response being + // delivered. It is critical for keeping jobs alive. + go func() { + metrics, err := decodeAndLabelMetrics(payload.PrometheusMetrics, payload.ExecutorName) + if err != nil { + // Just log the error but don't panic. The heartbeat is more important. + h.logger.Error("failed to decode metrics and apply labels for executor heartbeat", log.Error(err)) + return + } + + if err := h.metricsStore.Ingest(payload.ExecutorName, metrics); err != nil { + // Just log the error but don't panic. The heartbeat is more important. + h.logger.Error("failed to ingest metrics for executor heartbeat", log.Error(err)) + } + }() + unknownIDs, err := h.heartbeat(r.Context(), executor, payload.JobIDs) return http.StatusOK, unknownIDs, err }) @@ -184,3 +208,40 @@ func (h *handler) wrapHandler(w http.ResponseWriter, r *http.Request, payload an _, _ = io.Copy(w, bytes.NewReader(data)) } } + +// decodeAndLabelMetrics decodes the text serialized prometheus metrics dump and then +// applies common labels. +func decodeAndLabelMetrics(encodedMetrics, instanceName string) ([]*dto.MetricFamily, error) { + data := []*dto.MetricFamily{} + + dec := expfmt.NewDecoder(strings.NewReader(encodedMetrics), expfmt.FmtText) + for { + var mf dto.MetricFamily + if err := dec.Decode(&mf); err != nil { + if err == io.EOF { + break + } + + return nil, errors.Wrap(err, "decoding metric family") + } + + // Attach the extra labels. + metricLabelInstance := "sg_instance" + metricLabelJob := "sg_job" + job := "sourcegraph-executors" + for _, m := range mf.Metric { + var found bool + for _, l := range m.Label { + found = found || *l.Name == metricLabelInstance + } + if !found { + m.Label = append(m.Label, &dto.LabelPair{Name: &metricLabelInstance, Value: &instanceName}) + } + m.Label = append(m.Label, &dto.LabelPair{Name: &metricLabelJob, Value: &job}) + } + + data = append(data, &mf) + } + + return data, nil +} diff --git a/enterprise/cmd/frontend/internal/executorqueue/queuehandler.go b/enterprise/cmd/frontend/internal/executorqueue/queuehandler.go index 1fffcb1d545..164193f079c 100644 --- a/enterprise/cmd/frontend/internal/executorqueue/queuehandler.go +++ b/enterprise/cmd/frontend/internal/executorqueue/queuehandler.go @@ -12,10 +12,12 @@ import ( "github.com/sourcegraph/sourcegraph/internal/actor" "github.com/sourcegraph/sourcegraph/internal/database" "github.com/sourcegraph/sourcegraph/internal/gitserver" + metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store" executorDB "github.com/sourcegraph/sourcegraph/internal/services/executors/store/db" ) func newExecutorQueueHandler(db database.DB, queueOptions []handler.QueueOptions, accessToken func() string, uploadHandler http.Handler) (func() http.Handler, error) { + metricsStore := metricsstore.NewDistributedStore("executors:") executorStore := executorDB.New(db) gitserverClient := gitserver.NewClient(db) @@ -29,7 +31,7 @@ func newExecutorQueueHandler(db database.DB, queueOptions []handler.QueueOptions base.Path("/git/{RepoName:.*}/git-upload-pack").Handler(gitserverProxy(gitserverClient, "/git-upload-pack")) // Serve the executor queue API. - handler.SetupRoutes(executorStore, queueOptions, base.PathPrefix("/queue/").Subrouter()) + handler.SetupRoutes(executorStore, metricsStore, queueOptions, base.PathPrefix("/queue/").Subrouter()) // Upload LSIF indexes without a sudo access token or github tokens. base.Path("/lsif/upload").Methods("POST").Handler(uploadHandler) diff --git a/enterprise/cmd/worker/internal/executors/metricsserver_config.go b/enterprise/cmd/worker/internal/executors/metricsserver_config.go new file mode 100644 index 00000000000..51ce5119485 --- /dev/null +++ b/enterprise/cmd/worker/internal/executors/metricsserver_config.go @@ -0,0 +1,17 @@ +package executors + +import ( + "github.com/sourcegraph/sourcegraph/internal/env" +) + +type metricsServerConfig struct { + env.BaseConfig + + MetricsServerPort int +} + +var metricsServerConfigInst = &metricsServerConfig{} + +func (c *metricsServerConfig) Load() { + c.MetricsServerPort = c.GetInt("EXECUTORS_METRICS_SERVER_PORT", "6996", "The port to listen on to serve the metrics from executors.") +} diff --git a/enterprise/cmd/worker/internal/executors/metricsserver_job.go b/enterprise/cmd/worker/internal/executors/metricsserver_job.go new file mode 100644 index 00000000000..9f0abe9235c --- /dev/null +++ b/enterprise/cmd/worker/internal/executors/metricsserver_job.go @@ -0,0 +1,50 @@ +package executors + +import ( + "context" + "net" + "net/http" + "strconv" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sourcegraph/log" + + "github.com/sourcegraph/sourcegraph/cmd/worker/job" + "github.com/sourcegraph/sourcegraph/internal/env" + "github.com/sourcegraph/sourcegraph/internal/goroutine" + "github.com/sourcegraph/sourcegraph/internal/httpserver" + metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store" +) + +type metricsServerJob struct{} + +func NewMetricsServerJob() job.Job { + return &metricsServerJob{} +} + +func (j *metricsServerJob) Description() string { + return "HTTP server exposing the metrics collected from executors to Prometheus" +} + +func (j *metricsServerJob) Config() []env.Config { + return []env.Config{metricsServerConfigInst} +} + +func (j *metricsServerJob) Routines(ctx context.Context, logger log.Logger) ([]goroutine.BackgroundRoutine, error) { + host := "" + if env.InsecureDev { + host = "127.0.0.1" + } + addr := net.JoinHostPort(host, strconv.Itoa(metricsServerConfigInst.MetricsServerPort)) + + metricsStore := metricsstore.NewDistributedStore("executors:") + + handler := promhttp.HandlerFor(prometheus.GathererFunc(metricsStore.Gather), promhttp.HandlerOpts{}) + + routines := []goroutine.BackgroundRoutine{ + httpserver.NewFromAddr(addr, &http.Server{Handler: handler}), + } + + return routines, nil +} diff --git a/enterprise/cmd/worker/main.go b/enterprise/cmd/worker/main.go index 93517552b81..07a07c596d3 100644 --- a/enterprise/cmd/worker/main.go +++ b/enterprise/cmd/worker/main.go @@ -51,6 +51,7 @@ func main() { "batches-bulk-processor": batches.NewBulkOperationProcessorJob(), "batches-workspace-resolver": batches.NewWorkspaceResolverJob(), "executors-janitor": executors.NewJanitorJob(), + "executors-metricsserver": executors.NewMetricsServerJob(), "codemonitors-job": codemonitors.NewCodeMonitorJob(), "bitbucket-project-permissions": permissions.NewBitbucketProjectPermissionsJob(), "export-usage-telemetry": telemetry.NewTelemetryJob(), diff --git a/enterprise/dev/src-prof-services.json b/enterprise/dev/src-prof-services.json index 718c5dbc8ec..eca7b983df9 100644 --- a/enterprise/dev/src-prof-services.json +++ b/enterprise/dev/src-prof-services.json @@ -5,5 +5,6 @@ { "Name": "symbols", "Host": "127.0.0.1:6071" }, { "Name": "repo-updater", "Host": "127.0.0.1:6074" }, { "Name": "precise-code-intel-worker", "Host": "127.0.0.1:6088" }, - { "Name": "worker", "Host": "127.0.0.1:6089" } + { "Name": "worker", "Host": "127.0.0.1:6089" }, + { "Name": "worker-executors", "Host": "127.0.0.1:6969" } ] diff --git a/enterprise/internal/executor/client_types.go b/enterprise/internal/executor/client_types.go index 1fd39640dd3..b8cec48f292 100644 --- a/enterprise/internal/executor/client_types.go +++ b/enterprise/internal/executor/client_types.go @@ -122,6 +122,8 @@ type HeartbeatRequest struct { GitVersion string `json:"gitVersion"` IgniteVersion string `json:"igniteVersion"` SrcCliVersion string `json:"srcCliVersion"` + + PrometheusMetrics string `json:"prometheusMetrics"` } type CanceledJobsRequest struct { diff --git a/go.mod b/go.mod index f70d1db76bd..413c6bbeb4f 100644 --- a/go.mod +++ b/go.mod @@ -360,7 +360,7 @@ require ( github.com/pkg/profile v1.6.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/pquerna/cachecontrol v0.1.0 // indirect - github.com/prometheus/client_model v0.2.0 // indirect + github.com/prometheus/client_model v0.2.0 github.com/prometheus/common/sigv4 v0.1.0 // indirect github.com/prometheus/procfs v0.7.3 // indirect github.com/pseudomuto/protoc-gen-doc v1.5.1 // indirect diff --git a/internal/metrics/store/mocks_temp.go b/internal/metrics/store/mocks_temp.go new file mode 100644 index 00000000000..ae2e4e9235d --- /dev/null +++ b/internal/metrics/store/mocks_temp.go @@ -0,0 +1,426 @@ +// Code generated by go-mockgen 1.3.3; DO NOT EDIT. +// +// This file was generated by running `sg generate` (or `go-mockgen`) at the root of +// this repository. To add additional mocks to this or another package, add a new entry +// to the mockgen.yaml file in the root of this repository. + +package store + +import ( + "sync" + + go1 "github.com/prometheus/client_model/go" +) + +// MockDistributedStore is a mock implementation of the DistributedStore +// interface (from the package +// github.com/sourcegraph/sourcegraph/internal/metrics/store) used for unit +// testing. +type MockDistributedStore struct { + // GatherFunc is an instance of a mock function object controlling the + // behavior of the method Gather. + GatherFunc *DistributedStoreGatherFunc + // IngestFunc is an instance of a mock function object controlling the + // behavior of the method Ingest. + IngestFunc *DistributedStoreIngestFunc +} + +// NewMockDistributedStore creates a new mock of the DistributedStore +// interface. All methods return zero values for all results, unless +// overwritten. +func NewMockDistributedStore() *MockDistributedStore { + return &MockDistributedStore{ + GatherFunc: &DistributedStoreGatherFunc{ + defaultHook: func() (r0 []*go1.MetricFamily, r1 error) { + return + }, + }, + IngestFunc: &DistributedStoreIngestFunc{ + defaultHook: func(string, []*go1.MetricFamily) (r0 error) { + return + }, + }, + } +} + +// NewStrictMockDistributedStore creates a new mock of the DistributedStore +// interface. All methods panic on invocation, unless overwritten. +func NewStrictMockDistributedStore() *MockDistributedStore { + return &MockDistributedStore{ + GatherFunc: &DistributedStoreGatherFunc{ + defaultHook: func() ([]*go1.MetricFamily, error) { + panic("unexpected invocation of MockDistributedStore.Gather") + }, + }, + IngestFunc: &DistributedStoreIngestFunc{ + defaultHook: func(string, []*go1.MetricFamily) error { + panic("unexpected invocation of MockDistributedStore.Ingest") + }, + }, + } +} + +// NewMockDistributedStoreFrom creates a new mock of the +// MockDistributedStore interface. All methods delegate to the given +// implementation, unless overwritten. +func NewMockDistributedStoreFrom(i DistributedStore) *MockDistributedStore { + return &MockDistributedStore{ + GatherFunc: &DistributedStoreGatherFunc{ + defaultHook: i.Gather, + }, + IngestFunc: &DistributedStoreIngestFunc{ + defaultHook: i.Ingest, + }, + } +} + +// DistributedStoreGatherFunc describes the behavior when the Gather method +// of the parent MockDistributedStore instance is invoked. +type DistributedStoreGatherFunc struct { + defaultHook func() ([]*go1.MetricFamily, error) + hooks []func() ([]*go1.MetricFamily, error) + history []DistributedStoreGatherFuncCall + mutex sync.Mutex +} + +// Gather delegates to the next hook function in the queue and stores the +// parameter and result values of this invocation. +func (m *MockDistributedStore) Gather() ([]*go1.MetricFamily, error) { + r0, r1 := m.GatherFunc.nextHook()() + m.GatherFunc.appendCall(DistributedStoreGatherFuncCall{r0, r1}) + return r0, r1 +} + +// SetDefaultHook sets function that is called when the Gather method of the +// parent MockDistributedStore instance is invoked and the hook queue is +// empty. +func (f *DistributedStoreGatherFunc) SetDefaultHook(hook func() ([]*go1.MetricFamily, error)) { + f.defaultHook = hook +} + +// PushHook adds a function to the end of hook queue. Each invocation of the +// Gather method of the parent MockDistributedStore instance invokes the +// hook at the front of the queue and discards it. After the queue is empty, +// the default hook function is invoked for any future action. +func (f *DistributedStoreGatherFunc) PushHook(hook func() ([]*go1.MetricFamily, error)) { + f.mutex.Lock() + f.hooks = append(f.hooks, hook) + f.mutex.Unlock() +} + +// SetDefaultReturn calls SetDefaultHook with a function that returns the +// given values. +func (f *DistributedStoreGatherFunc) SetDefaultReturn(r0 []*go1.MetricFamily, r1 error) { + f.SetDefaultHook(func() ([]*go1.MetricFamily, error) { + return r0, r1 + }) +} + +// PushReturn calls PushHook with a function that returns the given values. +func (f *DistributedStoreGatherFunc) PushReturn(r0 []*go1.MetricFamily, r1 error) { + f.PushHook(func() ([]*go1.MetricFamily, error) { + return r0, r1 + }) +} + +func (f *DistributedStoreGatherFunc) nextHook() func() ([]*go1.MetricFamily, error) { + f.mutex.Lock() + defer f.mutex.Unlock() + + if len(f.hooks) == 0 { + return f.defaultHook + } + + hook := f.hooks[0] + f.hooks = f.hooks[1:] + return hook +} + +func (f *DistributedStoreGatherFunc) appendCall(r0 DistributedStoreGatherFuncCall) { + f.mutex.Lock() + f.history = append(f.history, r0) + f.mutex.Unlock() +} + +// History returns a sequence of DistributedStoreGatherFuncCall objects +// describing the invocations of this function. +func (f *DistributedStoreGatherFunc) History() []DistributedStoreGatherFuncCall { + f.mutex.Lock() + history := make([]DistributedStoreGatherFuncCall, len(f.history)) + copy(history, f.history) + f.mutex.Unlock() + + return history +} + +// DistributedStoreGatherFuncCall is an object that describes an invocation +// of method Gather on an instance of MockDistributedStore. +type DistributedStoreGatherFuncCall struct { + // Result0 is the value of the 1st result returned from this method + // invocation. + Result0 []*go1.MetricFamily + // Result1 is the value of the 2nd result returned from this method + // invocation. + Result1 error +} + +// Args returns an interface slice containing the arguments of this +// invocation. +func (c DistributedStoreGatherFuncCall) Args() []interface{} { + return []interface{}{} +} + +// Results returns an interface slice containing the results of this +// invocation. +func (c DistributedStoreGatherFuncCall) Results() []interface{} { + return []interface{}{c.Result0, c.Result1} +} + +// DistributedStoreIngestFunc describes the behavior when the Ingest method +// of the parent MockDistributedStore instance is invoked. +type DistributedStoreIngestFunc struct { + defaultHook func(string, []*go1.MetricFamily) error + hooks []func(string, []*go1.MetricFamily) error + history []DistributedStoreIngestFuncCall + mutex sync.Mutex +} + +// Ingest delegates to the next hook function in the queue and stores the +// parameter and result values of this invocation. +func (m *MockDistributedStore) Ingest(v0 string, v1 []*go1.MetricFamily) error { + r0 := m.IngestFunc.nextHook()(v0, v1) + m.IngestFunc.appendCall(DistributedStoreIngestFuncCall{v0, v1, r0}) + return r0 +} + +// SetDefaultHook sets function that is called when the Ingest method of the +// parent MockDistributedStore instance is invoked and the hook queue is +// empty. +func (f *DistributedStoreIngestFunc) SetDefaultHook(hook func(string, []*go1.MetricFamily) error) { + f.defaultHook = hook +} + +// PushHook adds a function to the end of hook queue. Each invocation of the +// Ingest method of the parent MockDistributedStore instance invokes the +// hook at the front of the queue and discards it. After the queue is empty, +// the default hook function is invoked for any future action. +func (f *DistributedStoreIngestFunc) PushHook(hook func(string, []*go1.MetricFamily) error) { + f.mutex.Lock() + f.hooks = append(f.hooks, hook) + f.mutex.Unlock() +} + +// SetDefaultReturn calls SetDefaultHook with a function that returns the +// given values. +func (f *DistributedStoreIngestFunc) SetDefaultReturn(r0 error) { + f.SetDefaultHook(func(string, []*go1.MetricFamily) error { + return r0 + }) +} + +// PushReturn calls PushHook with a function that returns the given values. +func (f *DistributedStoreIngestFunc) PushReturn(r0 error) { + f.PushHook(func(string, []*go1.MetricFamily) error { + return r0 + }) +} + +func (f *DistributedStoreIngestFunc) nextHook() func(string, []*go1.MetricFamily) error { + f.mutex.Lock() + defer f.mutex.Unlock() + + if len(f.hooks) == 0 { + return f.defaultHook + } + + hook := f.hooks[0] + f.hooks = f.hooks[1:] + return hook +} + +func (f *DistributedStoreIngestFunc) appendCall(r0 DistributedStoreIngestFuncCall) { + f.mutex.Lock() + f.history = append(f.history, r0) + f.mutex.Unlock() +} + +// History returns a sequence of DistributedStoreIngestFuncCall objects +// describing the invocations of this function. +func (f *DistributedStoreIngestFunc) History() []DistributedStoreIngestFuncCall { + f.mutex.Lock() + history := make([]DistributedStoreIngestFuncCall, len(f.history)) + copy(history, f.history) + f.mutex.Unlock() + + return history +} + +// DistributedStoreIngestFuncCall is an object that describes an invocation +// of method Ingest on an instance of MockDistributedStore. +type DistributedStoreIngestFuncCall struct { + // Arg0 is the value of the 1st argument passed to this method + // invocation. + Arg0 string + // Arg1 is the value of the 2nd argument passed to this method + // invocation. + Arg1 []*go1.MetricFamily + // Result0 is the value of the 1st result returned from this method + // invocation. + Result0 error +} + +// Args returns an interface slice containing the arguments of this +// invocation. +func (c DistributedStoreIngestFuncCall) Args() []interface{} { + return []interface{}{c.Arg0, c.Arg1} +} + +// Results returns an interface slice containing the results of this +// invocation. +func (c DistributedStoreIngestFuncCall) Results() []interface{} { + return []interface{}{c.Result0} +} + +// MockStore is a mock implementation of the Store interface (from the +// package github.com/sourcegraph/sourcegraph/internal/metrics/store) used +// for unit testing. +type MockStore struct { + // GatherFunc is an instance of a mock function object controlling the + // behavior of the method Gather. + GatherFunc *StoreGatherFunc +} + +// NewMockStore creates a new mock of the Store interface. All methods +// return zero values for all results, unless overwritten. +func NewMockStore() *MockStore { + return &MockStore{ + GatherFunc: &StoreGatherFunc{ + defaultHook: func() (r0 []*go1.MetricFamily, r1 error) { + return + }, + }, + } +} + +// NewStrictMockStore creates a new mock of the Store interface. All methods +// panic on invocation, unless overwritten. +func NewStrictMockStore() *MockStore { + return &MockStore{ + GatherFunc: &StoreGatherFunc{ + defaultHook: func() ([]*go1.MetricFamily, error) { + panic("unexpected invocation of MockStore.Gather") + }, + }, + } +} + +// NewMockStoreFrom creates a new mock of the MockStore interface. All +// methods delegate to the given implementation, unless overwritten. +func NewMockStoreFrom(i Store) *MockStore { + return &MockStore{ + GatherFunc: &StoreGatherFunc{ + defaultHook: i.Gather, + }, + } +} + +// StoreGatherFunc describes the behavior when the Gather method of the +// parent MockStore instance is invoked. +type StoreGatherFunc struct { + defaultHook func() ([]*go1.MetricFamily, error) + hooks []func() ([]*go1.MetricFamily, error) + history []StoreGatherFuncCall + mutex sync.Mutex +} + +// Gather delegates to the next hook function in the queue and stores the +// parameter and result values of this invocation. +func (m *MockStore) Gather() ([]*go1.MetricFamily, error) { + r0, r1 := m.GatherFunc.nextHook()() + m.GatherFunc.appendCall(StoreGatherFuncCall{r0, r1}) + return r0, r1 +} + +// SetDefaultHook sets function that is called when the Gather method of the +// parent MockStore instance is invoked and the hook queue is empty. +func (f *StoreGatherFunc) SetDefaultHook(hook func() ([]*go1.MetricFamily, error)) { + f.defaultHook = hook +} + +// PushHook adds a function to the end of hook queue. Each invocation of the +// Gather method of the parent MockStore instance invokes the hook at the +// front of the queue and discards it. After the queue is empty, the default +// hook function is invoked for any future action. +func (f *StoreGatherFunc) PushHook(hook func() ([]*go1.MetricFamily, error)) { + f.mutex.Lock() + f.hooks = append(f.hooks, hook) + f.mutex.Unlock() +} + +// SetDefaultReturn calls SetDefaultHook with a function that returns the +// given values. +func (f *StoreGatherFunc) SetDefaultReturn(r0 []*go1.MetricFamily, r1 error) { + f.SetDefaultHook(func() ([]*go1.MetricFamily, error) { + return r0, r1 + }) +} + +// PushReturn calls PushHook with a function that returns the given values. +func (f *StoreGatherFunc) PushReturn(r0 []*go1.MetricFamily, r1 error) { + f.PushHook(func() ([]*go1.MetricFamily, error) { + return r0, r1 + }) +} + +func (f *StoreGatherFunc) nextHook() func() ([]*go1.MetricFamily, error) { + f.mutex.Lock() + defer f.mutex.Unlock() + + if len(f.hooks) == 0 { + return f.defaultHook + } + + hook := f.hooks[0] + f.hooks = f.hooks[1:] + return hook +} + +func (f *StoreGatherFunc) appendCall(r0 StoreGatherFuncCall) { + f.mutex.Lock() + f.history = append(f.history, r0) + f.mutex.Unlock() +} + +// History returns a sequence of StoreGatherFuncCall objects describing the +// invocations of this function. +func (f *StoreGatherFunc) History() []StoreGatherFuncCall { + f.mutex.Lock() + history := make([]StoreGatherFuncCall, len(f.history)) + copy(history, f.history) + f.mutex.Unlock() + + return history +} + +// StoreGatherFuncCall is an object that describes an invocation of method +// Gather on an instance of MockStore. +type StoreGatherFuncCall struct { + // Result0 is the value of the 1st result returned from this method + // invocation. + Result0 []*go1.MetricFamily + // Result1 is the value of the 2nd result returned from this method + // invocation. + Result1 error +} + +// Args returns an interface slice containing the arguments of this +// invocation. +func (c StoreGatherFuncCall) Args() []interface{} { + return []interface{}{} +} + +// Results returns an interface slice containing the results of this +// invocation. +func (c StoreGatherFuncCall) Results() []interface{} { + return []interface{}{c.Result0, c.Result1} +} diff --git a/internal/metrics/store/store.go b/internal/metrics/store/store.go new file mode 100644 index 00000000000..dcc771408e5 --- /dev/null +++ b/internal/metrics/store/store.go @@ -0,0 +1,117 @@ +package store + +import ( + "bytes" + "io" + "strings" + + "github.com/gomodule/redigo/redis" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + + "github.com/sourcegraph/sourcegraph/internal/redispool" + "github.com/sourcegraph/sourcegraph/lib/errors" +) + +const DefaultMetricsExpiry = 30 + +type Store interface { + prometheus.Gatherer +} + +func NewDefaultStore() Store { + return &defaultStore{} +} + +type defaultStore struct{} + +func (*defaultStore) Gather() ([]*dto.MetricFamily, error) { + return prometheus.DefaultGatherer.Gather() +} + +type DistributedStore interface { + Store + Ingest(instance string, mfs []*dto.MetricFamily) error +} + +func NewDistributedStore(prefix string) DistributedStore { + return &distributedStore{ + prefix: prefix, + expiry: DefaultMetricsExpiry, + } +} + +type distributedStore struct { + prefix string + expiry int +} + +func (d *distributedStore) Gather() ([]*dto.MetricFamily, error) { + reConn := redispool.Cache.Get() + defer reConn.Close() + + // First, list all the keys for which we hold metrics. + keys, err := redis.Values(reConn.Do("KEYS", d.prefix+"*")) + if err != nil { + return nil, errors.Wrap(err, "listing entries from redis") + } + + if len(keys) == 0 { + return nil, nil + } + + // Then bulk retrieve all the metrics blobs for all the instances. + encodedMetrics, err := redis.Strings(reConn.Do("MGET", keys...)) + if err != nil { + return nil, errors.Wrap(err, "retrieving blobs from redis") + } + + // Then decode the serialized metrics into proper metric families required + // by the Gatherer interface. + mfs := []*dto.MetricFamily{} + for _, metrics := range encodedMetrics { + // Decode each metrics blob separately. + dec := expfmt.NewDecoder(strings.NewReader(metrics), expfmt.FmtText) + for { + var mf dto.MetricFamily + if err := dec.Decode(&mf); err != nil { + if err == io.EOF { + break + } + + return nil, errors.Wrap(err, "decoding metrics data") + } + mfs = append(mfs, &mf) + } + } + + return mfs, nil +} + +func (d *distributedStore) Ingest(instance string, mfs []*dto.MetricFamily) error { + // First, encode the metrics to text format so we can store them. + var enc bytes.Buffer + encoder := expfmt.NewEncoder(&enc, expfmt.FmtText) + + for _, a := range mfs { + if err := encoder.Encode(a); err != nil { + return errors.Wrap(err, "encoding metric family") + } + } + + encodedMetrics := enc.String() + + reConn := redispool.Cache.Get() + defer reConn.Close() + + // Store the metrics and set an expiry on the key, if we haven't retrieved + // an updated set of metric data, we consider the host down and prune it + // from the gatherer. + err := reConn.Send("SETEX", d.prefix+instance, d.expiry, encodedMetrics) + if err != nil { + return errors.Wrap(err, "writing metrics blob to redis") + } + + return nil +} diff --git a/mockgen.temp.yaml b/mockgen.temp.yaml index b7c2c2e3951..a154c9785e9 100644 --- a/mockgen.temp.yaml +++ b/mockgen.temp.yaml @@ -111,3 +111,8 @@ path: github.com/sourcegraph/sourcegraph/internal/workerutil/dbworker/store interfaces: - Store +- filename: internal/metrics/store/mocks_temp.go + path: github.com/sourcegraph/sourcegraph/internal/metrics/store + interfaces: + - Store + - DistributedStore diff --git a/monitoring/definitions/executor.go b/monitoring/definitions/executor.go index ad76b022302..7baa6a05134 100644 --- a/monitoring/definitions/executor.go +++ b/monitoring/definitions/executor.go @@ -6,7 +6,8 @@ import ( ) func Executor() *monitoring.Dashboard { - const containerName = "(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors)" + // sg_job value is hard-coded, see enterprise/cmd/frontend/internal/executorqueue/handler/routes.go + const containerName = "sourcegraph-executors" // frontend is sometimes called sourcegraph-frontend in various contexts const queueContainerName = "(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors)" @@ -26,7 +27,7 @@ func Executor() *monitoring.Dashboard { { Label: "Compute instance", Name: "instance", - OptionsQuery: "label_values(node_exporter_build_info{job=\"sourcegraph-executor-nodes\"}, instance)", + OptionsQuery: "label_values(node_exporter_build_info{sg_job=\"sourcegraph-executor-nodes\"}, instance)", // The options query can generate a massive result set that can cause issues. // shared.NewNodeExporterGroup filters by job as well so this is safe to use @@ -42,11 +43,14 @@ func Executor() *monitoring.Dashboard { shared.CodeIntelligence.NewExecutorExecutionCommandGroup(containerName), shared.CodeIntelligence.NewExecutorTeardownCommandGroup(containerName), - shared.NewNodeExporterGroup(containerName, "(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)", "Compute", "$instance"), - shared.NewNodeExporterGroup(containerName, "(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)", "Docker Registry Mirror", ".*"), + shared.NewNodeExporterGroup(containerName, "Compute", "$instance"), + shared.NewNodeExporterGroup(containerName, "Docker Registry Mirror", ".*"), // Resource monitoring - shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil), + shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, &shared.GolangMonitoringOptions{ + InstanceLabelName: "sg_instance", + JobLabelName: "sg_job", + }), }, } } diff --git a/monitoring/definitions/frontend.go b/monitoring/definitions/frontend.go index 0fe14536138..af13dc76bd3 100644 --- a/monitoring/definitions/frontend.go +++ b/monitoring/definitions/frontend.go @@ -576,7 +576,8 @@ func Frontend() *monitoring.Dashboard { Interpretation: `Account lockouts per minute`, }, }, - }}, + }, + }, { Title: "Organisation GraphQL API requests", Hidden: true, @@ -910,11 +911,11 @@ func orgMetricRows(orgMetricSpec []struct { name string route string description string -}) []monitoring.Row { +}, +) []monitoring.Row { result := []monitoring.Row{} for _, m := range orgMetricSpec { result = append(result, monitoring.Row{ - { Name: m.name + "_rate", Description: "rate of " + m.description, diff --git a/monitoring/definitions/postgres.go b/monitoring/definitions/postgres.go index 6a50319e5f5..6c6c723977c 100644 --- a/monitoring/definitions/postgres.go +++ b/monitoring/definitions/postgres.go @@ -24,43 +24,44 @@ func Postgres() *monitoring.Dashboard { Groups: []monitoring.Group{ { Title: "General", - Rows: []monitoring.Row{{ - monitoring.Observable{ - Name: "connections", - Description: "active connections", - Owner: monitoring.ObservableOwnerDevOps, - DataMustExist: false, // not deployed on docker-compose - Query: `sum by (job) (pg_stat_activity_count{datname!~"template.*|postgres|cloudsqladmin"}) OR sum by (job) (pg_stat_activity_count{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`, - Panel: monitoring.Panel().LegendFormat("{{datname}}"), - Warning: monitoring.Alert().LessOrEqual(5).For(5 * time.Minute), - NextSteps: "none", - }, - monitoring.Observable{ - Name: "usage_connections_percentage", - Description: "connection in use", - Owner: monitoring.ObservableOwnerDevOps, - DataMustExist: false, - Query: `sum(pg_stat_activity_count) by (job) / (sum(pg_settings_max_connections) by (job) - sum(pg_settings_superuser_reserved_connections) by (job)) * 100`, - Panel: monitoring.Panel().LegendFormat("{{job}}").Unit(monitoring.Percentage).Max(100).Min(0), - Warning: monitoring.Alert().GreaterOrEqual(80).For(5 * time.Minute), - Critical: monitoring.Alert().GreaterOrEqual(100).For(5 * time.Minute), - NextSteps: ` + Rows: []monitoring.Row{ + { + monitoring.Observable{ + Name: "connections", + Description: "active connections", + Owner: monitoring.ObservableOwnerDevOps, + DataMustExist: false, // not deployed on docker-compose + Query: `sum by (job) (pg_stat_activity_count{datname!~"template.*|postgres|cloudsqladmin"}) OR sum by (job) (pg_stat_activity_count{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`, + Panel: monitoring.Panel().LegendFormat("{{datname}}"), + Warning: monitoring.Alert().LessOrEqual(5).For(5 * time.Minute), + NextSteps: "none", + }, + monitoring.Observable{ + Name: "usage_connections_percentage", + Description: "connection in use", + Owner: monitoring.ObservableOwnerDevOps, + DataMustExist: false, + Query: `sum(pg_stat_activity_count) by (job) / (sum(pg_settings_max_connections) by (job) - sum(pg_settings_superuser_reserved_connections) by (job)) * 100`, + Panel: monitoring.Panel().LegendFormat("{{job}}").Unit(monitoring.Percentage).Max(100).Min(0), + Warning: monitoring.Alert().GreaterOrEqual(80).For(5 * time.Minute), + Critical: monitoring.Alert().GreaterOrEqual(100).For(5 * time.Minute), + NextSteps: ` - Consider increasing [max_connections](https://www.postgresql.org/docs/current/runtime-config-connection.html#GUC-MAX-CONNECTIONS) of the database instance, [learn more](https://docs.sourcegraph.com/admin/config/postgres-conf) `, + }, + monitoring.Observable{ + Name: "transaction_durations", + Description: "maximum transaction durations", + Owner: monitoring.ObservableOwnerDevOps, + DataMustExist: false, // not deployed on docker-compose + // Ignore in codeintel-db because Rockskip processing involves long transactions + // during normal operation. + Query: `sum by (job) (pg_stat_activity_max_tx_duration{datname!~"template.*|postgres|cloudsqladmin",job!="codeintel-db"}) OR sum by (job) (pg_stat_activity_max_tx_duration{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`, + Panel: monitoring.Panel().LegendFormat("{{datname}}").Unit(monitoring.Seconds), + Warning: monitoring.Alert().GreaterOrEqual(0.3).For(5 * time.Minute), + NextSteps: "none", + }, }, - monitoring.Observable{ - Name: "transaction_durations", - Description: "maximum transaction durations", - Owner: monitoring.ObservableOwnerDevOps, - DataMustExist: false, // not deployed on docker-compose - // Ignore in codeintel-db because Rockskip processing involves long transactions - // during normal operation. - Query: `sum by (job) (pg_stat_activity_max_tx_duration{datname!~"template.*|postgres|cloudsqladmin",job!="codeintel-db"}) OR sum by (job) (pg_stat_activity_max_tx_duration{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`, - Panel: monitoring.Panel().LegendFormat("{{datname}}").Unit(monitoring.Seconds), - Warning: monitoring.Alert().GreaterOrEqual(0.3).For(5 * time.Minute), - NextSteps: "none", - }, - }, }, }, { diff --git a/monitoring/definitions/shared/codeintel.go b/monitoring/definitions/shared/codeintel.go index 6ba07708221..efddf0f6e65 100644 --- a/monitoring/definitions/shared/codeintel.go +++ b/monitoring/definitions/shared/codeintel.go @@ -323,7 +323,7 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori // src_executor_run_lock_held_total func (codeIntelligence) NewExecutorExecutionRunLockContentionGroup(containerName string) monitoring.Group { constructor := func(metricNameRoot, legend string) Observable { - filters := makeFilters(containerName) + filters := makeFilters("sg_jobs", containerName) return Observable{ Name: metricNameRoot + "_total", Description: fmt.Sprintf("milliseconds %s every 5m", legend), diff --git a/monitoring/definitions/shared/constructor.go b/monitoring/definitions/shared/constructor.go index 46979ea4143..2214981784a 100644 --- a/monitoring/definitions/shared/constructor.go +++ b/monitoring/definitions/shared/constructor.go @@ -29,6 +29,9 @@ type ObservableConstructorOptions struct { // will add a prefix to the constructed legend. MetricDescriptionRoot string + // JobLabel is the name of the label used to denote the job name. If unset, "job" is used. + JobLabel string + // Filters are additional prometheus filter expressions used to select or hide values // for a given label pattern. Filters []string @@ -76,8 +79,12 @@ type GroupConstructorOptions struct { // expressions. The given container name may be string or pattern, which will be matched // against the prefix of the value of the job label. Note that this excludes replicas like // -0 and -1 in docker-compose. -func makeFilters(containerName string, filters ...string) string { - filters = append(filters, fmt.Sprintf(`job=~"^%s.*"`, containerName)) +func makeFilters(containerLabel, containerName string, filters ...string) string { + if containerLabel == "" { + containerLabel = "job" + } + + filters = append(filters, fmt.Sprintf(`%s=~"^%s.*"`, containerLabel, containerName)) return strings.Join(filters, ",") } diff --git a/monitoring/definitions/shared/go.go b/monitoring/definitions/shared/go.go index 0093c22d946..55287899682 100644 --- a/monitoring/definitions/shared/go.go +++ b/monitoring/definitions/shared/go.go @@ -14,28 +14,32 @@ import ( const TitleGolangMonitoring = "Golang runtime monitoring" var ( - GoGoroutines sharedObservable = func(containerName string, owner monitoring.ObservableOwner) Observable { - return Observable{ - Name: "go_goroutines", - Description: "maximum active goroutines", - Query: fmt.Sprintf(`max by(instance) (go_goroutines{job=~".*%s"})`, containerName), - Warning: monitoring.Alert().GreaterOrEqual(10000).For(10 * time.Minute), - Panel: monitoring.Panel().LegendFormat("{{name}}"), - Owner: owner, - Interpretation: "A high value here indicates a possible goroutine leak.", - NextSteps: "none", + GoGoroutines = func(jobLabel, instanceLabel string) sharedObservable { + return func(containerName string, owner monitoring.ObservableOwner) Observable { + return Observable{ + Name: "go_goroutines", + Description: "maximum active goroutines", + Query: fmt.Sprintf(`max by(%s) (go_goroutines{%s=~".*%s"})`, instanceLabel, jobLabel, containerName), + Warning: monitoring.Alert().GreaterOrEqual(10000).For(10 * time.Minute), + Panel: monitoring.Panel().LegendFormat("{{name}}"), + Owner: owner, + Interpretation: "A high value here indicates a possible goroutine leak.", + NextSteps: "none", + } } } - GoGcDuration sharedObservable = func(containerName string, owner monitoring.ObservableOwner) Observable { - return Observable{ - Name: "go_gc_duration_seconds", - Description: "maximum go garbage collection duration", - Query: fmt.Sprintf(`max by(instance) (go_gc_duration_seconds{job=~".*%s"})`, containerName), - Warning: monitoring.Alert().GreaterOrEqual(2), - Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Seconds), - Owner: owner, - NextSteps: "none", + GoGcDuration = func(jobLabel, instanceLabel string) sharedObservable { + return func(containerName string, owner monitoring.ObservableOwner) Observable { + return Observable{ + Name: "go_gc_duration_seconds", + Description: "maximum go garbage collection duration", + Query: fmt.Sprintf(`max by(%s) (go_gc_duration_seconds{%s=~".*%s"})`, instanceLabel, jobLabel, containerName), + Warning: monitoring.Alert().GreaterOrEqual(2), + Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Seconds), + Owner: owner, + NextSteps: "none", + } } } ) @@ -46,6 +50,10 @@ type GolangMonitoringOptions struct { // GCDuration transforms the default observable used to construct the Go GC duration panel. GCDuration ObservableOption + + JobLabelName string + + InstanceLabelName string } // NewGolangMonitoringGroup creates a group containing panels displaying Go monitoring @@ -55,13 +63,20 @@ func NewGolangMonitoringGroup(containerName string, owner monitoring.ObservableO options = &GolangMonitoringOptions{} } + if options.InstanceLabelName == "" { + options.InstanceLabelName = "instance" + } + if options.JobLabelName == "" { + options.JobLabelName = "job" + } + return monitoring.Group{ Title: TitleGolangMonitoring, Hidden: true, Rows: []monitoring.Row{ { - options.Goroutines.safeApply(GoGoroutines(containerName, owner)).Observable(), - options.GCDuration.safeApply(GoGcDuration(containerName, owner)).Observable(), + options.Goroutines.safeApply(GoGoroutines(options.JobLabelName, options.InstanceLabelName)(containerName, owner)).Observable(), + options.GCDuration.safeApply(GoGcDuration(options.JobLabelName, options.InstanceLabelName)(containerName, owner)).Observable(), }, }, } diff --git a/monitoring/definitions/shared/kubernetes.go b/monitoring/definitions/shared/kubernetes.go index ac3708a1e22..e79f7615489 100644 --- a/monitoring/definitions/shared/kubernetes.go +++ b/monitoring/definitions/shared/kubernetes.go @@ -13,24 +13,22 @@ import ( // metrics in a way that only applies in Kubernetes deployments. const TitleKubernetesMonitoring = "Kubernetes monitoring (only available on Kubernetes)" -var ( - KubernetesPodsAvailable sharedObservable = func(containerName string, owner monitoring.ObservableOwner) Observable { - return Observable{ - Name: "pods_available_percentage", - Description: "percentage pods available", - // the 'app' label is only available in Kubernetes deloyments - it indicates the pod. - Query: fmt.Sprintf(`sum by(app) (up{app=~".*%[1]s"}) / count by (app) (up{app=~".*%[1]s"}) * 100`, containerName), - Critical: monitoring.Alert().LessOrEqual(90).For(10 * time.Minute), - Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Percentage).Max(100).Min(0), - Owner: owner, - // Solutions similar to the ContainerMissing solutions. - NextSteps: fmt.Sprintf(` +var KubernetesPodsAvailable sharedObservable = func(containerName string, owner monitoring.ObservableOwner) Observable { + return Observable{ + Name: "pods_available_percentage", + Description: "percentage pods available", + // the 'app' label is only available in Kubernetes deloyments - it indicates the pod. + Query: fmt.Sprintf(`sum by(app) (up{app=~".*%[1]s"}) / count by (app) (up{app=~".*%[1]s"}) * 100`, containerName), + Critical: monitoring.Alert().LessOrEqual(90).For(10 * time.Minute), + Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Percentage).Max(100).Min(0), + Owner: owner, + // Solutions similar to the ContainerMissing solutions. + NextSteps: fmt.Sprintf(` - Determine if the pod was OOM killed using 'kubectl describe pod %[1]s' (look for 'OOMKilled: true') and, if so, consider increasing the memory limit in the relevant 'Deployment.yaml'. - Check the logs before the container restarted to see if there are 'panic:' messages or similar using 'kubectl logs -p %[1]s'. `, containerName), - } } -) +} type KubernetesMonitoringOptions struct { // PodsAvailable transforms the default observable used to construct the pods available panel. diff --git a/monitoring/definitions/shared/node_exporter.go b/monitoring/definitions/shared/node_exporter.go index e31d54a0d8b..4f6367ac8d9 100644 --- a/monitoring/definitions/shared/node_exporter.go +++ b/monitoring/definitions/shared/node_exporter.go @@ -8,7 +8,7 @@ import ( const TitleNodeExporter = "Executor: %s instance metrics" -func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) monitoring.Group { +func NewNodeExporterGroup(job, jobTitle, instanceFilter string) monitoring.Group { return monitoring.Group{ Title: fmt.Sprintf(TitleNodeExporter, jobTitle), Hidden: true, @@ -17,30 +17,30 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m { Name: "node_cpu_utilization", Description: "CPU utilization (minus idle/iowait)", - Query: "sum(rate(node_cpu_seconds_total{job=~\"" + job + "\",mode!~\"(idle|iowait)\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance) / count(node_cpu_seconds_total{job=~\"" + job + "\",mode=\"system\",instance=~\"" + instanceFilter + "\"}) by (instance) * 100", + Query: "sum(rate(node_cpu_seconds_total{sg_job=~\"" + job + "\",mode!~\"(idle|iowait)\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance) / count(node_cpu_seconds_total{sg_job=~\"" + job + "\",mode=\"system\",sg_instance=~\"" + instanceFilter + "\"}) by (sg_instance) * 100", NoAlert: true, Interpretation: "Indicates the amount of CPU time excluding idle and iowait time, divided by the number of cores, as a percentage.", - Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage).Max(100), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Percentage).Max(100), }, { Name: "node_cpu_saturation_cpu_wait", Description: "CPU saturation (time waiting)", - Query: "rate(node_pressure_cpu_waiting_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])", + Query: "rate(node_pressure_cpu_waiting_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])", NoAlert: true, Interpretation: "Indicates the average summed time a number of (but strictly not all) non-idle processes spent waiting for CPU time. If this is higher than normal, then the CPU is underpowered for the workload and more powerful machines should be provisioned. " + "This only represents a \"less-than-all processes\" time, because for processes to be waiting for CPU time there must be other process(es) consuming CPU time.", - Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Seconds), }, }, { { Name: "node_memory_utilization", Description: "memory utilization", - Query: "(1 - sum(node_memory_MemAvailable_bytes{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}) by (instance)) * 100", + Query: "(1 - sum(node_memory_MemAvailable_bytes{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}) by (sg_instance) / sum(node_memory_MemTotal_bytes{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}) by (sg_instance)) * 100", NoAlert: true, Interpretation: "Indicates the amount of available memory (including cache and buffers) as a percentage. Consistently high numbers are generally fine so long memory saturation figures are within acceptable ranges, " + "these figures may be more useful for informing executor provisioning decisions, such as increasing worker parallelism, down-sizing machines etc.", - Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage).Max(100), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Percentage).Max(100), }, // Please see the following article(s) on how we arrive at using these particular metrics. It is stupid complicated and underdocumented beyond anything. // Page 27 of https://documentation.suse.com/sles/11-SP4/pdf/book-sle-tuning_color_en.pdf @@ -50,20 +50,20 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m { Name: "node_memory_saturation_vmeff", Description: "memory saturation (vmem efficiency)", - Query: "(rate(node_vmstat_pgsteal_anon{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) " + - "/ (rate(node_vmstat_pgscan_anon{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) * 100", + Query: "(rate(node_vmstat_pgsteal_anon{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) " + + "/ (rate(node_vmstat_pgscan_anon{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) * 100", NoAlert: true, Interpretation: "Indicates the efficiency of page reclaim, calculated as pgsteal/pgscan. Optimal figures are short spikes of near 100% and above, indicating that a high ratio of scanned pages are actually being freed, " + "or exactly 0%, indicating that pages arent being scanned as there is no memory pressure. Sustained numbers >~100% may be sign of imminent memory exhaustion, while sustained 0% < x < ~100% figures are very serious.", - Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Percentage), }, { Name: "node_memory_saturation_pressure_stalled", Description: "memory saturation (fully stalled)", - Query: "rate(node_pressure_memory_stalled_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])", + Query: "rate(node_pressure_memory_stalled_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])", NoAlert: true, Interpretation: "Indicates the amount of time all non-idle processes were stalled waiting on memory operations to complete. This is often correlated with vmem efficiency ratio when pressure on available memory is high. If they're not correlated, this could indicate issues with the machine hardware and/or configuration.", - Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Seconds), }, }, { @@ -73,84 +73,84 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m { Name: "node_io_disk_utilization", Description: "disk IO utilization (percentage time spent in IO)", - Query: "sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]), \"disk\", \"$1\", \"device\", \"^([^d].+)\"), \"disk\", \"ignite\", \"device\", \"dm-.*\")) by(instance,disk) * 100", + Query: "sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]), \"disk\", \"$1\", \"device\", \"^([^d].+)\"), \"disk\", \"ignite\", \"device\", \"dm-.*\")) by(sg_instance,disk) * 100", NoAlert: true, Interpretation: "Indicates the percentage of time a disk was busy. If this is less than 100%, then the disk has spare utilization capacity. However, a value of 100% does not necesarily indicate the disk is at max capacity. " + "For single, serial request-serving devices, 100% may indicate maximum saturation, but for SSDs and RAID arrays this is less likely to be the case, as they are capable of serving multiple requests in parallel, other metrics such as " + "throughput and request queue size should be factored in.", - Panel: monitoring.Panel().LegendFormat("{{instance}}: {{disk}}").Unit(monitoring.Percentage), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}: {{disk}}").Unit(monitoring.Percentage), }, { Name: "node_io_disk_saturation", Description: "disk IO saturation (avg IO queue size)", - Query: "sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]), \"disk\", \"$1\", \"device\", \"^([^d].+)\"), \"disk\", \"ignite\", \"device\", \"dm-.*\")) by(instance,disk)", + Query: "sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]), \"disk\", \"$1\", \"device\", \"^([^d].+)\"), \"disk\", \"ignite\", \"device\", \"dm-.*\")) by(sg_instance,disk)", NoAlert: true, Interpretation: "Indicates the number of outstanding/queued IO requests. High but short-lived queue sizes may not present an issue, but if theyre consistently/often high and/or monotonically increasing, the disk may be failing or simply too slow for the amount of activity required. " + "Consider replacing the drive(s) with SSDs if they are not already and/or replacing the faulty drive(s), if any.", - Panel: monitoring.Panel().LegendFormat("{{instance}}: {{disk}}"), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}: {{disk}}"), }, { Name: "node_io_disk_saturation_pressure_full", Description: "disk IO saturation (avg time of all processes stalled)", - Query: "rate(node_pressure_io_stalled_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])", + Query: "rate(node_pressure_io_stalled_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])", NoAlert: true, Interpretation: "Indicates the averaged amount of time for which all non-idle processes were stalled waiting for IO to complete simultaneously aka where no processes could make progress.", // TODO: more - Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Seconds), }, }, { { Name: "node_io_network_utilization", Description: "network IO utilization (Rx)", - Query: "sum(rate(node_network_receive_bytes_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance) * 8", + Query: "sum(rate(node_network_receive_bytes_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance) * 8", NoAlert: true, Interpretation: "Indicates the average summed receiving throughput of all network interfaces. This is often predominantly composed of the WAN/internet-connected interface, and knowing normal/good figures depends on knowing the bandwidth of the " + "underlying hardware and the workloads.", - Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.BitsPerSecond), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.BitsPerSecond), }, { Name: "node_io_network_saturation", Description: "network IO saturation (Rx packets dropped)", - Query: "sum(rate(node_network_receive_drop_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance)", + Query: "sum(rate(node_network_receive_drop_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance)", NoAlert: true, Interpretation: "Number of dropped received packets. This can happen if the receive queues/buffers become full due to slow packet processing throughput. The queues/buffers could be configured to be larger as a stop-gap " + "but the processing application should be investigated as soon as possible. https://www.kernel.org/doc/html/latest/networking/statistics.html#:~:text=not%20otherwise%20counted.-,rx_dropped,-Number%20of%20packets", - Panel: monitoring.Panel().LegendFormat("{{instance}}"), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}"), }, { Name: "node_io_network_saturation", Description: "network IO errors (Rx)", - Query: "sum(rate(node_network_receive_errs_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance)", + Query: "sum(rate(node_network_receive_errs_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance)", NoAlert: true, Interpretation: "Number of bad/malformed packets received. https://www.kernel.org/doc/html/latest/networking/statistics.html#:~:text=excluding%20the%20FCS.-,rx_errors,-Total%20number%20of", - Panel: monitoring.Panel().LegendFormat("{{instance}}"), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}"), }, }, { { Name: "node_io_network_utilization", Description: "network IO utilization (Tx)", - Query: "sum(rate(node_network_transmit_bytes_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance) * 8", + Query: "sum(rate(node_network_transmit_bytes_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance) * 8", NoAlert: true, Interpretation: "Indicates the average summed transmitted throughput of all network interfaces. This is often predominantly composed of the WAN/internet-connected interface, and knowing normal/good figures depends on knowing the bandwidth of the " + "underlying hardware and the workloads.", - Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.BitsPerSecond), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.BitsPerSecond), }, { Name: "node_io_network_saturation", Description: "network IO saturation (Tx packets dropped)", - Query: "sum(rate(node_network_transmit_drop_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance)", + Query: "sum(rate(node_network_transmit_drop_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance)", NoAlert: true, Interpretation: "Number of dropped transmitted packets. This can happen if the receiving side's receive queues/buffers become full due to slow packet processing throughput, the network link is congested etc.", - Panel: monitoring.Panel().LegendFormat("{{instance}}"), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}"), }, { Name: "node_io_network_saturation", Description: "network IO errors (Tx)", - Query: "sum(rate(node_network_transmit_errs_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance)", + Query: "sum(rate(node_network_transmit_errs_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance)", NoAlert: true, Interpretation: "Number of packet transmission errors. This is distinct from tx packet dropping, and can indicate a failing NIC, improperly configured network options anywhere along the line, signal noise etc.", - Panel: monitoring.Panel().LegendFormat("{{instance}}"), + Panel: monitoring.Panel().LegendFormat("{{sg_instance}}"), }, }, }, @@ -164,7 +164,7 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m { Name: "node_cpu_saturation_load1", Description: "host CPU saturation (1min average)", - Query: "sum(node_load1{job=~\""+job+"\",instance=~\"$instance\"}) by (instance) / count(node_cpu_seconds_total{job=~\""+job+"\",mode=\"system\",instance=~\"$instance\"}) by (instance) * 100", + Query: "sum(node_load1{job=~\""+job+"\",sg_instance=~\"$instance\"}) by (sg_instance) / count(node_cpu_seconds_total{job=~\""+job+"\",mode=\"system\",sg_instance=~\"$instance\"}) by (sg_instance) * 100", NoAlert: true, Interpretation: "banana", Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage), @@ -172,7 +172,7 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m { Name: "node_cpu_saturation_load5", Description: "host CPU saturation (5min average)", - Query: "sum(node_load5{job=~\""+job+"\",instance=~\"$instance\"}) by (instance) / count(node_cpu_seconds_total{job=~\""+job+"\",mode=\"system\",instance=~\"$instance\"}) by (instance) * 100", + Query: "sum(node_load5{job=~\""+job+"\",sg_instance=~\"$instance\"}) by (sg_instance) / count(node_cpu_seconds_total{job=~\""+job+"\",mode=\"system\",sg_instance=~\"$instance\"}) by (sg_instance) * 100", NoAlert: true, Interpretation: "banana", Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage), @@ -183,7 +183,7 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m /* { Name: "node_memory_saturation", Description: "host memory saturation (major page fault rate)", - Query: "sum(rate(node_vmstat_pgmajfault{job=~\""+job+"\",instance=~\"$instance\"}[$__rate_interval])) by (instance)", + Query: "sum(rate(node_vmstat_pgmajfault{job=~\""+job+"\",sg_instance=~\"$instance\"}[$__rate_interval])) by (sg_instance)", NoAlert: true, Interpretation: "banana", Panel: monitoring.Panel().LegendFormat("{{instance}}"), @@ -193,7 +193,7 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m /* { Name: "node_io_disk_saturation_pressure_some", Description: "disk IO saturation (some-processes time waiting)", - Query: "rate(node_pressure_io_waiting_seconds_total{job=~\""+job+"\",instance=~\"$instance\"}[$__rate_interval])-rate(node_pressure_io_stalled_seconds_total{job=~\""+job+"\",instance=~\"$instance\"}[$__rate_interval])", + Query: "rate(node_pressure_io_waiting_seconds_total{job=~\""+job+"\",sg_instance=~\"$instance\"}[$__rate_interval])-rate(node_pressure_io_stalled_seconds_total{job=~\""+job+"\",sg_instance=~\"$instance\"}[$__rate_interval])", NoAlert: true, Interpretation: "banana", Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds), diff --git a/monitoring/definitions/shared/observation.go b/monitoring/definitions/shared/observation.go index 6414cb13108..3e3eaec1dd9 100644 --- a/monitoring/definitions/shared/observation.go +++ b/monitoring/definitions/shared/observation.go @@ -80,6 +80,10 @@ type ObservationGroupOptions struct { func (observationConstructor) NewGroup(containerName string, owner monitoring.ObservableOwner, options ObservationGroupOptions) monitoring.Group { rows := make([]monitoring.Row, 0, 2) + if options.JobLabel == "" { + options.JobLabel = "job" + } + if len(options.By) == 0 { if options.Aggregate != nil { panic("Aggregate must not be supplied when By is not set") diff --git a/monitoring/definitions/shared/queues.go b/monitoring/definitions/shared/queues.go index 173defce25e..a07e5ca6926 100644 --- a/monitoring/definitions/shared/queues.go +++ b/monitoring/definitions/shared/queues.go @@ -19,7 +19,7 @@ type queueConstructor struct{} // Requires a gauge of the format `src_{options.MetricNameRoot}_total` func (queueConstructor) Size(options ObservableConstructorOptions) sharedObservable { return func(containerName string, owner monitoring.ObservableOwner) Observable { - filters := makeFilters(containerName, options.Filters...) + filters := makeFilters(options.JobLabel, containerName, options.Filters...) by, legendPrefix := makeBy(options.By...) return Observable{ @@ -40,7 +40,7 @@ func (queueConstructor) Size(options ObservableConstructorOptions) sharedObserva // - counter of the format `src_{options.MetricNameRoot}_processor_total` func (queueConstructor) GrowthRate(options ObservableConstructorOptions) sharedObservable { return func(containerName string, owner monitoring.ObservableOwner) Observable { - filters := makeFilters(containerName, options.Filters...) + filters := makeFilters(options.JobLabel, containerName, options.Filters...) by, legendPrefix := makeBy(options.By...) return Observable{ @@ -60,7 +60,7 @@ func (queueConstructor) GrowthRate(options ObservableConstructorOptions) sharedO // - counter of the format `src_{options.MetricNameRoot}_queued_duration_seconds_total` func (queueConstructor) MaxAge(options ObservableConstructorOptions) sharedObservable { return func(containerName string, owner monitoring.ObservableOwner) Observable { - filters := makeFilters(containerName, options.Filters...) + filters := makeFilters(options.JobLabel, containerName, options.Filters...) by, legendPrefix := makeBy(options.By...) return Observable{ diff --git a/monitoring/definitions/shared/standard.go b/monitoring/definitions/shared/standard.go index 7cda1908819..711e5961fc1 100644 --- a/monitoring/definitions/shared/standard.go +++ b/monitoring/definitions/shared/standard.go @@ -29,7 +29,7 @@ func (standardConstructor) Count(legend string) observableConstructor { return func(options ObservableConstructorOptions) sharedObservable { return func(containerName string, owner monitoring.ObservableOwner) Observable { - filters := makeFilters(containerName, options.Filters...) + filters := makeFilters(options.JobLabel, containerName, options.Filters...) by, legendPrefix := makeBy(options.By...) return Observable{ @@ -56,7 +56,7 @@ func (standardConstructor) Duration(legend string) observableConstructor { return func(options ObservableConstructorOptions) sharedObservable { return func(containerName string, owner monitoring.ObservableOwner) Observable { - filters := makeFilters(containerName, options.Filters...) + filters := makeFilters(options.JobLabel, containerName, options.Filters...) by, _ := makeBy(append([]string{"le"}, options.By...)...) observable := Observable{ @@ -99,7 +99,7 @@ func (standardConstructor) Errors(legend string) observableConstructor { return func(options ObservableConstructorOptions) sharedObservable { return func(containerName string, owner monitoring.ObservableOwner) Observable { - filters := makeFilters(containerName, options.Filters...) + filters := makeFilters(options.JobLabel, containerName, options.Filters...) by, legendPrefix := makeBy(options.By...) return Observable{ @@ -128,7 +128,7 @@ func (standardConstructor) ErrorRate(legend string) observableConstructor { return func(options ObservableConstructorOptions) sharedObservable { return func(containerName string, owner monitoring.ObservableOwner) Observable { - filters := makeFilters(containerName, options.Filters...) + filters := makeFilters(options.JobLabel, containerName, options.Filters...) by, legendPrefix := makeBy(options.By...) return Observable{ @@ -145,7 +145,7 @@ func (standardConstructor) ErrorRate(legend string) observableConstructor { // LastOverTime creates a last-over-time aggregate for the error-rate metric, stretching back over the lookback-window time range. func (standardConstructor) LastOverTimeErrorRate(containerName string, lookbackWindow model.Duration, options ObservableConstructorOptions) string { - filters := makeFilters(containerName, options.Filters...) + filters := makeFilters(options.JobLabel, containerName, options.Filters...) by, _ := makeBy(options.By...) return fmt.Sprintf(`last_over_time(sum%[1]s(increase(src_%[2]s_errors_total{%[3]s}[5m]))[%[4]s:]) / (last_over_time(sum%[1]s(increase(src_%[2]s_total{%[3]s}[5m]))[%[4]s:]) + last_over_time(sum%[1]s(increase(src_%[2]s_errors_total{%[3]s}[5m]))[%[4]s:])) * 100`, by, options.MetricNameRoot, filters, lookbackWindow) diff --git a/monitoring/definitions/shared/workerutil.go b/monitoring/definitions/shared/workerutil.go index f46f3392270..903fc57825e 100644 --- a/monitoring/definitions/shared/workerutil.go +++ b/monitoring/definitions/shared/workerutil.go @@ -59,7 +59,7 @@ func (workerutilConstructor) ErrorRate(options ObservableConstructorOptions) sha // Requires a gauge of the format `src_{options.MetricNameRoot}_processor_handlers` func (workerutilConstructor) Handlers(options ObservableConstructorOptions) sharedObservable { return func(containerName string, owner monitoring.ObservableOwner) Observable { - filters := makeFilters(containerName, options.Filters...) + filters := makeFilters(options.JobLabel, containerName, options.Filters...) by, legendPrefix := makeBy(options.By...) return Observable{ diff --git a/monitoring/definitions/zoekt.go b/monitoring/definitions/zoekt.go index 16bde2c7748..ba056c985fa 100644 --- a/monitoring/definitions/zoekt.go +++ b/monitoring/definitions/zoekt.go @@ -188,7 +188,6 @@ func Zoekt() *monitoring.Dashboard { Title: "Git fetch durations", Rows: []monitoring.Row{ { - { Name: "90th_percentile_successful_git_fetch_durations_5m", Description: "90th percentile successful git fetch durations over 5m", diff --git a/sg.config.yaml b/sg.config.yaml index b06e51c3824..56151ff76e2 100644 --- a/sg.config.yaml +++ b/sg.config.yaml @@ -56,6 +56,7 @@ env: { "Name": "codeintel-worker", "Host": "127.0.0.1:6088" }, { "Name": "oss-worker", "Host": "127.0.0.1:6089" }, { "Name": "worker", "Host": "127.0.0.1:6089" }, + { "Name": "worker-executors", "Host": "127.0.0.1:6996" }, { "Name": "executor-codeintel", "Host": "127.0.0.1:6092" }, { "Name": "executor-batches", "Host": "127.0.0.1:6093" }, { "Name": "zoekt-index-0", "Host": "127.0.0.1:6072" },