mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 18:51:59 +00:00
Push executor metrics (#36969)
This commit is contained in:
parent
96c3a70621
commit
dcbd01f545
@ -18,8 +18,9 @@ env:
|
||||
{ "Name": "precise-code-intel-worker", "Host": "127.0.0.1:6088" },
|
||||
{ "Name": "worker", "Host": "127.0.0.1:6089" },
|
||||
{ "Name": "enterprise-worker", "Host": "127.0.0.1:6089" },
|
||||
{ "Name": "enterprise-worker-executors", "Host": "127.0.0.1:6996" },
|
||||
{ "Name": "executor-codeintel", "Host": "127.0.0.1:6092" },
|
||||
{ "Name": "executor-batches", "Host": "127.0.0.1:6093" },
|
||||
"Name": "executor-batches", "Host": "127.0.0.1:6093" },
|
||||
{ "Name": "zoekt-indexserver-0", "Host": "127.0.0.1:6072" },
|
||||
{ "Name": "zoekt-indexserver-1", "Host": "127.0.0.1:6073" },
|
||||
{ "Name": "zoekt-webserver-0", "Host": "127.0.0.1:3070", "DefaultPath": "/debug/requests/" },
|
||||
|
||||
@ -47,15 +47,10 @@
|
||||
# worker
|
||||
- host.docker.internal:6089
|
||||
- labels:
|
||||
job: sourcegraph-code-intel-indexers
|
||||
job: worker-executors
|
||||
targets:
|
||||
# sourcegraph-code-intel-indexers
|
||||
- host.docker.internal:6092
|
||||
- labels:
|
||||
job: executor-batches
|
||||
targets:
|
||||
# executor-batches
|
||||
- host.docker.internal:6093
|
||||
# worker
|
||||
- host.docker.internal:6996
|
||||
- labels:
|
||||
job: postgres_exporter
|
||||
targets:
|
||||
|
||||
@ -47,15 +47,10 @@
|
||||
# worker
|
||||
- 127.0.0.1:6089
|
||||
- labels:
|
||||
job: sourcegraph-code-intel-indexers
|
||||
job: worker-executors
|
||||
targets:
|
||||
# sourcegraph-code-intel-indexers
|
||||
- 127.0.0.1:6092
|
||||
- labels:
|
||||
job: executor-batches
|
||||
targets:
|
||||
# executor-batches
|
||||
- 127.0.0.1:6093
|
||||
# worker
|
||||
- 127.0.0.1:6996
|
||||
- labels:
|
||||
job: postgres_exporter
|
||||
targets:
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
{ "Name": "symbols", "Host": "127.0.0.1:6071" },
|
||||
{ "Name": "repo-updater", "Host": "127.0.0.1:6074" },
|
||||
{ "Name": "worker", "Host": "127.0.0.1:6089" },
|
||||
{ "Name": "worker-executors", "Host": "127.0.0.1:6969" },
|
||||
{ "Name": "precise-code-intel-worker", "Host": "127.0.0.1:6088" },
|
||||
{ "Name": "executor-codeintel", "Host": "127.0.0.1:6092" },
|
||||
{ "Name": "executor-batches", "Host": "127.0.0.1:6093" },
|
||||
|
||||
@ -31,7 +31,6 @@ That means, in order to deploy executors that can talk to the Sourcegraph instan
|
||||
- [Using binaries](#binaries)
|
||||
1. [Confirm executors can reach Sourcegraph instance](#confirm-executors-are-working)
|
||||
1. Optional: [Configuring auto scaling](#configuring-auto-scaling)
|
||||
1. Optional: [Configuring observability](#configuring-observability)
|
||||
|
||||
### Configure Sourcegraph
|
||||
|
||||
@ -368,235 +367,3 @@ To test if the metric is correctly reported into the Cloud provider:
|
||||
- On AWS, this can be found in the CloudWatch metrics section. Under **All metrics**, select the namespace `sourcegraph-executor` and then the metric `environment, queueName`. Make sure there are entries returned.
|
||||
|
||||
Next, you can test whether the number of executors rises and shrinks as load spikes occur. Keep in mind that auto-scaling is not a real-time operation on most cloud providers and usually takes a short moment and can have some delays between the metric going down and the desired machine count adjusting.
|
||||
|
||||
## Configuring observability
|
||||
|
||||
> NOTE: Observability features are currently not supported when [downloading and running executor binaries yourself](#binaries), and on managed instances since they require deployment adjustments.
|
||||
|
||||
Sourcegraph [ships with dashboards](observability/metrics.md) that can display executor metrics. We highly encourage setting this up to help make informed decisions on scaling and to make debugging easier.
|
||||
|
||||
In order to do that, the Prometheus instance bundled with your Sourcegraph deployment must be able to scrape the executor metrics endpoint.
|
||||
|
||||
That requires two things:
|
||||
|
||||
1. Provide Prometheus with service account credentials that allow it to get a list of active compute instances from the cloud provider.
|
||||
2. Add additional scrape jobs to Prometheus.
|
||||
|
||||
To add service account credentials, you can use the `credentials` submodule in both our [AWS](https://sourcegraph.com/github.com/sourcegraph/terraform-aws-executors/-/tree/modules/credentials) and [GCP](https://sourcegraph.com/github.com/sourcegraph/terraform-google-executors/-/tree/modules/credentials) executor modules.
|
||||
|
||||
```terraform
|
||||
module "credentials" {
|
||||
source = "sourcegraph/executors/<cloud>//modules/credentials"
|
||||
version = "<version>"
|
||||
|
||||
region = <region>
|
||||
resource_prefix = ""
|
||||
}
|
||||
|
||||
# For Google:
|
||||
output "instance_scraper_credentials_file" {
|
||||
value = module.my-credentials.instance_scraper_credentials_file
|
||||
}
|
||||
|
||||
# For AWS:
|
||||
output "instance_scraper_access_key_id" {
|
||||
value = module.my-credentials.instance_scraper_access_key_id
|
||||
}
|
||||
|
||||
output "instance_scraper_access_secret_key" {
|
||||
value = module.my-credentials.instance_scraper_access_secret_key
|
||||
}
|
||||
```
|
||||
|
||||
Just as with [auto scaling](#configuring-auto-scaling), you use the `credentials` submodule to get properly configured credentials in the Terraform outputs. When applied, this will yield something like this:
|
||||
|
||||
```
|
||||
# For AWS:
|
||||
instance_scraper_access_key_id = <THE_ACCESS_KEY_TO_CONFIGURE>
|
||||
instance_scraper_access_secret_key = <THE_SECRET_KEY_TO_CONFIGURE>
|
||||
|
||||
# For Google:
|
||||
instance_scraper_credentials_file = <THE_CREDENTIALS_FILE_CONTENT>
|
||||
```
|
||||
|
||||
Now we can use these credentials for the different cloud providers.
|
||||
|
||||
### Google
|
||||
|
||||
Credentials need to be added to the Prometheus container of your Sourcegraph deployment and a new scrape job needs to be added.
|
||||
|
||||
In a Kubernetes deployment, credentials can be added by mounting the credentials file obtained from the `credentials` module in the last step, and pointing to it from an environment variable.
|
||||
|
||||
**Step 1:** Create a secret called `prometheus-secrets` containing the credentials file content:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
type: Opaque
|
||||
metadata:
|
||||
name: prometheus-secrets
|
||||
data:
|
||||
# The Terraform output for `instance_scraper_credentials_file`
|
||||
GCP_ACCOUNT_JSON: <THE_CREDENTIALS_FILE_CONTENT>
|
||||
```
|
||||
|
||||
**Step 2:** Modify the Prometheus deployment manifest:
|
||||
|
||||
```yaml
|
||||
containers:
|
||||
- name: prometheus
|
||||
# [...]
|
||||
env:
|
||||
- name: GOOGLE_APPLICATION_CREDENTIALS
|
||||
value: /credentials/google_application_credentials.json
|
||||
volumeMounts:
|
||||
- mountPath: /credentials/google_application_credentials.json
|
||||
name: credentials
|
||||
subPath: google_application_credentials.json
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: credentials
|
||||
secret:
|
||||
secretName: prometheus-secrets
|
||||
items:
|
||||
- key: GCP_ACCOUNT_JSON
|
||||
path: google_application_credentials.json
|
||||
|
||||
```
|
||||
|
||||
**Step 3:** Add the following scraping job that uses [GCE service discovery configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) to the Prometheus configuration. To do that, you can edit the Prometheus `ConfigMap` and modify the contents of the `prometheus.yml` file. Under [`scrape_configs:`](https://sourcegraph.com/github.com/sourcegraph/deploy-sourcegraph@0938b6686f0c94d80e8331e36f5ddac4659027b1/-/blob/base/prometheus/prometheus.ConfigMap.yaml?L43:5) add the following and make sure to replace `{GCP_PROJECT}`, `{GCP_ZONE}` and `{INSTANCE_TAG}`. The `{INSTANCE_TAG}` value must be the same as [`instance_tag`](https://sourcegraph.com/search?q=context:global+repo:%5Egithub.com/sourcegraph/terraform-aws-executors%24+variable+%22instance_tag%22&patternType=literal).
|
||||
|
||||
```yaml
|
||||
- job_name: 'sourcegraph-executors'
|
||||
metrics_path: /proxy
|
||||
params:
|
||||
module: [executor]
|
||||
gce_sd_configs: &executor_gce_config
|
||||
- project: {GCP_PROJECT} # Change this to the GCP project ID
|
||||
port: 9999
|
||||
zone: {GCP_ZONE} # Change this to the GCP zone
|
||||
filter: '(labels.executor_tag = {INSTANCE_TAG})' # Change {INSTANCE_TAG} to the `executor_instance_tag` set in the Terraform modules
|
||||
relabel_configs: &executor_relabel_config
|
||||
- source_labels: [__meta_gce_public_ip]
|
||||
target_label: __address__
|
||||
replacement: "${1}${2}:9999"
|
||||
separator: ''
|
||||
- source_labels: [__meta_gce_zone]
|
||||
regex: ".+/([^/]+)"
|
||||
target_label: zone
|
||||
separator: ''
|
||||
- source_labels: [__meta_gce_project]
|
||||
target_label: project
|
||||
- source_labels: [__meta_gce_instance_name]
|
||||
target_label: instance
|
||||
separator: ''
|
||||
- regex: "__meta_gce_metadata_(image_.+)"
|
||||
action: labelmap
|
||||
- job_name: 'sourcegraph-executor-nodes'
|
||||
metrics_path: /proxy
|
||||
params:
|
||||
module: [node]
|
||||
gce_sd_configs: *executor_gce_config
|
||||
relabel_configs: *executor_relabel_config
|
||||
# If you've also used the Terraform modules to provision Docker registry
|
||||
# mirrors for executors:
|
||||
- job_name: 'sourcegraph-executors-docker-registry-mirrors'
|
||||
metrics_path: /proxy
|
||||
params:
|
||||
module: [registry]
|
||||
gce_sd_configs: &gce_executor_mirror_config
|
||||
- project: {GCP_PROJECT} # Change this to the GCP project ID
|
||||
port: 9999
|
||||
zone: {GCP_ZONE} # Change this to the GCP zone
|
||||
filter: '(labels.executor_tag = {INSTANCE_TAG}-docker-mirror)' # Change {INSTANCE_TAG} to the `executor_instance_tag` set in the Terraform modules
|
||||
relabel_configs: *executor_relabel_config
|
||||
- job_name: 'sourcegraph-executors-docker-registry-mirror-nodes'
|
||||
metrics_path: /proxy
|
||||
params:
|
||||
module: [node]
|
||||
gce_sd_configs: *gce_executor_mirror_config
|
||||
relabel_configs: *executor_relabel_config
|
||||
```
|
||||
|
||||
**Step 4:** Restart Prometheus.
|
||||
|
||||
If you currently have any executors or Docker registry mirrors running, you should start seeing metrics on the _Executors_ dashboard in Grafana. Alternatively, you can check if the executors can be scraped, by [port-forwarding the Prometheus UI to your local machine and checkin in the UI](./observability/metrics.md#accessing-prometheus-directly).
|
||||
|
||||
### AWS
|
||||
|
||||
Credentials need to be added to the Prometheus container of your Sourcegraph deployment and a new scrape job needs to be added.
|
||||
|
||||
In a Kubernetes deployment, credentials can be added by setting the two secrets obtained from the `credentials` module in the last step as environment variables.
|
||||
|
||||
**Step 1:** Modify the Prometheus deployment manifest:
|
||||
|
||||
```yaml
|
||||
containers:
|
||||
- name: prometheus
|
||||
# [...]
|
||||
env:
|
||||
- name: AWS_ACCESS_KEY_ID
|
||||
# The Terraform output for `instance_scraper_access_key_id`
|
||||
value: <THE_ACCESS_KEY_TO_CONFIGURE>
|
||||
- name: AWS_SECRET_ACCESS_KEY
|
||||
# The Terraform output for `instance_scraper_access_secret_key`
|
||||
value: <THE_SECRET_KEY_TO_CONFIGURE>
|
||||
```
|
||||
|
||||
**Step 2:** Add the following scraping job that uses [EC2 service discovery configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) to the Prometheus configuration. To do that, you can edit the Prometheus `ConfigMap` and modify the contents of the `prometheus.yml` file. Under [`scrape_configs:`](https://sourcegraph.com/github.com/sourcegraph/deploy-sourcegraph@master/-/blob/base/prometheus/prometheus.ConfigMap.yaml?L43:5) add the following and make sure to replace `{AWS_REGION}` and `{INSTANCE_TAG}`. The `{INSTANCE_TAG}` value must be the same as [`instance_tag`](https://sourcegraph.com/search?q=context:global+repo:%5Egithub.com/sourcegraph/terraform-aws-executors%24+variable+%22instance_tag%22&patternType=literal).
|
||||
|
||||
```yaml
|
||||
- job_name: 'sourcegraph-executors'
|
||||
metrics_path: /proxy
|
||||
params:
|
||||
module: [executor]
|
||||
ec2_sd_configs: &executor_ec2_config
|
||||
- region: {AWS_REGION} # Change this to the AWS region
|
||||
port: 9999
|
||||
filters:
|
||||
- name: tag:executor_tag
|
||||
values: [{INSTANCE_TAG}] # Change {INSTANCE_TAG} to the `executor_instance_tag` set in the Terraform modules
|
||||
relabel_configs: &executor_relabel_config
|
||||
- source_labels: [__meta_ec2_public_ip]
|
||||
target_label: __address__
|
||||
replacement: "${1}${2}:9999"
|
||||
separator: ''
|
||||
- source_labels: [__meta_ec2_availability_zone]
|
||||
regex: ".+/([^/]+)"
|
||||
target_label: zone
|
||||
separator: ''
|
||||
- source_labels: [__meta_ec2_instance_id]
|
||||
target_label: instance
|
||||
separator: ''
|
||||
- source_labels: [__meta_ec2_ami]
|
||||
target_label: version
|
||||
- job_name: 'sourcegraph-executor-nodes'
|
||||
metrics_path: /proxy
|
||||
params:
|
||||
module: [node]
|
||||
ec2_sd_configs: *executor_ec2_config
|
||||
relabel_configs: *executor_relabel_config
|
||||
# If you've also used the Terraform modules to provision Docker registry
|
||||
# mirrors for executors:
|
||||
- job_name: 'sourcegraph-executors-docker-registry-mirrors'
|
||||
metrics_path: /proxy
|
||||
params:
|
||||
module: [registry]
|
||||
ec2_sd_configs: &ec2_executor_mirror_config
|
||||
- region: {AWS_REGION}
|
||||
port: 9999
|
||||
filters:
|
||||
- name: tag:executor_tag
|
||||
values: [{INSTANCE_TAG}-docker-mirror]
|
||||
relabel_configs: *executor_relabel_config
|
||||
- job_name: 'sourcegraph-executors-docker-registry-mirror-nodes'
|
||||
metrics_path: /proxy
|
||||
params:
|
||||
module: [node]
|
||||
ec2_sd_configs: *ec2_executor_mirror_config
|
||||
relabel_configs: *executor_relabel_config
|
||||
```
|
||||
|
||||
**Step 3:** Restart Prometheus.
|
||||
|
||||
If you currently have any executors or Docker registry mirrors running, you should start seeing metrics on the _Executors_ dashboard in Grafana. Alternatively, you can check if the executors can be scraped, by [port-forwarding the Prometheus UI to your local machine and checkin in the UI](./observability/metrics.md#accessing-prometheus-directly).
|
||||
|
||||
@ -5931,7 +5931,7 @@ with your code hosts connections or networking issues affecting communication wi
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Custom alert query: `last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))[5h:]) / (last_over_time(sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))[5h:]) + last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))[5h:])) * 100`
|
||||
Custom alert query: `last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))[5h:]) / (last_over_time(sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))[5h:]) + last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))[5h:])) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
138
doc/admin/observability/dashboards.md
generated
138
doc/admin/observability/dashboards.md
generated
@ -17470,7 +17470,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100100`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(src_executor_processor_handlers{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"})`
|
||||
Query: `sum(src_executor_processor_handlers{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"})`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17489,7 +17489,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100110`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17508,7 +17508,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100111`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (le)(rate(src_executor_processor_duration_seconds_bucket{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (le)(rate(src_executor_processor_duration_seconds_bucket{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17527,7 +17527,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100112`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17546,7 +17546,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100113`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100`
|
||||
Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17569,7 +17569,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100200`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_executor_run_lock_wait_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_executor_run_lock_wait_total{sg_jobs=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17590,7 +17590,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100201`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_executor_run_lock_held_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_executor_run_lock_held_total{sg_jobs=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17611,7 +17611,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100300`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_apiclient_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17630,7 +17630,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100301`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (le)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (le)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17649,7 +17649,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100302`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17668,7 +17668,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100303`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_apiworker_apiclient_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100`
|
||||
Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17687,7 +17687,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100310`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_apiclient_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (op)(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17706,7 +17706,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100311`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])))`
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^sourcegraph-executors.*"}[5m])))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17725,7 +17725,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100312`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17744,7 +17744,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100313`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum by (op)(increase(src_apiworker_apiclient_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100`
|
||||
Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17765,7 +17765,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100400`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17784,7 +17784,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100401`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17803,7 +17803,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100402`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17822,7 +17822,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100403`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100`
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17841,7 +17841,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100410`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17860,7 +17860,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100411`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])))`
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17879,7 +17879,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100412`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17898,7 +17898,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100413`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100`
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17919,7 +17919,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100500`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17938,7 +17938,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100501`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17957,7 +17957,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100502`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17976,7 +17976,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100503`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100`
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -17995,7 +17995,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100510`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18014,7 +18014,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100511`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])))`
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18033,7 +18033,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100512`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18052,7 +18052,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100513`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100`
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18073,7 +18073,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100600`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18092,7 +18092,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100601`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18111,7 +18111,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100602`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18130,7 +18130,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100603`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100`
|
||||
Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18149,7 +18149,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100610`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18168,7 +18168,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100611`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])))`
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18187,7 +18187,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100612`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))`
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18206,7 +18206,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100613`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors).*"}[5m]))) * 100`
|
||||
Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18228,7 +18228,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100700`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_cpu_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",mode!~"(idle|iowait)",instance=~"$instance"}[$__rate_interval])) by(instance) / count(node_cpu_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",mode="system",instance=~"$instance"}) by (instance) * 100`
|
||||
Query: `sum(rate(node_cpu_seconds_total{sg_job=~"sourcegraph-executors",mode!~"(idle|iowait)",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance) / count(node_cpu_seconds_total{sg_job=~"sourcegraph-executors",mode="system",sg_instance=~"$instance"}) by (sg_instance) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18248,7 +18248,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100701`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `rate(node_pressure_cpu_waiting_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])`
|
||||
Query: `rate(node_pressure_cpu_waiting_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18268,7 +18268,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100710`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `(1 - sum(node_memory_MemAvailable_bytes{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}) by (instance) / sum(node_memory_MemTotal_bytes{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}) by (instance)) * 100`
|
||||
Query: `(1 - sum(node_memory_MemAvailable_bytes{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}) by (sg_instance) / sum(node_memory_MemTotal_bytes{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}) by (sg_instance)) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18288,7 +18288,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100711`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `(rate(node_vmstat_pgsteal_anon{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) / (rate(node_vmstat_pgscan_anon{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) * 100`
|
||||
Query: `(rate(node_vmstat_pgsteal_anon{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) / (rate(node_vmstat_pgscan_anon{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18308,7 +18308,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100712`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `rate(node_pressure_memory_stalled_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])`
|
||||
Query: `rate(node_pressure_memory_stalled_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18328,7 +18328,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100720`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(instance,disk) * 100`
|
||||
Query: `sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(sg_instance,disk) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18348,7 +18348,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100721`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(instance,disk)`
|
||||
Query: `sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(sg_instance,disk)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18368,7 +18368,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100722`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `rate(node_pressure_io_stalled_seconds_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])`
|
||||
Query: `rate(node_pressure_io_stalled_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18388,7 +18388,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100730`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_receive_bytes_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance) * 8`
|
||||
Query: `sum(rate(node_network_receive_bytes_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance) * 8`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18408,7 +18408,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100731`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_receive_drop_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance)`
|
||||
Query: `sum(rate(node_network_receive_drop_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18428,7 +18428,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100732`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_receive_errs_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance)`
|
||||
Query: `sum(rate(node_network_receive_errs_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18448,7 +18448,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100740`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_transmit_bytes_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance) * 8`
|
||||
Query: `sum(rate(node_network_transmit_bytes_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance) * 8`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18468,7 +18468,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100741`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_transmit_drop_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance)`
|
||||
Query: `sum(rate(node_network_transmit_drop_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18488,7 +18488,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100742`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_transmit_errs_total{job=~"(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)",instance=~"$instance"}[$__rate_interval])) by(instance)`
|
||||
Query: `sum(rate(node_network_transmit_errs_total{sg_job=~"sourcegraph-executors",sg_instance=~"$instance"}[$__rate_interval])) by(sg_instance)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18510,7 +18510,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100800`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_cpu_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",mode!~"(idle|iowait)",instance=~".*"}[$__rate_interval])) by(instance) / count(node_cpu_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",mode="system",instance=~".*"}) by (instance) * 100`
|
||||
Query: `sum(rate(node_cpu_seconds_total{sg_job=~"sourcegraph-executors",mode!~"(idle|iowait)",sg_instance=~".*"}[$__rate_interval])) by(sg_instance) / count(node_cpu_seconds_total{sg_job=~"sourcegraph-executors",mode="system",sg_instance=~".*"}) by (sg_instance) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18530,7 +18530,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100801`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `rate(node_pressure_cpu_waiting_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])`
|
||||
Query: `rate(node_pressure_cpu_waiting_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18550,7 +18550,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100810`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `(1 - sum(node_memory_MemAvailable_bytes{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}) by (instance) / sum(node_memory_MemTotal_bytes{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}) by (instance)) * 100`
|
||||
Query: `(1 - sum(node_memory_MemAvailable_bytes{sg_job=~"sourcegraph-executors",sg_instance=~".*"}) by (sg_instance) / sum(node_memory_MemTotal_bytes{sg_job=~"sourcegraph-executors",sg_instance=~".*"}) by (sg_instance)) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18570,7 +18570,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100811`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `(rate(node_vmstat_pgsteal_anon{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) / (rate(node_vmstat_pgscan_anon{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) * 100`
|
||||
Query: `(rate(node_vmstat_pgsteal_anon{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) / (rate(node_vmstat_pgscan_anon{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18590,7 +18590,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100812`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `rate(node_pressure_memory_stalled_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])`
|
||||
Query: `rate(node_pressure_memory_stalled_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18610,7 +18610,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100820`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(instance,disk) * 100`
|
||||
Query: `sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(sg_instance,disk) * 100`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18630,7 +18630,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100821`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(instance,disk)`
|
||||
Query: `sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval]), "disk", "$1", "device", "^([^d].+)"), "disk", "ignite", "device", "dm-.*")) by(sg_instance,disk)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18650,7 +18650,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100822`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `rate(node_pressure_io_stalled_seconds_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])`
|
||||
Query: `rate(node_pressure_io_stalled_seconds_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18670,7 +18670,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100830`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_receive_bytes_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance) * 8`
|
||||
Query: `sum(rate(node_network_receive_bytes_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance) * 8`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18690,7 +18690,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100831`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_receive_drop_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance)`
|
||||
Query: `sum(rate(node_network_receive_drop_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18710,7 +18710,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100832`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_receive_errs_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance)`
|
||||
Query: `sum(rate(node_network_receive_errs_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18730,7 +18730,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100840`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_transmit_bytes_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance) * 8`
|
||||
Query: `sum(rate(node_network_transmit_bytes_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance) * 8`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18750,7 +18750,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100841`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_transmit_drop_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance)`
|
||||
Query: `sum(rate(node_network_transmit_drop_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18770,7 +18770,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100842`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `sum(rate(node_network_transmit_errs_total{job=~"(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)",instance=~".*"}[$__rate_interval])) by(instance)`
|
||||
Query: `sum(rate(node_network_transmit_errs_total{sg_job=~"sourcegraph-executors",sg_instance=~".*"}[$__rate_interval])) by(sg_instance)`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18793,7 +18793,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100900`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `max by(instance) (go_goroutines{job=~".*(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors)"})`
|
||||
Query: `max by(sg_instance) (go_goroutines{sg_job=~".*sourcegraph-executors"})`
|
||||
|
||||
</details>
|
||||
|
||||
@ -18812,7 +18812,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100901`
|
||||
<details>
|
||||
<summary>Technical details</summary>
|
||||
|
||||
Query: `max by(instance) (go_gc_duration_seconds{job=~".*(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors)"})`
|
||||
Query: `max by(sg_instance) (go_gc_duration_seconds{sg_job=~".*sourcegraph-executors"})`
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
@ -18,25 +18,27 @@ import (
|
||||
type Config struct {
|
||||
env.BaseConfig
|
||||
|
||||
FrontendURL string
|
||||
FrontendAuthorizationToken string
|
||||
QueueName string
|
||||
QueuePollInterval time.Duration
|
||||
MaximumNumJobs int
|
||||
FirecrackerImage string
|
||||
VMStartupScriptPath string
|
||||
VMPrefix string
|
||||
KeepWorkspaces bool
|
||||
DockerHostMountPath string
|
||||
UseFirecracker bool
|
||||
JobNumCPUs int
|
||||
JobMemory string
|
||||
FirecrackerDiskSpace string
|
||||
MaximumRuntimePerJob time.Duration
|
||||
CleanupTaskInterval time.Duration
|
||||
NumTotalJobs int
|
||||
MaxActiveTime time.Duration
|
||||
WorkerHostname string
|
||||
FrontendURL string
|
||||
FrontendAuthorizationToken string
|
||||
QueueName string
|
||||
QueuePollInterval time.Duration
|
||||
MaximumNumJobs int
|
||||
FirecrackerImage string
|
||||
VMStartupScriptPath string
|
||||
VMPrefix string
|
||||
KeepWorkspaces bool
|
||||
DockerHostMountPath string
|
||||
UseFirecracker bool
|
||||
JobNumCPUs int
|
||||
JobMemory string
|
||||
FirecrackerDiskSpace string
|
||||
MaximumRuntimePerJob time.Duration
|
||||
CleanupTaskInterval time.Duration
|
||||
NumTotalJobs int
|
||||
MaxActiveTime time.Duration
|
||||
NodeExporterURL string
|
||||
DockerRegistryNodeExporterURL string
|
||||
WorkerHostname string
|
||||
}
|
||||
|
||||
func (c *Config) Load() {
|
||||
@ -57,6 +59,8 @@ func (c *Config) Load() {
|
||||
c.MaximumRuntimePerJob = c.GetInterval("EXECUTOR_MAXIMUM_RUNTIME_PER_JOB", "30m", "The maximum wall time that can be spent on a single job.")
|
||||
c.CleanupTaskInterval = c.GetInterval("EXECUTOR_CLEANUP_TASK_INTERVAL", "1m", "The frequency with which to run periodic cleanup tasks.")
|
||||
c.NumTotalJobs = c.GetInt("EXECUTOR_NUM_TOTAL_JOBS", "0", "The maximum number of jobs that will be dequeued by the worker.")
|
||||
c.NodeExporterURL = c.GetOptional("NODE_EXPORTER_URL", "The URL of the node_exporter instance, without the /metrics path.")
|
||||
c.DockerRegistryNodeExporterURL = c.GetOptional("DOCKER_REGISTRY_NODE_EXPORTER_URL", "The URL of the Docker Registry instance's node_exporter, without the /metrics path.")
|
||||
c.MaxActiveTime = c.GetInterval("EXECUTOR_MAX_ACTIVE_TIME", "0", "The maximum time that can be spent by the worker dequeueing records to be handled.")
|
||||
|
||||
hn := hostname.Get()
|
||||
@ -88,6 +92,9 @@ func (c *Config) APIWorkerOptions(telemetryOptions apiclient.TelemetryOptions) a
|
||||
// git repositories that make it into commands or stdout/stderr streams.
|
||||
c.FrontendAuthorizationToken: "SECRET_REMOVED",
|
||||
},
|
||||
|
||||
NodeExporterEndpoint: c.NodeExporterURL,
|
||||
DockerRegistryNodeExporterEndpoint: c.DockerRegistryNodeExporterURL,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package apiclient
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
@ -8,19 +9,27 @@ import (
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/opentracing/opentracing-go/log"
|
||||
otlog "github.com/opentracing/opentracing-go/log"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/enterprise/internal/executor"
|
||||
"github.com/sourcegraph/sourcegraph/internal/observation"
|
||||
"github.com/sourcegraph/sourcegraph/internal/workerutil"
|
||||
"github.com/sourcegraph/sourcegraph/lib/errors"
|
||||
)
|
||||
|
||||
// Client is the client used to communicate with a remote job queue API.
|
||||
type Client struct {
|
||||
options Options
|
||||
client *BaseClient
|
||||
operations *operations
|
||||
options Options
|
||||
client *BaseClient
|
||||
logger log.Logger
|
||||
metricsGatherer prometheus.Gatherer
|
||||
operations *operations
|
||||
}
|
||||
|
||||
type Options struct {
|
||||
@ -48,17 +57,19 @@ type EndpointOptions struct {
|
||||
Token string
|
||||
}
|
||||
|
||||
func New(options Options, observationContext *observation.Context) *Client {
|
||||
func New(options Options, metricsGatherer prometheus.Gatherer, observationContext *observation.Context) *Client {
|
||||
return &Client{
|
||||
options: options,
|
||||
client: NewBaseClient(options.BaseClientOptions),
|
||||
operations: newOperations(observationContext),
|
||||
options: options,
|
||||
client: NewBaseClient(options.BaseClientOptions),
|
||||
logger: log.Scoped("executor-api-client", "The API client adapter for executors to use dbworkers over HTTP"),
|
||||
metricsGatherer: metricsGatherer,
|
||||
operations: newOperations(observationContext),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) Dequeue(ctx context.Context, queueName string, job *executor.Job) (_ bool, err error) {
|
||||
ctx, _, endObservation := c.operations.dequeue.With(ctx, &err, observation.Args{LogFields: []log.Field{
|
||||
log.String("queueName", queueName),
|
||||
ctx, _, endObservation := c.operations.dequeue.With(ctx, &err, observation.Args{LogFields: []otlog.Field{
|
||||
otlog.String("queueName", queueName),
|
||||
}})
|
||||
defer endObservation(1, observation.Args{})
|
||||
|
||||
@ -73,9 +84,9 @@ func (c *Client) Dequeue(ctx context.Context, queueName string, job *executor.Jo
|
||||
}
|
||||
|
||||
func (c *Client) AddExecutionLogEntry(ctx context.Context, queueName string, jobID int, entry workerutil.ExecutionLogEntry) (entryID int, err error) {
|
||||
ctx, _, endObservation := c.operations.addExecutionLogEntry.With(ctx, &err, observation.Args{LogFields: []log.Field{
|
||||
log.String("queueName", queueName),
|
||||
log.Int("jobID", jobID),
|
||||
ctx, _, endObservation := c.operations.addExecutionLogEntry.With(ctx, &err, observation.Args{LogFields: []otlog.Field{
|
||||
otlog.String("queueName", queueName),
|
||||
otlog.Int("jobID", jobID),
|
||||
}})
|
||||
defer endObservation(1, observation.Args{})
|
||||
|
||||
@ -93,10 +104,10 @@ func (c *Client) AddExecutionLogEntry(ctx context.Context, queueName string, job
|
||||
}
|
||||
|
||||
func (c *Client) UpdateExecutionLogEntry(ctx context.Context, queueName string, jobID, entryID int, entry workerutil.ExecutionLogEntry) (err error) {
|
||||
ctx, _, endObservation := c.operations.updateExecutionLogEntry.With(ctx, &err, observation.Args{LogFields: []log.Field{
|
||||
log.String("queueName", queueName),
|
||||
log.Int("jobID", jobID),
|
||||
log.Int("entryID", entryID),
|
||||
ctx, _, endObservation := c.operations.updateExecutionLogEntry.With(ctx, &err, observation.Args{LogFields: []otlog.Field{
|
||||
otlog.String("queueName", queueName),
|
||||
otlog.Int("jobID", jobID),
|
||||
otlog.Int("entryID", entryID),
|
||||
}})
|
||||
defer endObservation(1, observation.Args{})
|
||||
|
||||
@ -114,9 +125,9 @@ func (c *Client) UpdateExecutionLogEntry(ctx context.Context, queueName string,
|
||||
}
|
||||
|
||||
func (c *Client) MarkComplete(ctx context.Context, queueName string, jobID int) (err error) {
|
||||
ctx, _, endObservation := c.operations.markComplete.With(ctx, &err, observation.Args{LogFields: []log.Field{
|
||||
log.String("queueName", queueName),
|
||||
log.Int("jobID", jobID),
|
||||
ctx, _, endObservation := c.operations.markComplete.With(ctx, &err, observation.Args{LogFields: []otlog.Field{
|
||||
otlog.String("queueName", queueName),
|
||||
otlog.Int("jobID", jobID),
|
||||
}})
|
||||
defer endObservation(1, observation.Args{})
|
||||
|
||||
@ -132,9 +143,9 @@ func (c *Client) MarkComplete(ctx context.Context, queueName string, jobID int)
|
||||
}
|
||||
|
||||
func (c *Client) MarkErrored(ctx context.Context, queueName string, jobID int, errorMessage string) (err error) {
|
||||
ctx, _, endObservation := c.operations.markErrored.With(ctx, &err, observation.Args{LogFields: []log.Field{
|
||||
log.String("queueName", queueName),
|
||||
log.Int("jobID", jobID),
|
||||
ctx, _, endObservation := c.operations.markErrored.With(ctx, &err, observation.Args{LogFields: []otlog.Field{
|
||||
otlog.String("queueName", queueName),
|
||||
otlog.Int("jobID", jobID),
|
||||
}})
|
||||
defer endObservation(1, observation.Args{})
|
||||
|
||||
@ -151,9 +162,9 @@ func (c *Client) MarkErrored(ctx context.Context, queueName string, jobID int, e
|
||||
}
|
||||
|
||||
func (c *Client) MarkFailed(ctx context.Context, queueName string, jobID int, errorMessage string) (err error) {
|
||||
ctx, _, endObservation := c.operations.markFailed.With(ctx, &err, observation.Args{LogFields: []log.Field{
|
||||
log.String("queueName", queueName),
|
||||
log.Int("jobID", jobID),
|
||||
ctx, _, endObservation := c.operations.markFailed.With(ctx, &err, observation.Args{LogFields: []otlog.Field{
|
||||
otlog.String("queueName", queueName),
|
||||
otlog.Int("jobID", jobID),
|
||||
}})
|
||||
defer endObservation(1, observation.Args{})
|
||||
|
||||
@ -197,12 +208,18 @@ func (c *Client) Ping(ctx context.Context, queueName string, jobIDs []int) (err
|
||||
}
|
||||
|
||||
func (c *Client) Heartbeat(ctx context.Context, queueName string, jobIDs []int) (knownIDs []int, err error) {
|
||||
ctx, _, endObservation := c.operations.heartbeat.With(ctx, &err, observation.Args{LogFields: []log.Field{
|
||||
log.String("queueName", queueName),
|
||||
log.String("jobIDs", intsToString(jobIDs)),
|
||||
ctx, _, endObservation := c.operations.heartbeat.With(ctx, &err, observation.Args{LogFields: []otlog.Field{
|
||||
otlog.String("queueName", queueName),
|
||||
otlog.String("jobIDs", intsToString(jobIDs)),
|
||||
}})
|
||||
defer endObservation(1, observation.Args{})
|
||||
|
||||
metrics, err := gatherMetrics(c.logger, c.metricsGatherer)
|
||||
if err != nil {
|
||||
c.logger.Error("Failed to collect prometheus metrics for heartbeat", log.Error(err))
|
||||
// Continue, no metrics should not prevent heartbeats.
|
||||
}
|
||||
|
||||
req, err := c.makeRequest("POST", fmt.Sprintf("%s/heartbeat", queueName), executor.HeartbeatRequest{
|
||||
ExecutorName: c.options.ExecutorName,
|
||||
JobIDs: jobIDs,
|
||||
@ -214,6 +231,8 @@ func (c *Client) Heartbeat(ctx context.Context, queueName string, jobIDs []int)
|
||||
GitVersion: c.options.TelemetryOptions.GitVersion,
|
||||
IgniteVersion: c.options.TelemetryOptions.IgniteVersion,
|
||||
SrcCliVersion: c.options.TelemetryOptions.SrcCliVersion,
|
||||
|
||||
PrometheusMetrics: metrics,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -264,3 +283,27 @@ func intsToString(ints []int) string {
|
||||
|
||||
return strings.Join(segments, ", ")
|
||||
}
|
||||
|
||||
func gatherMetrics(logger log.Logger, gatherer prometheus.Gatherer) (string, error) {
|
||||
maxDuration := 3 * time.Second
|
||||
ctx, cancel := context.WithTimeout(context.Background(), maxDuration)
|
||||
defer cancel()
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
logger.Warn("gathering metrics took longer than expected", log.Duration("maxDuration", maxDuration))
|
||||
}
|
||||
}()
|
||||
mfs, err := gatherer.Gather()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
enc := expfmt.NewEncoder(&buf, expfmt.FmtText)
|
||||
for _, mf := range mfs {
|
||||
if err := enc.Encode(mf); err != nil {
|
||||
return "", errors.Wrap(err, "encoding metric family")
|
||||
}
|
||||
}
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
@ -11,6 +11,8 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/enterprise/internal/executor"
|
||||
"github.com/sourcegraph/sourcegraph/internal/observation"
|
||||
@ -359,7 +361,9 @@ func TestHeartbeat(t *testing.T) {
|
||||
"executorVersion": "test-executor-version",
|
||||
"gitVersion": "test-git-version",
|
||||
"igniteVersion": "test-ignite-version",
|
||||
"srcCliVersion": "test-src-cli-version"
|
||||
"srcCliVersion": "test-src-cli-version",
|
||||
|
||||
"prometheusMetrics": ""
|
||||
}`,
|
||||
responseStatus: http.StatusOK,
|
||||
responsePayload: `[1]`,
|
||||
@ -393,7 +397,9 @@ func TestHeartbeatBadResponse(t *testing.T) {
|
||||
"executorVersion": "test-executor-version",
|
||||
"gitVersion": "test-git-version",
|
||||
"igniteVersion": "test-ignite-version",
|
||||
"srcCliVersion": "test-src-cli-version"
|
||||
"srcCliVersion": "test-src-cli-version",
|
||||
|
||||
"prometheusMetrics": ""
|
||||
}`,
|
||||
responseStatus: http.StatusInternalServerError,
|
||||
responsePayload: ``,
|
||||
@ -438,7 +444,8 @@ func testRoute(t *testing.T, spec routeSpec, f func(client *Client)) {
|
||||
},
|
||||
}
|
||||
|
||||
f(New(options, &observation.TestContext))
|
||||
client := New(options, prometheus.GathererFunc(func() ([]*dto.MetricFamily, error) { return nil, nil }), &observation.TestContext)
|
||||
f(client)
|
||||
}
|
||||
|
||||
func testServer(t *testing.T, spec routeSpec) *httptest.Server {
|
||||
|
||||
144
enterprise/cmd/executor/internal/metrics/metrics.go
Normal file
144
enterprise/cmd/executor/internal/metrics/metrics.go
Normal file
@ -0,0 +1,144 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/lib/errors"
|
||||
)
|
||||
|
||||
type metricsSyncPoint struct {
|
||||
notify *sync.Cond
|
||||
result chan metricsResult
|
||||
}
|
||||
|
||||
func newMetricsSyncPoint() metricsSyncPoint {
|
||||
return metricsSyncPoint{
|
||||
notify: sync.NewCond(&sync.Mutex{}),
|
||||
result: make(chan metricsResult, 1),
|
||||
}
|
||||
}
|
||||
|
||||
type metricsResult struct {
|
||||
metrics map[string]*dto.MetricFamily
|
||||
err error
|
||||
}
|
||||
|
||||
// MakeExecutorMetricsGatherer uses the given prometheus gatherer to collect all current
|
||||
// metrics, and optionally also gathers metrics from node exporter and the docker
|
||||
// registry mirror, if configured.
|
||||
func MakeExecutorMetricsGatherer(
|
||||
logger log.Logger,
|
||||
gatherer prometheus.Gatherer,
|
||||
// nodeExporterEndpoint is the URL of the local node_exporter endpoint, without
|
||||
// the /metrics path. Disabled, when an empty string.
|
||||
nodeExporterEndpoint string,
|
||||
// dockerRegsitryEndpoint is the URL of the intermediary caching docker registry,
|
||||
// for scraping and forwarding metrics. Disabled, when an empty string.
|
||||
dockerRegistryNodeExporterEndpoint string,
|
||||
) prometheus.GathererFunc {
|
||||
nodeMetrics := newMetricsSyncPoint()
|
||||
registryMetrics := newMetricsSyncPoint()
|
||||
|
||||
go backgroundCollectNodeExporterMetrics(nodeExporterEndpoint, nodeMetrics)
|
||||
go backgroundCollectNodeExporterMetrics(dockerRegistryNodeExporterEndpoint, registryMetrics)
|
||||
|
||||
return func() (mfs []*dto.MetricFamily, err error) {
|
||||
// notify to start a scrape
|
||||
nodeMetrics.notify.Signal()
|
||||
registryMetrics.notify.Signal()
|
||||
|
||||
mfs, err = gatherer.Gather()
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "getting default gatherer")
|
||||
}
|
||||
|
||||
if nodeExporterEndpoint != "" {
|
||||
result := <-registryMetrics.result
|
||||
if result.err != nil {
|
||||
logger.Warn("failed to get metrics for node exporter", log.Error(result.err))
|
||||
}
|
||||
for key, mf := range result.metrics {
|
||||
if strings.HasPrefix(key, "go_") || strings.HasPrefix(key, "promhttp_") || strings.HasPrefix(key, "process_") {
|
||||
continue
|
||||
}
|
||||
|
||||
mfs = append(mfs, mf)
|
||||
}
|
||||
}
|
||||
|
||||
if dockerRegistryNodeExporterEndpoint != "" {
|
||||
result := <-registryMetrics.result
|
||||
if result.err != nil {
|
||||
logger.Warn("failed to get metrics for docker registry", log.Error(result.err))
|
||||
}
|
||||
for key, mf := range result.metrics {
|
||||
if strings.HasPrefix(key, "go_") || strings.HasPrefix(key, "promhttp_") || strings.HasPrefix(key, "process_") {
|
||||
continue
|
||||
}
|
||||
|
||||
// should only be 1 registry, so we give it a set instance value
|
||||
metricLabelInstance := "sg_instance"
|
||||
instanceName := "docker-regsitry"
|
||||
for _, m := range mf.Metric {
|
||||
m.Label = append(m.Label, &dto.LabelPair{Name: &metricLabelInstance, Value: &instanceName})
|
||||
}
|
||||
|
||||
mfs = append(mfs, mf)
|
||||
}
|
||||
}
|
||||
|
||||
return mfs, nil
|
||||
}
|
||||
}
|
||||
|
||||
// On notify, scrapes the specified endpoint for prometheus metrics and sends them down the
|
||||
// associated channel. If the endpoint is "", then the channel is closed so that subsequent
|
||||
// reads return an empty value instead of blocking indefinitely.
|
||||
func backgroundCollectNodeExporterMetrics(endpoint string, syncPoint metricsSyncPoint) {
|
||||
if endpoint == "" {
|
||||
close(syncPoint.result)
|
||||
return
|
||||
}
|
||||
|
||||
collect := func() (map[string]*dto.MetricFamily, error) {
|
||||
resp, err := (&http.Client{
|
||||
Timeout: 2 * time.Second,
|
||||
}).Get(endpoint + "/metrics")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
b, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var parser expfmt.TextParser
|
||||
mfMap, err := parser.TextToMetricFamilies(bytes.NewReader(b))
|
||||
return mfMap, errors.Wrapf(err, "parsing node_exporter metrics, response: %s", string(b))
|
||||
}
|
||||
|
||||
for {
|
||||
syncPoint.notify.L.Lock()
|
||||
syncPoint.notify.Wait()
|
||||
mfMap, err := collect()
|
||||
if err != nil {
|
||||
syncPoint.result <- metricsResult{err: err}
|
||||
} else {
|
||||
syncPoint.result <- metricsResult{metrics: mfMap}
|
||||
}
|
||||
syncPoint.notify.L.Unlock()
|
||||
}
|
||||
}
|
||||
@ -8,10 +8,14 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/inconshreveable/log15"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/enterprise/cmd/executor/internal/apiclient"
|
||||
"github.com/sourcegraph/sourcegraph/enterprise/cmd/executor/internal/command"
|
||||
"github.com/sourcegraph/sourcegraph/enterprise/cmd/executor/internal/janitor"
|
||||
"github.com/sourcegraph/sourcegraph/enterprise/cmd/executor/internal/metrics"
|
||||
"github.com/sourcegraph/sourcegraph/internal/goroutine"
|
||||
"github.com/sourcegraph/sourcegraph/internal/observation"
|
||||
"github.com/sourcegraph/sourcegraph/internal/workerutil"
|
||||
@ -60,6 +64,14 @@ type Options struct {
|
||||
// ResourceOptions configures the resource limits of docker container and Firecracker
|
||||
// virtual machines running on the executor.
|
||||
ResourceOptions command.ResourceOptions
|
||||
|
||||
// NodeExporterEndpoint is the URL of the local node_exporter endpoint, without
|
||||
// the /metrics path.
|
||||
NodeExporterEndpoint string
|
||||
|
||||
// DockerRegsitryEndpoint is the URL of the intermediary caching docker registry,
|
||||
// for scraping and forwarding metrics.
|
||||
DockerRegistryNodeExporterEndpoint string
|
||||
}
|
||||
|
||||
// NewWorker creates a worker that polls a remote job queue API for work. The returned
|
||||
@ -67,8 +79,9 @@ type Options struct {
|
||||
// as a heartbeat routine that will periodically hit the remote API with the work that is
|
||||
// currently being performed, which is necessary so the job queue API doesn't hand out jobs
|
||||
// it thinks may have been dropped.
|
||||
func NewWorker(nameSet *janitor.NameSet, options Options, observationContext *observation.Context) (worker goroutine.WaitableBackgroundRoutine) {
|
||||
queueStore := apiclient.New(options.ClientOptions, observationContext)
|
||||
func NewWorker(nameSet *janitor.NameSet, options Options, observationContext *observation.Context) goroutine.WaitableBackgroundRoutine {
|
||||
gatherer := metrics.MakeExecutorMetricsGatherer(log.Scoped("executor-worker.metrics-gatherer", ""), prometheus.DefaultGatherer, options.NodeExporterEndpoint, options.DockerRegistryNodeExporterEndpoint)
|
||||
queueStore := apiclient.New(options.ClientOptions, gatherer, observationContext)
|
||||
store := &storeShim{queueName: options.QueueName, queueStore: queueStore}
|
||||
|
||||
if !connectToFrontend(queueStore, options) {
|
||||
|
||||
@ -6,7 +6,7 @@ export CNI_VERSION=v0.9.1
|
||||
export KERNEL_IMAGE="weaveworks/ignite-kernel:5.10.51"
|
||||
export EXECUTOR_FIRECRACKER_IMAGE="sourcegraph/ignite-ubuntu:insiders"
|
||||
export NODE_EXPORTER_VERSION=1.2.2
|
||||
export EXPORTER_EXPORTER_VERSION=0.4.5
|
||||
export NODE_EXPORTER_ADDR="127.0.0.1:9100"
|
||||
|
||||
## Install ops agent
|
||||
## Reference: https://cloud.google.com/logging/docs/agent/ops-agent/installation
|
||||
@ -100,6 +100,7 @@ Environment=HOME="%h"
|
||||
Environment=SRC_LOG_LEVEL=dbug
|
||||
Environment=SRC_PROF_HTTP=127.0.0.1:6060
|
||||
Environment=EXECUTOR_FIRECRACKER_IMAGE="${EXECUTOR_FIRECRACKER_IMAGE}"
|
||||
Environment=NODE_EXPORTER_URL="http://${NODE_EXPORTER_ADDR}"
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -147,7 +148,7 @@ Description=Node Exporter
|
||||
[Service]
|
||||
User=node_exporter
|
||||
ExecStart=/usr/local/bin/node_exporter \
|
||||
--web.listen-address="127.0.0.1:9100" \
|
||||
--web.listen-address="${NODE_EXPORTER_ADDR}" \
|
||||
--collector.disable-defaults \
|
||||
--collector.cpu \
|
||||
--collector.loadavg \
|
||||
@ -169,42 +170,6 @@ EOF
|
||||
systemctl enable node_exporter
|
||||
}
|
||||
|
||||
function install_exporter_exporter() {
|
||||
useradd --system --shell /bin/false exporter_exporter
|
||||
|
||||
wget https://github.com/QubitProducts/exporter_exporter/releases/download/v${EXPORTER_EXPORTER_VERSION}/exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64.tar.gz
|
||||
tar xvfz exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64.tar.gz
|
||||
mv exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64/exporter_exporter /usr/local/bin/exporter_exporter
|
||||
rm -rf exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64 exporter_exporter-${EXPORTER_EXPORTER_VERSION}.linux-amd64.tar.gz
|
||||
|
||||
chown exporter_exporter:exporter_exporter /usr/local/bin/exporter_exporter
|
||||
|
||||
cat <<EOF >/usr/local/bin/exporter_exporter.yaml
|
||||
modules:
|
||||
node:
|
||||
method: http
|
||||
http:
|
||||
port: 9100
|
||||
executor:
|
||||
method: http
|
||||
http:
|
||||
port: 6060
|
||||
EOF
|
||||
|
||||
cat <<EOF >/etc/systemd/system/exporter_exporter.service
|
||||
[Unit]
|
||||
Description=Exporter Exporter
|
||||
[Service]
|
||||
User=exporter_exporter
|
||||
ExecStart=/usr/local/bin/exporter_exporter -config.file "/usr/local/bin/exporter_exporter.yaml"
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
systemctl daemon-reload
|
||||
systemctl enable exporter_exporter
|
||||
}
|
||||
|
||||
# Install src-cli to the host system. It's needed for src steps outside of firecracker.
|
||||
function install_src_cli() {
|
||||
curl -f -L -o src-cli.tar.gz "https://github.com/sourcegraph/src-cli/releases/download/${SRC_CLI_VERSION}/src-cli_${SRC_CLI_VERSION}_linux_amd64.tar.gz"
|
||||
@ -252,7 +217,6 @@ install_ignite
|
||||
# Services
|
||||
install_executor
|
||||
install_node_exporter
|
||||
install_exporter_exporter
|
||||
|
||||
# Service prep and cleanup
|
||||
generate_ignite_base_image
|
||||
|
||||
@ -7,6 +7,7 @@ import (
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
apiclient "github.com/sourcegraph/sourcegraph/enterprise/internal/executor"
|
||||
metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store"
|
||||
executor "github.com/sourcegraph/sourcegraph/internal/services/executors/store"
|
||||
"github.com/sourcegraph/sourcegraph/internal/types"
|
||||
"github.com/sourcegraph/sourcegraph/internal/workerutil"
|
||||
@ -17,6 +18,8 @@ import (
|
||||
type handler struct {
|
||||
QueueOptions
|
||||
executorStore executor.Store
|
||||
metricsStore metricsstore.DistributedStore
|
||||
logger log.Logger
|
||||
}
|
||||
|
||||
type QueueOptions struct {
|
||||
@ -31,9 +34,11 @@ type QueueOptions struct {
|
||||
RecordTransformer func(ctx context.Context, record workerutil.Record) (apiclient.Job, error)
|
||||
}
|
||||
|
||||
func newHandler(executorStore executor.Store, queueOptions QueueOptions) *handler {
|
||||
func newHandler(executorStore executor.Store, metricsStore metricsstore.DistributedStore, queueOptions QueueOptions) *handler {
|
||||
return &handler{
|
||||
executorStore: executorStore,
|
||||
metricsStore: metricsStore,
|
||||
logger: log.Scoped("executor-queue-handler", "The route handler for all executor dbworker API tunnel endpoints"),
|
||||
QueueOptions: queueOptions,
|
||||
}
|
||||
}
|
||||
|
||||
@ -7,6 +7,7 @@ import (
|
||||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
apiclient "github.com/sourcegraph/sourcegraph/enterprise/internal/executor"
|
||||
metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store"
|
||||
"github.com/sourcegraph/sourcegraph/internal/types"
|
||||
"github.com/sourcegraph/sourcegraph/internal/workerutil"
|
||||
"github.com/sourcegraph/sourcegraph/internal/workerutil/dbworker/store"
|
||||
@ -39,8 +40,9 @@ func TestDequeue(t *testing.T) {
|
||||
}
|
||||
|
||||
executorStore := NewMockStore()
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
|
||||
job, dequeued, err := handler.dequeue(context.Background(), "deadbeef")
|
||||
if err != nil {
|
||||
@ -58,7 +60,10 @@ func TestDequeue(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestDequeueNoRecord(t *testing.T) {
|
||||
handler := newHandler(NewMockStore(), QueueOptions{Store: workerstoremocks.NewMockStore()})
|
||||
executorStore := NewMockStore()
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: workerstoremocks.NewMockStore()})
|
||||
|
||||
_, dequeued, err := handler.dequeue(context.Background(), "deadbeef")
|
||||
if err != nil {
|
||||
@ -79,8 +84,9 @@ func TestAddExecutionLogEntry(t *testing.T) {
|
||||
store.AddExecutionLogEntryFunc.SetDefaultReturn(fakeEntryID, nil)
|
||||
|
||||
executorStore := NewMockStore()
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
|
||||
job, dequeued, err := handler.dequeue(context.Background(), "deadbeef")
|
||||
if err != nil {
|
||||
@ -118,7 +124,8 @@ func TestAddExecutionLogEntryUnknownJob(t *testing.T) {
|
||||
store := workerstoremocks.NewMockStore()
|
||||
store.AddExecutionLogEntryFunc.SetDefaultReturn(0, workerstore.ErrExecutionLogEntryNotUpdated)
|
||||
executorStore := NewMockStore()
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store})
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store})
|
||||
|
||||
entry := workerutil.ExecutionLogEntry{
|
||||
Command: []string{"ls", "-a"},
|
||||
@ -137,8 +144,9 @@ func TestUpdateExecutionLogEntry(t *testing.T) {
|
||||
}
|
||||
|
||||
executorStore := NewMockStore()
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
|
||||
job, dequeued, err := handler.dequeue(context.Background(), "deadbeef")
|
||||
if err != nil {
|
||||
@ -176,7 +184,8 @@ func TestUpdateExecutionLogEntryUnknownJob(t *testing.T) {
|
||||
store := workerstoremocks.NewMockStore()
|
||||
store.UpdateExecutionLogEntryFunc.SetDefaultReturn(workerstore.ErrExecutionLogEntryNotUpdated)
|
||||
executorStore := NewMockStore()
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store})
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store})
|
||||
|
||||
entry := workerutil.ExecutionLogEntry{
|
||||
Command: []string{"ls", "-a"},
|
||||
@ -196,8 +205,9 @@ func TestMarkComplete(t *testing.T) {
|
||||
}
|
||||
|
||||
executorStore := NewMockStore()
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
|
||||
job, dequeued, err := handler.dequeue(context.Background(), "deadbeef")
|
||||
if err != nil {
|
||||
@ -224,7 +234,8 @@ func TestMarkCompleteUnknownJob(t *testing.T) {
|
||||
store := workerstoremocks.NewMockStore()
|
||||
store.MarkCompleteFunc.SetDefaultReturn(false, nil)
|
||||
executorStore := NewMockStore()
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store})
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store})
|
||||
|
||||
if err := handler.markComplete(context.Background(), "deadbeef", 42); err != ErrUnknownJob {
|
||||
t.Fatalf("unexpected error. want=%q have=%q", ErrUnknownJob, err)
|
||||
@ -236,7 +247,8 @@ func TestMarkCompleteStoreError(t *testing.T) {
|
||||
internalErr := errors.New("something went wrong")
|
||||
store.MarkCompleteFunc.SetDefaultReturn(false, internalErr)
|
||||
executorStore := NewMockStore()
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store})
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store})
|
||||
|
||||
if err := handler.markComplete(context.Background(), "deadbeef", 42); err == nil || errors.UnwrapAll(err).Error() != internalErr.Error() {
|
||||
t.Fatalf("unexpected error. want=%q have=%q", internalErr, errors.UnwrapAll(err))
|
||||
@ -252,8 +264,9 @@ func TestMarkErrored(t *testing.T) {
|
||||
}
|
||||
|
||||
executorStore := NewMockStore()
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
|
||||
job, dequeued, err := handler.dequeue(context.Background(), "deadbeef")
|
||||
if err != nil {
|
||||
@ -283,7 +296,8 @@ func TestMarkErroredUnknownJob(t *testing.T) {
|
||||
store := workerstoremocks.NewMockStore()
|
||||
store.MarkErroredFunc.SetDefaultReturn(false, nil)
|
||||
executorStore := NewMockStore()
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store})
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store})
|
||||
|
||||
if err := handler.markErrored(context.Background(), "deadbeef", 42, "OH NO"); err != ErrUnknownJob {
|
||||
t.Fatalf("unexpected error. want=%q have=%q", ErrUnknownJob, err)
|
||||
@ -295,7 +309,8 @@ func TestMarkErroredStoreError(t *testing.T) {
|
||||
storeErr := errors.New("something went wrong")
|
||||
store.MarkErroredFunc.SetDefaultReturn(false, storeErr)
|
||||
executorStore := NewMockStore()
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store})
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store})
|
||||
|
||||
if err := handler.markErrored(context.Background(), "deadbeef", 42, "OH NO"); err == nil || errors.UnwrapAll(err).Error() != storeErr.Error() {
|
||||
t.Fatalf("unexpected error. want=%q have=%q", storeErr, errors.UnwrapAll(err))
|
||||
@ -311,8 +326,9 @@ func TestMarkFailed(t *testing.T) {
|
||||
}
|
||||
|
||||
executorStore := NewMockStore()
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store, RecordTransformer: recordTransformer})
|
||||
|
||||
job, dequeued, err := handler.dequeue(context.Background(), "deadbeef")
|
||||
if err != nil {
|
||||
@ -342,7 +358,8 @@ func TestMarkFailedUnknownJob(t *testing.T) {
|
||||
store := workerstoremocks.NewMockStore()
|
||||
store.MarkFailedFunc.SetDefaultReturn(false, nil)
|
||||
executorStore := NewMockStore()
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store})
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store})
|
||||
|
||||
if err := handler.markFailed(context.Background(), "deadbeef", 42, "OH NO"); err != ErrUnknownJob {
|
||||
t.Fatalf("unexpected error. want=%q have=%q", ErrUnknownJob, err)
|
||||
@ -354,7 +371,8 @@ func TestMarkFailedStoreError(t *testing.T) {
|
||||
storeErr := errors.New("something went wrong")
|
||||
store.MarkFailedFunc.SetDefaultReturn(false, storeErr)
|
||||
executorStore := NewMockStore()
|
||||
handler := newHandler(executorStore, QueueOptions{Store: store})
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: store})
|
||||
|
||||
if err := handler.markFailed(context.Background(), "deadbeef", 42, "OH NO"); err == nil || errors.UnwrapAll(err).Error() != storeErr.Error() {
|
||||
t.Fatalf("unexpected error. want=%q have=%q", storeErr, errors.UnwrapAll(err))
|
||||
@ -372,6 +390,7 @@ func TestHeartbeat(t *testing.T) {
|
||||
})
|
||||
|
||||
executorStore := NewMockStore()
|
||||
metricsStore := metricsstore.NewMockDistributedStore()
|
||||
|
||||
executor := types.Executor{
|
||||
Hostname: "test-hostname",
|
||||
@ -385,7 +404,7 @@ func TestHeartbeat(t *testing.T) {
|
||||
SrcCliVersion: "test-src-cli-version",
|
||||
}
|
||||
|
||||
handler := newHandler(executorStore, QueueOptions{Store: s, RecordTransformer: recordTransformer})
|
||||
handler := newHandler(executorStore, metricsStore, QueueOptions{Store: s, RecordTransformer: recordTransformer})
|
||||
|
||||
if knownIDs, err := handler.heartbeat(context.Background(), executor, []int{testKnownID, 10}); err != nil {
|
||||
t.Fatalf("unexpected error performing heartbeat: %s", err)
|
||||
|
||||
@ -6,21 +6,29 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/grafana/regexp"
|
||||
"github.com/inconshreveable/log15"
|
||||
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
apiclient "github.com/sourcegraph/sourcegraph/enterprise/internal/executor"
|
||||
metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store"
|
||||
executor "github.com/sourcegraph/sourcegraph/internal/services/executors/store"
|
||||
"github.com/sourcegraph/sourcegraph/internal/types"
|
||||
"github.com/sourcegraph/sourcegraph/lib/errors"
|
||||
)
|
||||
|
||||
// SetupRoutes registers all route handlers required for all configured executor
|
||||
// queues with the given router.
|
||||
func SetupRoutes(executorStore executor.Store, queueOptionsMap []QueueOptions, router *mux.Router) {
|
||||
func SetupRoutes(executorStore executor.Store, metricsStore metricsstore.DistributedStore, queueOptionsMap []QueueOptions, router *mux.Router) {
|
||||
for _, queueOptions := range queueOptionsMap {
|
||||
h := newHandler(executorStore, queueOptions)
|
||||
h := newHandler(executorStore, metricsStore, queueOptions)
|
||||
|
||||
subRouter := router.PathPrefix(fmt.Sprintf("/{queueName:(?:%s)}/", regexp.QuoteMeta(queueOptions.Name))).Subrouter()
|
||||
routes := map[string]func(w http.ResponseWriter, r *http.Request){
|
||||
@ -132,6 +140,22 @@ func (h *handler) handleHeartbeat(w http.ResponseWriter, r *http.Request) {
|
||||
SrcCliVersion: payload.SrcCliVersion,
|
||||
}
|
||||
|
||||
// Handle metrics in the background, this should not delay the heartbeat response being
|
||||
// delivered. It is critical for keeping jobs alive.
|
||||
go func() {
|
||||
metrics, err := decodeAndLabelMetrics(payload.PrometheusMetrics, payload.ExecutorName)
|
||||
if err != nil {
|
||||
// Just log the error but don't panic. The heartbeat is more important.
|
||||
h.logger.Error("failed to decode metrics and apply labels for executor heartbeat", log.Error(err))
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.metricsStore.Ingest(payload.ExecutorName, metrics); err != nil {
|
||||
// Just log the error but don't panic. The heartbeat is more important.
|
||||
h.logger.Error("failed to ingest metrics for executor heartbeat", log.Error(err))
|
||||
}
|
||||
}()
|
||||
|
||||
unknownIDs, err := h.heartbeat(r.Context(), executor, payload.JobIDs)
|
||||
return http.StatusOK, unknownIDs, err
|
||||
})
|
||||
@ -184,3 +208,40 @@ func (h *handler) wrapHandler(w http.ResponseWriter, r *http.Request, payload an
|
||||
_, _ = io.Copy(w, bytes.NewReader(data))
|
||||
}
|
||||
}
|
||||
|
||||
// decodeAndLabelMetrics decodes the text serialized prometheus metrics dump and then
|
||||
// applies common labels.
|
||||
func decodeAndLabelMetrics(encodedMetrics, instanceName string) ([]*dto.MetricFamily, error) {
|
||||
data := []*dto.MetricFamily{}
|
||||
|
||||
dec := expfmt.NewDecoder(strings.NewReader(encodedMetrics), expfmt.FmtText)
|
||||
for {
|
||||
var mf dto.MetricFamily
|
||||
if err := dec.Decode(&mf); err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
|
||||
return nil, errors.Wrap(err, "decoding metric family")
|
||||
}
|
||||
|
||||
// Attach the extra labels.
|
||||
metricLabelInstance := "sg_instance"
|
||||
metricLabelJob := "sg_job"
|
||||
job := "sourcegraph-executors"
|
||||
for _, m := range mf.Metric {
|
||||
var found bool
|
||||
for _, l := range m.Label {
|
||||
found = found || *l.Name == metricLabelInstance
|
||||
}
|
||||
if !found {
|
||||
m.Label = append(m.Label, &dto.LabelPair{Name: &metricLabelInstance, Value: &instanceName})
|
||||
}
|
||||
m.Label = append(m.Label, &dto.LabelPair{Name: &metricLabelJob, Value: &job})
|
||||
}
|
||||
|
||||
data = append(data, &mf)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
@ -12,10 +12,12 @@ import (
|
||||
"github.com/sourcegraph/sourcegraph/internal/actor"
|
||||
"github.com/sourcegraph/sourcegraph/internal/database"
|
||||
"github.com/sourcegraph/sourcegraph/internal/gitserver"
|
||||
metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store"
|
||||
executorDB "github.com/sourcegraph/sourcegraph/internal/services/executors/store/db"
|
||||
)
|
||||
|
||||
func newExecutorQueueHandler(db database.DB, queueOptions []handler.QueueOptions, accessToken func() string, uploadHandler http.Handler) (func() http.Handler, error) {
|
||||
metricsStore := metricsstore.NewDistributedStore("executors:")
|
||||
executorStore := executorDB.New(db)
|
||||
gitserverClient := gitserver.NewClient(db)
|
||||
|
||||
@ -29,7 +31,7 @@ func newExecutorQueueHandler(db database.DB, queueOptions []handler.QueueOptions
|
||||
base.Path("/git/{RepoName:.*}/git-upload-pack").Handler(gitserverProxy(gitserverClient, "/git-upload-pack"))
|
||||
|
||||
// Serve the executor queue API.
|
||||
handler.SetupRoutes(executorStore, queueOptions, base.PathPrefix("/queue/").Subrouter())
|
||||
handler.SetupRoutes(executorStore, metricsStore, queueOptions, base.PathPrefix("/queue/").Subrouter())
|
||||
|
||||
// Upload LSIF indexes without a sudo access token or github tokens.
|
||||
base.Path("/lsif/upload").Methods("POST").Handler(uploadHandler)
|
||||
|
||||
@ -0,0 +1,17 @@
|
||||
package executors
|
||||
|
||||
import (
|
||||
"github.com/sourcegraph/sourcegraph/internal/env"
|
||||
)
|
||||
|
||||
type metricsServerConfig struct {
|
||||
env.BaseConfig
|
||||
|
||||
MetricsServerPort int
|
||||
}
|
||||
|
||||
var metricsServerConfigInst = &metricsServerConfig{}
|
||||
|
||||
func (c *metricsServerConfig) Load() {
|
||||
c.MetricsServerPort = c.GetInt("EXECUTORS_METRICS_SERVER_PORT", "6996", "The port to listen on to serve the metrics from executors.")
|
||||
}
|
||||
@ -0,0 +1,50 @@
|
||||
package executors
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net"
|
||||
"net/http"
|
||||
"strconv"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/cmd/worker/job"
|
||||
"github.com/sourcegraph/sourcegraph/internal/env"
|
||||
"github.com/sourcegraph/sourcegraph/internal/goroutine"
|
||||
"github.com/sourcegraph/sourcegraph/internal/httpserver"
|
||||
metricsstore "github.com/sourcegraph/sourcegraph/internal/metrics/store"
|
||||
)
|
||||
|
||||
type metricsServerJob struct{}
|
||||
|
||||
func NewMetricsServerJob() job.Job {
|
||||
return &metricsServerJob{}
|
||||
}
|
||||
|
||||
func (j *metricsServerJob) Description() string {
|
||||
return "HTTP server exposing the metrics collected from executors to Prometheus"
|
||||
}
|
||||
|
||||
func (j *metricsServerJob) Config() []env.Config {
|
||||
return []env.Config{metricsServerConfigInst}
|
||||
}
|
||||
|
||||
func (j *metricsServerJob) Routines(ctx context.Context, logger log.Logger) ([]goroutine.BackgroundRoutine, error) {
|
||||
host := ""
|
||||
if env.InsecureDev {
|
||||
host = "127.0.0.1"
|
||||
}
|
||||
addr := net.JoinHostPort(host, strconv.Itoa(metricsServerConfigInst.MetricsServerPort))
|
||||
|
||||
metricsStore := metricsstore.NewDistributedStore("executors:")
|
||||
|
||||
handler := promhttp.HandlerFor(prometheus.GathererFunc(metricsStore.Gather), promhttp.HandlerOpts{})
|
||||
|
||||
routines := []goroutine.BackgroundRoutine{
|
||||
httpserver.NewFromAddr(addr, &http.Server{Handler: handler}),
|
||||
}
|
||||
|
||||
return routines, nil
|
||||
}
|
||||
@ -51,6 +51,7 @@ func main() {
|
||||
"batches-bulk-processor": batches.NewBulkOperationProcessorJob(),
|
||||
"batches-workspace-resolver": batches.NewWorkspaceResolverJob(),
|
||||
"executors-janitor": executors.NewJanitorJob(),
|
||||
"executors-metricsserver": executors.NewMetricsServerJob(),
|
||||
"codemonitors-job": codemonitors.NewCodeMonitorJob(),
|
||||
"bitbucket-project-permissions": permissions.NewBitbucketProjectPermissionsJob(),
|
||||
"export-usage-telemetry": telemetry.NewTelemetryJob(),
|
||||
|
||||
@ -5,5 +5,6 @@
|
||||
{ "Name": "symbols", "Host": "127.0.0.1:6071" },
|
||||
{ "Name": "repo-updater", "Host": "127.0.0.1:6074" },
|
||||
{ "Name": "precise-code-intel-worker", "Host": "127.0.0.1:6088" },
|
||||
{ "Name": "worker", "Host": "127.0.0.1:6089" }
|
||||
{ "Name": "worker", "Host": "127.0.0.1:6089" },
|
||||
{ "Name": "worker-executors", "Host": "127.0.0.1:6969" }
|
||||
]
|
||||
|
||||
@ -122,6 +122,8 @@ type HeartbeatRequest struct {
|
||||
GitVersion string `json:"gitVersion"`
|
||||
IgniteVersion string `json:"igniteVersion"`
|
||||
SrcCliVersion string `json:"srcCliVersion"`
|
||||
|
||||
PrometheusMetrics string `json:"prometheusMetrics"`
|
||||
}
|
||||
|
||||
type CanceledJobsRequest struct {
|
||||
|
||||
2
go.mod
2
go.mod
@ -360,7 +360,7 @@ require (
|
||||
github.com/pkg/profile v1.6.0 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/pquerna/cachecontrol v0.1.0 // indirect
|
||||
github.com/prometheus/client_model v0.2.0 // indirect
|
||||
github.com/prometheus/client_model v0.2.0
|
||||
github.com/prometheus/common/sigv4 v0.1.0 // indirect
|
||||
github.com/prometheus/procfs v0.7.3 // indirect
|
||||
github.com/pseudomuto/protoc-gen-doc v1.5.1 // indirect
|
||||
|
||||
426
internal/metrics/store/mocks_temp.go
Normal file
426
internal/metrics/store/mocks_temp.go
Normal file
@ -0,0 +1,426 @@
|
||||
// Code generated by go-mockgen 1.3.3; DO NOT EDIT.
|
||||
//
|
||||
// This file was generated by running `sg generate` (or `go-mockgen`) at the root of
|
||||
// this repository. To add additional mocks to this or another package, add a new entry
|
||||
// to the mockgen.yaml file in the root of this repository.
|
||||
|
||||
package store
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
go1 "github.com/prometheus/client_model/go"
|
||||
)
|
||||
|
||||
// MockDistributedStore is a mock implementation of the DistributedStore
|
||||
// interface (from the package
|
||||
// github.com/sourcegraph/sourcegraph/internal/metrics/store) used for unit
|
||||
// testing.
|
||||
type MockDistributedStore struct {
|
||||
// GatherFunc is an instance of a mock function object controlling the
|
||||
// behavior of the method Gather.
|
||||
GatherFunc *DistributedStoreGatherFunc
|
||||
// IngestFunc is an instance of a mock function object controlling the
|
||||
// behavior of the method Ingest.
|
||||
IngestFunc *DistributedStoreIngestFunc
|
||||
}
|
||||
|
||||
// NewMockDistributedStore creates a new mock of the DistributedStore
|
||||
// interface. All methods return zero values for all results, unless
|
||||
// overwritten.
|
||||
func NewMockDistributedStore() *MockDistributedStore {
|
||||
return &MockDistributedStore{
|
||||
GatherFunc: &DistributedStoreGatherFunc{
|
||||
defaultHook: func() (r0 []*go1.MetricFamily, r1 error) {
|
||||
return
|
||||
},
|
||||
},
|
||||
IngestFunc: &DistributedStoreIngestFunc{
|
||||
defaultHook: func(string, []*go1.MetricFamily) (r0 error) {
|
||||
return
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// NewStrictMockDistributedStore creates a new mock of the DistributedStore
|
||||
// interface. All methods panic on invocation, unless overwritten.
|
||||
func NewStrictMockDistributedStore() *MockDistributedStore {
|
||||
return &MockDistributedStore{
|
||||
GatherFunc: &DistributedStoreGatherFunc{
|
||||
defaultHook: func() ([]*go1.MetricFamily, error) {
|
||||
panic("unexpected invocation of MockDistributedStore.Gather")
|
||||
},
|
||||
},
|
||||
IngestFunc: &DistributedStoreIngestFunc{
|
||||
defaultHook: func(string, []*go1.MetricFamily) error {
|
||||
panic("unexpected invocation of MockDistributedStore.Ingest")
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// NewMockDistributedStoreFrom creates a new mock of the
|
||||
// MockDistributedStore interface. All methods delegate to the given
|
||||
// implementation, unless overwritten.
|
||||
func NewMockDistributedStoreFrom(i DistributedStore) *MockDistributedStore {
|
||||
return &MockDistributedStore{
|
||||
GatherFunc: &DistributedStoreGatherFunc{
|
||||
defaultHook: i.Gather,
|
||||
},
|
||||
IngestFunc: &DistributedStoreIngestFunc{
|
||||
defaultHook: i.Ingest,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// DistributedStoreGatherFunc describes the behavior when the Gather method
|
||||
// of the parent MockDistributedStore instance is invoked.
|
||||
type DistributedStoreGatherFunc struct {
|
||||
defaultHook func() ([]*go1.MetricFamily, error)
|
||||
hooks []func() ([]*go1.MetricFamily, error)
|
||||
history []DistributedStoreGatherFuncCall
|
||||
mutex sync.Mutex
|
||||
}
|
||||
|
||||
// Gather delegates to the next hook function in the queue and stores the
|
||||
// parameter and result values of this invocation.
|
||||
func (m *MockDistributedStore) Gather() ([]*go1.MetricFamily, error) {
|
||||
r0, r1 := m.GatherFunc.nextHook()()
|
||||
m.GatherFunc.appendCall(DistributedStoreGatherFuncCall{r0, r1})
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// SetDefaultHook sets function that is called when the Gather method of the
|
||||
// parent MockDistributedStore instance is invoked and the hook queue is
|
||||
// empty.
|
||||
func (f *DistributedStoreGatherFunc) SetDefaultHook(hook func() ([]*go1.MetricFamily, error)) {
|
||||
f.defaultHook = hook
|
||||
}
|
||||
|
||||
// PushHook adds a function to the end of hook queue. Each invocation of the
|
||||
// Gather method of the parent MockDistributedStore instance invokes the
|
||||
// hook at the front of the queue and discards it. After the queue is empty,
|
||||
// the default hook function is invoked for any future action.
|
||||
func (f *DistributedStoreGatherFunc) PushHook(hook func() ([]*go1.MetricFamily, error)) {
|
||||
f.mutex.Lock()
|
||||
f.hooks = append(f.hooks, hook)
|
||||
f.mutex.Unlock()
|
||||
}
|
||||
|
||||
// SetDefaultReturn calls SetDefaultHook with a function that returns the
|
||||
// given values.
|
||||
func (f *DistributedStoreGatherFunc) SetDefaultReturn(r0 []*go1.MetricFamily, r1 error) {
|
||||
f.SetDefaultHook(func() ([]*go1.MetricFamily, error) {
|
||||
return r0, r1
|
||||
})
|
||||
}
|
||||
|
||||
// PushReturn calls PushHook with a function that returns the given values.
|
||||
func (f *DistributedStoreGatherFunc) PushReturn(r0 []*go1.MetricFamily, r1 error) {
|
||||
f.PushHook(func() ([]*go1.MetricFamily, error) {
|
||||
return r0, r1
|
||||
})
|
||||
}
|
||||
|
||||
func (f *DistributedStoreGatherFunc) nextHook() func() ([]*go1.MetricFamily, error) {
|
||||
f.mutex.Lock()
|
||||
defer f.mutex.Unlock()
|
||||
|
||||
if len(f.hooks) == 0 {
|
||||
return f.defaultHook
|
||||
}
|
||||
|
||||
hook := f.hooks[0]
|
||||
f.hooks = f.hooks[1:]
|
||||
return hook
|
||||
}
|
||||
|
||||
func (f *DistributedStoreGatherFunc) appendCall(r0 DistributedStoreGatherFuncCall) {
|
||||
f.mutex.Lock()
|
||||
f.history = append(f.history, r0)
|
||||
f.mutex.Unlock()
|
||||
}
|
||||
|
||||
// History returns a sequence of DistributedStoreGatherFuncCall objects
|
||||
// describing the invocations of this function.
|
||||
func (f *DistributedStoreGatherFunc) History() []DistributedStoreGatherFuncCall {
|
||||
f.mutex.Lock()
|
||||
history := make([]DistributedStoreGatherFuncCall, len(f.history))
|
||||
copy(history, f.history)
|
||||
f.mutex.Unlock()
|
||||
|
||||
return history
|
||||
}
|
||||
|
||||
// DistributedStoreGatherFuncCall is an object that describes an invocation
|
||||
// of method Gather on an instance of MockDistributedStore.
|
||||
type DistributedStoreGatherFuncCall struct {
|
||||
// Result0 is the value of the 1st result returned from this method
|
||||
// invocation.
|
||||
Result0 []*go1.MetricFamily
|
||||
// Result1 is the value of the 2nd result returned from this method
|
||||
// invocation.
|
||||
Result1 error
|
||||
}
|
||||
|
||||
// Args returns an interface slice containing the arguments of this
|
||||
// invocation.
|
||||
func (c DistributedStoreGatherFuncCall) Args() []interface{} {
|
||||
return []interface{}{}
|
||||
}
|
||||
|
||||
// Results returns an interface slice containing the results of this
|
||||
// invocation.
|
||||
func (c DistributedStoreGatherFuncCall) Results() []interface{} {
|
||||
return []interface{}{c.Result0, c.Result1}
|
||||
}
|
||||
|
||||
// DistributedStoreIngestFunc describes the behavior when the Ingest method
|
||||
// of the parent MockDistributedStore instance is invoked.
|
||||
type DistributedStoreIngestFunc struct {
|
||||
defaultHook func(string, []*go1.MetricFamily) error
|
||||
hooks []func(string, []*go1.MetricFamily) error
|
||||
history []DistributedStoreIngestFuncCall
|
||||
mutex sync.Mutex
|
||||
}
|
||||
|
||||
// Ingest delegates to the next hook function in the queue and stores the
|
||||
// parameter and result values of this invocation.
|
||||
func (m *MockDistributedStore) Ingest(v0 string, v1 []*go1.MetricFamily) error {
|
||||
r0 := m.IngestFunc.nextHook()(v0, v1)
|
||||
m.IngestFunc.appendCall(DistributedStoreIngestFuncCall{v0, v1, r0})
|
||||
return r0
|
||||
}
|
||||
|
||||
// SetDefaultHook sets function that is called when the Ingest method of the
|
||||
// parent MockDistributedStore instance is invoked and the hook queue is
|
||||
// empty.
|
||||
func (f *DistributedStoreIngestFunc) SetDefaultHook(hook func(string, []*go1.MetricFamily) error) {
|
||||
f.defaultHook = hook
|
||||
}
|
||||
|
||||
// PushHook adds a function to the end of hook queue. Each invocation of the
|
||||
// Ingest method of the parent MockDistributedStore instance invokes the
|
||||
// hook at the front of the queue and discards it. After the queue is empty,
|
||||
// the default hook function is invoked for any future action.
|
||||
func (f *DistributedStoreIngestFunc) PushHook(hook func(string, []*go1.MetricFamily) error) {
|
||||
f.mutex.Lock()
|
||||
f.hooks = append(f.hooks, hook)
|
||||
f.mutex.Unlock()
|
||||
}
|
||||
|
||||
// SetDefaultReturn calls SetDefaultHook with a function that returns the
|
||||
// given values.
|
||||
func (f *DistributedStoreIngestFunc) SetDefaultReturn(r0 error) {
|
||||
f.SetDefaultHook(func(string, []*go1.MetricFamily) error {
|
||||
return r0
|
||||
})
|
||||
}
|
||||
|
||||
// PushReturn calls PushHook with a function that returns the given values.
|
||||
func (f *DistributedStoreIngestFunc) PushReturn(r0 error) {
|
||||
f.PushHook(func(string, []*go1.MetricFamily) error {
|
||||
return r0
|
||||
})
|
||||
}
|
||||
|
||||
func (f *DistributedStoreIngestFunc) nextHook() func(string, []*go1.MetricFamily) error {
|
||||
f.mutex.Lock()
|
||||
defer f.mutex.Unlock()
|
||||
|
||||
if len(f.hooks) == 0 {
|
||||
return f.defaultHook
|
||||
}
|
||||
|
||||
hook := f.hooks[0]
|
||||
f.hooks = f.hooks[1:]
|
||||
return hook
|
||||
}
|
||||
|
||||
func (f *DistributedStoreIngestFunc) appendCall(r0 DistributedStoreIngestFuncCall) {
|
||||
f.mutex.Lock()
|
||||
f.history = append(f.history, r0)
|
||||
f.mutex.Unlock()
|
||||
}
|
||||
|
||||
// History returns a sequence of DistributedStoreIngestFuncCall objects
|
||||
// describing the invocations of this function.
|
||||
func (f *DistributedStoreIngestFunc) History() []DistributedStoreIngestFuncCall {
|
||||
f.mutex.Lock()
|
||||
history := make([]DistributedStoreIngestFuncCall, len(f.history))
|
||||
copy(history, f.history)
|
||||
f.mutex.Unlock()
|
||||
|
||||
return history
|
||||
}
|
||||
|
||||
// DistributedStoreIngestFuncCall is an object that describes an invocation
|
||||
// of method Ingest on an instance of MockDistributedStore.
|
||||
type DistributedStoreIngestFuncCall struct {
|
||||
// Arg0 is the value of the 1st argument passed to this method
|
||||
// invocation.
|
||||
Arg0 string
|
||||
// Arg1 is the value of the 2nd argument passed to this method
|
||||
// invocation.
|
||||
Arg1 []*go1.MetricFamily
|
||||
// Result0 is the value of the 1st result returned from this method
|
||||
// invocation.
|
||||
Result0 error
|
||||
}
|
||||
|
||||
// Args returns an interface slice containing the arguments of this
|
||||
// invocation.
|
||||
func (c DistributedStoreIngestFuncCall) Args() []interface{} {
|
||||
return []interface{}{c.Arg0, c.Arg1}
|
||||
}
|
||||
|
||||
// Results returns an interface slice containing the results of this
|
||||
// invocation.
|
||||
func (c DistributedStoreIngestFuncCall) Results() []interface{} {
|
||||
return []interface{}{c.Result0}
|
||||
}
|
||||
|
||||
// MockStore is a mock implementation of the Store interface (from the
|
||||
// package github.com/sourcegraph/sourcegraph/internal/metrics/store) used
|
||||
// for unit testing.
|
||||
type MockStore struct {
|
||||
// GatherFunc is an instance of a mock function object controlling the
|
||||
// behavior of the method Gather.
|
||||
GatherFunc *StoreGatherFunc
|
||||
}
|
||||
|
||||
// NewMockStore creates a new mock of the Store interface. All methods
|
||||
// return zero values for all results, unless overwritten.
|
||||
func NewMockStore() *MockStore {
|
||||
return &MockStore{
|
||||
GatherFunc: &StoreGatherFunc{
|
||||
defaultHook: func() (r0 []*go1.MetricFamily, r1 error) {
|
||||
return
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// NewStrictMockStore creates a new mock of the Store interface. All methods
|
||||
// panic on invocation, unless overwritten.
|
||||
func NewStrictMockStore() *MockStore {
|
||||
return &MockStore{
|
||||
GatherFunc: &StoreGatherFunc{
|
||||
defaultHook: func() ([]*go1.MetricFamily, error) {
|
||||
panic("unexpected invocation of MockStore.Gather")
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// NewMockStoreFrom creates a new mock of the MockStore interface. All
|
||||
// methods delegate to the given implementation, unless overwritten.
|
||||
func NewMockStoreFrom(i Store) *MockStore {
|
||||
return &MockStore{
|
||||
GatherFunc: &StoreGatherFunc{
|
||||
defaultHook: i.Gather,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// StoreGatherFunc describes the behavior when the Gather method of the
|
||||
// parent MockStore instance is invoked.
|
||||
type StoreGatherFunc struct {
|
||||
defaultHook func() ([]*go1.MetricFamily, error)
|
||||
hooks []func() ([]*go1.MetricFamily, error)
|
||||
history []StoreGatherFuncCall
|
||||
mutex sync.Mutex
|
||||
}
|
||||
|
||||
// Gather delegates to the next hook function in the queue and stores the
|
||||
// parameter and result values of this invocation.
|
||||
func (m *MockStore) Gather() ([]*go1.MetricFamily, error) {
|
||||
r0, r1 := m.GatherFunc.nextHook()()
|
||||
m.GatherFunc.appendCall(StoreGatherFuncCall{r0, r1})
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// SetDefaultHook sets function that is called when the Gather method of the
|
||||
// parent MockStore instance is invoked and the hook queue is empty.
|
||||
func (f *StoreGatherFunc) SetDefaultHook(hook func() ([]*go1.MetricFamily, error)) {
|
||||
f.defaultHook = hook
|
||||
}
|
||||
|
||||
// PushHook adds a function to the end of hook queue. Each invocation of the
|
||||
// Gather method of the parent MockStore instance invokes the hook at the
|
||||
// front of the queue and discards it. After the queue is empty, the default
|
||||
// hook function is invoked for any future action.
|
||||
func (f *StoreGatherFunc) PushHook(hook func() ([]*go1.MetricFamily, error)) {
|
||||
f.mutex.Lock()
|
||||
f.hooks = append(f.hooks, hook)
|
||||
f.mutex.Unlock()
|
||||
}
|
||||
|
||||
// SetDefaultReturn calls SetDefaultHook with a function that returns the
|
||||
// given values.
|
||||
func (f *StoreGatherFunc) SetDefaultReturn(r0 []*go1.MetricFamily, r1 error) {
|
||||
f.SetDefaultHook(func() ([]*go1.MetricFamily, error) {
|
||||
return r0, r1
|
||||
})
|
||||
}
|
||||
|
||||
// PushReturn calls PushHook with a function that returns the given values.
|
||||
func (f *StoreGatherFunc) PushReturn(r0 []*go1.MetricFamily, r1 error) {
|
||||
f.PushHook(func() ([]*go1.MetricFamily, error) {
|
||||
return r0, r1
|
||||
})
|
||||
}
|
||||
|
||||
func (f *StoreGatherFunc) nextHook() func() ([]*go1.MetricFamily, error) {
|
||||
f.mutex.Lock()
|
||||
defer f.mutex.Unlock()
|
||||
|
||||
if len(f.hooks) == 0 {
|
||||
return f.defaultHook
|
||||
}
|
||||
|
||||
hook := f.hooks[0]
|
||||
f.hooks = f.hooks[1:]
|
||||
return hook
|
||||
}
|
||||
|
||||
func (f *StoreGatherFunc) appendCall(r0 StoreGatherFuncCall) {
|
||||
f.mutex.Lock()
|
||||
f.history = append(f.history, r0)
|
||||
f.mutex.Unlock()
|
||||
}
|
||||
|
||||
// History returns a sequence of StoreGatherFuncCall objects describing the
|
||||
// invocations of this function.
|
||||
func (f *StoreGatherFunc) History() []StoreGatherFuncCall {
|
||||
f.mutex.Lock()
|
||||
history := make([]StoreGatherFuncCall, len(f.history))
|
||||
copy(history, f.history)
|
||||
f.mutex.Unlock()
|
||||
|
||||
return history
|
||||
}
|
||||
|
||||
// StoreGatherFuncCall is an object that describes an invocation of method
|
||||
// Gather on an instance of MockStore.
|
||||
type StoreGatherFuncCall struct {
|
||||
// Result0 is the value of the 1st result returned from this method
|
||||
// invocation.
|
||||
Result0 []*go1.MetricFamily
|
||||
// Result1 is the value of the 2nd result returned from this method
|
||||
// invocation.
|
||||
Result1 error
|
||||
}
|
||||
|
||||
// Args returns an interface slice containing the arguments of this
|
||||
// invocation.
|
||||
func (c StoreGatherFuncCall) Args() []interface{} {
|
||||
return []interface{}{}
|
||||
}
|
||||
|
||||
// Results returns an interface slice containing the results of this
|
||||
// invocation.
|
||||
func (c StoreGatherFuncCall) Results() []interface{} {
|
||||
return []interface{}{c.Result0, c.Result1}
|
||||
}
|
||||
117
internal/metrics/store/store.go
Normal file
117
internal/metrics/store/store.go
Normal file
@ -0,0 +1,117 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"github.com/gomodule/redigo/redis"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/internal/redispool"
|
||||
"github.com/sourcegraph/sourcegraph/lib/errors"
|
||||
)
|
||||
|
||||
const DefaultMetricsExpiry = 30
|
||||
|
||||
type Store interface {
|
||||
prometheus.Gatherer
|
||||
}
|
||||
|
||||
func NewDefaultStore() Store {
|
||||
return &defaultStore{}
|
||||
}
|
||||
|
||||
type defaultStore struct{}
|
||||
|
||||
func (*defaultStore) Gather() ([]*dto.MetricFamily, error) {
|
||||
return prometheus.DefaultGatherer.Gather()
|
||||
}
|
||||
|
||||
type DistributedStore interface {
|
||||
Store
|
||||
Ingest(instance string, mfs []*dto.MetricFamily) error
|
||||
}
|
||||
|
||||
func NewDistributedStore(prefix string) DistributedStore {
|
||||
return &distributedStore{
|
||||
prefix: prefix,
|
||||
expiry: DefaultMetricsExpiry,
|
||||
}
|
||||
}
|
||||
|
||||
type distributedStore struct {
|
||||
prefix string
|
||||
expiry int
|
||||
}
|
||||
|
||||
func (d *distributedStore) Gather() ([]*dto.MetricFamily, error) {
|
||||
reConn := redispool.Cache.Get()
|
||||
defer reConn.Close()
|
||||
|
||||
// First, list all the keys for which we hold metrics.
|
||||
keys, err := redis.Values(reConn.Do("KEYS", d.prefix+"*"))
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "listing entries from redis")
|
||||
}
|
||||
|
||||
if len(keys) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Then bulk retrieve all the metrics blobs for all the instances.
|
||||
encodedMetrics, err := redis.Strings(reConn.Do("MGET", keys...))
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "retrieving blobs from redis")
|
||||
}
|
||||
|
||||
// Then decode the serialized metrics into proper metric families required
|
||||
// by the Gatherer interface.
|
||||
mfs := []*dto.MetricFamily{}
|
||||
for _, metrics := range encodedMetrics {
|
||||
// Decode each metrics blob separately.
|
||||
dec := expfmt.NewDecoder(strings.NewReader(metrics), expfmt.FmtText)
|
||||
for {
|
||||
var mf dto.MetricFamily
|
||||
if err := dec.Decode(&mf); err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
|
||||
return nil, errors.Wrap(err, "decoding metrics data")
|
||||
}
|
||||
mfs = append(mfs, &mf)
|
||||
}
|
||||
}
|
||||
|
||||
return mfs, nil
|
||||
}
|
||||
|
||||
func (d *distributedStore) Ingest(instance string, mfs []*dto.MetricFamily) error {
|
||||
// First, encode the metrics to text format so we can store them.
|
||||
var enc bytes.Buffer
|
||||
encoder := expfmt.NewEncoder(&enc, expfmt.FmtText)
|
||||
|
||||
for _, a := range mfs {
|
||||
if err := encoder.Encode(a); err != nil {
|
||||
return errors.Wrap(err, "encoding metric family")
|
||||
}
|
||||
}
|
||||
|
||||
encodedMetrics := enc.String()
|
||||
|
||||
reConn := redispool.Cache.Get()
|
||||
defer reConn.Close()
|
||||
|
||||
// Store the metrics and set an expiry on the key, if we haven't retrieved
|
||||
// an updated set of metric data, we consider the host down and prune it
|
||||
// from the gatherer.
|
||||
err := reConn.Send("SETEX", d.prefix+instance, d.expiry, encodedMetrics)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "writing metrics blob to redis")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@ -111,3 +111,8 @@
|
||||
path: github.com/sourcegraph/sourcegraph/internal/workerutil/dbworker/store
|
||||
interfaces:
|
||||
- Store
|
||||
- filename: internal/metrics/store/mocks_temp.go
|
||||
path: github.com/sourcegraph/sourcegraph/internal/metrics/store
|
||||
interfaces:
|
||||
- Store
|
||||
- DistributedStore
|
||||
|
||||
@ -6,7 +6,8 @@ import (
|
||||
)
|
||||
|
||||
func Executor() *monitoring.Dashboard {
|
||||
const containerName = "(executor|sourcegraph-code-intel-indexers|executor-batches|sourcegraph-executors)"
|
||||
// sg_job value is hard-coded, see enterprise/cmd/frontend/internal/executorqueue/handler/routes.go
|
||||
const containerName = "sourcegraph-executors"
|
||||
|
||||
// frontend is sometimes called sourcegraph-frontend in various contexts
|
||||
const queueContainerName = "(executor|sourcegraph-code-intel-indexers|executor-batches|frontend|sourcegraph-frontend|worker|sourcegraph-executors)"
|
||||
@ -26,7 +27,7 @@ func Executor() *monitoring.Dashboard {
|
||||
{
|
||||
Label: "Compute instance",
|
||||
Name: "instance",
|
||||
OptionsQuery: "label_values(node_exporter_build_info{job=\"sourcegraph-executor-nodes\"}, instance)",
|
||||
OptionsQuery: "label_values(node_exporter_build_info{sg_job=\"sourcegraph-executor-nodes\"}, instance)",
|
||||
|
||||
// The options query can generate a massive result set that can cause issues.
|
||||
// shared.NewNodeExporterGroup filters by job as well so this is safe to use
|
||||
@ -42,11 +43,14 @@ func Executor() *monitoring.Dashboard {
|
||||
shared.CodeIntelligence.NewExecutorExecutionCommandGroup(containerName),
|
||||
shared.CodeIntelligence.NewExecutorTeardownCommandGroup(containerName),
|
||||
|
||||
shared.NewNodeExporterGroup(containerName, "(sourcegraph-code-intel-indexer-nodes|sourcegraph-executor-nodes)", "Compute", "$instance"),
|
||||
shared.NewNodeExporterGroup(containerName, "(sourcegraph-code-intel-indexer-docker-registry-mirror-nodes|sourcegraph-executors-docker-registry-mirror-nodes)", "Docker Registry Mirror", ".*"),
|
||||
shared.NewNodeExporterGroup(containerName, "Compute", "$instance"),
|
||||
shared.NewNodeExporterGroup(containerName, "Docker Registry Mirror", ".*"),
|
||||
|
||||
// Resource monitoring
|
||||
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, nil),
|
||||
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerCodeIntel, &shared.GolangMonitoringOptions{
|
||||
InstanceLabelName: "sg_instance",
|
||||
JobLabelName: "sg_job",
|
||||
}),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@ -576,7 +576,8 @@ func Frontend() *monitoring.Dashboard {
|
||||
Interpretation: `Account lockouts per minute`,
|
||||
},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
Title: "Organisation GraphQL API requests",
|
||||
Hidden: true,
|
||||
@ -910,11 +911,11 @@ func orgMetricRows(orgMetricSpec []struct {
|
||||
name string
|
||||
route string
|
||||
description string
|
||||
}) []monitoring.Row {
|
||||
},
|
||||
) []monitoring.Row {
|
||||
result := []monitoring.Row{}
|
||||
for _, m := range orgMetricSpec {
|
||||
result = append(result, monitoring.Row{
|
||||
|
||||
{
|
||||
Name: m.name + "_rate",
|
||||
Description: "rate of " + m.description,
|
||||
|
||||
@ -24,43 +24,44 @@ func Postgres() *monitoring.Dashboard {
|
||||
Groups: []monitoring.Group{
|
||||
{
|
||||
Title: "General",
|
||||
Rows: []monitoring.Row{{
|
||||
monitoring.Observable{
|
||||
Name: "connections",
|
||||
Description: "active connections",
|
||||
Owner: monitoring.ObservableOwnerDevOps,
|
||||
DataMustExist: false, // not deployed on docker-compose
|
||||
Query: `sum by (job) (pg_stat_activity_count{datname!~"template.*|postgres|cloudsqladmin"}) OR sum by (job) (pg_stat_activity_count{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{datname}}"),
|
||||
Warning: monitoring.Alert().LessOrEqual(5).For(5 * time.Minute),
|
||||
NextSteps: "none",
|
||||
},
|
||||
monitoring.Observable{
|
||||
Name: "usage_connections_percentage",
|
||||
Description: "connection in use",
|
||||
Owner: monitoring.ObservableOwnerDevOps,
|
||||
DataMustExist: false,
|
||||
Query: `sum(pg_stat_activity_count) by (job) / (sum(pg_settings_max_connections) by (job) - sum(pg_settings_superuser_reserved_connections) by (job)) * 100`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{job}}").Unit(monitoring.Percentage).Max(100).Min(0),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(80).For(5 * time.Minute),
|
||||
Critical: monitoring.Alert().GreaterOrEqual(100).For(5 * time.Minute),
|
||||
NextSteps: `
|
||||
Rows: []monitoring.Row{
|
||||
{
|
||||
monitoring.Observable{
|
||||
Name: "connections",
|
||||
Description: "active connections",
|
||||
Owner: monitoring.ObservableOwnerDevOps,
|
||||
DataMustExist: false, // not deployed on docker-compose
|
||||
Query: `sum by (job) (pg_stat_activity_count{datname!~"template.*|postgres|cloudsqladmin"}) OR sum by (job) (pg_stat_activity_count{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{datname}}"),
|
||||
Warning: monitoring.Alert().LessOrEqual(5).For(5 * time.Minute),
|
||||
NextSteps: "none",
|
||||
},
|
||||
monitoring.Observable{
|
||||
Name: "usage_connections_percentage",
|
||||
Description: "connection in use",
|
||||
Owner: monitoring.ObservableOwnerDevOps,
|
||||
DataMustExist: false,
|
||||
Query: `sum(pg_stat_activity_count) by (job) / (sum(pg_settings_max_connections) by (job) - sum(pg_settings_superuser_reserved_connections) by (job)) * 100`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{job}}").Unit(monitoring.Percentage).Max(100).Min(0),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(80).For(5 * time.Minute),
|
||||
Critical: monitoring.Alert().GreaterOrEqual(100).For(5 * time.Minute),
|
||||
NextSteps: `
|
||||
- Consider increasing [max_connections](https://www.postgresql.org/docs/current/runtime-config-connection.html#GUC-MAX-CONNECTIONS) of the database instance, [learn more](https://docs.sourcegraph.com/admin/config/postgres-conf)
|
||||
`,
|
||||
},
|
||||
monitoring.Observable{
|
||||
Name: "transaction_durations",
|
||||
Description: "maximum transaction durations",
|
||||
Owner: monitoring.ObservableOwnerDevOps,
|
||||
DataMustExist: false, // not deployed on docker-compose
|
||||
// Ignore in codeintel-db because Rockskip processing involves long transactions
|
||||
// during normal operation.
|
||||
Query: `sum by (job) (pg_stat_activity_max_tx_duration{datname!~"template.*|postgres|cloudsqladmin",job!="codeintel-db"}) OR sum by (job) (pg_stat_activity_max_tx_duration{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{datname}}").Unit(monitoring.Seconds),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(0.3).For(5 * time.Minute),
|
||||
NextSteps: "none",
|
||||
},
|
||||
},
|
||||
monitoring.Observable{
|
||||
Name: "transaction_durations",
|
||||
Description: "maximum transaction durations",
|
||||
Owner: monitoring.ObservableOwnerDevOps,
|
||||
DataMustExist: false, // not deployed on docker-compose
|
||||
// Ignore in codeintel-db because Rockskip processing involves long transactions
|
||||
// during normal operation.
|
||||
Query: `sum by (job) (pg_stat_activity_max_tx_duration{datname!~"template.*|postgres|cloudsqladmin",job!="codeintel-db"}) OR sum by (job) (pg_stat_activity_max_tx_duration{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`,
|
||||
Panel: monitoring.Panel().LegendFormat("{{datname}}").Unit(monitoring.Seconds),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(0.3).For(5 * time.Minute),
|
||||
NextSteps: "none",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
|
||||
@ -323,7 +323,7 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori
|
||||
// src_executor_run_lock_held_total
|
||||
func (codeIntelligence) NewExecutorExecutionRunLockContentionGroup(containerName string) monitoring.Group {
|
||||
constructor := func(metricNameRoot, legend string) Observable {
|
||||
filters := makeFilters(containerName)
|
||||
filters := makeFilters("sg_jobs", containerName)
|
||||
return Observable{
|
||||
Name: metricNameRoot + "_total",
|
||||
Description: fmt.Sprintf("milliseconds %s every 5m", legend),
|
||||
|
||||
@ -29,6 +29,9 @@ type ObservableConstructorOptions struct {
|
||||
// will add a prefix to the constructed legend.
|
||||
MetricDescriptionRoot string
|
||||
|
||||
// JobLabel is the name of the label used to denote the job name. If unset, "job" is used.
|
||||
JobLabel string
|
||||
|
||||
// Filters are additional prometheus filter expressions used to select or hide values
|
||||
// for a given label pattern.
|
||||
Filters []string
|
||||
@ -76,8 +79,12 @@ type GroupConstructorOptions struct {
|
||||
// expressions. The given container name may be string or pattern, which will be matched
|
||||
// against the prefix of the value of the job label. Note that this excludes replicas like
|
||||
// -0 and -1 in docker-compose.
|
||||
func makeFilters(containerName string, filters ...string) string {
|
||||
filters = append(filters, fmt.Sprintf(`job=~"^%s.*"`, containerName))
|
||||
func makeFilters(containerLabel, containerName string, filters ...string) string {
|
||||
if containerLabel == "" {
|
||||
containerLabel = "job"
|
||||
}
|
||||
|
||||
filters = append(filters, fmt.Sprintf(`%s=~"^%s.*"`, containerLabel, containerName))
|
||||
return strings.Join(filters, ",")
|
||||
}
|
||||
|
||||
|
||||
@ -14,28 +14,32 @@ import (
|
||||
const TitleGolangMonitoring = "Golang runtime monitoring"
|
||||
|
||||
var (
|
||||
GoGoroutines sharedObservable = func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
return Observable{
|
||||
Name: "go_goroutines",
|
||||
Description: "maximum active goroutines",
|
||||
Query: fmt.Sprintf(`max by(instance) (go_goroutines{job=~".*%s"})`, containerName),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(10000).For(10 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}"),
|
||||
Owner: owner,
|
||||
Interpretation: "A high value here indicates a possible goroutine leak.",
|
||||
NextSteps: "none",
|
||||
GoGoroutines = func(jobLabel, instanceLabel string) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
return Observable{
|
||||
Name: "go_goroutines",
|
||||
Description: "maximum active goroutines",
|
||||
Query: fmt.Sprintf(`max by(%s) (go_goroutines{%s=~".*%s"})`, instanceLabel, jobLabel, containerName),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(10000).For(10 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}"),
|
||||
Owner: owner,
|
||||
Interpretation: "A high value here indicates a possible goroutine leak.",
|
||||
NextSteps: "none",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GoGcDuration sharedObservable = func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
return Observable{
|
||||
Name: "go_gc_duration_seconds",
|
||||
Description: "maximum go garbage collection duration",
|
||||
Query: fmt.Sprintf(`max by(instance) (go_gc_duration_seconds{job=~".*%s"})`, containerName),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(2),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Seconds),
|
||||
Owner: owner,
|
||||
NextSteps: "none",
|
||||
GoGcDuration = func(jobLabel, instanceLabel string) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
return Observable{
|
||||
Name: "go_gc_duration_seconds",
|
||||
Description: "maximum go garbage collection duration",
|
||||
Query: fmt.Sprintf(`max by(%s) (go_gc_duration_seconds{%s=~".*%s"})`, instanceLabel, jobLabel, containerName),
|
||||
Warning: monitoring.Alert().GreaterOrEqual(2),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Seconds),
|
||||
Owner: owner,
|
||||
NextSteps: "none",
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
@ -46,6 +50,10 @@ type GolangMonitoringOptions struct {
|
||||
|
||||
// GCDuration transforms the default observable used to construct the Go GC duration panel.
|
||||
GCDuration ObservableOption
|
||||
|
||||
JobLabelName string
|
||||
|
||||
InstanceLabelName string
|
||||
}
|
||||
|
||||
// NewGolangMonitoringGroup creates a group containing panels displaying Go monitoring
|
||||
@ -55,13 +63,20 @@ func NewGolangMonitoringGroup(containerName string, owner monitoring.ObservableO
|
||||
options = &GolangMonitoringOptions{}
|
||||
}
|
||||
|
||||
if options.InstanceLabelName == "" {
|
||||
options.InstanceLabelName = "instance"
|
||||
}
|
||||
if options.JobLabelName == "" {
|
||||
options.JobLabelName = "job"
|
||||
}
|
||||
|
||||
return monitoring.Group{
|
||||
Title: TitleGolangMonitoring,
|
||||
Hidden: true,
|
||||
Rows: []monitoring.Row{
|
||||
{
|
||||
options.Goroutines.safeApply(GoGoroutines(containerName, owner)).Observable(),
|
||||
options.GCDuration.safeApply(GoGcDuration(containerName, owner)).Observable(),
|
||||
options.Goroutines.safeApply(GoGoroutines(options.JobLabelName, options.InstanceLabelName)(containerName, owner)).Observable(),
|
||||
options.GCDuration.safeApply(GoGcDuration(options.JobLabelName, options.InstanceLabelName)(containerName, owner)).Observable(),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@ -13,24 +13,22 @@ import (
|
||||
// metrics in a way that only applies in Kubernetes deployments.
|
||||
const TitleKubernetesMonitoring = "Kubernetes monitoring (only available on Kubernetes)"
|
||||
|
||||
var (
|
||||
KubernetesPodsAvailable sharedObservable = func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
return Observable{
|
||||
Name: "pods_available_percentage",
|
||||
Description: "percentage pods available",
|
||||
// the 'app' label is only available in Kubernetes deloyments - it indicates the pod.
|
||||
Query: fmt.Sprintf(`sum by(app) (up{app=~".*%[1]s"}) / count by (app) (up{app=~".*%[1]s"}) * 100`, containerName),
|
||||
Critical: monitoring.Alert().LessOrEqual(90).For(10 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Percentage).Max(100).Min(0),
|
||||
Owner: owner,
|
||||
// Solutions similar to the ContainerMissing solutions.
|
||||
NextSteps: fmt.Sprintf(`
|
||||
var KubernetesPodsAvailable sharedObservable = func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
return Observable{
|
||||
Name: "pods_available_percentage",
|
||||
Description: "percentage pods available",
|
||||
// the 'app' label is only available in Kubernetes deloyments - it indicates the pod.
|
||||
Query: fmt.Sprintf(`sum by(app) (up{app=~".*%[1]s"}) / count by (app) (up{app=~".*%[1]s"}) * 100`, containerName),
|
||||
Critical: monitoring.Alert().LessOrEqual(90).For(10 * time.Minute),
|
||||
Panel: monitoring.Panel().LegendFormat("{{name}}").Unit(monitoring.Percentage).Max(100).Min(0),
|
||||
Owner: owner,
|
||||
// Solutions similar to the ContainerMissing solutions.
|
||||
NextSteps: fmt.Sprintf(`
|
||||
- Determine if the pod was OOM killed using 'kubectl describe pod %[1]s' (look for 'OOMKilled: true') and, if so, consider increasing the memory limit in the relevant 'Deployment.yaml'.
|
||||
- Check the logs before the container restarted to see if there are 'panic:' messages or similar using 'kubectl logs -p %[1]s'.
|
||||
`, containerName),
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
type KubernetesMonitoringOptions struct {
|
||||
// PodsAvailable transforms the default observable used to construct the pods available panel.
|
||||
|
||||
@ -8,7 +8,7 @@ import (
|
||||
|
||||
const TitleNodeExporter = "Executor: %s instance metrics"
|
||||
|
||||
func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) monitoring.Group {
|
||||
func NewNodeExporterGroup(job, jobTitle, instanceFilter string) monitoring.Group {
|
||||
return monitoring.Group{
|
||||
Title: fmt.Sprintf(TitleNodeExporter, jobTitle),
|
||||
Hidden: true,
|
||||
@ -17,30 +17,30 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m
|
||||
{
|
||||
Name: "node_cpu_utilization",
|
||||
Description: "CPU utilization (minus idle/iowait)",
|
||||
Query: "sum(rate(node_cpu_seconds_total{job=~\"" + job + "\",mode!~\"(idle|iowait)\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance) / count(node_cpu_seconds_total{job=~\"" + job + "\",mode=\"system\",instance=~\"" + instanceFilter + "\"}) by (instance) * 100",
|
||||
Query: "sum(rate(node_cpu_seconds_total{sg_job=~\"" + job + "\",mode!~\"(idle|iowait)\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance) / count(node_cpu_seconds_total{sg_job=~\"" + job + "\",mode=\"system\",sg_instance=~\"" + instanceFilter + "\"}) by (sg_instance) * 100",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the amount of CPU time excluding idle and iowait time, divided by the number of cores, as a percentage.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage).Max(100),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Percentage).Max(100),
|
||||
},
|
||||
{
|
||||
Name: "node_cpu_saturation_cpu_wait",
|
||||
Description: "CPU saturation (time waiting)",
|
||||
Query: "rate(node_pressure_cpu_waiting_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])",
|
||||
Query: "rate(node_pressure_cpu_waiting_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the average summed time a number of (but strictly not all) non-idle processes spent waiting for CPU time. If this is higher than normal, then the CPU is underpowered for the workload and more powerful machines should be provisioned. " +
|
||||
"This only represents a \"less-than-all processes\" time, because for processes to be waiting for CPU time there must be other process(es) consuming CPU time.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Seconds),
|
||||
},
|
||||
},
|
||||
{
|
||||
{
|
||||
Name: "node_memory_utilization",
|
||||
Description: "memory utilization",
|
||||
Query: "(1 - sum(node_memory_MemAvailable_bytes{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}) by (instance)) * 100",
|
||||
Query: "(1 - sum(node_memory_MemAvailable_bytes{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}) by (sg_instance) / sum(node_memory_MemTotal_bytes{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}) by (sg_instance)) * 100",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the amount of available memory (including cache and buffers) as a percentage. Consistently high numbers are generally fine so long memory saturation figures are within acceptable ranges, " +
|
||||
"these figures may be more useful for informing executor provisioning decisions, such as increasing worker parallelism, down-sizing machines etc.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage).Max(100),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Percentage).Max(100),
|
||||
},
|
||||
// Please see the following article(s) on how we arrive at using these particular metrics. It is stupid complicated and underdocumented beyond anything.
|
||||
// Page 27 of https://documentation.suse.com/sles/11-SP4/pdf/book-sle-tuning_color_en.pdf
|
||||
@ -50,20 +50,20 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m
|
||||
{
|
||||
Name: "node_memory_saturation_vmeff",
|
||||
Description: "memory saturation (vmem efficiency)",
|
||||
Query: "(rate(node_vmstat_pgsteal_anon{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) " +
|
||||
"/ (rate(node_vmstat_pgscan_anon{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) * 100",
|
||||
Query: "(rate(node_vmstat_pgsteal_anon{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_direct{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_file{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgsteal_kswapd{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) " +
|
||||
"/ (rate(node_vmstat_pgscan_anon{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_direct{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_file{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]) + rate(node_vmstat_pgscan_kswapd{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) * 100",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the efficiency of page reclaim, calculated as pgsteal/pgscan. Optimal figures are short spikes of near 100% and above, indicating that a high ratio of scanned pages are actually being freed, " +
|
||||
"or exactly 0%, indicating that pages arent being scanned as there is no memory pressure. Sustained numbers >~100% may be sign of imminent memory exhaustion, while sustained 0% < x < ~100% figures are very serious.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Percentage),
|
||||
},
|
||||
{
|
||||
Name: "node_memory_saturation_pressure_stalled",
|
||||
Description: "memory saturation (fully stalled)",
|
||||
Query: "rate(node_pressure_memory_stalled_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])",
|
||||
Query: "rate(node_pressure_memory_stalled_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the amount of time all non-idle processes were stalled waiting on memory operations to complete. This is often correlated with vmem efficiency ratio when pressure on available memory is high. If they're not correlated, this could indicate issues with the machine hardware and/or configuration.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Seconds),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -73,84 +73,84 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m
|
||||
{
|
||||
Name: "node_io_disk_utilization",
|
||||
Description: "disk IO utilization (percentage time spent in IO)",
|
||||
Query: "sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]), \"disk\", \"$1\", \"device\", \"^([^d].+)\"), \"disk\", \"ignite\", \"device\", \"dm-.*\")) by(instance,disk) * 100",
|
||||
Query: "sum(label_replace(label_replace(rate(node_disk_io_time_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]), \"disk\", \"$1\", \"device\", \"^([^d].+)\"), \"disk\", \"ignite\", \"device\", \"dm-.*\")) by(sg_instance,disk) * 100",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the percentage of time a disk was busy. If this is less than 100%, then the disk has spare utilization capacity. However, a value of 100% does not necesarily indicate the disk is at max capacity. " +
|
||||
"For single, serial request-serving devices, 100% may indicate maximum saturation, but for SSDs and RAID arrays this is less likely to be the case, as they are capable of serving multiple requests in parallel, other metrics such as " +
|
||||
"throughput and request queue size should be factored in.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}: {{disk}}").Unit(monitoring.Percentage),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}: {{disk}}").Unit(monitoring.Percentage),
|
||||
},
|
||||
{
|
||||
Name: "node_io_disk_saturation",
|
||||
Description: "disk IO saturation (avg IO queue size)",
|
||||
Query: "sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval]), \"disk\", \"$1\", \"device\", \"^([^d].+)\"), \"disk\", \"ignite\", \"device\", \"dm-.*\")) by(instance,disk)",
|
||||
Query: "sum(label_replace(label_replace(rate(node_disk_io_time_weighted_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval]), \"disk\", \"$1\", \"device\", \"^([^d].+)\"), \"disk\", \"ignite\", \"device\", \"dm-.*\")) by(sg_instance,disk)",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the number of outstanding/queued IO requests. High but short-lived queue sizes may not present an issue, but if theyre consistently/often high and/or monotonically increasing, the disk may be failing or simply too slow for the amount of activity required. " +
|
||||
"Consider replacing the drive(s) with SSDs if they are not already and/or replacing the faulty drive(s), if any.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}: {{disk}}"),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}: {{disk}}"),
|
||||
},
|
||||
{
|
||||
Name: "node_io_disk_saturation_pressure_full",
|
||||
Description: "disk IO saturation (avg time of all processes stalled)",
|
||||
Query: "rate(node_pressure_io_stalled_seconds_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])",
|
||||
Query: "rate(node_pressure_io_stalled_seconds_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the averaged amount of time for which all non-idle processes were stalled waiting for IO to complete simultaneously aka where no processes could make progress.", // TODO: more
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.Seconds),
|
||||
},
|
||||
},
|
||||
{
|
||||
{
|
||||
Name: "node_io_network_utilization",
|
||||
Description: "network IO utilization (Rx)",
|
||||
Query: "sum(rate(node_network_receive_bytes_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance) * 8",
|
||||
Query: "sum(rate(node_network_receive_bytes_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance) * 8",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the average summed receiving throughput of all network interfaces. This is often predominantly composed of the WAN/internet-connected interface, and knowing normal/good figures depends on knowing the bandwidth of the " +
|
||||
"underlying hardware and the workloads.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.BitsPerSecond),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.BitsPerSecond),
|
||||
},
|
||||
{
|
||||
Name: "node_io_network_saturation",
|
||||
Description: "network IO saturation (Rx packets dropped)",
|
||||
Query: "sum(rate(node_network_receive_drop_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance)",
|
||||
Query: "sum(rate(node_network_receive_drop_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance)",
|
||||
NoAlert: true,
|
||||
Interpretation: "Number of dropped received packets. This can happen if the receive queues/buffers become full due to slow packet processing throughput. The queues/buffers could be configured to be larger as a stop-gap " +
|
||||
"but the processing application should be investigated as soon as possible. https://www.kernel.org/doc/html/latest/networking/statistics.html#:~:text=not%20otherwise%20counted.-,rx_dropped,-Number%20of%20packets",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}"),
|
||||
},
|
||||
{
|
||||
Name: "node_io_network_saturation",
|
||||
Description: "network IO errors (Rx)",
|
||||
Query: "sum(rate(node_network_receive_errs_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance)",
|
||||
Query: "sum(rate(node_network_receive_errs_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance)",
|
||||
NoAlert: true,
|
||||
Interpretation: "Number of bad/malformed packets received. https://www.kernel.org/doc/html/latest/networking/statistics.html#:~:text=excluding%20the%20FCS.-,rx_errors,-Total%20number%20of",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}"),
|
||||
},
|
||||
},
|
||||
{
|
||||
{
|
||||
Name: "node_io_network_utilization",
|
||||
Description: "network IO utilization (Tx)",
|
||||
Query: "sum(rate(node_network_transmit_bytes_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance) * 8",
|
||||
Query: "sum(rate(node_network_transmit_bytes_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance) * 8",
|
||||
NoAlert: true,
|
||||
Interpretation: "Indicates the average summed transmitted throughput of all network interfaces. This is often predominantly composed of the WAN/internet-connected interface, and knowing normal/good figures depends on knowing the bandwidth of the " +
|
||||
"underlying hardware and the workloads.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.BitsPerSecond),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}").Unit(monitoring.BitsPerSecond),
|
||||
},
|
||||
{
|
||||
Name: "node_io_network_saturation",
|
||||
Description: "network IO saturation (Tx packets dropped)",
|
||||
Query: "sum(rate(node_network_transmit_drop_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance)",
|
||||
Query: "sum(rate(node_network_transmit_drop_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance)",
|
||||
NoAlert: true,
|
||||
Interpretation: "Number of dropped transmitted packets. This can happen if the receiving side's receive queues/buffers become full due to slow packet processing throughput, the network link is congested etc.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}"),
|
||||
},
|
||||
{
|
||||
Name: "node_io_network_saturation",
|
||||
Description: "network IO errors (Tx)",
|
||||
Query: "sum(rate(node_network_transmit_errs_total{job=~\"" + job + "\",instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(instance)",
|
||||
Query: "sum(rate(node_network_transmit_errs_total{sg_job=~\"" + job + "\",sg_instance=~\"" + instanceFilter + "\"}[$__rate_interval])) by(sg_instance)",
|
||||
NoAlert: true,
|
||||
Interpretation: "Number of packet transmission errors. This is distinct from tx packet dropping, and can indicate a failing NIC, improperly configured network options anywhere along the line, signal noise etc.",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
||||
Panel: monitoring.Panel().LegendFormat("{{sg_instance}}"),
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -164,7 +164,7 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m
|
||||
{
|
||||
Name: "node_cpu_saturation_load1",
|
||||
Description: "host CPU saturation (1min average)",
|
||||
Query: "sum(node_load1{job=~\""+job+"\",instance=~\"$instance\"}) by (instance) / count(node_cpu_seconds_total{job=~\""+job+"\",mode=\"system\",instance=~\"$instance\"}) by (instance) * 100",
|
||||
Query: "sum(node_load1{job=~\""+job+"\",sg_instance=~\"$instance\"}) by (sg_instance) / count(node_cpu_seconds_total{job=~\""+job+"\",mode=\"system\",sg_instance=~\"$instance\"}) by (sg_instance) * 100",
|
||||
NoAlert: true,
|
||||
Interpretation: "banana",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage),
|
||||
@ -172,7 +172,7 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m
|
||||
{
|
||||
Name: "node_cpu_saturation_load5",
|
||||
Description: "host CPU saturation (5min average)",
|
||||
Query: "sum(node_load5{job=~\""+job+"\",instance=~\"$instance\"}) by (instance) / count(node_cpu_seconds_total{job=~\""+job+"\",mode=\"system\",instance=~\"$instance\"}) by (instance) * 100",
|
||||
Query: "sum(node_load5{job=~\""+job+"\",sg_instance=~\"$instance\"}) by (sg_instance) / count(node_cpu_seconds_total{job=~\""+job+"\",mode=\"system\",sg_instance=~\"$instance\"}) by (sg_instance) * 100",
|
||||
NoAlert: true,
|
||||
Interpretation: "banana",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Percentage),
|
||||
@ -183,7 +183,7 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m
|
||||
/* {
|
||||
Name: "node_memory_saturation",
|
||||
Description: "host memory saturation (major page fault rate)",
|
||||
Query: "sum(rate(node_vmstat_pgmajfault{job=~\""+job+"\",instance=~\"$instance\"}[$__rate_interval])) by (instance)",
|
||||
Query: "sum(rate(node_vmstat_pgmajfault{job=~\""+job+"\",sg_instance=~\"$instance\"}[$__rate_interval])) by (sg_instance)",
|
||||
NoAlert: true,
|
||||
Interpretation: "banana",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
||||
@ -193,7 +193,7 @@ func NewNodeExporterGroup(containerName, job, jobTitle, instanceFilter string) m
|
||||
/* {
|
||||
Name: "node_io_disk_saturation_pressure_some",
|
||||
Description: "disk IO saturation (some-processes time waiting)",
|
||||
Query: "rate(node_pressure_io_waiting_seconds_total{job=~\""+job+"\",instance=~\"$instance\"}[$__rate_interval])-rate(node_pressure_io_stalled_seconds_total{job=~\""+job+"\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
Query: "rate(node_pressure_io_waiting_seconds_total{job=~\""+job+"\",sg_instance=~\"$instance\"}[$__rate_interval])-rate(node_pressure_io_stalled_seconds_total{job=~\""+job+"\",sg_instance=~\"$instance\"}[$__rate_interval])",
|
||||
NoAlert: true,
|
||||
Interpretation: "banana",
|
||||
Panel: monitoring.Panel().LegendFormat("{{instance}}").Unit(monitoring.Seconds),
|
||||
|
||||
@ -80,6 +80,10 @@ type ObservationGroupOptions struct {
|
||||
func (observationConstructor) NewGroup(containerName string, owner monitoring.ObservableOwner, options ObservationGroupOptions) monitoring.Group {
|
||||
rows := make([]monitoring.Row, 0, 2)
|
||||
|
||||
if options.JobLabel == "" {
|
||||
options.JobLabel = "job"
|
||||
}
|
||||
|
||||
if len(options.By) == 0 {
|
||||
if options.Aggregate != nil {
|
||||
panic("Aggregate must not be supplied when By is not set")
|
||||
|
||||
@ -19,7 +19,7 @@ type queueConstructor struct{}
|
||||
// Requires a gauge of the format `src_{options.MetricNameRoot}_total`
|
||||
func (queueConstructor) Size(options ObservableConstructorOptions) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
filters := makeFilters(containerName, options.Filters...)
|
||||
filters := makeFilters(options.JobLabel, containerName, options.Filters...)
|
||||
by, legendPrefix := makeBy(options.By...)
|
||||
|
||||
return Observable{
|
||||
@ -40,7 +40,7 @@ func (queueConstructor) Size(options ObservableConstructorOptions) sharedObserva
|
||||
// - counter of the format `src_{options.MetricNameRoot}_processor_total`
|
||||
func (queueConstructor) GrowthRate(options ObservableConstructorOptions) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
filters := makeFilters(containerName, options.Filters...)
|
||||
filters := makeFilters(options.JobLabel, containerName, options.Filters...)
|
||||
by, legendPrefix := makeBy(options.By...)
|
||||
|
||||
return Observable{
|
||||
@ -60,7 +60,7 @@ func (queueConstructor) GrowthRate(options ObservableConstructorOptions) sharedO
|
||||
// - counter of the format `src_{options.MetricNameRoot}_queued_duration_seconds_total`
|
||||
func (queueConstructor) MaxAge(options ObservableConstructorOptions) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
filters := makeFilters(containerName, options.Filters...)
|
||||
filters := makeFilters(options.JobLabel, containerName, options.Filters...)
|
||||
by, legendPrefix := makeBy(options.By...)
|
||||
|
||||
return Observable{
|
||||
|
||||
@ -29,7 +29,7 @@ func (standardConstructor) Count(legend string) observableConstructor {
|
||||
|
||||
return func(options ObservableConstructorOptions) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
filters := makeFilters(containerName, options.Filters...)
|
||||
filters := makeFilters(options.JobLabel, containerName, options.Filters...)
|
||||
by, legendPrefix := makeBy(options.By...)
|
||||
|
||||
return Observable{
|
||||
@ -56,7 +56,7 @@ func (standardConstructor) Duration(legend string) observableConstructor {
|
||||
|
||||
return func(options ObservableConstructorOptions) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
filters := makeFilters(containerName, options.Filters...)
|
||||
filters := makeFilters(options.JobLabel, containerName, options.Filters...)
|
||||
by, _ := makeBy(append([]string{"le"}, options.By...)...)
|
||||
|
||||
observable := Observable{
|
||||
@ -99,7 +99,7 @@ func (standardConstructor) Errors(legend string) observableConstructor {
|
||||
|
||||
return func(options ObservableConstructorOptions) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
filters := makeFilters(containerName, options.Filters...)
|
||||
filters := makeFilters(options.JobLabel, containerName, options.Filters...)
|
||||
by, legendPrefix := makeBy(options.By...)
|
||||
|
||||
return Observable{
|
||||
@ -128,7 +128,7 @@ func (standardConstructor) ErrorRate(legend string) observableConstructor {
|
||||
|
||||
return func(options ObservableConstructorOptions) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
filters := makeFilters(containerName, options.Filters...)
|
||||
filters := makeFilters(options.JobLabel, containerName, options.Filters...)
|
||||
by, legendPrefix := makeBy(options.By...)
|
||||
|
||||
return Observable{
|
||||
@ -145,7 +145,7 @@ func (standardConstructor) ErrorRate(legend string) observableConstructor {
|
||||
|
||||
// LastOverTime creates a last-over-time aggregate for the error-rate metric, stretching back over the lookback-window time range.
|
||||
func (standardConstructor) LastOverTimeErrorRate(containerName string, lookbackWindow model.Duration, options ObservableConstructorOptions) string {
|
||||
filters := makeFilters(containerName, options.Filters...)
|
||||
filters := makeFilters(options.JobLabel, containerName, options.Filters...)
|
||||
by, _ := makeBy(options.By...)
|
||||
return fmt.Sprintf(`last_over_time(sum%[1]s(increase(src_%[2]s_errors_total{%[3]s}[5m]))[%[4]s:]) / (last_over_time(sum%[1]s(increase(src_%[2]s_total{%[3]s}[5m]))[%[4]s:]) + last_over_time(sum%[1]s(increase(src_%[2]s_errors_total{%[3]s}[5m]))[%[4]s:])) * 100`,
|
||||
by, options.MetricNameRoot, filters, lookbackWindow)
|
||||
|
||||
@ -59,7 +59,7 @@ func (workerutilConstructor) ErrorRate(options ObservableConstructorOptions) sha
|
||||
// Requires a gauge of the format `src_{options.MetricNameRoot}_processor_handlers`
|
||||
func (workerutilConstructor) Handlers(options ObservableConstructorOptions) sharedObservable {
|
||||
return func(containerName string, owner monitoring.ObservableOwner) Observable {
|
||||
filters := makeFilters(containerName, options.Filters...)
|
||||
filters := makeFilters(options.JobLabel, containerName, options.Filters...)
|
||||
by, legendPrefix := makeBy(options.By...)
|
||||
|
||||
return Observable{
|
||||
|
||||
@ -188,7 +188,6 @@ func Zoekt() *monitoring.Dashboard {
|
||||
Title: "Git fetch durations",
|
||||
Rows: []monitoring.Row{
|
||||
{
|
||||
|
||||
{
|
||||
Name: "90th_percentile_successful_git_fetch_durations_5m",
|
||||
Description: "90th percentile successful git fetch durations over 5m",
|
||||
|
||||
@ -56,6 +56,7 @@ env:
|
||||
{ "Name": "codeintel-worker", "Host": "127.0.0.1:6088" },
|
||||
{ "Name": "oss-worker", "Host": "127.0.0.1:6089" },
|
||||
{ "Name": "worker", "Host": "127.0.0.1:6089" },
|
||||
{ "Name": "worker-executors", "Host": "127.0.0.1:6996" },
|
||||
{ "Name": "executor-codeintel", "Host": "127.0.0.1:6092" },
|
||||
{ "Name": "executor-batches", "Host": "127.0.0.1:6093" },
|
||||
{ "Name": "zoekt-index-0", "Host": "127.0.0.1:6072" },
|
||||
|
||||
Loading…
Reference in New Issue
Block a user