From 6c7389f37c7696d6bb070636692ba71d38c3e7bf Mon Sep 17 00:00:00 2001 From: William Bezuidenhout Date: Mon, 19 Dec 2022 14:18:51 +0200 Subject: [PATCH] otel: add collector dashboard (#45009) * add initial dashboard for otel * add failed sent dashboard * extra panels * use sum and rate for resource queries * review comments * add warning alerts * Update monitoring/definitions/otel_collector.go * review comments * run go generate * Update monitoring/definitions/otel_collector.go Co-authored-by: Robert Lin * Update monitoring/definitions/otel_collector.go Co-authored-by: Robert Lin * Update monitoring/definitions/otel_collector.go Co-authored-by: Robert Lin * Update monitoring/definitions/otel_collector.go Co-authored-by: Robert Lin * Update monitoring/definitions/otel_collector.go Co-authored-by: Robert Lin * Update monitoring/definitions/otel_collector.go Co-authored-by: Robert Lin * review comments * review feedback also drop two panels * remove brackets in metrics * update docs * fix goimport * gogenerate Co-authored-by: Robert Lin Co-authored-by: Jean-Hadrien Chabran --- dev/prometheus/all/prometheus_targets.yml | 5 + dev/prometheus/linux/prometheus_targets.yml | 5 + doc/admin/observability/alerts.md | 158 ++++++++++ doc/admin/observability/dashboards.md | 298 ++++++++++++++++++ .../configs/jaeger.yaml | 3 + monitoring/definitions/dashboards.go | 1 + monitoring/definitions/otel_collector.go | 145 +++++++++ sg.config.yaml | 1 + 8 files changed, 616 insertions(+) create mode 100644 monitoring/definitions/otel_collector.go diff --git a/dev/prometheus/all/prometheus_targets.yml b/dev/prometheus/all/prometheus_targets.yml index 2c9115b600b..ffc19646647 100644 --- a/dev/prometheus/all/prometheus_targets.yml +++ b/dev/prometheus/all/prometheus_targets.yml @@ -63,3 +63,8 @@ targets: # github proxy - host.docker.internal:6090 +- labels: + job: otel-collector + targets: + # opentelemetry collector + - host.docker.internal:8888 diff --git a/dev/prometheus/linux/prometheus_targets.yml b/dev/prometheus/linux/prometheus_targets.yml index ca635c0ca3f..8f94e01133c 100644 --- a/dev/prometheus/linux/prometheus_targets.yml +++ b/dev/prometheus/linux/prometheus_targets.yml @@ -63,3 +63,8 @@ targets: # github proxy - 127.0.0.1:6090 +- labels: + job: otel-collector + targets: + # opentelemetry collector + - host.docker.internal:8888 diff --git a/doc/admin/observability/alerts.md b/doc/admin/observability/alerts.md index df74195a91d..4c61c4420d8 100644 --- a/doc/admin/observability/alerts.md +++ b/doc/admin/observability/alerts.md @@ -7851,3 +7851,161 @@ Generated query for warning alert: `max((rate(src_telemetry_job_total{op="SendEv
+## otel-collector: otel_span_refused + +

spans refused per receiver

+ +**Descriptions** + +- warning otel-collector: 1+ spans refused per receiver for 5m0s + +**Next steps** + +- Check logs of the collector and configuration of the receiver +- More help interpreting this metric is available in the [dashboards reference](./dashboards.md#otel-collector-otel-span-refused). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_otel-collector_otel_span_refused" +] +``` + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Generated query for warning alert: `max((sum by(receiver) (rate(otelcol_receiver_refused_spans[1m]))) > 1)` + +
+ +
+ +## otel-collector: otel_span_export_failures + +

span export failures by exporter

+ +**Descriptions** + +- warning otel-collector: 1+ span export failures by exporter for 5m0s + +**Next steps** + +- Check the configuration of the exporter and if the service being exported is up +- More help interpreting this metric is available in the [dashboards reference](./dashboards.md#otel-collector-otel-span-export-failures). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_otel-collector_otel_span_export_failures" +] +``` + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Generated query for warning alert: `max((sum by(exporter) (rate(otelcol_exporter_send_failed_spans[1m]))) > 1)` + +
+ +
+ +## otel-collector: container_cpu_usage + +

container cpu usage total (1m average) across all cores by instance

+ +**Descriptions** + +- warning otel-collector: 99%+ container cpu usage total (1m average) across all cores by instance + +**Next steps** + +- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `cpus:` of the otel-collector container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#otel-collector-container-cpu-usage). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_otel-collector_container_cpu_usage" +] +``` + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Generated query for warning alert: `max((cadvisor_container_cpu_usage_percentage_total{name=~"^otel-collector.*"}) >= 99)` + +
+ +
+ +## otel-collector: container_memory_usage + +

container memory usage by instance

+ +**Descriptions** + +- warning otel-collector: 99%+ container memory usage by instance + +**Next steps** + +- **Kubernetes:** Consider increasing memory limit in relevant `Deployment.yaml`. +- **Docker Compose:** Consider increasing `memory:` of otel-collector container in `docker-compose.yml`. +- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#otel-collector-container-memory-usage). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_otel-collector_container_memory_usage" +] +``` + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Generated query for warning alert: `max((cadvisor_container_memory_usage_percentage_total{name=~"^otel-collector.*"}) >= 99)` + +
+ +
+ +## otel-collector: pods_available_percentage + +

percentage pods available

+ +**Descriptions** + +- critical otel-collector: less than 90% percentage pods available for 10m0s + +**Next steps** + +- Determine if the pod was OOM killed using `kubectl describe pod otel-collector` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. +- Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p otel-collector`. +- Learn more about the related dashboard panel in the [dashboards reference](./dashboards.md#otel-collector-pods-available-percentage). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "critical_otel-collector_pods_available_percentage" +] +``` + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Generated query for critical alert: `min((sum by(app) (up{app=~".*otel-collector"}) / count by(app) (up{app=~".*otel-collector"}) * 100) <= 90)` + +
+ +
+ diff --git a/doc/admin/observability/dashboards.md b/doc/admin/observability/dashboards.md index 340ec267852..6ec011a81b5 100644 --- a/doc/admin/observability/dashboards.md +++ b/doc/admin/observability/dashboards.md @@ -24485,3 +24485,301 @@ Query: `rate(src_telemetry_job_total{op="SendEvents"}[1h]) / on() group_right()
+## OpenTelemetry Collector + +

The OpenTelemetry collector ingests OpenTelemetry data from Sourcegraph and exports it to the configured backends.

+ +To see this dashboard, visit `/-/debug/grafana/d/otel-collector/otel-collector` on your Sourcegraph instance. + +### OpenTelemetry Collector: Receivers + +#### otel-collector: otel_span_receive_rate + +

Spans received per receiver per minute

+ + Shows the rate of spans accepted by the configured reveiver + + A Trace is a collection of spans and a span represents a unit of work or operation. Spans are the building blocks of Traces. + The spans have only been accepted by the receiver, which means they still have to move through the configured pipeline to be exported. + For more information on tracing and configuration of a OpenTelemetry receiver see https://opentelemetry.io/docs/collector/configuration/#receivers. + + See the Exporters section see spans that have made it through the pipeline and are exported. + + Depending the configured processors, received spans might be dropped and not exported. For more information on configuring processors see + https://opentelemetry.io/docs/collector/configuration/#processors. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100000` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `sum by (receiver) (rate(otelcol_receiver_accepted_spans[1m]))` + +
+ +
+ +#### otel-collector: otel_span_refused + +

Spans refused per receiver

+ + Shows the amount of spans that have been refused by a receiver. + + A Trace is a collection of spans. A Span represents a unit of work or operation. Spans are the building blocks of Traces. + + Spans can be rejected either due to a misconfigured receiver or receiving spans in the wrong format. The log of the collector will have more information on why a span was rejected. + For more information on tracing and configuration of a OpenTelemetry receiver see https://opentelemetry.io/docs/collector/configuration/#receivers. + +Refer to the [alerts reference](./alerts.md#otel-collector-otel-span-refused) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100001` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `sum by (receiver) (rate(otelcol_receiver_refused_spans[1m]))` + +
+ +
+ +### OpenTelemetry Collector: Exporters + +#### otel-collector: otel_span_export_rate + +

Spans exported per exporter per minute

+ + Shows the rate of spans being sent by the exporter + + A Trace is a collection of spans. A Span represents a unit of work or operation. Spans are the building blocks of Traces. + The rate of spans here indicates spans that have made it through the configured pipeline and have been sent to the configured export destination. + + For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100100` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `sum by (exporter) (rate(otelcol_exporter_sent_spans[1m]))` + +
+ +
+ +#### otel-collector: otel_span_export_failures + +

Span export failures by exporter

+ + Shows the rate of spans failed to be sent by the configured reveiver. A number higher than 0 for a long period can indicate a problem with the exporter configuration or with the service that is being exported too + + For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. + +Refer to the [alerts reference](./alerts.md#otel-collector-otel-span-export-failures) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100101` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `sum by (exporter) (rate(otelcol_exporter_send_failed_spans[1m]))` + +
+ +
+ +### OpenTelemetry Collector: Collector resource usage + +#### otel-collector: otel_cpu_usage + +

Cpu usage of the collector

+ + Shows the cpu usage of the OpenTelemetry collector + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100200` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `sum by (job) (rate(otelcol_process_cpu_seconds{job=~"^.*"}[1m]))` + +
+ +
+ +#### otel-collector: otel_memory_resident_set_size + +

Memory allocated to the otel collector

+ + Shows the memory Resident Set Size (RSS) allocated to the OpenTelemetry collector + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100201` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `sum by (job) (rate(otelcol_process_memory_rss{job=~"^.*"}[1m]))` + +
+ +
+ +#### otel-collector: otel_memory_usage + +

Memory used by the collector

+ + Shows how much memory is being used by the otel collector. + + * High memory usage might indicate thad the configured pipeline is keeping a lot of spans in memory for processing + * Spans failing to be sent and the exporter is configured to retry + * A high batch count by using a batch processor + + For more information on configuring processors for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#processors. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100202` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `sum by (job) (rate(otelcol_process_runtime_total_alloc_bytes{job=~"^.*"}[1m]))` + +
+ +
+ +### OpenTelemetry Collector: Container monitoring (not available on server) + +#### otel-collector: container_missing + +

Container missing

+ +This value is the number of times a container has not been seen for more than one minute. If you observe this +value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. + +- **Kubernetes:** + - Determine if the pod was OOM killed using `kubectl describe pod otel-collector` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p otel-collector`. +- **Docker Compose:** + - Determine if the pod was OOM killed using `docker inspect -f '{{json .State}}' otel-collector` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the otel-collector container in `docker-compose.yml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs otel-collector` (note this will include logs from the previous and currently running container). + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100300` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `count by(name) ((time() - container_last_seen{name=~"^otel-collector.*"}) > 60)` + +
+ +
+ +#### otel-collector: container_cpu_usage + +

Container cpu usage total (1m average) across all cores by instance

+ +Refer to the [alerts reference](./alerts.md#otel-collector-container-cpu-usage) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100301` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `cadvisor_container_cpu_usage_percentage_total{name=~"^otel-collector.*"}` + +
+ +
+ +#### otel-collector: container_memory_usage + +

Container memory usage by instance

+ +Refer to the [alerts reference](./alerts.md#otel-collector-container-memory-usage) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100302` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `cadvisor_container_memory_usage_percentage_total{name=~"^otel-collector.*"}` + +
+ +
+ +#### otel-collector: fs_io_operations + +

Filesystem reads and writes rate by instance over 1h

+ +This value indicates the number of filesystem read and write operations by containers of this service. +When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with {{CONTAINER_NAME}} issues. + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100303` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `sum by(name) (rate(container_fs_reads_total{name=~"^otel-collector.*"}[1h]) + rate(container_fs_writes_total{name=~"^otel-collector.*"}[1h]))` + +
+ +
+ +### OpenTelemetry Collector: Kubernetes monitoring (only available on Kubernetes) + +#### otel-collector: pods_available_percentage + +

Percentage pods available

+ +Refer to the [alerts reference](./alerts.md#otel-collector-pods-available-percentage) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100400` on your Sourcegraph instance. + +*Managed by the [Sourcegraph Cloud DevOps team](https://handbook.sourcegraph.com/departments/engineering/teams/devops).* + +
+Technical details + +Query: `sum by(app) (up{app=~".*otel-collector"}) / count by (app) (up{app=~".*otel-collector"}) * 100` + +
+ +
+ diff --git a/docker-images/opentelemetry-collector/configs/jaeger.yaml b/docker-images/opentelemetry-collector/configs/jaeger.yaml index 3c91e659bba..3c58f49657b 100644 --- a/docker-images/opentelemetry-collector/configs/jaeger.yaml +++ b/docker-images/opentelemetry-collector/configs/jaeger.yaml @@ -24,6 +24,9 @@ extensions: endpoint: ":55679" service: + telemetry: + metrics: + address: ":8888" extensions: [health_check,zpages] pipelines: traces: diff --git a/monitoring/definitions/dashboards.go b/monitoring/definitions/dashboards.go index b8e8006a1a3..b8ec2edc66c 100644 --- a/monitoring/definitions/dashboards.go +++ b/monitoring/definitions/dashboards.go @@ -31,6 +31,7 @@ func Default() Dashboards { CodeIntelRanking(), CodeIntelUploads(), Telemetry(), + OtelCollector(), } } diff --git a/monitoring/definitions/otel_collector.go b/monitoring/definitions/otel_collector.go new file mode 100644 index 00000000000..ecfea3c7be5 --- /dev/null +++ b/monitoring/definitions/otel_collector.go @@ -0,0 +1,145 @@ +package definitions + +import ( + "time" + + "github.com/sourcegraph/sourcegraph/monitoring/definitions/shared" + "github.com/sourcegraph/sourcegraph/monitoring/monitoring" +) + +func OtelCollector() *monitoring.Dashboard { + containerName := "otel-collector" + + return &monitoring.Dashboard{ + Name: containerName, + Title: "OpenTelemetry Collector", + Description: "The OpenTelemetry collector ingests OpenTelemetry data from Sourcegraph and exports it to the configured backends.", + Groups: []monitoring.Group{ + { + Title: "Receivers", + Hidden: false, + Rows: []monitoring.Row{ + { + { + Name: "otel_span_receive_rate", + Description: "spans received per receiver per minute", + Panel: monitoring.Panel().Unit(monitoring.Number).LegendFormat("receiver: {{receiver}}"), + Owner: monitoring.ObservableOwnerDevOps, + Query: "sum by (receiver) (rate(otelcol_receiver_accepted_spans[1m]))", + NoAlert: true, + Interpretation: ` + Shows the rate of spans accepted by the configured reveiver + + A Trace is a collection of spans and a span represents a unit of work or operation. Spans are the building blocks of Traces. + The spans have only been accepted by the receiver, which means they still have to move through the configured pipeline to be exported. + For more information on tracing and configuration of a OpenTelemetry receiver see https://opentelemetry.io/docs/collector/configuration/#receivers. + + See the Exporters section see spans that have made it through the pipeline and are exported. + + Depending the configured processors, received spans might be dropped and not exported. For more information on configuring processors see + https://opentelemetry.io/docs/collector/configuration/#processors.`, + }, + { + Name: "otel_span_refused", + Description: "spans refused per receiver", + Panel: monitoring.Panel().Unit(monitoring.Number).LegendFormat("receiver: {{receiver}}"), + Owner: monitoring.ObservableOwnerDevOps, + Query: "sum by (receiver) (rate(otelcol_receiver_refused_spans[1m]))", + Warning: monitoring.Alert().Greater(1).For(5 * time.Minute), + NextSteps: "Check logs of the collector and configuration of the receiver", + Interpretation: ` + Shows the amount of spans that have been refused by a receiver. + + A Trace is a collection of spans. A Span represents a unit of work or operation. Spans are the building blocks of Traces. + + Spans can be rejected either due to a misconfigured receiver or receiving spans in the wrong format. The log of the collector will have more information on why a span was rejected. + For more information on tracing and configuration of a OpenTelemetry receiver see https://opentelemetry.io/docs/collector/configuration/#receivers.`, + }, + }, + }, + }, + { + Title: "Exporters", + Hidden: false, + Rows: []monitoring.Row{ + { + { + Name: "otel_span_export_rate", + Description: "spans exported per exporter per minute", + Panel: monitoring.Panel().Unit(monitoring.Number).LegendFormat("exporter: {{exporter}}"), + Owner: monitoring.ObservableOwnerDevOps, + Query: "sum by (exporter) (rate(otelcol_exporter_sent_spans[1m]))", + NoAlert: true, + Interpretation: ` + Shows the rate of spans being sent by the exporter + + A Trace is a collection of spans. A Span represents a unit of work or operation. Spans are the building blocks of Traces. + The rate of spans here indicates spans that have made it through the configured pipeline and have been sent to the configured export destination. + + For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters.`, + }, + { + Name: "otel_span_export_failures", + Description: "span export failures by exporter", + Panel: monitoring.Panel().Unit(monitoring.Number).LegendFormat("exporter: {{exporter}}"), + Owner: monitoring.ObservableOwnerDevOps, + Query: "sum by (exporter) (rate(otelcol_exporter_send_failed_spans[1m]))", + Warning: monitoring.Alert().Greater(1).For(5 * time.Minute), + NextSteps: "Check the configuration of the exporter and if the service being exported is up", + Interpretation: ` + Shows the rate of spans failed to be sent by the configured reveiver. A number higher than 0 for a long period can indicate a problem with the exporter configuration or with the service that is being exported too + + For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters.`, + }, + }, + }, + }, + { + Title: "Collector resource usage", + Hidden: false, + Rows: []monitoring.Row{ + { + { + Name: "otel_cpu_usage", + Description: "cpu usage of the collector", + Panel: monitoring.Panel().Unit(monitoring.Seconds).LegendFormat("{{job}}"), + Owner: monitoring.ObservableOwnerDevOps, + Query: "sum by (job) (rate(otelcol_process_cpu_seconds{job=~\"^.*\"}[1m]))", + NoAlert: true, + Interpretation: ` + Shows the cpu usage of the OpenTelemetry collector`, + }, + { + Name: "otel_memory_resident_set_size", + Description: "memory allocated to the otel collector", + Panel: monitoring.Panel().Unit(monitoring.Bytes).LegendFormat("{{job}}"), + Owner: monitoring.ObservableOwnerDevOps, + Query: "sum by (job) (rate(otelcol_process_memory_rss{job=~\"^.*\"}[1m]))", + NoAlert: true, + Interpretation: ` + Shows the memory Resident Set Size (RSS) allocated to the OpenTelemetry collector`, + }, + { + Name: "otel_memory_usage", + Description: "memory used by the collector", + Panel: monitoring.Panel().Unit(monitoring.Bytes).LegendFormat("{{job}}"), + Owner: monitoring.ObservableOwnerDevOps, + Query: "sum by (job) (rate(otelcol_process_runtime_total_alloc_bytes{job=~\"^.*\"}[1m]))", + NoAlert: true, + Interpretation: ` + Shows how much memory is being used by the otel collector. + + * High memory usage might indicate thad the configured pipeline is keeping a lot of spans in memory for processing + * Spans failing to be sent and the exporter is configured to retry + * A high batch count by using a batch processor + + For more information on configuring processors for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#processors.`, + }, + }, + }, + }, + shared.NewContainerMonitoringGroup("otel-collector", monitoring.ObservableOwnerDevOps, nil), + shared.NewKubernetesMonitoringGroup("otel-collector", monitoring.ObservableOwnerDevOps, nil), + }, + } +} diff --git a/sg.config.yaml b/sg.config.yaml index 0cb8e67fa10..10a9619dff4 100644 --- a/sg.config.yaml +++ b/sg.config.yaml @@ -752,6 +752,7 @@ commands: docker container rm otel-collector docker run --rm --name=otel-collector $DOCKER_NET $DOCKER_ARGS \ -p 4317:4317 -p 4318:4318 -p 55679:55679 -p 55670:55670 \ + -p 8888:8888 \ -e JAEGER_HOST=$JAEGER_HOST \ -e HONEYCOMB_API_KEY=$HONEYCOMB_API_KEY \ -e HONEYCOMB_DATASET=$HONEYCOMB_DATASET \