Fill out dashboards for precise-code-intel services (#10496)

2026-02-06 19:51:50 +00:00 · 2020-05-12 10:12:45 -05:00 · 2020-05-12 10:12:45 -05:00 · 47a84cd769
commit 47a84cd769
parent 53043edfa1
9 changed files with 334 additions and 15 deletions
--- a/cmd/precise-code-intel-bundle-manager/env.go
+++ b/cmd/precise-code-intel-bundle-manager/env.go
@ -10,9 +10,9 @@ import (

 var (
 	rawBundleDir            = env.Get("PRECISE_CODE_INTEL_BUNDLE_DIR", "/lsif-storage", "Root dir containing uploads and converted bundles.")
-	rawDatabaseCacheSize    = env.Get("PRECISE_CODE_INTEL_CONNECTION_CACHE_CAPACITY", "100", "Number of SQLite connections that can be opened at once.")
-	rawDocumentCacheSize    = env.Get("PRECISE_CODE_INTEL_DOCUMENT_CACHE_CAPACITY", "100", "Maximum number of decoded documents that can be held in memory at once.")
-	rawResultChunkCacheSize = env.Get("PRECISE_CODE_INTEL_RESULT_CHUNK_CACHE_CAPACITY", "100", "Maximum number of decoded result chunks that can be held in memory at once.")
+	rawDatabaseCacheSize    = env.Get("PRECISE_CODE_INTEL_CONNECTION_CACHE_CAPACITY", "100", "Maximum number of SQLite connections that can be open at once.")
+	rawDocumentCacheSize    = env.Get("PRECISE_CODE_INTEL_DOCUMENT_CACHE_CAPACITY", "1000000", "Size of decoded document cache. A document's cost is the number of fields.")
+	rawResultChunkCacheSize = env.Get("PRECISE_CODE_INTEL_RESULT_CHUNK_CACHE_CAPACITY", "1000000", "Size of decoded result chunk cache. A result chunk's cost is the number of fields.")
 	rawDesiredPercentFree   = env.Get("PRECISE_CODE_INTEL_DESIRED_PERCENT_FREE", "10", "Target percentage of free space on disk.")
 	rawJanitorInterval      = env.Get("PRECISE_CODE_INTEL_JANITOR_INTERVAL", "1m", "Interval between cleanup runs.")
 	rawMaxUploadAge         = env.Get("PRECISE_CODE_INTEL_MAX_UPLOAD_AGE", "24h", "The maximum time an upload can sit on disk.")
--- a/cmd/precise-code-intel-bundle-manager/internal/database/observability.go
+++ b/cmd/precise-code-intel-bundle-manager/internal/database/observability.go
@ -67,7 +67,7 @@ func NewObserved(database Database, observationContext *observation.Context) Dat
 		}),
 		monikerResultsOperation: observationContext.Operation(observation.Op{
 			Name:         "Database.MonikerResults",
-			MetricLabels: []string{"monike_results"},
+			MetricLabels: []string{"moniker_results"},
 			Metrics:      metrics,
 		}),
 		packageInformationOperation: observationContext.Operation(observation.Op{
--- a/cmd/precise-code-intel-bundle-manager/main.go
+++ b/cmd/precise-code-intel-bundle-manager/main.go
@ -6,7 +6,6 @@ import (
 	"os/signal"
 	"syscall"

-	"github.com/dgraph-io/ristretto"
 	"github.com/inconshreveable/log15"
 	"github.com/opentracing/opentracing-go"
 	"github.com/pkg/errors"
@ -102,14 +101,9 @@ func prepCaches(r prometheus.Registerer, databaseCacheSize, documentCacheSize, r
 		log.Fatal(errors.Wrap(err, "failed to initialize result chunk cache"))
 	}

-	cacheMetrics := map[string]*ristretto.Metrics{
-		"precise-code-intel-database":     databaseCacheMetrics,
-		"precise-code-intel-document":     documentCacheMetrics,
-		"precise-code-intel-result-chunk": resultChunkCacheMetrics,
-	}
-	for cacheName, metrics := range cacheMetrics {
-		MustRegisterCacheMonitor(r, cacheName, metrics)
-	}
+	MustRegisterCacheMonitor(r, "precise-code-intel-database", databaseCacheSize, databaseCacheMetrics)
+	MustRegisterCacheMonitor(r, "precise-code-intel-document", documentCacheSize, documentCacheMetrics)
+	MustRegisterCacheMonitor(r, "precise-code-intel-result-chunk", resultChunkCacheSize, resultChunkCacheMetrics)

 	return databaseCache, documentCache, resultChunkCache
 }
--- a/cmd/precise-code-intel-bundle-manager/metrics.go
+++ b/cmd/precise-code-intel-bundle-manager/metrics.go
@ -6,7 +6,16 @@ import (
 )

 // MustRegisterCacheMonitor emits metrics for a ristretto cache.
-func MustRegisterCacheMonitor(r prometheus.Registerer, cacheName string, metrics *ristretto.Metrics) {
+func MustRegisterCacheMonitor(r prometheus.Registerer, cacheName string, capacity int, metrics *ristretto.Metrics) {
+	cacheCapacity := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Name:        "src_cache_capacity",
+		Help:        "Capacity of the cache.",
+		ConstLabels: prometheus.Labels{"cache": cacheName},
+	}, func() float64 {
+		return float64(capacity)
+	})
+	r.MustRegister(cacheCapacity)
+
 	cacheCost := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 		Name:        "src_cache_cost",
 		Help:        "Current cost of the cache.",
--- a/doc/admin/observability/alert_solutions.md
+++ b/doc/admin/observability/alert_solutions.md
@ -339,6 +339,22 @@ for assistance.
 - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
 - **Docker Compose:** Consider increasing `cpus:` of the github-proxy container in `docker-compose.yml`.

+# precise-code-intel-api-server: frontend_internal_api_error_responses
+
+**Descriptions:**
+
+- _precise-code-intel-api-server: 5+ frontend-internal API error responses every 5m by route_
+
+**Possible solutions:**
+
+- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
+- **Kubernetes:**
+	- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
+	- Check `kubectl logs precise-code-intel-api-server` for logs indicate request failures to `frontend` or `frontend-internal`.
+- **Docker Compose:**
+	- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
+	- Check `docker logs precise-code-intel-api-server` for logs indicating request failures to `frontend` or `frontend-internal`.
+
 # precise-code-intel-api-server: container_restarts

 **Descriptions:**
@ -376,6 +392,35 @@ for assistance.
 - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
 - **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-api-server container in `docker-compose.yml`.

+# precise-code-intel-bundle-manager: disk_space_remaining
+
+**Descriptions:**
+
+- _precise-code-intel-bundle-manager: less than 25% disk space remaining by instance_
+
+
+- _precise-code-intel-bundle-manager: less than 15% disk space remaining by instance_
+
+**Possible solutions:**
+
+- **Provision more disk space:** Sourcegraph will begin deleting the oldest uploaded bundle files at 10% disk space remaining.
+
+# precise-code-intel-bundle-manager: frontend_internal_api_error_responses
+
+**Descriptions:**
+
+- _precise-code-intel-bundle-manager: 5+ frontend-internal API error responses every 5m by route_
+
+**Possible solutions:**
+
+- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
+- **Kubernetes:**
+	- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
+	- Check `kubectl logs precise-code-intel-bundle-manager` for logs indicate request failures to `frontend` or `frontend-internal`.
+- **Docker Compose:**
+	- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
+	- Check `docker logs precise-code-intel-bundle-manager` for logs indicating request failures to `frontend` or `frontend-internal`.
+
 # precise-code-intel-bundle-manager: container_restarts

 **Descriptions:**
@ -413,6 +458,22 @@ for assistance.
 - **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
 - **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-bundle-manager container in `docker-compose.yml`.

+# precise-code-intel-worker: frontend_internal_api_error_responses
+
+**Descriptions:**
+
+- _precise-code-intel-worker: 5+ frontend-internal API error responses every 5m by route_
+
+**Possible solutions:**
+
+- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
+- **Kubernetes:**
+	- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
+	- Check `kubectl logs precise-code-intel-worker` for logs indicate request failures to `frontend` or `frontend-internal`.
+- **Docker Compose:**
+	- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
+	- Check `docker logs precise-code-intel-worker` for logs indicating request failures to `frontend` or `frontend-internal`.
+
 # precise-code-intel-worker: container_restarts

 **Descriptions:**
--- a/internal/codeintel/db/observability.go
+++ b/internal/codeintel/db/observability.go
@ -46,7 +46,7 @@ var _ DB = &ObservedDB{}
 func NewObserved(db DB, observationContext *observation.Context) DB {
 	metrics := metrics.NewOperationMetrics(
 		observationContext.Registerer,
-		"codeintel_db",
+		"code_intel_db",
 		metrics.WithLabels("op"),
 		metrics.WithCountHelp("Total number of results returned"),
 	)
--- a/monitoring/precise_code_intel_api_server.go
+++ b/monitoring/precise_code_intel_api_server.go
@ -6,6 +6,78 @@ func PreciseCodeIntelAPIServer() *Container {
 		Title:       "Precise Code Intel API Server",
 		Description: "Serves precise code intelligence requests.",
 		Groups: []Group{
+			{
+				Title: "General",
+				Rows: []Row{
+					{
+						{
+							Name:        "99th_percentile_code_intel_api_duration",
+							Description: "99th percentile successful code intel api query duration over 5m",
+							// TODO(efritz) - ensure these exclude error durations
+							Query:             `histogram_quantile(0.99, sum by (le,op)(rate(src_code_intel_api_duration_seconds_bucket[5m])))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "code_intel_api_errors",
+							Description:       "code intel api errors every 5m",
+							Query:             `sum by (op)(increase(src_code_intel_api_errors_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}"),
+							PossibleSolutions: "none",
+						},
+					},
+					// TODO(efritz) - add bundle manager request meter
+					// TODO(efritz) - add gitserver request meter
+					{
+						{
+							Name:        "99th_percentile_db_duration",
+							Description: "99th percentile successful database query duration over 5m",
+							// TODO(efritz) - ensure these exclude error durations
+							Query:             `histogram_quantile(0.99, sum by (le,op)(rate(src_code_intel_db_duration_seconds_bucket{job="precise-code-intel-api-server"}[5m])))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "db_errors",
+							Description:       "database errors every 5m",
+							Query:             `sum by (op)(increase(src_code_intel_db_errors_total{job="precise-code-intel-api-server"}[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}"),
+							PossibleSolutions: "none",
+						},
+					},
+					{
+						{
+							Name:              "processing_uploads_reset",
+							Description:       "jobs reset to queued state every 5m",
+							Query:             `sum(increase(src_upload_queue_resets_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("jobs"),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "upload_resetter_errors",
+							Description:       "upload resetter errors every 5m",
+							Query:             `sum(increase(src_upload_queue_reset_errors_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("errors"),
+							PossibleSolutions: "none",
+						},
+					},
+					{
+						sharedFrontendInternalAPIErrorResponses("precise-code-intel-api-server"),
+					},
+				},
+			},
 			{
 				Title:  "Container monitoring (not available on k8s or server)",
 				Hidden: true,
--- a/monitoring/precise_code_intel_bundle_manager.go
+++ b/monitoring/precise_code_intel_bundle_manager.go
@ -6,6 +6,128 @@ func PreciseCodeIntelBundleManager() *Container {
 		Title:       "Precise Code Intel Bundle Manager",
 		Description: "Stores and manages precise code intelligence bundles.",
 		Groups: []Group{
+			{
+				Title: "General",
+				Rows: []Row{
+					{
+						{
+							Name:        "99th_percentile_bundle_database_duration",
+							Description: "99th percentile successful bundle database query duration over 5m",
+							// TODO(efritz) - ensure these exclude error durations
+							Query:             `histogram_quantile(0.99, sum by (le,op)(rate(src_bundle_database_duration_seconds_bucket[5m])))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "bundle_database_errors",
+							Description:       "bundle database errors every 5m",
+							Query:             `sum by (op)(increase(src_bundle_database_errors_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}"),
+							PossibleSolutions: "none",
+						},
+					},
+					{
+						{
+							Name:        "99th_percentile_bundle_reader_duration",
+							Description: "99th percentile successful bundle reader query duration over 5m",
+							// TODO(efritz) - ensure these exclude error durations
+							Query:             `histogram_quantile(0.99, sum by (le,op)(rate(src_bundle_reader_duration_seconds_bucket[5m])))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "bundle_reader_errors",
+							Description:       "bundle reader errors every 5m",
+							Query:             `sum by (op)(increase(src_bundle_reader_errors_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}"),
+							PossibleSolutions: "none",
+						},
+					},
+					{
+						{
+							Name:            "disk_space_remaining",
+							Description:     "disk space remaining by instance",
+							Query:           `(src_disk_space_available_bytes / src_disk_space_total_bytes) * 100`,
+							DataMayNotExist: true,
+							Warning:         Alert{LessOrEqual: 25},
+							Critical:        Alert{LessOrEqual: 15},
+							PanelOptions:    PanelOptions().LegendFormat("{{instance}}").Unit(Percentage),
+							PossibleSolutions: `
+								- **Provision more disk space:** Sourcegraph will begin deleting the oldest uploaded bundle files at 10% disk space remaining.
+							`,
+						},
+					},
+					{
+						{
+							Name:              "cache_utilization",
+							Description:       "cache utilization",
+							Query:             `(src_cache_cost / src_cache_capacity) * 100`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 110},
+							PanelOptions:      PanelOptions().LegendFormat("{{cache}}").Unit(Percentage),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "cache_miss_percentage",
+							Description:       "percentage of cache misses over all cache activity every 5m",
+							Query:             `(increase(src_cache_misses_total[5m]) / (increase(src_cache_hits_total[5m]) + increase(src_cache_misses_total[5m]))) * 100`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 80},
+							PanelOptions:      PanelOptions().LegendFormat("{{cache}}").Unit(Percentage),
+							PossibleSolutions: "none",
+						},
+					},
+					{
+						{
+							Name:              "janitor_errors",
+							Description:       "janitor errors every 5m",
+							Query:             `sum(increase(src_bundle_manager_janitor_errors_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("errors"),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "janitor_old_uploads",
+							Description:       "upload files removed (due to age) every 5m",
+							Query:             `sum(increase(src_bundle_manager_janitor_upload_files_removed_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("files removed"),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "janitor_orphaned_dumps",
+							Description:       "bundle files removed (with no corresponding database entry) every 5m",
+							Query:             `sum(increase(src_bundle_manager_janitor_orphaned_bundle_files_removed_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("files removed"),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "janitor_old_dumps",
+							Description:       "bundle files removed (after evicting them from the database) every 5m",
+							Query:             `sum(increase(src_bundle_manager_janitor_evicted_bundle_files_removed_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("files removed"),
+							PossibleSolutions: "none",
+						},
+					},
+					{
+						sharedFrontendInternalAPIErrorResponses("precise-code-intel-bundle-manager"),
+					},
+				},
+			},
 			{
 				Title:  "Container monitoring (not available on k8s or server)",
 				Hidden: true,
--- a/monitoring/precise_code_intel_worker.go
+++ b/monitoring/precise_code_intel_worker.go
@ -6,6 +6,67 @@ func PreciseCodeIntelWorker() *Container {
 		Title:       "Precise Code Intel Worker",
 		Description: "Handles conversion of uploaded precise code intelligence bundles.",
 		Groups: []Group{
+			{
+				Title: "General",
+				Rows: []Row{
+					{
+						{
+							Name:              "upload_queue_size",
+							Description:       "upload queue size",
+							Query:             `max(src_upload_queue_uploads_total)`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 100},
+							PanelOptions:      PanelOptions().LegendFormat("uploads queued for processing"),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "upload_queue_growth_rate",
+							Description:       "upload queue growth rate every 5m",
+							Query:             `sum(increase(src_upload_queue_uploads_total[5m])) / sum(increase(src_upload_queue_processor_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 5},
+							PanelOptions:      PanelOptions().LegendFormat("upload queue growth rate"),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:        "upload_process_errors",
+							Description: "upload process errors every 5m",
+							// TODO(efritz) - ensure these differentiate malformed dumps and system errors
+							Query:             `sum(increase(src_upload_queue_processor_errors_total[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("errors"),
+							PossibleSolutions: "none",
+						},
+					},
+					// TODO(efritz) - add bundle manager request meter
+					// TODO(efritz) - add gitserver request meter
+					{
+						{
+							Name:        "99th_percentile_db_duration",
+							Description: "99th percentile successful database query duration over 5m",
+							// TODO(efritz) - ensure these exclude error durations
+							Query:             `histogram_quantile(0.99, sum by (le,op)(rate(src_code_intel_db_duration_seconds_bucket{job="precise-code-intel-worker"}[5m])))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
+							PossibleSolutions: "none",
+						},
+						{
+							Name:              "db_errors",
+							Description:       "database errors every 5m",
+							Query:             `sum by (op)(increase(src_code_intel_db_errors_total{job="precise-code-intel-worker"}[5m]))`,
+							DataMayNotExist:   true,
+							Warning:           Alert{GreaterOrEqual: 20},
+							PanelOptions:      PanelOptions().LegendFormat("{{op}}"),
+							PossibleSolutions: "none",
+						},
+					},
+					{
+						sharedFrontendInternalAPIErrorResponses("precise-code-intel-worker"),
+					},
+				},
+			},
 			{
 				Title:  "Container monitoring (not available on k8s or server)",
 				Hidden: true,