Fill out dashboards for precise-code-intel services (#10496)

This commit is contained in:
Eric Fritz 2020-05-12 10:12:45 -05:00 committed by GitHub
parent 53043edfa1
commit 47a84cd769
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 334 additions and 15 deletions

View File

@ -10,9 +10,9 @@ import (
var (
rawBundleDir = env.Get("PRECISE_CODE_INTEL_BUNDLE_DIR", "/lsif-storage", "Root dir containing uploads and converted bundles.")
rawDatabaseCacheSize = env.Get("PRECISE_CODE_INTEL_CONNECTION_CACHE_CAPACITY", "100", "Number of SQLite connections that can be opened at once.")
rawDocumentCacheSize = env.Get("PRECISE_CODE_INTEL_DOCUMENT_CACHE_CAPACITY", "100", "Maximum number of decoded documents that can be held in memory at once.")
rawResultChunkCacheSize = env.Get("PRECISE_CODE_INTEL_RESULT_CHUNK_CACHE_CAPACITY", "100", "Maximum number of decoded result chunks that can be held in memory at once.")
rawDatabaseCacheSize = env.Get("PRECISE_CODE_INTEL_CONNECTION_CACHE_CAPACITY", "100", "Maximum number of SQLite connections that can be open at once.")
rawDocumentCacheSize = env.Get("PRECISE_CODE_INTEL_DOCUMENT_CACHE_CAPACITY", "1000000", "Size of decoded document cache. A document's cost is the number of fields.")
rawResultChunkCacheSize = env.Get("PRECISE_CODE_INTEL_RESULT_CHUNK_CACHE_CAPACITY", "1000000", "Size of decoded result chunk cache. A result chunk's cost is the number of fields.")
rawDesiredPercentFree = env.Get("PRECISE_CODE_INTEL_DESIRED_PERCENT_FREE", "10", "Target percentage of free space on disk.")
rawJanitorInterval = env.Get("PRECISE_CODE_INTEL_JANITOR_INTERVAL", "1m", "Interval between cleanup runs.")
rawMaxUploadAge = env.Get("PRECISE_CODE_INTEL_MAX_UPLOAD_AGE", "24h", "The maximum time an upload can sit on disk.")

View File

@ -67,7 +67,7 @@ func NewObserved(database Database, observationContext *observation.Context) Dat
}),
monikerResultsOperation: observationContext.Operation(observation.Op{
Name: "Database.MonikerResults",
MetricLabels: []string{"monike_results"},
MetricLabels: []string{"moniker_results"},
Metrics: metrics,
}),
packageInformationOperation: observationContext.Operation(observation.Op{

View File

@ -6,7 +6,6 @@ import (
"os/signal"
"syscall"
"github.com/dgraph-io/ristretto"
"github.com/inconshreveable/log15"
"github.com/opentracing/opentracing-go"
"github.com/pkg/errors"
@ -102,14 +101,9 @@ func prepCaches(r prometheus.Registerer, databaseCacheSize, documentCacheSize, r
log.Fatal(errors.Wrap(err, "failed to initialize result chunk cache"))
}
cacheMetrics := map[string]*ristretto.Metrics{
"precise-code-intel-database": databaseCacheMetrics,
"precise-code-intel-document": documentCacheMetrics,
"precise-code-intel-result-chunk": resultChunkCacheMetrics,
}
for cacheName, metrics := range cacheMetrics {
MustRegisterCacheMonitor(r, cacheName, metrics)
}
MustRegisterCacheMonitor(r, "precise-code-intel-database", databaseCacheSize, databaseCacheMetrics)
MustRegisterCacheMonitor(r, "precise-code-intel-document", documentCacheSize, documentCacheMetrics)
MustRegisterCacheMonitor(r, "precise-code-intel-result-chunk", resultChunkCacheSize, resultChunkCacheMetrics)
return databaseCache, documentCache, resultChunkCache
}

View File

@ -6,7 +6,16 @@ import (
)
// MustRegisterCacheMonitor emits metrics for a ristretto cache.
func MustRegisterCacheMonitor(r prometheus.Registerer, cacheName string, metrics *ristretto.Metrics) {
func MustRegisterCacheMonitor(r prometheus.Registerer, cacheName string, capacity int, metrics *ristretto.Metrics) {
cacheCapacity := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "src_cache_capacity",
Help: "Capacity of the cache.",
ConstLabels: prometheus.Labels{"cache": cacheName},
}, func() float64 {
return float64(capacity)
})
r.MustRegister(cacheCapacity)
cacheCost := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "src_cache_cost",
Help: "Current cost of the cache.",

View File

@ -339,6 +339,22 @@ for assistance.
- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
- **Docker Compose:** Consider increasing `cpus:` of the github-proxy container in `docker-compose.yml`.
# precise-code-intel-api-server: frontend_internal_api_error_responses
**Descriptions:**
- _precise-code-intel-api-server: 5+ frontend-internal API error responses every 5m by route_
**Possible solutions:**
- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
- **Kubernetes:**
- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
- Check `kubectl logs precise-code-intel-api-server` for logs indicate request failures to `frontend` or `frontend-internal`.
- **Docker Compose:**
- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
- Check `docker logs precise-code-intel-api-server` for logs indicating request failures to `frontend` or `frontend-internal`.
# precise-code-intel-api-server: container_restarts
**Descriptions:**
@ -376,6 +392,35 @@ for assistance.
- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-api-server container in `docker-compose.yml`.
# precise-code-intel-bundle-manager: disk_space_remaining
**Descriptions:**
- _precise-code-intel-bundle-manager: less than 25% disk space remaining by instance_
- _precise-code-intel-bundle-manager: less than 15% disk space remaining by instance_
**Possible solutions:**
- **Provision more disk space:** Sourcegraph will begin deleting the oldest uploaded bundle files at 10% disk space remaining.
# precise-code-intel-bundle-manager: frontend_internal_api_error_responses
**Descriptions:**
- _precise-code-intel-bundle-manager: 5+ frontend-internal API error responses every 5m by route_
**Possible solutions:**
- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
- **Kubernetes:**
- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
- Check `kubectl logs precise-code-intel-bundle-manager` for logs indicate request failures to `frontend` or `frontend-internal`.
- **Docker Compose:**
- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
- Check `docker logs precise-code-intel-bundle-manager` for logs indicating request failures to `frontend` or `frontend-internal`.
# precise-code-intel-bundle-manager: container_restarts
**Descriptions:**
@ -413,6 +458,22 @@ for assistance.
- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-bundle-manager container in `docker-compose.yml`.
# precise-code-intel-worker: frontend_internal_api_error_responses
**Descriptions:**
- _precise-code-intel-worker: 5+ frontend-internal API error responses every 5m by route_
**Possible solutions:**
- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
- **Kubernetes:**
- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
- Check `kubectl logs precise-code-intel-worker` for logs indicate request failures to `frontend` or `frontend-internal`.
- **Docker Compose:**
- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
- Check `docker logs precise-code-intel-worker` for logs indicating request failures to `frontend` or `frontend-internal`.
# precise-code-intel-worker: container_restarts
**Descriptions:**

View File

@ -46,7 +46,7 @@ var _ DB = &ObservedDB{}
func NewObserved(db DB, observationContext *observation.Context) DB {
metrics := metrics.NewOperationMetrics(
observationContext.Registerer,
"codeintel_db",
"code_intel_db",
metrics.WithLabels("op"),
metrics.WithCountHelp("Total number of results returned"),
)

View File

@ -6,6 +6,78 @@ func PreciseCodeIntelAPIServer() *Container {
Title: "Precise Code Intel API Server",
Description: "Serves precise code intelligence requests.",
Groups: []Group{
{
Title: "General",
Rows: []Row{
{
{
Name: "99th_percentile_code_intel_api_duration",
Description: "99th percentile successful code intel api query duration over 5m",
// TODO(efritz) - ensure these exclude error durations
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_code_intel_api_duration_seconds_bucket[5m])))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
PossibleSolutions: "none",
},
{
Name: "code_intel_api_errors",
Description: "code intel api errors every 5m",
Query: `sum by (op)(increase(src_code_intel_api_errors_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
PossibleSolutions: "none",
},
},
// TODO(efritz) - add bundle manager request meter
// TODO(efritz) - add gitserver request meter
{
{
Name: "99th_percentile_db_duration",
Description: "99th percentile successful database query duration over 5m",
// TODO(efritz) - ensure these exclude error durations
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_code_intel_db_duration_seconds_bucket{job="precise-code-intel-api-server"}[5m])))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
PossibleSolutions: "none",
},
{
Name: "db_errors",
Description: "database errors every 5m",
Query: `sum by (op)(increase(src_code_intel_db_errors_total{job="precise-code-intel-api-server"}[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
PossibleSolutions: "none",
},
},
{
{
Name: "processing_uploads_reset",
Description: "jobs reset to queued state every 5m",
Query: `sum(increase(src_upload_queue_resets_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("jobs"),
PossibleSolutions: "none",
},
{
Name: "upload_resetter_errors",
Description: "upload resetter errors every 5m",
Query: `sum(increase(src_upload_queue_reset_errors_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("errors"),
PossibleSolutions: "none",
},
},
{
sharedFrontendInternalAPIErrorResponses("precise-code-intel-api-server"),
},
},
},
{
Title: "Container monitoring (not available on k8s or server)",
Hidden: true,

View File

@ -6,6 +6,128 @@ func PreciseCodeIntelBundleManager() *Container {
Title: "Precise Code Intel Bundle Manager",
Description: "Stores and manages precise code intelligence bundles.",
Groups: []Group{
{
Title: "General",
Rows: []Row{
{
{
Name: "99th_percentile_bundle_database_duration",
Description: "99th percentile successful bundle database query duration over 5m",
// TODO(efritz) - ensure these exclude error durations
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_bundle_database_duration_seconds_bucket[5m])))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
PossibleSolutions: "none",
},
{
Name: "bundle_database_errors",
Description: "bundle database errors every 5m",
Query: `sum by (op)(increase(src_bundle_database_errors_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
PossibleSolutions: "none",
},
},
{
{
Name: "99th_percentile_bundle_reader_duration",
Description: "99th percentile successful bundle reader query duration over 5m",
// TODO(efritz) - ensure these exclude error durations
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_bundle_reader_duration_seconds_bucket[5m])))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
PossibleSolutions: "none",
},
{
Name: "bundle_reader_errors",
Description: "bundle reader errors every 5m",
Query: `sum by (op)(increase(src_bundle_reader_errors_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
PossibleSolutions: "none",
},
},
{
{
Name: "disk_space_remaining",
Description: "disk space remaining by instance",
Query: `(src_disk_space_available_bytes / src_disk_space_total_bytes) * 100`,
DataMayNotExist: true,
Warning: Alert{LessOrEqual: 25},
Critical: Alert{LessOrEqual: 15},
PanelOptions: PanelOptions().LegendFormat("{{instance}}").Unit(Percentage),
PossibleSolutions: `
- **Provision more disk space:** Sourcegraph will begin deleting the oldest uploaded bundle files at 10% disk space remaining.
`,
},
},
{
{
Name: "cache_utilization",
Description: "cache utilization",
Query: `(src_cache_cost / src_cache_capacity) * 100`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 110},
PanelOptions: PanelOptions().LegendFormat("{{cache}}").Unit(Percentage),
PossibleSolutions: "none",
},
{
Name: "cache_miss_percentage",
Description: "percentage of cache misses over all cache activity every 5m",
Query: `(increase(src_cache_misses_total[5m]) / (increase(src_cache_hits_total[5m]) + increase(src_cache_misses_total[5m]))) * 100`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 80},
PanelOptions: PanelOptions().LegendFormat("{{cache}}").Unit(Percentage),
PossibleSolutions: "none",
},
},
{
{
Name: "janitor_errors",
Description: "janitor errors every 5m",
Query: `sum(increase(src_bundle_manager_janitor_errors_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("errors"),
PossibleSolutions: "none",
},
{
Name: "janitor_old_uploads",
Description: "upload files removed (due to age) every 5m",
Query: `sum(increase(src_bundle_manager_janitor_upload_files_removed_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("files removed"),
PossibleSolutions: "none",
},
{
Name: "janitor_orphaned_dumps",
Description: "bundle files removed (with no corresponding database entry) every 5m",
Query: `sum(increase(src_bundle_manager_janitor_orphaned_bundle_files_removed_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("files removed"),
PossibleSolutions: "none",
},
{
Name: "janitor_old_dumps",
Description: "bundle files removed (after evicting them from the database) every 5m",
Query: `sum(increase(src_bundle_manager_janitor_evicted_bundle_files_removed_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("files removed"),
PossibleSolutions: "none",
},
},
{
sharedFrontendInternalAPIErrorResponses("precise-code-intel-bundle-manager"),
},
},
},
{
Title: "Container monitoring (not available on k8s or server)",
Hidden: true,

View File

@ -6,6 +6,67 @@ func PreciseCodeIntelWorker() *Container {
Title: "Precise Code Intel Worker",
Description: "Handles conversion of uploaded precise code intelligence bundles.",
Groups: []Group{
{
Title: "General",
Rows: []Row{
{
{
Name: "upload_queue_size",
Description: "upload queue size",
Query: `max(src_upload_queue_uploads_total)`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 100},
PanelOptions: PanelOptions().LegendFormat("uploads queued for processing"),
PossibleSolutions: "none",
},
{
Name: "upload_queue_growth_rate",
Description: "upload queue growth rate every 5m",
Query: `sum(increase(src_upload_queue_uploads_total[5m])) / sum(increase(src_upload_queue_processor_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 5},
PanelOptions: PanelOptions().LegendFormat("upload queue growth rate"),
PossibleSolutions: "none",
},
{
Name: "upload_process_errors",
Description: "upload process errors every 5m",
// TODO(efritz) - ensure these differentiate malformed dumps and system errors
Query: `sum(increase(src_upload_queue_processor_errors_total[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("errors"),
PossibleSolutions: "none",
},
},
// TODO(efritz) - add bundle manager request meter
// TODO(efritz) - add gitserver request meter
{
{
Name: "99th_percentile_db_duration",
Description: "99th percentile successful database query duration over 5m",
// TODO(efritz) - ensure these exclude error durations
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_code_intel_db_duration_seconds_bucket{job="precise-code-intel-worker"}[5m])))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
PossibleSolutions: "none",
},
{
Name: "db_errors",
Description: "database errors every 5m",
Query: `sum by (op)(increase(src_code_intel_db_errors_total{job="precise-code-intel-worker"}[5m]))`,
DataMayNotExist: true,
Warning: Alert{GreaterOrEqual: 20},
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
PossibleSolutions: "none",
},
},
{
sharedFrontendInternalAPIErrorResponses("precise-code-intel-worker"),
},
},
},
{
Title: "Container monitoring (not available on k8s or server)",
Hidden: true,