mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 19:51:50 +00:00
Fill out dashboards for precise-code-intel services (#10496)
This commit is contained in:
parent
53043edfa1
commit
47a84cd769
@ -10,9 +10,9 @@ import (
|
||||
|
||||
var (
|
||||
rawBundleDir = env.Get("PRECISE_CODE_INTEL_BUNDLE_DIR", "/lsif-storage", "Root dir containing uploads and converted bundles.")
|
||||
rawDatabaseCacheSize = env.Get("PRECISE_CODE_INTEL_CONNECTION_CACHE_CAPACITY", "100", "Number of SQLite connections that can be opened at once.")
|
||||
rawDocumentCacheSize = env.Get("PRECISE_CODE_INTEL_DOCUMENT_CACHE_CAPACITY", "100", "Maximum number of decoded documents that can be held in memory at once.")
|
||||
rawResultChunkCacheSize = env.Get("PRECISE_CODE_INTEL_RESULT_CHUNK_CACHE_CAPACITY", "100", "Maximum number of decoded result chunks that can be held in memory at once.")
|
||||
rawDatabaseCacheSize = env.Get("PRECISE_CODE_INTEL_CONNECTION_CACHE_CAPACITY", "100", "Maximum number of SQLite connections that can be open at once.")
|
||||
rawDocumentCacheSize = env.Get("PRECISE_CODE_INTEL_DOCUMENT_CACHE_CAPACITY", "1000000", "Size of decoded document cache. A document's cost is the number of fields.")
|
||||
rawResultChunkCacheSize = env.Get("PRECISE_CODE_INTEL_RESULT_CHUNK_CACHE_CAPACITY", "1000000", "Size of decoded result chunk cache. A result chunk's cost is the number of fields.")
|
||||
rawDesiredPercentFree = env.Get("PRECISE_CODE_INTEL_DESIRED_PERCENT_FREE", "10", "Target percentage of free space on disk.")
|
||||
rawJanitorInterval = env.Get("PRECISE_CODE_INTEL_JANITOR_INTERVAL", "1m", "Interval between cleanup runs.")
|
||||
rawMaxUploadAge = env.Get("PRECISE_CODE_INTEL_MAX_UPLOAD_AGE", "24h", "The maximum time an upload can sit on disk.")
|
||||
|
||||
@ -67,7 +67,7 @@ func NewObserved(database Database, observationContext *observation.Context) Dat
|
||||
}),
|
||||
monikerResultsOperation: observationContext.Operation(observation.Op{
|
||||
Name: "Database.MonikerResults",
|
||||
MetricLabels: []string{"monike_results"},
|
||||
MetricLabels: []string{"moniker_results"},
|
||||
Metrics: metrics,
|
||||
}),
|
||||
packageInformationOperation: observationContext.Operation(observation.Op{
|
||||
|
||||
@ -6,7 +6,6 @@ import (
|
||||
"os/signal"
|
||||
"syscall"
|
||||
|
||||
"github.com/dgraph-io/ristretto"
|
||||
"github.com/inconshreveable/log15"
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"github.com/pkg/errors"
|
||||
@ -102,14 +101,9 @@ func prepCaches(r prometheus.Registerer, databaseCacheSize, documentCacheSize, r
|
||||
log.Fatal(errors.Wrap(err, "failed to initialize result chunk cache"))
|
||||
}
|
||||
|
||||
cacheMetrics := map[string]*ristretto.Metrics{
|
||||
"precise-code-intel-database": databaseCacheMetrics,
|
||||
"precise-code-intel-document": documentCacheMetrics,
|
||||
"precise-code-intel-result-chunk": resultChunkCacheMetrics,
|
||||
}
|
||||
for cacheName, metrics := range cacheMetrics {
|
||||
MustRegisterCacheMonitor(r, cacheName, metrics)
|
||||
}
|
||||
MustRegisterCacheMonitor(r, "precise-code-intel-database", databaseCacheSize, databaseCacheMetrics)
|
||||
MustRegisterCacheMonitor(r, "precise-code-intel-document", documentCacheSize, documentCacheMetrics)
|
||||
MustRegisterCacheMonitor(r, "precise-code-intel-result-chunk", resultChunkCacheSize, resultChunkCacheMetrics)
|
||||
|
||||
return databaseCache, documentCache, resultChunkCache
|
||||
}
|
||||
|
||||
@ -6,7 +6,16 @@ import (
|
||||
)
|
||||
|
||||
// MustRegisterCacheMonitor emits metrics for a ristretto cache.
|
||||
func MustRegisterCacheMonitor(r prometheus.Registerer, cacheName string, metrics *ristretto.Metrics) {
|
||||
func MustRegisterCacheMonitor(r prometheus.Registerer, cacheName string, capacity int, metrics *ristretto.Metrics) {
|
||||
cacheCapacity := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
|
||||
Name: "src_cache_capacity",
|
||||
Help: "Capacity of the cache.",
|
||||
ConstLabels: prometheus.Labels{"cache": cacheName},
|
||||
}, func() float64 {
|
||||
return float64(capacity)
|
||||
})
|
||||
r.MustRegister(cacheCapacity)
|
||||
|
||||
cacheCost := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
|
||||
Name: "src_cache_cost",
|
||||
Help: "Current cost of the cache.",
|
||||
|
||||
@ -339,6 +339,22 @@ for assistance.
|
||||
- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
|
||||
- **Docker Compose:** Consider increasing `cpus:` of the github-proxy container in `docker-compose.yml`.
|
||||
|
||||
# precise-code-intel-api-server: frontend_internal_api_error_responses
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _precise-code-intel-api-server: 5+ frontend-internal API error responses every 5m by route_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
|
||||
- **Kubernetes:**
|
||||
- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
|
||||
- Check `kubectl logs precise-code-intel-api-server` for logs indicate request failures to `frontend` or `frontend-internal`.
|
||||
- **Docker Compose:**
|
||||
- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
|
||||
- Check `docker logs precise-code-intel-api-server` for logs indicating request failures to `frontend` or `frontend-internal`.
|
||||
|
||||
# precise-code-intel-api-server: container_restarts
|
||||
|
||||
**Descriptions:**
|
||||
@ -376,6 +392,35 @@ for assistance.
|
||||
- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
|
||||
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-api-server container in `docker-compose.yml`.
|
||||
|
||||
# precise-code-intel-bundle-manager: disk_space_remaining
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _precise-code-intel-bundle-manager: less than 25% disk space remaining by instance_
|
||||
|
||||
|
||||
- _precise-code-intel-bundle-manager: less than 15% disk space remaining by instance_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
- **Provision more disk space:** Sourcegraph will begin deleting the oldest uploaded bundle files at 10% disk space remaining.
|
||||
|
||||
# precise-code-intel-bundle-manager: frontend_internal_api_error_responses
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _precise-code-intel-bundle-manager: 5+ frontend-internal API error responses every 5m by route_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
|
||||
- **Kubernetes:**
|
||||
- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
|
||||
- Check `kubectl logs precise-code-intel-bundle-manager` for logs indicate request failures to `frontend` or `frontend-internal`.
|
||||
- **Docker Compose:**
|
||||
- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
|
||||
- Check `docker logs precise-code-intel-bundle-manager` for logs indicating request failures to `frontend` or `frontend-internal`.
|
||||
|
||||
# precise-code-intel-bundle-manager: container_restarts
|
||||
|
||||
**Descriptions:**
|
||||
@ -413,6 +458,22 @@ for assistance.
|
||||
- **Kubernetes:** Consider increasing CPU limits in the the relevant `Deployment.yaml`.
|
||||
- **Docker Compose:** Consider increasing `cpus:` of the precise-code-intel-bundle-manager container in `docker-compose.yml`.
|
||||
|
||||
# precise-code-intel-worker: frontend_internal_api_error_responses
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
- _precise-code-intel-worker: 5+ frontend-internal API error responses every 5m by route_
|
||||
|
||||
**Possible solutions:**
|
||||
|
||||
- **Single-container deployments:** Check `docker logs $CONTAINER_ID` for logs starting with `repo-updater` that indicate requests to the frontend service are failing.
|
||||
- **Kubernetes:**
|
||||
- Confirm that `kubectl get pods` shows the `frontend` pods are healthy.
|
||||
- Check `kubectl logs precise-code-intel-worker` for logs indicate request failures to `frontend` or `frontend-internal`.
|
||||
- **Docker Compose:**
|
||||
- Confirm that `docker ps` shows the `frontend-internal` container is healthy.
|
||||
- Check `docker logs precise-code-intel-worker` for logs indicating request failures to `frontend` or `frontend-internal`.
|
||||
|
||||
# precise-code-intel-worker: container_restarts
|
||||
|
||||
**Descriptions:**
|
||||
|
||||
@ -46,7 +46,7 @@ var _ DB = &ObservedDB{}
|
||||
func NewObserved(db DB, observationContext *observation.Context) DB {
|
||||
metrics := metrics.NewOperationMetrics(
|
||||
observationContext.Registerer,
|
||||
"codeintel_db",
|
||||
"code_intel_db",
|
||||
metrics.WithLabels("op"),
|
||||
metrics.WithCountHelp("Total number of results returned"),
|
||||
)
|
||||
|
||||
@ -6,6 +6,78 @@ func PreciseCodeIntelAPIServer() *Container {
|
||||
Title: "Precise Code Intel API Server",
|
||||
Description: "Serves precise code intelligence requests.",
|
||||
Groups: []Group{
|
||||
{
|
||||
Title: "General",
|
||||
Rows: []Row{
|
||||
{
|
||||
{
|
||||
Name: "99th_percentile_code_intel_api_duration",
|
||||
Description: "99th percentile successful code intel api query duration over 5m",
|
||||
// TODO(efritz) - ensure these exclude error durations
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_code_intel_api_duration_seconds_bucket[5m])))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "code_intel_api_errors",
|
||||
Description: "code intel api errors every 5m",
|
||||
Query: `sum by (op)(increase(src_code_intel_api_errors_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
// TODO(efritz) - add bundle manager request meter
|
||||
// TODO(efritz) - add gitserver request meter
|
||||
{
|
||||
{
|
||||
Name: "99th_percentile_db_duration",
|
||||
Description: "99th percentile successful database query duration over 5m",
|
||||
// TODO(efritz) - ensure these exclude error durations
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_code_intel_db_duration_seconds_bucket{job="precise-code-intel-api-server"}[5m])))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "db_errors",
|
||||
Description: "database errors every 5m",
|
||||
Query: `sum by (op)(increase(src_code_intel_db_errors_total{job="precise-code-intel-api-server"}[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
{
|
||||
{
|
||||
Name: "processing_uploads_reset",
|
||||
Description: "jobs reset to queued state every 5m",
|
||||
Query: `sum(increase(src_upload_queue_resets_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("jobs"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "upload_resetter_errors",
|
||||
Description: "upload resetter errors every 5m",
|
||||
Query: `sum(increase(src_upload_queue_reset_errors_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("errors"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
{
|
||||
sharedFrontendInternalAPIErrorResponses("precise-code-intel-api-server"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Title: "Container monitoring (not available on k8s or server)",
|
||||
Hidden: true,
|
||||
|
||||
@ -6,6 +6,128 @@ func PreciseCodeIntelBundleManager() *Container {
|
||||
Title: "Precise Code Intel Bundle Manager",
|
||||
Description: "Stores and manages precise code intelligence bundles.",
|
||||
Groups: []Group{
|
||||
{
|
||||
Title: "General",
|
||||
Rows: []Row{
|
||||
{
|
||||
{
|
||||
Name: "99th_percentile_bundle_database_duration",
|
||||
Description: "99th percentile successful bundle database query duration over 5m",
|
||||
// TODO(efritz) - ensure these exclude error durations
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_bundle_database_duration_seconds_bucket[5m])))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "bundle_database_errors",
|
||||
Description: "bundle database errors every 5m",
|
||||
Query: `sum by (op)(increase(src_bundle_database_errors_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
{
|
||||
{
|
||||
Name: "99th_percentile_bundle_reader_duration",
|
||||
Description: "99th percentile successful bundle reader query duration over 5m",
|
||||
// TODO(efritz) - ensure these exclude error durations
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_bundle_reader_duration_seconds_bucket[5m])))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "bundle_reader_errors",
|
||||
Description: "bundle reader errors every 5m",
|
||||
Query: `sum by (op)(increase(src_bundle_reader_errors_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
{
|
||||
{
|
||||
Name: "disk_space_remaining",
|
||||
Description: "disk space remaining by instance",
|
||||
Query: `(src_disk_space_available_bytes / src_disk_space_total_bytes) * 100`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{LessOrEqual: 25},
|
||||
Critical: Alert{LessOrEqual: 15},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{instance}}").Unit(Percentage),
|
||||
PossibleSolutions: `
|
||||
- **Provision more disk space:** Sourcegraph will begin deleting the oldest uploaded bundle files at 10% disk space remaining.
|
||||
`,
|
||||
},
|
||||
},
|
||||
{
|
||||
{
|
||||
Name: "cache_utilization",
|
||||
Description: "cache utilization",
|
||||
Query: `(src_cache_cost / src_cache_capacity) * 100`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 110},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{cache}}").Unit(Percentage),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "cache_miss_percentage",
|
||||
Description: "percentage of cache misses over all cache activity every 5m",
|
||||
Query: `(increase(src_cache_misses_total[5m]) / (increase(src_cache_hits_total[5m]) + increase(src_cache_misses_total[5m]))) * 100`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 80},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{cache}}").Unit(Percentage),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
{
|
||||
{
|
||||
Name: "janitor_errors",
|
||||
Description: "janitor errors every 5m",
|
||||
Query: `sum(increase(src_bundle_manager_janitor_errors_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("errors"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "janitor_old_uploads",
|
||||
Description: "upload files removed (due to age) every 5m",
|
||||
Query: `sum(increase(src_bundle_manager_janitor_upload_files_removed_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("files removed"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "janitor_orphaned_dumps",
|
||||
Description: "bundle files removed (with no corresponding database entry) every 5m",
|
||||
Query: `sum(increase(src_bundle_manager_janitor_orphaned_bundle_files_removed_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("files removed"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "janitor_old_dumps",
|
||||
Description: "bundle files removed (after evicting them from the database) every 5m",
|
||||
Query: `sum(increase(src_bundle_manager_janitor_evicted_bundle_files_removed_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("files removed"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
{
|
||||
sharedFrontendInternalAPIErrorResponses("precise-code-intel-bundle-manager"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Title: "Container monitoring (not available on k8s or server)",
|
||||
Hidden: true,
|
||||
|
||||
@ -6,6 +6,67 @@ func PreciseCodeIntelWorker() *Container {
|
||||
Title: "Precise Code Intel Worker",
|
||||
Description: "Handles conversion of uploaded precise code intelligence bundles.",
|
||||
Groups: []Group{
|
||||
{
|
||||
Title: "General",
|
||||
Rows: []Row{
|
||||
{
|
||||
{
|
||||
Name: "upload_queue_size",
|
||||
Description: "upload queue size",
|
||||
Query: `max(src_upload_queue_uploads_total)`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 100},
|
||||
PanelOptions: PanelOptions().LegendFormat("uploads queued for processing"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "upload_queue_growth_rate",
|
||||
Description: "upload queue growth rate every 5m",
|
||||
Query: `sum(increase(src_upload_queue_uploads_total[5m])) / sum(increase(src_upload_queue_processor_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 5},
|
||||
PanelOptions: PanelOptions().LegendFormat("upload queue growth rate"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "upload_process_errors",
|
||||
Description: "upload process errors every 5m",
|
||||
// TODO(efritz) - ensure these differentiate malformed dumps and system errors
|
||||
Query: `sum(increase(src_upload_queue_processor_errors_total[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("errors"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
// TODO(efritz) - add bundle manager request meter
|
||||
// TODO(efritz) - add gitserver request meter
|
||||
{
|
||||
{
|
||||
Name: "99th_percentile_db_duration",
|
||||
Description: "99th percentile successful database query duration over 5m",
|
||||
// TODO(efritz) - ensure these exclude error durations
|
||||
Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_code_intel_db_duration_seconds_bucket{job="precise-code-intel-worker"}[5m])))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}").Unit(Seconds),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
{
|
||||
Name: "db_errors",
|
||||
Description: "database errors every 5m",
|
||||
Query: `sum by (op)(increase(src_code_intel_db_errors_total{job="precise-code-intel-worker"}[5m]))`,
|
||||
DataMayNotExist: true,
|
||||
Warning: Alert{GreaterOrEqual: 20},
|
||||
PanelOptions: PanelOptions().LegendFormat("{{op}}"),
|
||||
PossibleSolutions: "none",
|
||||
},
|
||||
},
|
||||
{
|
||||
sharedFrontendInternalAPIErrorResponses("precise-code-intel-worker"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Title: "Container monitoring (not available on k8s or server)",
|
||||
Hidden: true,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user