mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 19:21:50 +00:00
We have a number of docs links in the product that point to the old doc site. Method: - Search the repo for `docs.sourcegraph.com` - Exclude the `doc/` dir, all test fixtures, and `CHANGELOG.md` - For each, replace `docs.sourcegraph.com` with `sourcegraph.com/docs` - Navigate to the resulting URL ensuring it's not a dead link, updating the URL if necessary Many of the URLs updated are just comments, but since I'm doing a manual audit of each URL anyways, I felt it was worth it to update these while I was at it.
204 lines
10 KiB
Go
204 lines
10 KiB
Go
package definitions
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/sourcegraph/sourcegraph/monitoring/definitions/shared"
|
|
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
|
)
|
|
|
|
func Postgres() *monitoring.Dashboard {
|
|
const (
|
|
// In docker-compose, codeintel-db container is called pgsql. In Kubernetes,
|
|
// codeintel-db container is called codeintel-db Because of this, we track
|
|
// all database cAdvisor metrics in a single panel using this container
|
|
// name regex to ensure we have observability on all platforms.
|
|
containerName = "(pgsql|codeintel-db|codeinsights)"
|
|
)
|
|
return &monitoring.Dashboard{
|
|
Name: "postgres",
|
|
Title: "Postgres",
|
|
Description: "Postgres metrics, exported from postgres_exporter (not available on server).",
|
|
NoSourcegraphDebugServer: true, // This is third-party service
|
|
Groups: []monitoring.Group{
|
|
{
|
|
Title: "General",
|
|
Rows: []monitoring.Row{
|
|
{
|
|
monitoring.Observable{
|
|
Name: "connections",
|
|
Description: "active connections",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
DataMustExist: false, // not deployed on docker-compose
|
|
Query: `sum by (job) (pg_stat_activity_count{datname!~"template.*|postgres|cloudsqladmin"}) OR sum by (job) (pg_stat_activity_count{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`,
|
|
Panel: monitoring.Panel().LegendFormat("{{datname}}"),
|
|
Warning: monitoring.Alert().LessOrEqual(5).For(5 * time.Minute),
|
|
NextSteps: "none",
|
|
},
|
|
monitoring.Observable{
|
|
Name: "usage_connections_percentage",
|
|
Description: "connection in use",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
DataMustExist: false,
|
|
Query: `sum(pg_stat_activity_count) by (job) / (sum(pg_settings_max_connections) by (job) - sum(pg_settings_superuser_reserved_connections) by (job)) * 100`,
|
|
Panel: monitoring.Panel().LegendFormat("{{job}}").Unit(monitoring.Percentage).Max(100).Min(0),
|
|
Warning: monitoring.Alert().GreaterOrEqual(80).For(5 * time.Minute),
|
|
Critical: monitoring.Alert().GreaterOrEqual(100).For(5 * time.Minute),
|
|
NextSteps: `
|
|
- Consider increasing [max_connections](https://www.postgresql.org/docs/current/runtime-config-connection.html#GUC-MAX-CONNECTIONS) of the database instance, [learn more](https://sourcegraph.com/docs/admin/config/postgres-conf)
|
|
`,
|
|
},
|
|
monitoring.Observable{
|
|
Name: "transaction_durations",
|
|
Description: "maximum transaction durations",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
DataMustExist: false, // not deployed on docker-compose
|
|
// Ignore in codeintel-db because Rockskip processing involves long transactions
|
|
// during normal operation.
|
|
Query: `sum by (job) (pg_stat_activity_max_tx_duration{datname!~"template.*|postgres|cloudsqladmin",job!="codeintel-db"}) OR sum by (job) (pg_stat_activity_max_tx_duration{job="codeinsights-db", datname!~"template.*|cloudsqladmin"})`,
|
|
Panel: monitoring.Panel().LegendFormat("{{datname}}").Unit(monitoring.Seconds),
|
|
Warning: monitoring.Alert().GreaterOrEqual(0.3).For(5 * time.Minute),
|
|
NextSteps: "none",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Title: "Database and collector status",
|
|
Hidden: true,
|
|
Rows: []monitoring.Row{
|
|
{
|
|
monitoring.Observable{
|
|
Name: "postgres_up",
|
|
Description: "database availability",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
DataMustExist: false, // not deployed on docker-compose
|
|
Query: "pg_up",
|
|
Panel: monitoring.Panel().LegendFormat("{{app}}"),
|
|
Critical: monitoring.Alert().LessOrEqual(0).For(5 * time.Minute),
|
|
// Similar to ContainerMissing solutions
|
|
NextSteps: fmt.Sprintf(`
|
|
- **Kubernetes:**
|
|
- Determine if the pod was OOM killed using 'kubectl describe pod %[1]s' (look for 'OOMKilled: true') and, if so, consider increasing the memory limit in the relevant 'Deployment.yaml'.
|
|
- Check the logs before the container restarted to see if there are 'panic:' messages or similar using 'kubectl logs -p %[1]s'.
|
|
- Check if there is any OOMKILL event using the provisioning panels
|
|
- Check kernel logs using 'dmesg' for OOMKILL events on worker nodes
|
|
- **Docker Compose:**
|
|
- Determine if the pod was OOM killed using 'docker inspect -f \'{{json .State}}\' %[1]s' (look for '"OOMKilled":true') and, if so, consider increasing the memory limit of the %[1]s container in 'docker-compose.yml'.
|
|
- Check the logs before the container restarted to see if there are 'panic:' messages or similar using 'docker logs %[1]s' (note this will include logs from the previous and currently running container).
|
|
- Check if there is any OOMKILL event using the provisioning panels
|
|
- Check kernel logs using 'dmesg' for OOMKILL events
|
|
`, containerName),
|
|
Interpretation: "A non-zero value indicates the database is online.",
|
|
},
|
|
monitoring.Observable{
|
|
Name: "invalid_indexes",
|
|
Description: "invalid indexes (unusable by the query planner)",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
DataMustExist: false, // not deployed on docker-compose
|
|
Query: "max by (relname)(pg_invalid_index_count)",
|
|
Panel: monitoring.Panel().LegendFormat("{{relname}}"),
|
|
Critical: monitoring.Alert().GreaterOrEqual(1).AggregateBy(monitoring.AggregatorSum),
|
|
NextSteps: `
|
|
- Drop and re-create the invalid trigger - please contact Sourcegraph to supply the trigger definition.
|
|
`,
|
|
Interpretation: "A non-zero value indicates the that Postgres failed to build an index. Expect degraded performance until the index is manually rebuilt.",
|
|
},
|
|
},
|
|
{
|
|
monitoring.Observable{
|
|
Name: "pg_exporter_err",
|
|
Description: "errors scraping postgres exporter",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
DataMustExist: false, // not deployed on docker-compose
|
|
Query: "pg_exporter_last_scrape_error",
|
|
Panel: monitoring.Panel().LegendFormat("{{app}}"),
|
|
Warning: monitoring.Alert().GreaterOrEqual(1).For(5 * time.Minute),
|
|
|
|
NextSteps: `
|
|
- Ensure the Postgres exporter can access the Postgres database. Also, check the Postgres exporter logs for errors.
|
|
`,
|
|
Interpretation: "This value indicates issues retrieving metrics from postgres_exporter.",
|
|
},
|
|
monitoring.Observable{
|
|
Name: "migration_in_progress",
|
|
Description: "active schema migration",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
DataMustExist: false, // not deployed on docker-compose
|
|
Query: "pg_sg_migration_status",
|
|
Panel: monitoring.Panel().LegendFormat("{{app}}"),
|
|
Critical: monitoring.Alert().GreaterOrEqual(1).For(5 * time.Minute),
|
|
Interpretation: "A 0 value indicates that no migration is in progress.",
|
|
NextSteps: `
|
|
The database migration has been in progress for 5 or more minutes - please contact Sourcegraph if this persists.
|
|
`,
|
|
},
|
|
// TODO(@daxmc99): Blocked by https://github.com/sourcegraph/sourcegraph/issues/13300
|
|
// need to enable `pg_stat_statements` in Postgres conf
|
|
// monitoring.Observable{
|
|
// Name: "cache_hit_ratio",
|
|
// Description: "ratio of cache hits over 5m",
|
|
// Owner: monitoring.ObservableOwnerDevOps,
|
|
// Query: `avg(rate(pg_stat_database_blks_hit{datname!~"template.*|postgres|cloudsqladmin"}[5m]) / (rate(pg_stat_database_blks_hit{datname!~"template.*|postgres|cloudsqladmin"}[5m]) + rate(pg_stat_database_blks_read{datname!~"template.*|postgres|cloudsqladmin"}[5m]))) by (datname) * 100`,
|
|
// DataMayNotExist: true,
|
|
// Warning: monitoring.Alert().LessOrEqual(0.98).For(5 * time.Minute),
|
|
// PossibleSolutions: "Cache hit ratio should be at least 99%, please [open an issue](https://github.com/sourcegraph/sourcegraph/issues/new/choose) " +
|
|
// "to add additional indexes",
|
|
// PanelOptions: monitoring.PanelOptions().Unit(monitoring.Percentage)},
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Title: "Object size and bloat",
|
|
Hidden: true,
|
|
Rows: []monitoring.Row{
|
|
{
|
|
monitoring.Observable{
|
|
Name: "pg_table_size",
|
|
Description: "table size",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
Query: `max by (relname)(pg_table_bloat_size)`,
|
|
Panel: monitoring.Panel().LegendFormat("{{relname}}").Unit(monitoring.Bytes),
|
|
NoAlert: true,
|
|
Interpretation: "Total size of this table",
|
|
},
|
|
monitoring.Observable{
|
|
Name: "pg_table_bloat_ratio",
|
|
Description: "table bloat ratio",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
Query: `max by (relname)(pg_table_bloat_ratio) * 100`,
|
|
Panel: monitoring.Panel().LegendFormat("{{relname}}").Unit(monitoring.Percentage),
|
|
NoAlert: true,
|
|
Interpretation: "Estimated bloat ratio of this table (high bloat = high overhead)",
|
|
},
|
|
},
|
|
{
|
|
monitoring.Observable{
|
|
Name: "pg_index_size",
|
|
Description: "index size",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
Query: `max by (relname)(pg_index_bloat_size)`,
|
|
Panel: monitoring.Panel().LegendFormat("{{relname}}").Unit(monitoring.Bytes),
|
|
NoAlert: true,
|
|
Interpretation: "Total size of this index",
|
|
},
|
|
monitoring.Observable{
|
|
Name: "pg_index_bloat_ratio",
|
|
Description: "index bloat ratio",
|
|
Owner: monitoring.ObservableOwnerInfraOrg,
|
|
Query: `max by (relname)(pg_index_bloat_ratio) * 100`,
|
|
Panel: monitoring.Panel().LegendFormat("{{relname}}").Unit(monitoring.Percentage),
|
|
NoAlert: true,
|
|
Interpretation: "Estimated bloat ratio of this index (high bloat = high overhead)",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
|
|
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerInfraOrg, nil),
|
|
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerInfraOrg, nil),
|
|
},
|
|
}
|
|
}
|