mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 17:31:43 +00:00
grpc: retry: create Prometheus dashboards (#59607)
This PR implements the logic in the monitoring generator for displaying the gRPC reply metrics included in https://github.com/sourcegraph/sourcegraph/pull/59399 . ## Test plan I created the following screenshots (look at the bottom panel) from the gitserver + zoekt-websever grafana dashboards after running `sg start --except zoekt-web-0 --except gitserver-1` and executing searches like `context:global type:diff test count:all r:hashicorp` or `context:global test count:all r:hashicorp` <img width="1716" alt="Screenshot 2024-01-15 at 1 28 11 PM" src="https://github.com/sourcegraph/sourcegraph/assets/9022011/a503b6d9-3e21-451c-b98b-3b6e634d4ec9"> <img width="1715" alt="Screenshot 2024-01-15 at 1 28 41 PM" src="https://github.com/sourcegraph/sourcegraph/assets/9022011/ed07244e-340d-4ae3-933d-3416abff91cc">
This commit is contained in:
parent
5ac8c8c2b4
commit
70d5012674
927
doc/admin/observability/dashboards.md
generated
927
doc/admin/observability/dashboards.md
generated
File diff suppressed because it is too large
Load Diff
@ -424,6 +424,14 @@ func Frontend() *monitoring.Dashboard {
|
||||
RawGRPCServiceName: grpcZoektConfigurationServiceName,
|
||||
Namespace: "", // intentionally empty
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariableFrontendZoektConfiguration.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
shared.NewGRPCRetryMetricsGroup(
|
||||
shared.GRPCRetryMetricsOptions{
|
||||
HumanServiceName: "zoekt_configuration",
|
||||
RawGRPCServiceName: grpcZoektConfigurationServiceName,
|
||||
Namespace: "src",
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariableFrontendZoektConfiguration.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
|
||||
@ -442,6 +450,14 @@ func Frontend() *monitoring.Dashboard {
|
||||
RawGRPCServiceName: grpcInternalAPIServiceName,
|
||||
Namespace: "src",
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariableFrontendInternalAPI.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
shared.NewGRPCRetryMetricsGroup(
|
||||
shared.GRPCRetryMetricsOptions{
|
||||
HumanServiceName: "internal_api",
|
||||
RawGRPCServiceName: grpcInternalAPIServiceName,
|
||||
Namespace: "src",
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariableFrontendInternalAPI.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
|
||||
|
||||
@ -556,6 +556,15 @@ func GitServer() *monitoring.Dashboard {
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
|
||||
shared.NewGRPCRetryMetricsGroup(
|
||||
shared.GRPCRetryMetricsOptions{
|
||||
HumanServiceName: "gitserver",
|
||||
RawGRPCServiceName: grpcServiceName,
|
||||
Namespace: "src",
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "gitserver",
|
||||
InstanceFilterRegex: `${shard:regex}`,
|
||||
|
||||
@ -420,6 +420,15 @@ func RepoUpdater() *monitoring.Dashboard {
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSource),
|
||||
|
||||
shared.NewGRPCRetryMetricsGroup(
|
||||
shared.GRPCRetryMetricsOptions{
|
||||
HumanServiceName: "repo_updater",
|
||||
RawGRPCServiceName: grpcServiceName,
|
||||
Namespace: "src",
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSource),
|
||||
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "repo_updater",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
|
||||
@ -240,6 +240,16 @@ regularly above 0 it is a sign for further investigation.`,
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
|
||||
shared.NewGRPCRetryMetricsGroup(
|
||||
shared.GRPCRetryMetricsOptions{
|
||||
HumanServiceName: "searcher",
|
||||
RawGRPCServiceName: grpcServiceName,
|
||||
Namespace: "src",
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "searcher",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
|
||||
@ -5,9 +5,10 @@ import (
|
||||
"strings"
|
||||
|
||||
"github.com/iancoleman/strcase"
|
||||
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
||||
"golang.org/x/text/cases"
|
||||
"golang.org/x/text/language"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
||||
)
|
||||
|
||||
type GRPCServerMetricsOptions struct {
|
||||
@ -474,6 +475,118 @@ func NewGRPCInternalErrorMetricsGroup(opts GRPCInternalErrorMetricsOptions, owne
|
||||
}
|
||||
}
|
||||
|
||||
type GRPCRetryMetricsOptions struct {
|
||||
// HumanServiceName is the short, lowercase, snake_case, human-readable name of the grpc service that we're gathering metrics for.
|
||||
//
|
||||
// Example: "gitserver"
|
||||
HumanServiceName string
|
||||
|
||||
// RawGRPCServiceName is the full, dot-separated, code-generated gRPC service name that we're gathering metrics for.
|
||||
//
|
||||
// Example: "gitserver.v1.GitserverService"
|
||||
RawGRPCServiceName string
|
||||
|
||||
// MethodFilterRegex is the PromQL regex that's used to filter the
|
||||
// GRPC server metrics to only those emitted by the method(s) that were interested in.
|
||||
//
|
||||
// Example: (Search | Exec)
|
||||
MethodFilterRegex string
|
||||
|
||||
// Namespace is the Prometheus metrics namespace for metrics emitted by this service.
|
||||
Namespace string
|
||||
}
|
||||
|
||||
// NewGRPCRetryMetricsGroup creates a Group containing metrics that track "internal" gRPC errors.
|
||||
func NewGRPCRetryMetricsGroup(opts GRPCRetryMetricsOptions, owner monitoring.ObservableOwner) monitoring.Group {
|
||||
opts.HumanServiceName = strcase.ToSnake(opts.HumanServiceName)
|
||||
|
||||
metric := func(base string, labelFilters ...string) string {
|
||||
m := base
|
||||
|
||||
if opts.Namespace != "" {
|
||||
m = fmt.Sprintf("%s_%s", opts.Namespace, m)
|
||||
}
|
||||
|
||||
if len(labelFilters) > 0 {
|
||||
m = fmt.Sprintf("%s{%s}", m, strings.Join(labelFilters, ","))
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
sum := func(metric, duration string, groupByLabels ...string) string {
|
||||
base := fmt.Sprintf("sum(rate(%s[%s]))", metric, duration)
|
||||
|
||||
if len(groupByLabels) > 0 {
|
||||
base = fmt.Sprintf("%s by (%s)", base, strings.Join(groupByLabels, ", "))
|
||||
}
|
||||
|
||||
return fmt.Sprintf("(%s)", base)
|
||||
}
|
||||
|
||||
methodLabelFilter := fmt.Sprintf(`grpc_method=~"%s"`, opts.MethodFilterRegex)
|
||||
serviceLabelFilter := fmt.Sprintf(`grpc_service=~"%s"`, opts.RawGRPCServiceName)
|
||||
isRetriedLabelFilter := fmt.Sprintf(`is_retried="%s"`, "true")
|
||||
|
||||
percentageQuery := func(numerator, denominator string) string {
|
||||
ratio := fmt.Sprintf("((%s) / (%s))", numerator, denominator)
|
||||
return fmt.Sprintf("(100.0 * (%s))", ratio)
|
||||
}
|
||||
|
||||
titleCaser := cases.Title(language.English)
|
||||
|
||||
return monitoring.Group{
|
||||
Title: fmt.Sprintf("%s GRPC retry metrics", titleCaser.String(strings.ReplaceAll(opts.HumanServiceName, "_", " "))),
|
||||
Hidden: true,
|
||||
Rows: []monitoring.Row{
|
||||
{
|
||||
monitoring.Observable{
|
||||
Name: fmt.Sprintf("%s_grpc_clients_retry_percentage_across_all_methods", opts.HumanServiceName),
|
||||
Description: "client retry percentage across all methods over 2m",
|
||||
Query: percentageQuery(
|
||||
sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter, isRetriedLabelFilter), "2m"),
|
||||
sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter), "2m"),
|
||||
),
|
||||
Owner: owner,
|
||||
NoAlert: true,
|
||||
Interpretation: fmt.Sprintf("The percentage of gRPC requests that were retried across all methods, aggregated across all %q clients.", opts.HumanServiceName),
|
||||
Panel: monitoring.Panel().
|
||||
Unit(monitoring.Percentage).
|
||||
With(monitoring.PanelOptions.LegendOnRight()).
|
||||
With(monitoring.PanelOptions.ZeroIfNoData()),
|
||||
},
|
||||
monitoring.Observable{
|
||||
Name: fmt.Sprintf("%s_grpc_clients_retry_percentage_per_method", opts.HumanServiceName),
|
||||
Description: "client retry percentage per-method over 2m",
|
||||
Query: percentageQuery(
|
||||
sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter, isRetriedLabelFilter, methodLabelFilter), "2m", "grpc_method"),
|
||||
sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter, methodLabelFilter), "2m", "grpc_method"),
|
||||
),
|
||||
Owner: owner,
|
||||
NoAlert: true,
|
||||
Interpretation: fmt.Sprintf("The percentage of gRPC requests that were retried aggregated across all %q clients, broken out per method.", opts.HumanServiceName),
|
||||
Panel: monitoring.Panel().LegendFormat("{{grpc_method}}").
|
||||
Unit(monitoring.Percentage).
|
||||
With(monitoring.PanelOptions.LegendOnRight()).
|
||||
With(monitoring.PanelOptions.ZeroIfNoData("grpc_method")),
|
||||
},
|
||||
monitoring.Observable{
|
||||
Name: fmt.Sprintf("%s_grpc_clients_retry_count_per_method", opts.HumanServiceName),
|
||||
Description: "client retry count per-method over 2m",
|
||||
Query: sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter, methodLabelFilter), "2m", "grpc_method"),
|
||||
Owner: owner,
|
||||
NoAlert: true,
|
||||
Interpretation: fmt.Sprintf("The count of gRPC requests that were retried aggregated across all %q clients, broken out per method", opts.HumanServiceName),
|
||||
Panel: monitoring.Panel().LegendFormat("{{grpc_method}}").
|
||||
Unit(monitoring.RequestsPerSecond).
|
||||
With(monitoring.PanelOptions.LegendOnRight()).
|
||||
With(monitoring.PanelOptions.ZeroIfNoData("grpc_method")),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// GRPCMethodVariable creates a container variable that contains all the gRPC methods
|
||||
// exposed by the given service.
|
||||
//
|
||||
|
||||
@ -58,6 +58,15 @@ func Symbols() *monitoring.Dashboard {
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerCodeIntel),
|
||||
|
||||
shared.NewGRPCRetryMetricsGroup(
|
||||
shared.GRPCRetryMetricsOptions{
|
||||
HumanServiceName: containerName,
|
||||
RawGRPCServiceName: grpcServiceName,
|
||||
Namespace: "src",
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerCodeIntel),
|
||||
|
||||
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
||||
HumanServiceName: "symbols",
|
||||
InstanceFilterRegex: `${instance:regex}`,
|
||||
|
||||
@ -1089,6 +1089,15 @@ func Zoekt() *monitoring.Dashboard {
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
|
||||
shared.NewGRPCRetryMetricsGroup(
|
||||
shared.GRPCRetryMetricsOptions{
|
||||
HumanServiceName: "zoekt-webserver",
|
||||
RawGRPCServiceName: grpcServiceName,
|
||||
Namespace: "src",
|
||||
|
||||
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
||||
}, monitoring.ObservableOwnerSearchCore),
|
||||
|
||||
shared.NewDiskMetricsGroup(
|
||||
shared.DiskMetricsGroupOptions{
|
||||
DiskTitle: "data",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user