grpc: retry: create Prometheus dashboards (#59607)

This PR implements the logic in the monitoring generator for displaying the gRPC reply metrics included in https://github.com/sourcegraph/sourcegraph/pull/59399 .

## Test plan

I created the following screenshots (look at the bottom panel) from the gitserver + zoekt-websever grafana dashboards after running `sg start --except zoekt-web-0 --except gitserver-1` and executing searches like `context:global type:diff test count:all r:hashicorp` or `context:global test count:all r:hashicorp`


<img width="1716" alt="Screenshot 2024-01-15 at 1 28 11 PM" src="https://github.com/sourcegraph/sourcegraph/assets/9022011/a503b6d9-3e21-451c-b98b-3b6e634d4ec9">


<img width="1715" alt="Screenshot 2024-01-15 at 1 28 41 PM" src="https://github.com/sourcegraph/sourcegraph/assets/9022011/ed07244e-340d-4ae3-933d-3416abff91cc">
This commit is contained in:
Geoffrey Gilmore 2024-01-16 10:45:43 -08:00 committed by GitHub
parent 5ac8c8c2b4
commit 70d5012674
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 867 additions and 237 deletions

File diff suppressed because it is too large Load Diff

View File

@ -424,6 +424,14 @@ func Frontend() *monitoring.Dashboard {
RawGRPCServiceName: grpcZoektConfigurationServiceName,
Namespace: "", // intentionally empty
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariableFrontendZoektConfiguration.Name),
}, monitoring.ObservableOwnerSearchCore),
shared.NewGRPCRetryMetricsGroup(
shared.GRPCRetryMetricsOptions{
HumanServiceName: "zoekt_configuration",
RawGRPCServiceName: grpcZoektConfigurationServiceName,
Namespace: "src",
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariableFrontendZoektConfiguration.Name),
}, monitoring.ObservableOwnerSearchCore),
@ -442,6 +450,14 @@ func Frontend() *monitoring.Dashboard {
RawGRPCServiceName: grpcInternalAPIServiceName,
Namespace: "src",
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariableFrontendInternalAPI.Name),
}, monitoring.ObservableOwnerSearchCore),
shared.NewGRPCRetryMetricsGroup(
shared.GRPCRetryMetricsOptions{
HumanServiceName: "internal_api",
RawGRPCServiceName: grpcInternalAPIServiceName,
Namespace: "src",
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariableFrontendInternalAPI.Name),
}, monitoring.ObservableOwnerSearchCore),

View File

@ -556,6 +556,15 @@ func GitServer() *monitoring.Dashboard {
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerSearchCore),
shared.NewGRPCRetryMetricsGroup(
shared.GRPCRetryMetricsOptions{
HumanServiceName: "gitserver",
RawGRPCServiceName: grpcServiceName,
Namespace: "src",
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerSearchCore),
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
HumanServiceName: "gitserver",
InstanceFilterRegex: `${shard:regex}`,

View File

@ -420,6 +420,15 @@ func RepoUpdater() *monitoring.Dashboard {
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerSource),
shared.NewGRPCRetryMetricsGroup(
shared.GRPCRetryMetricsOptions{
HumanServiceName: "repo_updater",
RawGRPCServiceName: grpcServiceName,
Namespace: "src",
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerSource),
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
HumanServiceName: "repo_updater",
InstanceFilterRegex: `${instance:regex}`,

View File

@ -240,6 +240,16 @@ regularly above 0 it is a sign for further investigation.`,
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerSearchCore),
shared.NewGRPCRetryMetricsGroup(
shared.GRPCRetryMetricsOptions{
HumanServiceName: "searcher",
RawGRPCServiceName: grpcServiceName,
Namespace: "src",
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerSearchCore),
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
HumanServiceName: "searcher",
InstanceFilterRegex: `${instance:regex}`,

View File

@ -5,9 +5,10 @@ import (
"strings"
"github.com/iancoleman/strcase"
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
"golang.org/x/text/cases"
"golang.org/x/text/language"
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
)
type GRPCServerMetricsOptions struct {
@ -474,6 +475,118 @@ func NewGRPCInternalErrorMetricsGroup(opts GRPCInternalErrorMetricsOptions, owne
}
}
type GRPCRetryMetricsOptions struct {
// HumanServiceName is the short, lowercase, snake_case, human-readable name of the grpc service that we're gathering metrics for.
//
// Example: "gitserver"
HumanServiceName string
// RawGRPCServiceName is the full, dot-separated, code-generated gRPC service name that we're gathering metrics for.
//
// Example: "gitserver.v1.GitserverService"
RawGRPCServiceName string
// MethodFilterRegex is the PromQL regex that's used to filter the
// GRPC server metrics to only those emitted by the method(s) that were interested in.
//
// Example: (Search | Exec)
MethodFilterRegex string
// Namespace is the Prometheus metrics namespace for metrics emitted by this service.
Namespace string
}
// NewGRPCRetryMetricsGroup creates a Group containing metrics that track "internal" gRPC errors.
func NewGRPCRetryMetricsGroup(opts GRPCRetryMetricsOptions, owner monitoring.ObservableOwner) monitoring.Group {
opts.HumanServiceName = strcase.ToSnake(opts.HumanServiceName)
metric := func(base string, labelFilters ...string) string {
m := base
if opts.Namespace != "" {
m = fmt.Sprintf("%s_%s", opts.Namespace, m)
}
if len(labelFilters) > 0 {
m = fmt.Sprintf("%s{%s}", m, strings.Join(labelFilters, ","))
}
return m
}
sum := func(metric, duration string, groupByLabels ...string) string {
base := fmt.Sprintf("sum(rate(%s[%s]))", metric, duration)
if len(groupByLabels) > 0 {
base = fmt.Sprintf("%s by (%s)", base, strings.Join(groupByLabels, ", "))
}
return fmt.Sprintf("(%s)", base)
}
methodLabelFilter := fmt.Sprintf(`grpc_method=~"%s"`, opts.MethodFilterRegex)
serviceLabelFilter := fmt.Sprintf(`grpc_service=~"%s"`, opts.RawGRPCServiceName)
isRetriedLabelFilter := fmt.Sprintf(`is_retried="%s"`, "true")
percentageQuery := func(numerator, denominator string) string {
ratio := fmt.Sprintf("((%s) / (%s))", numerator, denominator)
return fmt.Sprintf("(100.0 * (%s))", ratio)
}
titleCaser := cases.Title(language.English)
return monitoring.Group{
Title: fmt.Sprintf("%s GRPC retry metrics", titleCaser.String(strings.ReplaceAll(opts.HumanServiceName, "_", " "))),
Hidden: true,
Rows: []monitoring.Row{
{
monitoring.Observable{
Name: fmt.Sprintf("%s_grpc_clients_retry_percentage_across_all_methods", opts.HumanServiceName),
Description: "client retry percentage across all methods over 2m",
Query: percentageQuery(
sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter, isRetriedLabelFilter), "2m"),
sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter), "2m"),
),
Owner: owner,
NoAlert: true,
Interpretation: fmt.Sprintf("The percentage of gRPC requests that were retried across all methods, aggregated across all %q clients.", opts.HumanServiceName),
Panel: monitoring.Panel().
Unit(monitoring.Percentage).
With(monitoring.PanelOptions.LegendOnRight()).
With(monitoring.PanelOptions.ZeroIfNoData()),
},
monitoring.Observable{
Name: fmt.Sprintf("%s_grpc_clients_retry_percentage_per_method", opts.HumanServiceName),
Description: "client retry percentage per-method over 2m",
Query: percentageQuery(
sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter, isRetriedLabelFilter, methodLabelFilter), "2m", "grpc_method"),
sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter, methodLabelFilter), "2m", "grpc_method"),
),
Owner: owner,
NoAlert: true,
Interpretation: fmt.Sprintf("The percentage of gRPC requests that were retried aggregated across all %q clients, broken out per method.", opts.HumanServiceName),
Panel: monitoring.Panel().LegendFormat("{{grpc_method}}").
Unit(monitoring.Percentage).
With(monitoring.PanelOptions.LegendOnRight()).
With(monitoring.PanelOptions.ZeroIfNoData("grpc_method")),
},
monitoring.Observable{
Name: fmt.Sprintf("%s_grpc_clients_retry_count_per_method", opts.HumanServiceName),
Description: "client retry count per-method over 2m",
Query: sum(metric("grpc_client_retry_attempts_total", serviceLabelFilter, methodLabelFilter), "2m", "grpc_method"),
Owner: owner,
NoAlert: true,
Interpretation: fmt.Sprintf("The count of gRPC requests that were retried aggregated across all %q clients, broken out per method", opts.HumanServiceName),
Panel: monitoring.Panel().LegendFormat("{{grpc_method}}").
Unit(monitoring.RequestsPerSecond).
With(monitoring.PanelOptions.LegendOnRight()).
With(monitoring.PanelOptions.ZeroIfNoData("grpc_method")),
},
},
},
}
}
// GRPCMethodVariable creates a container variable that contains all the gRPC methods
// exposed by the given service.
//

View File

@ -58,6 +58,15 @@ func Symbols() *monitoring.Dashboard {
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerCodeIntel),
shared.NewGRPCRetryMetricsGroup(
shared.GRPCRetryMetricsOptions{
HumanServiceName: containerName,
RawGRPCServiceName: grpcServiceName,
Namespace: "src",
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerCodeIntel),
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
HumanServiceName: "symbols",
InstanceFilterRegex: `${instance:regex}`,

View File

@ -1089,6 +1089,15 @@ func Zoekt() *monitoring.Dashboard {
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerSearchCore),
shared.NewGRPCRetryMetricsGroup(
shared.GRPCRetryMetricsOptions{
HumanServiceName: "zoekt-webserver",
RawGRPCServiceName: grpcServiceName,
Namespace: "src",
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
}, monitoring.ObservableOwnerSearchCore),
shared.NewDiskMetricsGroup(
shared.DiskMetricsGroupOptions{
DiskTitle: "data",