mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 19:21:50 +00:00
256 lines
10 KiB
Go
256 lines
10 KiB
Go
package definitions
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/sourcegraph/sourcegraph/monitoring/definitions/shared"
|
|
"github.com/sourcegraph/sourcegraph/monitoring/monitoring"
|
|
)
|
|
|
|
func Searcher() *monitoring.Dashboard {
|
|
const (
|
|
containerName = "searcher"
|
|
grpcServiceName = "searcher.v1.SearcherService"
|
|
)
|
|
|
|
grpcMethodVariable := shared.GRPCMethodVariable("searcher", grpcServiceName)
|
|
|
|
// instanceSelector is a helper for inserting the instance selector.
|
|
// Should be used on strings created via `` since you can't escape in
|
|
// those.
|
|
instanceSelector := func(s string) string {
|
|
return strings.ReplaceAll(s, "$$INSTANCE$$", "instance=~`${instance:regex}`")
|
|
}
|
|
|
|
return &monitoring.Dashboard{
|
|
Name: "searcher",
|
|
Title: "Searcher",
|
|
Description: "Performs unindexed searches (diff and commit search, text search for unindexed branches).",
|
|
Variables: []monitoring.ContainerVariable{
|
|
{
|
|
Label: "Instance",
|
|
Name: "instance",
|
|
OptionsLabelValues: monitoring.ContainerVariableOptionsLabelValues{
|
|
Query: "searcher_service_request_total",
|
|
LabelName: "instance",
|
|
ExampleOption: "searcher-7dd95df88c-5bjt9:3181",
|
|
},
|
|
Multi: true,
|
|
},
|
|
grpcMethodVariable,
|
|
},
|
|
Groups: []monitoring.Group{
|
|
{
|
|
Title: "General",
|
|
Rows: []monitoring.Row{
|
|
{
|
|
{
|
|
Name: "traffic",
|
|
Description: "requests per second by code over 10m",
|
|
Query: "sum by (code) (rate(searcher_service_request_total{instance=~`${instance:regex}`}[10m]))",
|
|
Panel: monitoring.Panel().LegendFormat("{{code}}"),
|
|
Owner: monitoring.ObservableOwnerSearchCore,
|
|
NoAlert: true,
|
|
Interpretation: `
|
|
This graph is the average number of requests per second searcher is
|
|
experiencing over the last 10 minutes.
|
|
|
|
The code is the HTTP Status code. 200 is success. We have a special code
|
|
"canceled" which is common when doing a large search request and we find
|
|
enough results before searching all possible repos.
|
|
|
|
Note: A search query is translated into an unindexed search query per unique
|
|
(repo, commit). This means a single user query may result in thousands of
|
|
requests to searcher.`,
|
|
},
|
|
{
|
|
Name: "replica_traffic",
|
|
Description: "requests per second per replica over 10m",
|
|
Query: "sum by (instance) (rate(searcher_service_request_total{instance=~`${instance:regex}`}[10m]))",
|
|
Warning: monitoring.Alert().GreaterOrEqual(5),
|
|
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
|
Owner: monitoring.ObservableOwnerSearchCore,
|
|
NextSteps: "none",
|
|
Interpretation: `
|
|
This graph is the average number of requests per second searcher is
|
|
experiencing over the last 10 minutes broken down per replica.
|
|
|
|
The code is the HTTP Status code. 200 is success. We have a special code
|
|
"canceled" which is common when doing a large search request and we find
|
|
enough results before searching all possible repos.
|
|
|
|
Note: A search query is translated into an unindexed search query per unique
|
|
(repo, commit). This means a single user query may result in thousands of
|
|
requests to searcher.`,
|
|
},
|
|
}, {
|
|
{
|
|
Name: "concurrent_requests",
|
|
Description: "amount of in-flight unindexed search requests (per instance)",
|
|
Query: "sum by (instance) (searcher_service_running{instance=~`${instance:regex}`})",
|
|
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
|
Owner: monitoring.ObservableOwnerSearchCore,
|
|
NoAlert: true,
|
|
Interpretation: `
|
|
This graph is the amount of in-flight unindexed search requests per instance.
|
|
Consistently high numbers here indicate you may need to scale out searcher.`,
|
|
},
|
|
{
|
|
Name: "unindexed_search_request_errors",
|
|
Description: "unindexed search request errors every 5m by code",
|
|
Query: instanceSelector(`sum by (code)(increase(searcher_service_request_total{code!="200",code!="canceled",$$INSTANCE$$}[5m])) / ignoring(code) group_left sum(increase(searcher_service_request_total{$$INSTANCE$$}[5m])) * 100`),
|
|
Warning: monitoring.Alert().GreaterOrEqual(5).For(5 * time.Minute),
|
|
Panel: monitoring.Panel().LegendFormat("{{code}}").Unit(monitoring.Percentage),
|
|
Owner: monitoring.ObservableOwnerSearchCore,
|
|
NextSteps: "none",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
|
|
{
|
|
Title: "Cache store",
|
|
Hidden: true,
|
|
Rows: []monitoring.Row{
|
|
{
|
|
{
|
|
Name: "store_fetching",
|
|
Description: "amount of in-flight unindexed search requests fetching code from gitserver (per instance)",
|
|
Query: "sum by (instance) (searcher_store_fetching{instance=~`${instance:regex}`})",
|
|
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
|
Owner: monitoring.ObservableOwnerSearchCore,
|
|
NoAlert: true,
|
|
Interpretation: `
|
|
Before we can search a commit we fetch the code from gitserver then cache it
|
|
for future search requests. This graph is the current number of search
|
|
requests which are in the state of fetching code from gitserver.
|
|
|
|
Generally this number should remain low since fetching code is fast, but
|
|
expect bursts. In the case of instances with a monorepo you would expect this
|
|
number to stay low for the duration of fetching the code (which in some cases
|
|
can take many minutes).`,
|
|
},
|
|
{
|
|
Name: "store_fetching_waiting",
|
|
Description: "amount of in-flight unindexed search requests waiting to fetch code from gitserver (per instance)",
|
|
Query: "sum by (instance) (searcher_store_fetch_queue_size{instance=~`${instance:regex}`})",
|
|
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
|
Owner: monitoring.ObservableOwnerSearchCore,
|
|
NoAlert: true,
|
|
Interpretation: `
|
|
We limit the number of requests which can fetch code to prevent overwhelming
|
|
gitserver. This gauge is the number of requests waiting to be allowed to speak
|
|
to gitserver.`,
|
|
},
|
|
{
|
|
Name: "store_fetching_fail",
|
|
Description: "amount of unindexed search requests that failed while fetching code from gitserver over 10m (per instance)",
|
|
Query: "sum by (instance) (rate(searcher_store_fetch_failed{instance=~`${instance:regex}`}[10m]))",
|
|
Panel: monitoring.Panel().LegendFormat("{{instance}}"),
|
|
Owner: monitoring.ObservableOwnerSearchCore,
|
|
NoAlert: true,
|
|
Interpretation: `
|
|
This graph should be zero since fetching happens in the background and will
|
|
not be influenced by user timeouts/etc. Expected upticks in this graph are
|
|
during gitserver rollouts. If you regularly see this graph have non-zero
|
|
values please reach out to support.`,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
|
|
{
|
|
Title: "Index use",
|
|
Hidden: true,
|
|
Rows: []monitoring.Row{
|
|
{
|
|
{
|
|
Name: "searcher_hybrid_final_state_total",
|
|
Description: "hybrid search final state over 10m",
|
|
Interpretation: `
|
|
This graph is about our interactions with the search index (zoekt) to help
|
|
complete unindexed search requests. Searcher will use indexed search for the
|
|
files that have not changed between the unindexed commit and the index.
|
|
|
|
This graph should mostly be "success". The next most common state should be
|
|
"search-canceled" which happens when result limits are hit or the user starts
|
|
a new search. Finally the next most common should be "diff-too-large", which
|
|
happens if the commit is too far from the indexed commit. Otherwise other
|
|
state should be rare and likely are a sign for further investigation.
|
|
|
|
Note: On sourcegraph.com "zoekt-list-missing" is also common due to it
|
|
indexing a subset of repositories. Otherwise every other state should occur
|
|
rarely.
|
|
|
|
For a full list of possible state see
|
|
[recordHybridFinalState](https://sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+f:cmd/searcher+recordHybridFinalState).`,
|
|
Query: "sum by (state)(increase(searcher_hybrid_final_state_total{instance=~`${instance:regex}`}[10m]))",
|
|
Panel: monitoring.Panel().LegendFormat("{{state}}"),
|
|
Owner: monitoring.ObservableOwnerSearchCore,
|
|
NoAlert: true,
|
|
},
|
|
{
|
|
Name: "searcher_hybrid_retry_total",
|
|
Description: "hybrid search retrying over 10m",
|
|
Interpretation: `
|
|
Expectation is that this graph should mostly be 0. It will trigger if a user
|
|
manages to do a search and the underlying index changes while searching or
|
|
Zoekt goes down. So occasional bursts can be expected, but if this graph is
|
|
regularly above 0 it is a sign for further investigation.`,
|
|
Query: "sum by (reason)(increase(searcher_hybrid_retry_total{instance=~`${instance:regex}`}[10m]))",
|
|
Panel: monitoring.Panel().LegendFormat("{{reason}}"),
|
|
Owner: monitoring.ObservableOwnerSearchCore,
|
|
NoAlert: true,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
|
|
shared.NewDiskMetricsGroup(
|
|
shared.DiskMetricsGroupOptions{
|
|
DiskTitle: "cache",
|
|
|
|
MetricMountNameLabel: "cacheDir",
|
|
MetricNamespace: "searcher",
|
|
|
|
ServiceName: "searcher",
|
|
InstanceFilterRegex: `${instance:regex}`,
|
|
},
|
|
monitoring.ObservableOwnerSearchCore,
|
|
),
|
|
|
|
shared.NewGRPCServerMetricsGroup(
|
|
shared.GRPCServerMetricsOptions{
|
|
HumanServiceName: "searcher",
|
|
RawGRPCServiceName: grpcServiceName,
|
|
|
|
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
|
|
|
InstanceFilterRegex: `${instance:regex}`,
|
|
MessageSizeNamespace: "src",
|
|
}, monitoring.ObservableOwnerSearchCore),
|
|
|
|
shared.NewGRPCInternalErrorMetricsGroup(
|
|
shared.GRPCInternalErrorMetricsOptions{
|
|
HumanServiceName: "searcher",
|
|
RawGRPCServiceName: grpcServiceName,
|
|
Namespace: "src",
|
|
|
|
MethodFilterRegex: fmt.Sprintf("${%s:regex}", grpcMethodVariable.Name),
|
|
}, monitoring.ObservableOwnerSearchCore),
|
|
shared.NewSiteConfigurationClientMetricsGroup(shared.SiteConfigurationMetricsOptions{
|
|
HumanServiceName: "searcher",
|
|
InstanceFilterRegex: `${instance:regex}`,
|
|
}, monitoring.ObservableOwnerDevOps),
|
|
shared.NewDatabaseConnectionsMonitoringGroup(containerName),
|
|
shared.NewFrontendInternalAPIErrorResponseMonitoringGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
|
|
shared.NewContainerMonitoringGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
|
|
shared.NewProvisioningIndicatorsGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
|
|
shared.NewGolangMonitoringGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
|
|
shared.NewKubernetesMonitoringGroup(containerName, monitoring.ObservableOwnerSearchCore, nil),
|
|
},
|
|
}
|
|
}
|