repos: Don't alert on sync errors caused by internal rate limit (#35287)

These are temporary and don't indicate an issue communicating with the
code host.
This commit is contained in:
Ryan Slade 2022-05-11 14:01:47 +02:00 committed by GitHub
parent 45149830bc
commit 034d8cfcb2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 6 additions and 2 deletions

View File

@ -11019,7 +11019,7 @@ To see this panel, visit `/-/debug/grafana/d/repo-updater/repo-updater?viewPanel
<details>
<summary>Technical details</summary>
Query: `max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="invalid_npm_path"}[5m]))`
Query: `max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="invalid_npm_path",reason!="internal_rate_limit"}[5m]))`
</details>

View File

@ -870,6 +870,10 @@ func syncErrorReason(err error) string {
case strings.Contains(err.Error(), "expected path in npm/(scope/)?name"):
// This is a known issue which we can filter out for now
return "invalid_npm_path"
case strings.Contains(err.Error(), "internal rate limit exceeded"):
// We want to identify these as it's not an issue communicating with the code
// host and is most likely caused by temporary traffic spikes.
return "internal_rate_limit"
default:
return "unknown"
}

View File

@ -63,7 +63,7 @@ func RepoUpdater() *monitoring.Container {
{
Name: "src_repoupdater_syncer_sync_errors_total",
Description: "site level external service sync error rate",
Query: `max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="invalid_npm_path"}[5m]))`,
Query: `max by (family) (rate(src_repoupdater_syncer_sync_errors_total{owner!="user",reason!="invalid_npm_path",reason!="internal_rate_limit"}[5m]))`,
Warning: monitoring.Alert().Greater(0.5).For(10 * time.Minute),
Critical: monitoring.Alert().Greater(1).For(10 * time.Minute),
Panel: monitoring.Panel().Unit(monitoring.Number).With(monitoring.PanelOptions.ZeroIfNoData()),