telemetry-gateway: wrap publish errors in error details for Sentry (#61895)

We aggregate all errors on a single log entry to get accurate representations of issues in Sentry, while not generating thousands of log entries at the same time. Because this means we only get higher-level logger context, we must annotate the errors directly with some hidden details to preserve Sentry grouping while adding context for diagnostics.

We do this by using CockroachDB error's `WithSafeDetails` helper, which annotates an error with details that are _not_ rendered by `err.Error()` but _are_ included in Sentry reports (via the `Message` field). Currently, we annotate the event ID, feature, action, and source (see example output below)

Context on this request: https://sourcegraph.slack.com/archives/C06CCJR4K9R/p1713054354443579

## Test plan

_Super_ jank unit tests that try to emulate how we build Sentry reports, relying on knowledge of how our Sentry report building works internally. I opened https://github.com/sourcegraph/log/issues/65 in case there are new use cases for this in the future. Running the test with `-v` demonstrates the output:

```
$ go test -timeout 30s -run ^TestSummarizeFailedEvents$ github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server -v
=== RUN   TestSummarizeFailedEvents
=== RUN   TestSummarizeFailedEvents/all_failed
=== RUN   TestSummarizeFailedEvents/all_failed/Sentry_report
    publish_events_test.go:83: Sentry Error message for field "error.0":
        
        publish_events_test.go:38: event publish failed
        (1) feature:"feature_0" action:"action_0" id:"id_0" server:{version:"TestSummarizeFailedEvents/all_failed"}  client:{name:"test_client"}
        Wraps: (2) attached stack trace
          -- stack trace:
          | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1
          |     /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38
          | testing.tRunner
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689
          | runtime.goexit
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222
        Wraps: (3) event publish failed
        Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError
        -- report composition:
        *errutil.leafError: event publish failed
        publish_events_test.go:38: *withstack.withStack (top exception)
        *safedetails.withSafeDetails: feature:"feature_0" action:"action_0" id:"id_0" server:{version:"TestSummarizeFailedEvents/all_failed"}  client:{name:"test_client"}
        
    publish_events_test.go:83: Sentry Error message for field "error.1":
        
        publish_events_test.go:38: event publish failed
        (1) feature:"feature_1" action:"action_1" id:"id_1" server:{version:"TestSummarizeFailedEvents/all_failed"}
        Wraps: (2) attached stack trace
          -- stack trace:
          | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1
          |     /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38
          | testing.tRunner
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689
          | runtime.goexit
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222
        Wraps: (3) event publish failed
        Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError
        -- report composition:
        *errutil.leafError: event publish failed
        publish_events_test.go:38: *withstack.withStack (top exception)
        *safedetails.withSafeDetails: feature:"feature_1" action:"action_1" id:"id_1" server:{version:"TestSummarizeFailedEvents/all_failed"}
        
    publish_events_test.go:83: Sentry Error message for field "error.2":
        
        publish_events_test.go:38: event publish failed
        (1) feature:"feature_2" action:"action_2" id:"id_2" server:{version:"TestSummarizeFailedEvents/all_failed"}  client:{name:"test_client"}
        Wraps: (2) attached stack trace
          -- stack trace:
          | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1
          |     /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38
          | testing.tRunner
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689
          | runtime.goexit
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222
        Wraps: (3) event publish failed
        Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError
        -- report composition:
        *errutil.leafError: event publish failed
        publish_events_test.go:38: *withstack.withStack (top exception)
        *safedetails.withSafeDetails: feature:"feature_2" action:"action_2" id:"id_2" server:{version:"TestSummarizeFailedEvents/all_failed"}  client:{name:"test_client"}
        
    publish_events_test.go:83: Sentry Error message for field "error.3":
        
        publish_events_test.go:38: event publish failed
        (1) feature:"feature_3" action:"action_3" id:"id_3" server:{version:"TestSummarizeFailedEvents/all_failed"}
        Wraps: (2) attached stack trace
          -- stack trace:
          | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1
          |     /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38
          | testing.tRunner
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689
          | runtime.goexit
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222
        Wraps: (3) event publish failed
        Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError
        -- report composition:
        *errutil.leafError: event publish failed
        publish_events_test.go:38: *withstack.withStack (top exception)
        *safedetails.withSafeDetails: feature:"feature_3" action:"action_3" id:"id_3" server:{version:"TestSummarizeFailedEvents/all_failed"}
        
    publish_events_test.go:83: Sentry Error message for field "error.4":
        
        publish_events_test.go:38: event publish failed
        (1) feature:"feature_4" action:"action_4" id:"id_4" server:{version:"TestSummarizeFailedEvents/all_failed"}  client:{name:"test_client"}
        Wraps: (2) attached stack trace
          -- stack trace:
          | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1
          |     /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38
          | testing.tRunner
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689
          | runtime.goexit
          |     /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222
        Wraps: (3) event publish failed
        Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError
        -- report composition:
        *errutil.leafError: event publish failed
        publish_events_test.go:38: *withstack.withStack (top exception)
        *safedetails.withSafeDetails: feature:"feature_4" action:"action_4" id:"id_4" server:{version:"TestSummarizeFailedEvents/all_failed"}  client:{name:"test_client"}
        
=== RUN   TestSummarizeFailedEvents/some_failed
=== RUN   TestSummarizeFailedEvents/all_succeeded
=== RUN   TestSummarizeFailedEvents/all_succeeded_(large_set)
--- PASS: TestSummarizeFailedEvents (0.23s)
    --- PASS: TestSummarizeFailedEvents/all_failed (0.22s)
        --- PASS: TestSummarizeFailedEvents/all_failed/Sentry_report (0.00s)
    --- PASS: TestSummarizeFailedEvents/some_failed (0.00s)
    --- PASS: TestSummarizeFailedEvents/all_succeeded (0.00s)
    --- PASS: TestSummarizeFailedEvents/all_succeeded_(large_set) (0.00s)
PASS
ok      github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server        (cached)
```
This commit is contained in:
Robert Lin 2024-04-16 08:19:29 +09:00 committed by GitHub
parent 1a13911ae7
commit 294d4b47f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 134 additions and 12 deletions

View File

@ -87,10 +87,31 @@ func (p *Publisher) GetSourceName() string {
type PublishEventResult struct {
// EventID is the ID of the event that was published.
EventID string
// EventFeature is the feature of the event that was published.
EventFeature string
// EventAction is the action of the event that was published.
EventAction string
// EventSource is a string representation of source of the event, as reported
// at recording time, in the default proto string format, e.g:
//
// server:{version:"..."} client:{name:"..."}
EventSource string
// PublishError, if non-nil, indicates an error occurred publishing the event.
PublishError error
}
// NewPublishEventResult returns a PublishEventResult for the given event and error.
// Should only be used internally or in testing.
func NewPublishEventResult(event *telemetrygatewayv1.Event, err error) PublishEventResult {
return PublishEventResult{
EventID: event.GetId(),
EventFeature: event.GetFeature(),
EventAction: event.GetAction(),
EventSource: event.GetSource().String(),
PublishError: err,
}
}
// Publish emits all events concurrently, up to 100 at a time for each call.
func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Event) []PublishEventResult {
wg := pool.NewWithResults[PublishEventResult]().
@ -139,6 +160,10 @@ func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Ev
// We can't error forever, as the instance will keep trying to deliver
// this event - for now, we just pretend the event succeeded, and log
// some diagnostics.
//
// TODO: Maybe we can merge this with how we handle errors in
// summarizePublishEventsResults - for now, we stick with this
// special handling for extra visibility.
if len(payload) >= googlepubsub.MaxPublishRequestBytes {
trace.Logger(ctx, p.logger).Error("discarding oversized event",
log.Error(errors.Newf("event %s/%s is oversized",
@ -149,9 +174,13 @@ func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Ev
redact.Safe(event.GetFeature()),
redact.Safe(event.GetAction()))),
log.String("eventID", event.GetId()),
log.String("eventSource", event.GetSource().String()),
log.Int("size", len(payload)),
// Record a section of the event content for diagnostics
log.String("eventSnippet", strings.ToValidUTF8(string(eventJSON[:256]), "<22>")))
// Record a section of the event content for diagnostics.
// Keep in mind the size of Context objects in Sentry:
// https://develop.sentry.dev/sdk/data-handling/#variable-size
// And GCP logging limits: https://cloud.google.com/logging/quotas
log.String("eventSnippet", strings.ToValidUTF8(string(eventJSON[:512]), "<22>")))
// We must return nil, pretending the publish succeeded, so that
// the client stops attempting to publish an event that will
// never succeed.
@ -174,10 +203,7 @@ func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Ev
}
wg.Go(func() PublishEventResult {
return PublishEventResult{
EventID: event.GetId(),
PublishError: doPublish(event),
}
return NewPublishEventResult(event, doPublish(event))
})
}
return wg.Wait()

View File

@ -20,6 +20,7 @@ go_library(
"//internal/telemetrygateway/v1:telemetrygateway",
"//internal/trace",
"//lib/errors",
"@com_github_cockroachdb_redact//:redact",
"@com_github_sourcegraph_log//:log",
"@io_opentelemetry_go_otel//:otel",
"@io_opentelemetry_go_otel//attribute",
@ -39,7 +40,9 @@ go_test(
deps = [
"//cmd/telemetry-gateway/internal/events",
"//internal/telemetrygateway/v1:telemetrygateway",
"//lib/errors",
"@com_github_hexops_autogold_v2//:autogold",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
],
)

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"github.com/cockroachdb/redact"
"github.com/sourcegraph/log"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
@ -83,10 +84,26 @@ func summarizePublishEventsResults(results []events.PublishEventResult) publishE
failed = make([]events.PublishEventResult, 0)
)
// We aggregate all errors on a single log entry to get accurate
// representations of issues in Sentry, while not generating thousands of
// log entries at the same time. Because this means we only get higher-level
// logger context, we must annotate the errors with some hidden details to
// preserve Sentry grouping while adding context for diagnostics.
for i, result := range results {
if result.PublishError != nil {
failed = append(failed, result)
errFields = append(errFields, log.NamedError(fmt.Sprintf("error.%d", i), result.PublishError))
// Construct details to annotate the error with in Sentry reports
// without affecting the error itself (which is important for
// grouping within Sentry)
errFields = append(errFields, log.NamedError(fmt.Sprintf("error.%d", i),
errors.WithSafeDetails(result.PublishError,
"feature:%[1]q action:%[2]q id:%[3]q %[4]s", // mimic format of result.EventSource
redact.Safe(result.EventFeature),
redact.Safe(result.EventAction),
redact.Safe(result.EventID),
redact.Safe(result.EventSource),
),
))
} else {
succeeded = append(succeeded, result.EventID)
}

View File

@ -1,23 +1,43 @@
package server
import (
"errors"
"reflect"
"strconv"
"testing"
"github.com/hexops/autogold/v2"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/events"
telemetrygatewayv1 "github.com/sourcegraph/sourcegraph/internal/telemetrygateway/v1"
"github.com/sourcegraph/sourcegraph/lib/errors"
)
func TestSummarizeFailedEvents(t *testing.T) {
t.Run("all failed", func(t *testing.T) {
results := make([]events.PublishEventResult, 0)
for i := range results {
results[i].EventID = "id_" + strconv.Itoa(i)
results[i].PublishError = errors.New("failed")
const failureMessage = "event publish failed"
submitted := make([]*telemetrygatewayv1.Event, 5)
for i := range submitted {
submitted[i] = &telemetrygatewayv1.Event{
Id: "id_" + strconv.Itoa(i),
Feature: "feature_" + strconv.Itoa(i),
Action: "action_" + strconv.Itoa(i),
Source: &telemetrygatewayv1.EventSource{
Server: &telemetrygatewayv1.EventSource_Server{Version: t.Name()},
},
}
if i%2 == 0 {
submitted[i].Source.Client = &telemetrygatewayv1.EventSource_Client{
Name: "test_client",
}
}
}
results := make([]events.PublishEventResult, len(submitted))
for i, event := range submitted {
results[i] = events.NewPublishEventResult(event, errors.New(failureMessage))
}
require.False(t, len(results) == 0)
summary := summarizePublishEventsResults(results)
autogold.Expect("all events in batch failed to submit").Equal(t, summary.message)
@ -25,6 +45,48 @@ func TestSummarizeFailedEvents(t *testing.T) {
assert.Len(t, summary.errorFields, len(results))
assert.Len(t, summary.succeededEvents, 0)
assert.Len(t, summary.failedEvents, len(results))
// This sub-test asserts some behaviour specific to how we construct
// Sentry reports in sourcegraph/log:
// https://github.com/sourcegraph/log/blob/f53023898988779b0dabb75fda2c5c3b8d5ae3ae/internal/sinkcores/sentrycore/worker.go#L95-L96
//
// It requires knowledge of the internals of the logging library and
// how Sentry interprets these reports.
// TODO: https://github.com/sourcegraph/log/issues/65
t.Run("Sentry report", func(t *testing.T) {
for i, errField := range summary.errorFields {
// Our logging library wraps the error provided to log.NamedError
// to customize how it gets rendered. Internally, we extract the
// undelying error for generating the report - to emulate this
// behaviour we must reflect for the underlying error on the
// internal type:
// https://github.com/sourcegraph/log/blob/f53023898988779b0dabb75fda2c5c3b8d5ae3ae/internal/encoders/encoders.go#L59-L61
//
// TODO: https://github.com/sourcegraph/log/issues/65
logError, ok := errField.Interface.(error)
assert.Truef(t, ok, "expected errField %q to carry an error", errField.Key)
rv := reflect.Indirect(reflect.ValueOf(logError))
v := rv.FieldByName("Source")
err, ok := v.Interface().(error)
assert.Truef(t, ok, "expected errField %q to carry a log error that wraps an error as 'Source'", errField.Key)
// The undelying error should only report the original error
// exactly without any additional context. This is used in
// Sentry report generation, which preserves our desired
// grouping.
assert.Equal(t, err.Error(), failureMessage)
// Assert that the final Sentry report includes the metadata we
// are attaching to assist with diagnostics on this particular
// error.
event, _ := errors.BuildSentryReport(err)
t.Logf("Sentry Error message for field %q:\n\n%s\n\n",
errField.Key, event.Message)
assert.Contains(t, event.Message, "feature_"+strconv.Itoa(i))
assert.Contains(t, event.Message, "action_"+strconv.Itoa(i))
assert.Contains(t, event.Message, "id_"+strconv.Itoa(i))
}
})
})
t.Run("some failed", func(t *testing.T) {

View File

@ -38,6 +38,20 @@ var (
// called. Useful for sentinel errors.
WithStack = errors.WithStack
// WithSafeDetails annotates an error with the given reportable details.
// The format is made available as a PII-free string, alongside
// with a PII-free representation of every additional argument.
// Arguments can be reported as-is (without redaction) by wrapping
// them using the Safe() function.
//
// If the format is empty and there are no arguments, the
// error argument is returned unchanged.
//
// Detail is shown:
// - when formatting with `%+v`.
// - in Sentry reports.
WithSafeDetails = errors.WithSafeDetails
Is = errors.Is
IsAny = errors.IsAny
As = errors.As