mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 19:51:50 +00:00
telemetry-gateway: wrap publish errors in error details for Sentry (#61895)
We aggregate all errors on a single log entry to get accurate representations of issues in Sentry, while not generating thousands of log entries at the same time. Because this means we only get higher-level logger context, we must annotate the errors directly with some hidden details to preserve Sentry grouping while adding context for diagnostics. We do this by using CockroachDB error's `WithSafeDetails` helper, which annotates an error with details that are _not_ rendered by `err.Error()` but _are_ included in Sentry reports (via the `Message` field). Currently, we annotate the event ID, feature, action, and source (see example output below) Context on this request: https://sourcegraph.slack.com/archives/C06CCJR4K9R/p1713054354443579 ## Test plan _Super_ jank unit tests that try to emulate how we build Sentry reports, relying on knowledge of how our Sentry report building works internally. I opened https://github.com/sourcegraph/log/issues/65 in case there are new use cases for this in the future. Running the test with `-v` demonstrates the output: ``` $ go test -timeout 30s -run ^TestSummarizeFailedEvents$ github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server -v === RUN TestSummarizeFailedEvents === RUN TestSummarizeFailedEvents/all_failed === RUN TestSummarizeFailedEvents/all_failed/Sentry_report publish_events_test.go:83: Sentry Error message for field "error.0": publish_events_test.go:38: event publish failed (1) feature:"feature_0" action:"action_0" id:"id_0" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_0" action:"action_0" id:"id_0" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} publish_events_test.go:83: Sentry Error message for field "error.1": publish_events_test.go:38: event publish failed (1) feature:"feature_1" action:"action_1" id:"id_1" server:{version:"TestSummarizeFailedEvents/all_failed"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_1" action:"action_1" id:"id_1" server:{version:"TestSummarizeFailedEvents/all_failed"} publish_events_test.go:83: Sentry Error message for field "error.2": publish_events_test.go:38: event publish failed (1) feature:"feature_2" action:"action_2" id:"id_2" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_2" action:"action_2" id:"id_2" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} publish_events_test.go:83: Sentry Error message for field "error.3": publish_events_test.go:38: event publish failed (1) feature:"feature_3" action:"action_3" id:"id_3" server:{version:"TestSummarizeFailedEvents/all_failed"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_3" action:"action_3" id:"id_3" server:{version:"TestSummarizeFailedEvents/all_failed"} publish_events_test.go:83: Sentry Error message for field "error.4": publish_events_test.go:38: event publish failed (1) feature:"feature_4" action:"action_4" id:"id_4" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_4" action:"action_4" id:"id_4" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} === RUN TestSummarizeFailedEvents/some_failed === RUN TestSummarizeFailedEvents/all_succeeded === RUN TestSummarizeFailedEvents/all_succeeded_(large_set) --- PASS: TestSummarizeFailedEvents (0.23s) --- PASS: TestSummarizeFailedEvents/all_failed (0.22s) --- PASS: TestSummarizeFailedEvents/all_failed/Sentry_report (0.00s) --- PASS: TestSummarizeFailedEvents/some_failed (0.00s) --- PASS: TestSummarizeFailedEvents/all_succeeded (0.00s) --- PASS: TestSummarizeFailedEvents/all_succeeded_(large_set) (0.00s) PASS ok github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server (cached) ```
This commit is contained in:
parent
1a13911ae7
commit
294d4b47f8
@ -87,10 +87,31 @@ func (p *Publisher) GetSourceName() string {
|
||||
type PublishEventResult struct {
|
||||
// EventID is the ID of the event that was published.
|
||||
EventID string
|
||||
// EventFeature is the feature of the event that was published.
|
||||
EventFeature string
|
||||
// EventAction is the action of the event that was published.
|
||||
EventAction string
|
||||
// EventSource is a string representation of source of the event, as reported
|
||||
// at recording time, in the default proto string format, e.g:
|
||||
//
|
||||
// server:{version:"..."} client:{name:"..."}
|
||||
EventSource string
|
||||
// PublishError, if non-nil, indicates an error occurred publishing the event.
|
||||
PublishError error
|
||||
}
|
||||
|
||||
// NewPublishEventResult returns a PublishEventResult for the given event and error.
|
||||
// Should only be used internally or in testing.
|
||||
func NewPublishEventResult(event *telemetrygatewayv1.Event, err error) PublishEventResult {
|
||||
return PublishEventResult{
|
||||
EventID: event.GetId(),
|
||||
EventFeature: event.GetFeature(),
|
||||
EventAction: event.GetAction(),
|
||||
EventSource: event.GetSource().String(),
|
||||
PublishError: err,
|
||||
}
|
||||
}
|
||||
|
||||
// Publish emits all events concurrently, up to 100 at a time for each call.
|
||||
func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Event) []PublishEventResult {
|
||||
wg := pool.NewWithResults[PublishEventResult]().
|
||||
@ -139,6 +160,10 @@ func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Ev
|
||||
// We can't error forever, as the instance will keep trying to deliver
|
||||
// this event - for now, we just pretend the event succeeded, and log
|
||||
// some diagnostics.
|
||||
//
|
||||
// TODO: Maybe we can merge this with how we handle errors in
|
||||
// summarizePublishEventsResults - for now, we stick with this
|
||||
// special handling for extra visibility.
|
||||
if len(payload) >= googlepubsub.MaxPublishRequestBytes {
|
||||
trace.Logger(ctx, p.logger).Error("discarding oversized event",
|
||||
log.Error(errors.Newf("event %s/%s is oversized",
|
||||
@ -149,9 +174,13 @@ func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Ev
|
||||
redact.Safe(event.GetFeature()),
|
||||
redact.Safe(event.GetAction()))),
|
||||
log.String("eventID", event.GetId()),
|
||||
log.String("eventSource", event.GetSource().String()),
|
||||
log.Int("size", len(payload)),
|
||||
// Record a section of the event content for diagnostics
|
||||
log.String("eventSnippet", strings.ToValidUTF8(string(eventJSON[:256]), "<22>")))
|
||||
// Record a section of the event content for diagnostics.
|
||||
// Keep in mind the size of Context objects in Sentry:
|
||||
// https://develop.sentry.dev/sdk/data-handling/#variable-size
|
||||
// And GCP logging limits: https://cloud.google.com/logging/quotas
|
||||
log.String("eventSnippet", strings.ToValidUTF8(string(eventJSON[:512]), "<22>")))
|
||||
// We must return nil, pretending the publish succeeded, so that
|
||||
// the client stops attempting to publish an event that will
|
||||
// never succeed.
|
||||
@ -174,10 +203,7 @@ func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Ev
|
||||
}
|
||||
|
||||
wg.Go(func() PublishEventResult {
|
||||
return PublishEventResult{
|
||||
EventID: event.GetId(),
|
||||
PublishError: doPublish(event),
|
||||
}
|
||||
return NewPublishEventResult(event, doPublish(event))
|
||||
})
|
||||
}
|
||||
return wg.Wait()
|
||||
|
||||
@ -20,6 +20,7 @@ go_library(
|
||||
"//internal/telemetrygateway/v1:telemetrygateway",
|
||||
"//internal/trace",
|
||||
"//lib/errors",
|
||||
"@com_github_cockroachdb_redact//:redact",
|
||||
"@com_github_sourcegraph_log//:log",
|
||||
"@io_opentelemetry_go_otel//:otel",
|
||||
"@io_opentelemetry_go_otel//attribute",
|
||||
@ -39,7 +40,9 @@ go_test(
|
||||
deps = [
|
||||
"//cmd/telemetry-gateway/internal/events",
|
||||
"//internal/telemetrygateway/v1:telemetrygateway",
|
||||
"//lib/errors",
|
||||
"@com_github_hexops_autogold_v2//:autogold",
|
||||
"@com_github_stretchr_testify//assert",
|
||||
"@com_github_stretchr_testify//require",
|
||||
],
|
||||
)
|
||||
|
||||
@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/cockroachdb/redact"
|
||||
"github.com/sourcegraph/log"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
@ -83,10 +84,26 @@ func summarizePublishEventsResults(results []events.PublishEventResult) publishE
|
||||
failed = make([]events.PublishEventResult, 0)
|
||||
)
|
||||
|
||||
// We aggregate all errors on a single log entry to get accurate
|
||||
// representations of issues in Sentry, while not generating thousands of
|
||||
// log entries at the same time. Because this means we only get higher-level
|
||||
// logger context, we must annotate the errors with some hidden details to
|
||||
// preserve Sentry grouping while adding context for diagnostics.
|
||||
for i, result := range results {
|
||||
if result.PublishError != nil {
|
||||
failed = append(failed, result)
|
||||
errFields = append(errFields, log.NamedError(fmt.Sprintf("error.%d", i), result.PublishError))
|
||||
// Construct details to annotate the error with in Sentry reports
|
||||
// without affecting the error itself (which is important for
|
||||
// grouping within Sentry)
|
||||
errFields = append(errFields, log.NamedError(fmt.Sprintf("error.%d", i),
|
||||
errors.WithSafeDetails(result.PublishError,
|
||||
"feature:%[1]q action:%[2]q id:%[3]q %[4]s", // mimic format of result.EventSource
|
||||
redact.Safe(result.EventFeature),
|
||||
redact.Safe(result.EventAction),
|
||||
redact.Safe(result.EventID),
|
||||
redact.Safe(result.EventSource),
|
||||
),
|
||||
))
|
||||
} else {
|
||||
succeeded = append(succeeded, result.EventID)
|
||||
}
|
||||
|
||||
@ -1,23 +1,43 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
"github.com/hexops/autogold/v2"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/events"
|
||||
telemetrygatewayv1 "github.com/sourcegraph/sourcegraph/internal/telemetrygateway/v1"
|
||||
"github.com/sourcegraph/sourcegraph/lib/errors"
|
||||
)
|
||||
|
||||
func TestSummarizeFailedEvents(t *testing.T) {
|
||||
t.Run("all failed", func(t *testing.T) {
|
||||
results := make([]events.PublishEventResult, 0)
|
||||
for i := range results {
|
||||
results[i].EventID = "id_" + strconv.Itoa(i)
|
||||
results[i].PublishError = errors.New("failed")
|
||||
const failureMessage = "event publish failed"
|
||||
submitted := make([]*telemetrygatewayv1.Event, 5)
|
||||
for i := range submitted {
|
||||
submitted[i] = &telemetrygatewayv1.Event{
|
||||
Id: "id_" + strconv.Itoa(i),
|
||||
Feature: "feature_" + strconv.Itoa(i),
|
||||
Action: "action_" + strconv.Itoa(i),
|
||||
Source: &telemetrygatewayv1.EventSource{
|
||||
Server: &telemetrygatewayv1.EventSource_Server{Version: t.Name()},
|
||||
},
|
||||
}
|
||||
if i%2 == 0 {
|
||||
submitted[i].Source.Client = &telemetrygatewayv1.EventSource_Client{
|
||||
Name: "test_client",
|
||||
}
|
||||
}
|
||||
}
|
||||
results := make([]events.PublishEventResult, len(submitted))
|
||||
for i, event := range submitted {
|
||||
results[i] = events.NewPublishEventResult(event, errors.New(failureMessage))
|
||||
}
|
||||
require.False(t, len(results) == 0)
|
||||
|
||||
summary := summarizePublishEventsResults(results)
|
||||
autogold.Expect("all events in batch failed to submit").Equal(t, summary.message)
|
||||
@ -25,6 +45,48 @@ func TestSummarizeFailedEvents(t *testing.T) {
|
||||
assert.Len(t, summary.errorFields, len(results))
|
||||
assert.Len(t, summary.succeededEvents, 0)
|
||||
assert.Len(t, summary.failedEvents, len(results))
|
||||
|
||||
// This sub-test asserts some behaviour specific to how we construct
|
||||
// Sentry reports in sourcegraph/log:
|
||||
// https://github.com/sourcegraph/log/blob/f53023898988779b0dabb75fda2c5c3b8d5ae3ae/internal/sinkcores/sentrycore/worker.go#L95-L96
|
||||
//
|
||||
// It requires knowledge of the internals of the logging library and
|
||||
// how Sentry interprets these reports.
|
||||
// TODO: https://github.com/sourcegraph/log/issues/65
|
||||
t.Run("Sentry report", func(t *testing.T) {
|
||||
for i, errField := range summary.errorFields {
|
||||
// Our logging library wraps the error provided to log.NamedError
|
||||
// to customize how it gets rendered. Internally, we extract the
|
||||
// undelying error for generating the report - to emulate this
|
||||
// behaviour we must reflect for the underlying error on the
|
||||
// internal type:
|
||||
// https://github.com/sourcegraph/log/blob/f53023898988779b0dabb75fda2c5c3b8d5ae3ae/internal/encoders/encoders.go#L59-L61
|
||||
//
|
||||
// TODO: https://github.com/sourcegraph/log/issues/65
|
||||
logError, ok := errField.Interface.(error)
|
||||
assert.Truef(t, ok, "expected errField %q to carry an error", errField.Key)
|
||||
rv := reflect.Indirect(reflect.ValueOf(logError))
|
||||
v := rv.FieldByName("Source")
|
||||
err, ok := v.Interface().(error)
|
||||
assert.Truef(t, ok, "expected errField %q to carry a log error that wraps an error as 'Source'", errField.Key)
|
||||
|
||||
// The undelying error should only report the original error
|
||||
// exactly without any additional context. This is used in
|
||||
// Sentry report generation, which preserves our desired
|
||||
// grouping.
|
||||
assert.Equal(t, err.Error(), failureMessage)
|
||||
|
||||
// Assert that the final Sentry report includes the metadata we
|
||||
// are attaching to assist with diagnostics on this particular
|
||||
// error.
|
||||
event, _ := errors.BuildSentryReport(err)
|
||||
t.Logf("Sentry Error message for field %q:\n\n%s\n\n",
|
||||
errField.Key, event.Message)
|
||||
assert.Contains(t, event.Message, "feature_"+strconv.Itoa(i))
|
||||
assert.Contains(t, event.Message, "action_"+strconv.Itoa(i))
|
||||
assert.Contains(t, event.Message, "id_"+strconv.Itoa(i))
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("some failed", func(t *testing.T) {
|
||||
|
||||
@ -38,6 +38,20 @@ var (
|
||||
// called. Useful for sentinel errors.
|
||||
WithStack = errors.WithStack
|
||||
|
||||
// WithSafeDetails annotates an error with the given reportable details.
|
||||
// The format is made available as a PII-free string, alongside
|
||||
// with a PII-free representation of every additional argument.
|
||||
// Arguments can be reported as-is (without redaction) by wrapping
|
||||
// them using the Safe() function.
|
||||
//
|
||||
// If the format is empty and there are no arguments, the
|
||||
// error argument is returned unchanged.
|
||||
//
|
||||
// Detail is shown:
|
||||
// - when formatting with `%+v`.
|
||||
// - in Sentry reports.
|
||||
WithSafeDetails = errors.WithSafeDetails
|
||||
|
||||
Is = errors.Is
|
||||
IsAny = errors.IsAny
|
||||
As = errors.As
|
||||
|
||||
Loading…
Reference in New Issue
Block a user