telemetry-gateway: wrap publish errors in error details for Sentry (#61895)

We aggregate all errors on a single log entry to get accurate representations of issues in Sentry, while not generating thousands of log entries at the same time. Because this means we only get higher-level logger context, we must annotate the errors directly with some hidden details to preserve Sentry grouping while adding context for diagnostics. We do this by using CockroachDB error's `WithSafeDetails` helper, which annotates an error with details that are _not_ rendered by `err.Error()` but _are_ included in Sentry reports (via the `Message` field). Currently, we annotate the event ID, feature, action, and source (see example output below) Context on this request: https://sourcegraph.slack.com/archives/C06CCJR4K9R/p1713054354443579 ## Test plan _Super_ jank unit tests that try to emulate how we build Sentry reports, relying on knowledge of how our Sentry report building works internally. I opened https://github.com/sourcegraph/log/issues/65 in case there are new use cases for this in the future. Running the test with `-v` demonstrates the output: ``` $ go test -timeout 30s -run ^TestSummarizeFailedEvents$ github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server -v === RUN TestSummarizeFailedEvents === RUN TestSummarizeFailedEvents/all_failed === RUN TestSummarizeFailedEvents/all_failed/Sentry_report publish_events_test.go:83: Sentry Error message for field "error.0": publish_events_test.go:38: event publish failed (1) feature:"feature_0" action:"action_0" id:"id_0" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_0" action:"action_0" id:"id_0" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} publish_events_test.go:83: Sentry Error message for field "error.1": publish_events_test.go:38: event publish failed (1) feature:"feature_1" action:"action_1" id:"id_1" server:{version:"TestSummarizeFailedEvents/all_failed"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_1" action:"action_1" id:"id_1" server:{version:"TestSummarizeFailedEvents/all_failed"} publish_events_test.go:83: Sentry Error message for field "error.2": publish_events_test.go:38: event publish failed (1) feature:"feature_2" action:"action_2" id:"id_2" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_2" action:"action_2" id:"id_2" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} publish_events_test.go:83: Sentry Error message for field "error.3": publish_events_test.go:38: event publish failed (1) feature:"feature_3" action:"action_3" id:"id_3" server:{version:"TestSummarizeFailedEvents/all_failed"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_3" action:"action_3" id:"id_3" server:{version:"TestSummarizeFailedEvents/all_failed"} publish_events_test.go:83: Sentry Error message for field "error.4": publish_events_test.go:38: event publish failed (1) feature:"feature_4" action:"action_4" id:"id_4" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} Wraps: (2) attached stack trace -- stack trace: | github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server.TestSummarizeFailedEvents.func1 | /Users/robert@sourcegraph.com/Projects/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server/publish_events_test.go:38 | testing.tRunner | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/testing/testing.go:1689 | runtime.goexit | /Users/robert@sourcegraph.com/.asdf/installs/golang/1.22.1/go/src/runtime/asm_arm64.s:1222 Wraps: (3) event publish failed Error types: (1) *safedetails.withSafeDetails (2) *withstack.withStack (3) *errutil.leafError -- report composition: *errutil.leafError: event publish failed publish_events_test.go:38: *withstack.withStack (top exception) *safedetails.withSafeDetails: feature:"feature_4" action:"action_4" id:"id_4" server:{version:"TestSummarizeFailedEvents/all_failed"} client:{name:"test_client"} === RUN TestSummarizeFailedEvents/some_failed === RUN TestSummarizeFailedEvents/all_succeeded === RUN TestSummarizeFailedEvents/all_succeeded_(large_set) --- PASS: TestSummarizeFailedEvents (0.23s) --- PASS: TestSummarizeFailedEvents/all_failed (0.22s) --- PASS: TestSummarizeFailedEvents/all_failed/Sentry_report (0.00s) --- PASS: TestSummarizeFailedEvents/some_failed (0.00s) --- PASS: TestSummarizeFailedEvents/all_succeeded (0.00s) --- PASS: TestSummarizeFailedEvents/all_succeeded_(large_set) (0.00s) PASS ok github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/server (cached) ```
2026-02-06 19:51:50 +00:00 · 2024-04-16 08:19:29 +09:00 · 2024-04-16 08:19:29 +09:00 · 294d4b47f8
commit 294d4b47f8
parent 1a13911ae7
5 changed files with 134 additions and 12 deletions
--- a/cmd/telemetry-gateway/internal/events/events.go
+++ b/cmd/telemetry-gateway/internal/events/events.go
@ -87,10 +87,31 @@ func (p *Publisher) GetSourceName() string {
 type PublishEventResult struct {
 	// EventID is the ID of the event that was published.
 	EventID string
+	// EventFeature is the feature of the event that was published.
+	EventFeature string
+	// EventAction is the action of the event that was published.
+	EventAction string
+	// EventSource is a string representation of source of the event, as reported
+	// at recording time, in the default proto string format, e.g:
+	//
+	//   server:{version:"..."}  client:{name:"..."}
+	EventSource string
 	// PublishError, if non-nil, indicates an error occurred publishing the event.
 	PublishError error
 }

+// NewPublishEventResult returns a PublishEventResult for the given event and error.
+// Should only be used internally or in testing.
+func NewPublishEventResult(event *telemetrygatewayv1.Event, err error) PublishEventResult {
+	return PublishEventResult{
+		EventID:      event.GetId(),
+		EventFeature: event.GetFeature(),
+		EventAction:  event.GetAction(),
+		EventSource:  event.GetSource().String(),
+		PublishError: err,
+	}
+}
+
 // Publish emits all events concurrently, up to 100 at a time for each call.
 func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Event) []PublishEventResult {
 	wg := pool.NewWithResults[PublishEventResult]().
@ -139,6 +160,10 @@ func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Ev
 			// We can't error forever, as the instance will keep trying to deliver
 			// this event - for now, we just pretend the event succeeded, and log
 			// some diagnostics.
+			//
+			// TODO: Maybe we can merge this with how we handle errors in
+			// summarizePublishEventsResults - for now, we stick with this
+			// special handling for extra visibility.
 			if len(payload) >= googlepubsub.MaxPublishRequestBytes {
 				trace.Logger(ctx, p.logger).Error("discarding oversized event",
 					log.Error(errors.Newf("event %s/%s is oversized",
@ -149,9 +174,13 @@ func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Ev
 						redact.Safe(event.GetFeature()),
 						redact.Safe(event.GetAction()))),
 					log.String("eventID", event.GetId()),
+					log.String("eventSource", event.GetSource().String()),
 					log.Int("size", len(payload)),
-					// Record a section of the event content for diagnostics
-					log.String("eventSnippet", strings.ToValidUTF8(string(eventJSON[:256]), "<22>")))
+					// Record a section of the event content for diagnostics.
+					// Keep in mind the size of Context objects in Sentry:
+					// https://develop.sentry.dev/sdk/data-handling/#variable-size
+					// And GCP logging limits: https://cloud.google.com/logging/quotas
+					log.String("eventSnippet", strings.ToValidUTF8(string(eventJSON[:512]), "<22>")))
 				// We must return nil, pretending the publish succeeded, so that
 				// the client stops attempting to publish an event that will
 				// never succeed.
@ -174,10 +203,7 @@ func (p *Publisher) Publish(ctx context.Context, events []*telemetrygatewayv1.Ev
 		}

 		wg.Go(func() PublishEventResult {
-			return PublishEventResult{
-				EventID:      event.GetId(),
-				PublishError: doPublish(event),
-			}
+			return NewPublishEventResult(event, doPublish(event))
 		})
 	}
 	return wg.Wait()
--- a/cmd/telemetry-gateway/internal/server/BUILD.bazel
+++ b/cmd/telemetry-gateway/internal/server/BUILD.bazel
@ -20,6 +20,7 @@ go_library(
        "//internal/telemetrygateway/v1:telemetrygateway",
        "//internal/trace",
        "//lib/errors",
+        "@com_github_cockroachdb_redact//:redact",
        "@com_github_sourcegraph_log//:log",
        "@io_opentelemetry_go_otel//:otel",
        "@io_opentelemetry_go_otel//attribute",
@ -39,7 +40,9 @@ go_test(
    deps = [
        "//cmd/telemetry-gateway/internal/events",
        "//internal/telemetrygateway/v1:telemetrygateway",
+        "//lib/errors",
        "@com_github_hexops_autogold_v2//:autogold",
        "@com_github_stretchr_testify//assert",
+        "@com_github_stretchr_testify//require",
    ],
 )
--- a/cmd/telemetry-gateway/internal/server/publish_events.go
+++ b/cmd/telemetry-gateway/internal/server/publish_events.go
@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"

+	"github.com/cockroachdb/redact"
 	"github.com/sourcegraph/log"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/metric"
@ -83,10 +84,26 @@ func summarizePublishEventsResults(results []events.PublishEventResult) publishE
 		failed    = make([]events.PublishEventResult, 0)
 	)

+	// We aggregate all errors on a single log entry to get accurate
+	// representations of issues in Sentry, while not generating thousands of
+	// log entries at the same time. Because this means we only get higher-level
+	// logger context, we must annotate the errors with some hidden details to
+	// preserve Sentry grouping while adding context for diagnostics.
 	for i, result := range results {
 		if result.PublishError != nil {
 			failed = append(failed, result)
-			errFields = append(errFields, log.NamedError(fmt.Sprintf("error.%d", i), result.PublishError))
+			// Construct details to annotate the error with in Sentry reports
+			// without affecting the error itself (which is important for
+			// grouping within Sentry)
+			errFields = append(errFields, log.NamedError(fmt.Sprintf("error.%d", i),
+				errors.WithSafeDetails(result.PublishError,
+					"feature:%[1]q action:%[2]q id:%[3]q %[4]s", // mimic format of result.EventSource
+					redact.Safe(result.EventFeature),
+					redact.Safe(result.EventAction),
+					redact.Safe(result.EventID),
+					redact.Safe(result.EventSource),
+				),
+			))
 		} else {
 			succeeded = append(succeeded, result.EventID)
 		}
--- a/cmd/telemetry-gateway/internal/server/publish_events_test.go
+++ b/cmd/telemetry-gateway/internal/server/publish_events_test.go
@ -1,23 +1,43 @@
 package server

 import (
-	"errors"
+	"reflect"
 	"strconv"
 	"testing"

 	"github.com/hexops/autogold/v2"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"

 	"github.com/sourcegraph/sourcegraph/cmd/telemetry-gateway/internal/events"
+	telemetrygatewayv1 "github.com/sourcegraph/sourcegraph/internal/telemetrygateway/v1"
+	"github.com/sourcegraph/sourcegraph/lib/errors"
 )

 func TestSummarizeFailedEvents(t *testing.T) {
 	t.Run("all failed", func(t *testing.T) {
-		results := make([]events.PublishEventResult, 0)
-		for i := range results {
-			results[i].EventID = "id_" + strconv.Itoa(i)
-			results[i].PublishError = errors.New("failed")
+		const failureMessage = "event publish failed"
+		submitted := make([]*telemetrygatewayv1.Event, 5)
+		for i := range submitted {
+			submitted[i] = &telemetrygatewayv1.Event{
+				Id:      "id_" + strconv.Itoa(i),
+				Feature: "feature_" + strconv.Itoa(i),
+				Action:  "action_" + strconv.Itoa(i),
+				Source: &telemetrygatewayv1.EventSource{
+					Server: &telemetrygatewayv1.EventSource_Server{Version: t.Name()},
+				},
+			}
+			if i%2 == 0 {
+				submitted[i].Source.Client = &telemetrygatewayv1.EventSource_Client{
+					Name: "test_client",
+				}
+			}
 		}
+		results := make([]events.PublishEventResult, len(submitted))
+		for i, event := range submitted {
+			results[i] = events.NewPublishEventResult(event, errors.New(failureMessage))
+		}
+		require.False(t, len(results) == 0)

 		summary := summarizePublishEventsResults(results)
 		autogold.Expect("all events in batch failed to submit").Equal(t, summary.message)
@ -25,6 +45,48 @@ func TestSummarizeFailedEvents(t *testing.T) {
 		assert.Len(t, summary.errorFields, len(results))
 		assert.Len(t, summary.succeededEvents, 0)
 		assert.Len(t, summary.failedEvents, len(results))
+
+		// This sub-test asserts some behaviour specific to how we construct
+		// Sentry reports in sourcegraph/log:
+		// https://github.com/sourcegraph/log/blob/f53023898988779b0dabb75fda2c5c3b8d5ae3ae/internal/sinkcores/sentrycore/worker.go#L95-L96
+		//
+		// It requires knowledge of the internals of the logging library and
+		// how Sentry interprets these reports.
+		// TODO: https://github.com/sourcegraph/log/issues/65
+		t.Run("Sentry report", func(t *testing.T) {
+			for i, errField := range summary.errorFields {
+				// Our logging library wraps the error provided to log.NamedError
+				// to customize how it gets rendered. Internally, we extract the
+				// undelying error for generating the report - to emulate this
+				// behaviour we must reflect for the underlying error on the
+				// internal type:
+				// https://github.com/sourcegraph/log/blob/f53023898988779b0dabb75fda2c5c3b8d5ae3ae/internal/encoders/encoders.go#L59-L61
+				//
+				// TODO: https://github.com/sourcegraph/log/issues/65
+				logError, ok := errField.Interface.(error)
+				assert.Truef(t, ok, "expected errField %q to carry an error", errField.Key)
+				rv := reflect.Indirect(reflect.ValueOf(logError))
+				v := rv.FieldByName("Source")
+				err, ok := v.Interface().(error)
+				assert.Truef(t, ok, "expected errField %q to carry a log error that wraps an error as 'Source'", errField.Key)
+
+				// The undelying error should only report the original error
+				// exactly without any additional context. This is used in
+				// Sentry report generation, which preserves our desired
+				// grouping.
+				assert.Equal(t, err.Error(), failureMessage)
+
+				// Assert that the final Sentry report includes the metadata we
+				// are attaching to assist with diagnostics on this particular
+				// error.
+				event, _ := errors.BuildSentryReport(err)
+				t.Logf("Sentry Error message for field %q:\n\n%s\n\n",
+					errField.Key, event.Message)
+				assert.Contains(t, event.Message, "feature_"+strconv.Itoa(i))
+				assert.Contains(t, event.Message, "action_"+strconv.Itoa(i))
+				assert.Contains(t, event.Message, "id_"+strconv.Itoa(i))
+			}
+		})
 	})

 	t.Run("some failed", func(t *testing.T) {
--- a/lib/errors/cockroach.go
+++ b/lib/errors/cockroach.go
@ -38,6 +38,20 @@ var (
 	// called. Useful for sentinel errors.
 	WithStack = errors.WithStack

+	// WithSafeDetails annotates an error with the given reportable details.
+	// The format is made available as a PII-free string, alongside
+	// with a PII-free representation of every additional argument.
+	// Arguments can be reported as-is (without redaction) by wrapping
+	// them using the Safe() function.
+	//
+	// If the format is empty and there are no arguments, the
+	// error argument is returned unchanged.
+	//
+	// Detail is shown:
+	// - when formatting with `%+v`.
+	// - in Sentry reports.
+	WithSafeDetails = errors.WithSafeDetails
+
 	Is        = errors.Is
 	IsAny     = errors.IsAny
 	As        = errors.As