chore/sg: remove 'sg telemetry' and related docs (#63763)

Removes the `sg telemetry` command that pertains to the legacy V1
exporter that is specific to Cloud instances.

I got asked about this recently, and especially with the new `sg
analytics` for usage of the `sg` CLI, this has the potential to be
pretty confusing.

Part of https://linear.app/sourcegraph/issue/CORE-104

## Test plan

n/a

## Changelog

- `sg`: the deprecated `sg telemetry` command for allowlisting export of
V1 telemetry from Cloud instances has been removed. Use telemetry V2
instead.
This commit is contained in:
Robert Lin 2024-07-10 17:25:04 -07:00 committed by GitHub
parent cb19d6f0a9
commit a07a1b9ed0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 1 additions and 311 deletions

View File

@ -1,4 +1,3 @@
// NOTE(naman): Remember to add events to allow list: https://docs-legacy.sourcegraph.com/dev/background-information/data-usage-pipeline#allow-list
export const enum EventName {
CODY_CHAT_PAGE_VIEWED = 'web:codyChat:pageViewed',
CODY_CHAT_SUBMIT = 'web:codyChat:submit',

View File

@ -1,6 +1,5 @@
import type { AuthProvider } from '../jscontext'
// NOTE(naman): Remember to add events to allow list: https://docs-legacy.sourcegraph.com/dev/background-information/data-usage-pipeline#allow-list
export const enum EventName {
CODY_CHAT_PAGE_VIEWED = 'web:codyChat:pageViewed',
CODY_CHAT_SUBMIT = 'web:codyChat:submit',

View File

@ -39,7 +39,6 @@ go_library(
"sg_src.go",
"sg_start.go",
"sg_teammate.go",
"sg_telemetry.go",
"sg_tests.go",
"sg_update.go",
"sg_version.go",

View File

@ -282,7 +282,6 @@ var sg = &cli.App{
dbCommand,
migrationCommand,
insightsCommand,
telemetryCommand,
monitoringCommand,
contextCommand,
deployCommand,

View File

@ -1,176 +0,0 @@
package main
import (
"fmt"
"strings"
"github.com/sourcegraph/sourcegraph/lib/output"
"github.com/sourcegraph/sourcegraph/dev/sg/internal/category"
"github.com/sourcegraph/sourcegraph/dev/sg/internal/migration"
"github.com/sourcegraph/sourcegraph/dev/sg/internal/db"
"github.com/urfave/cli/v2"
"github.com/sourcegraph/sourcegraph/dev/sg/internal/std"
)
const telemetryV1DeprecationWarning = `WARNING: Telemetry V1 and related mechanisms (event_logs export) have been DEPRECATED.
This includes the 'sg telemetry' commands.
It will be replaced with Telemetry V2 - code that is instrumented with the new
Telemetry SDKs will automatically export safe telemetry in accordance with the
Sourcegraph instance's allowed policies.
For more details, see https://docs-legacy.sourcegraph.com/dev/background-information/telemetry
or reach out in #discuss-analytics.`
func renderTelemetryV1DeprecationWarning() {
std.Out.WriteWarningf(telemetryV1DeprecationWarning)
}
var telemetryCommand = &cli.Command{
Name: "telemetry",
Before: func(ctx *cli.Context) error {
renderTelemetryV1DeprecationWarning()
return nil
},
Usage: "[DEPRECATED] Operations relating to Sourcegraph telemetry v1",
Description: telemetryV1DeprecationWarning,
Category: category.Dev,
Subcommands: []*cli.Command{
allowlistCommand,
},
}
var allowlistCommand = &cli.Command{
Name: "allowlist",
Usage: "Edit the legacy telemetry v1 usage data export allow list",
Flags: []cli.Flag{},
Description: `
Utility that will generate SQL to add and remove events from the usage data allow list.
https://docs-legacy.sourcegraph.com/dev/background-information/data-usage-pipeline#allow-list
Events are keyed by event name and passed in as additional arguments to the add and remove subcommands.
`,
UsageText: `
# Generate SQL to add events from the allow list
sg telemetry allowlist add EVENT_ONE EVENT_TWO
# Generate SQL to remove events from the allow list
sg telemetry allowlist remove EVENT_ONE EVENT_TWO
# Automatically generate migration files associated with the allow list modification
sg telemetry allowlist add --migration EVENT_ONE EVENT_TWO
# Provide a specific migration name for the migration files
sg telemetry allowlist add --migration --name my_migration_name EVENT_ONE EVENT_TWO
`,
Subcommands: []*cli.Command{
addAllowlistCommand,
removeAllowlistCommand,
},
}
var addAllowlistCommand = &cli.Command{
Name: "add",
ArgsUsage: "[event]",
Usage: "Generate the SQL required to add events to the legacy telemetry v1 export allow list",
UsageText: `
# Generate SQL to add events from the allow list
sg telemetry allowlist add EVENT_ONE EVENT_TWO
# Automatically generate migration files associated with the allow list modification
sg telemetry allowlist add --migration EVENT_ONE EVENT_TWO
# Provide a specific migration name for the migration files
sg telemetry allowlist add --migration --name my_migration_name EVENT_ONE EVENT_TWO
`,
Flags: []cli.Flag{
allowlistCreateMigrationFlag,
allowlistMigrationNameOverrideFlag,
},
Action: addAllowList,
}
var removeAllowlistCommand = &cli.Command{
Name: "remove",
ArgsUsage: "[event]",
Usage: "Generate the SQL required to remove events from the legacy telemetry v1 export allow list",
UsageText: `
# Generate SQL to add events from the allow list
sg telemetry allowlist remove EVENT_ONE EVENT_TWO
# Automatically generate migration files associated with the allow list modification
sg telemetry allowlist remove --migration EVENT_ONE EVENT_TWO
# Provide a specific migration name for the migration files
sg telemetry allowlist remove --migration --name my_migration_name EVENT_ONE EVENT_TWO
`,
Flags: []cli.Flag{
allowlistCreateMigrationFlag,
allowlistMigrationNameOverrideFlag,
},
Action: removeAllowList,
}
var createMigrationFiles bool
var allowlistCreateMigrationFlag = &cli.BoolFlag{
Name: "migration",
Usage: "Create migration files with the generated SQL.",
Value: false,
Destination: &createMigrationFiles,
}
var allowlistMigrationName string
var allowlistMigrationNameOverrideFlag = &cli.StringFlag{
Name: "name",
Usage: "Specifies the name of the resulting migration.",
Required: false,
Value: "sg_telemetry_allowlist",
Destination: &allowlistMigrationName,
}
func addAllowList(ctx *cli.Context) (err error) {
events := ctx.Args().Slice()
if len(events) == 0 {
return cli.Exit("no events provided", 1)
}
return editAllowlist(ctx, events, false)
}
func removeAllowList(ctx *cli.Context) (err error) {
events := ctx.Args().Slice()
if len(events) == 0 {
return cli.Exit("no events provided", 1)
}
return editAllowlist(ctx, events, true)
}
func editAllowlist(ctx *cli.Context, events []string, reverse bool) error {
header := fmt.Sprintf("-- This migration was generated by the command `sg telemetry %s`", ctx.Command.FullName())
arrayStr := fmt.Sprintf(`'{%v}'`, strings.Join(events, ","))
upQuery := fmt.Sprintf("INSERT INTO event_logs_export_allowlist (event_name) VALUES (UNNEST(%s::TEXT[])) ON CONFLICT DO NOTHING;", arrayStr)
downQuery := fmt.Sprintf("DELETE FROM event_logs_export_allowlist WHERE event_name IN (SELECT * FROM UNNEST(%s::TEXT[]));", arrayStr)
if reverse {
upQuery, downQuery = downQuery, upQuery
}
std.Out.WriteLine(output.Styledf(output.StylePending, "\ngenerating output..."))
std.Out.WriteLine(output.Styledf(output.StyleSuccess, "%s", upQuery))
std.Out.WriteLine(output.Styledf(output.StyleWarning, "revert:\n%s", downQuery))
if !createMigrationFiles {
return nil
}
std.Out.WriteLine(output.Styledf(output.StylePending, "\ncreating migration files with name: %s...\n", allowlistMigrationName))
database, ok := db.DatabaseByName("frontend")
if !ok {
return cli.Exit("frontend database not found", 1)
}
return migration.AddWithTemplate(database, allowlistMigrationName, fmt.Sprintf("%s\n%s", header, upQuery), fmt.Sprintf("%s\n%s", header, downQuery))
}

View File

@ -1,63 +0,0 @@
# DEPRECATED: Adding, changing, and debugging user event data
> WARNING: **This process is deprecated.** To export Telemetry events from Sourcegraph instances, refer to the new [telemetry reference](./telemetry/index.md).
This document outlines the process for adding or changing the raw user event data collected from Sourcegraph instances. This is limited to certain managed instances (cloud) where the customer has signed a corresponding data collection agreement.
### User event data philosophy
[Raw user event data](https://docs.sourcegraph.com/dev/background-information/data-usage-pipeline) is collected from logs in the `event_logs` table in the instance primary database and sent to Sourcegraph centralized analytics. These [events](https://sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:internal/database/event_logs.go+Event+type:symbol+select:symbol.struct&patternType=standard) are a product of events performed by users or the system and represent our customers most sensitive data. We must preserve and build trust through only careful additions and changes to events added to the egress pipeline.
All user event data must be:
1. Anonymous (with only one exception, the email address of the initial site installer)
2. Non-specific (e.g., no repo names, no usernames, no file names, no specific search queries, etc.)
### Adding events to the raw user event data pipeline
Ensure that any events added to the data pipeline are consistent with the user level data [FAQ](https://docs.google.com/document/d/1vXHoMBnvI_SlOjft4Q1Zhb5ZoScS1IjZ4V1LSKgVxv8/edit#).
### Changing the BigQuery view
Edit the query from the [cloud console](https://console.cloud.google.com/bigquery?project=telligentsourcegraph&ws=!1m5!1m4!4m3!1stelligentsourcegraph!2sdotcom_events!3sevents_usage) to add additional data that is passed through the event batches. Any data that does not exist will persist as null rather than failthis will ensure backward compatibility.
```(
WITH data as (
SELECT JSON_EXTRACT_ARRAY(data, '$') as json
FROM `telligentsourcegraph.dotcom_events.events_usage_raw`
)
select
--flattened_data,
JSON_EXTRACT_SCALAR(flattened_data, '$.name') as name,
JSON_EXTRACT_SCALAR(flattened_data, '$.url') as url,
JSON_EXTRACT_SCALAR(flattened_data, '$.user_id') as user_id,
JSON_EXTRACT_SCALAR(flattened_data, '$.anonymous_user_id') as anonymous_user_id,
JSON_EXTRACT_SCALAR(flattened_data, '$.source') as source,
JSON_EXTRACT_SCALAR(flattened_data, '$.argument') as argument,
JSON_EXTRACT_SCALAR(flattened_data, '$.version') as version,
JSON_EXTRACT_SCALAR(flattened_data, '$.timestamp') as timestamp,
JSON_EXTRACT_SCALAR(flattened_data, '$.firstSourceURL') as firstSourceURL,
JSON_EXTRACT_SCALAR(flattened_data, '$.first_source_url') as first_source_url,
JSON_EXTRACT_SCALAR(flattened_data, '$.feature_flags') as feature_flags,
JSON_EXTRACT_SCALAR(flattened_data, '$.cohort_id') as cohort_id,
JSON_EXTRACT_SCALAR(flattened_data, '$.referrer') as referrer,
JSON_EXTRACT_SCALAR(flattened_data, '$.public_argument') as public_argument,
JSON_EXTRACT_SCALAR(flattened_data, '$.device_id') as device_id,
JSON_EXTRACT_SCALAR(flattened_data, '$.insert_id') as insert_id,
JSON_EXTRACT_SCALAR(flattened_data, '$.last_source_url') as last_source_url,
JSON_EXTRACT_SCALAR(flattened_data, '$.site_id') as site_id,
JSON_EXTRACT_SCALAR(flattened_data, '$.license_key') as license_key,
JSON_EXTRACT_SCALAR(flattened_data, '$.initial_admin_email') as initial_admin_email,
JSON_EXTRACT_SCALAR(flattened_data, '$.deploy_type') as deploy_type,
from data
cross join unnest(data.json) as flattened_data
)
```
### Debugging
Working backward from where the data is coming from, we should receive alerts for anomalies along each step of the process here.
1. At the [view](https://console.cloud.google.com/bigquery?project=telligentsourcegraph&ws=!1m5!1m4!4m3!1stelligentsourcegraph!2sdotcom_events!3sevents_usage), the details expose the raw SQL of flattening out the payload batches then extracting the desired field from the JSON structure.
2. In the [raw table](https://console.cloud.google.com/bigquery?project=telligentsourcegraph&ws=!1m5!1m4!4m3!1stelligentsourcegraph!2sdotcom_events!3sevents_usage_raw), the subscription unloads any new data into this table as an array of JSON objects. The payloads are capped at source to 10MB.
3. In the [subscription](https://console.cloud.google.com/cloudpubsub/subscription/detail/dotcom-events-usage?project=telligentsourcegraph), there are metrics available in the cloud console. There is not currently any automated snapshot generation.
4. In the [pub/sub topic](https://console.cloud.google.com/cloudpubsub/topic/detail/dotcom-events-usage?project=telligentsourcegraph), there are also metrics available in the cloud console. There is not currently any automated snapshot generation. There is currently a 7 day retention duration for messages in the queue.
5. In the instance, this would point to an issue with the instance sending data at all. Check if there is also an interruption in [ping](https://docs.sourcegraph.com/dev/background-information/adding_ping_data) or other telemetry data.

View File

@ -1,65 +0,0 @@
# DEPRECATED: Event level data usage pipeline
> WARNING: **This process is deprecated.** To export Telemetry events from Sourcegraph instances, refer to the new [telemetry reference](./telemetry/index.md).
This document outlines information about the ability to export raw user event data from Sourcegraph. This is limited
to certain managed instances (cloud) where the customer has signed a corresponding data collection agreement.
### What is it?
This process is a background job that can be enabled that will periodically scrape the `event_logs` table in the primary database
and send it to Sourcegraph centralized analytics. [Events](https://sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:internal/database/event_logs.go+Event+type:symbol+select:symbol.struct&patternType=standard) stored in `event_logs` are product events performed by users or the system. More information can be found in [RFC 719: Managed Instance Telemetry](https://docs.google.com/document/d/1N9aO0uTlvwXI7FzdPjIUCn_d1tRkJUfsc0urWigRf6s/edit).
The [job interval](https://sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:%cmd/worker/internal/telemetry/telemetry_job%5C.go+JobCooldownDuration&patternType=standard) determines how often the job is executed. The [batch size option](https://sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:%cmd/worker/internal/telemetry/telemetry_job%5C.go+getBatchSize+type:symbol&patternType=standard) determines how many records can be pulled in a single scrape. The batch size has a [default value](https://sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:%cmd/worker/internal/telemetry/telemetry_job%5C.go+MaxEventsCountDefault&patternType=standard) and can be overridden with a site setting:
``` json
"exportUsageTelemetry": {
"batchSize": 100,
}
```
The scraping job maintains state using a bookmark stored in the primary postgres database in the table `event_logs_scrape_state`. [If the bookmark is not found, one will be inserted](https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/cmd/worker/internal/telemetry/telemetry_job.go?L424-440) such that the bookmark is the most recent event at the time.
The scraping process has a crude at-least once semantics guarantee. If any scrape should fail, the bookmark state will not be updated causing future scrapes to retry the same set of events.
### Allow list
Only events that [exist in an allow list](https://sourcegraph.com/github.com/sourcegraph/sourcegraph@735bc0f69ce417ecce55a9194dbf349c954043e3/-/blob/internal/database/event_logs.go?L321-324) will be scraped. Events are keyed in the allow list by the `event_logs.name` column. The allow list can be found in the primary
postgres database in the table `event_logs_export_allowlist`.
#### Adding to the allow list
Modifying the allow list is performed using database migrations. To simplify this process and ensure consistency, use the [sg](https://docs.sourcegraph.com/dev/background-information/sg) tool:
``` shell
sg telemetry allowlist add --migration [event]
```
``` shell
sg telemetry allowlist remove --migration [event]
```
If you want to modify many events you can pass a file of newline delimited event names using `xargs`:
``` shell
cat /location/of/my/events/file | xargs sg telemetry allowlist add --migration
```
Create a pull request and get a review from the Data Engineering team.
#### Determine if an event is in the allow list
Currently, there is not a single document that shows the entire allow list. There are two options:
1. Start Sourcegraph and migrate to the latest version, and query the database
```postgresql
select * from event_logs_export_allowlist;
```
2. [Look through migration files](https://sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/sourcegraph%24+file:migrations+lang:sql+MY_EVENT_NAME&patternType=standard) to see if the event you are looking for has been added and not deleted
### How to enable for a managed instance
1. Ensure the managed instance has the [appropriate IAM policy](https://sourcegraph.sourcegraph.com/github.com/sourcegraph/deploy-sourcegraph-managed/-/blob/modules/terraform-managed-instance-new/iam.tf?L19-31&utm_source=raycast-sourcegraph&utm_campaign=search) applied
2. Update the managed instance deployment manifest to include the following environment variables:
1. `EXPORT_USAGE_DATA_ENABLED=true`
2. [`EXPORT_USAGE_DATA_TOPIC_NAME`](https://sourcegraph.sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/deploy-sourcegraph-managed%24+EXPORT_USAGE_DATA_TOPIC_NAME&patternType=standard)
3. [`EXPORT_USAGE_DATA_TOPIC_PROJECT`](https://sourcegraph.sourcegraph.com/search?q=context:global+repo:%5Egithub%5C.com/sourcegraph/deploy-sourcegraph-managed%24+EXPORT_USAGE_DATA_TOPIC_PROJECT&patternType=standard)
3. Deploy the updated deployment manifest and restart the `worker` service.
### Monitoring
Each Sourcegraph instance with this export job enabled will emit metrics that are prefixed with `src_telemetry_job`.

View File

@ -13,7 +13,7 @@
> NOTE: This document is a work-in-progress.
Telemetry describes the logging of user events, such as a page view or search. Telemetry data is collected by each Sourcegraph instance and is not sent to Sourcegraph.com (except in aggregate form as documented in "[Pings](../../../admin/pings.md)"). Some select managed instances enable
event level (non-aggregated) [telemetry](../data-usage-pipeline.md).
event level (non-aggregated) V1 telemetry.
## Browser extension telemetry

View File

@ -161,8 +161,6 @@ Clarification and discussion about key concepts, architecture, and development s
- [Telemetry](background-information/telemetry/index.md)
- [Adding, changing and debugging pings](background-information/adding_ping_data.md)
- [DEPRECATED: Event level data usage pipeline](background-information/data-usage-pipeline.md)
- [DEPRECATED: Adding, changing and debugging user event data](background-information/adding_event_level_data.md)
- [Deploy Sourcegraph with Helm chart (BETA)](../../../admin/deploy/kubernetes/helm.md)
- [GitHub API oddities](background-information/github-api-oddities.md)