fix(sg): acknowledge command execution state to avoid recursion when executing short running commands (#64181)

Some commands like the [`batcheshelper-builder`](https://sourcegraph.sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/sg.config.yaml?L821) aren't long running commands. This command is used to build and load an image into docker. The `cmd` section returns an `exit 0`. This behavior combined with `continueWatchOnExit` results in an infinite loop where the process is continually restarted because `sg` doesn't know that the process has finished executing and isn't a long-running process. https://github.com/user-attachments/assets/e7a027a1-6f93-403f-9240-6a791255fba9 An example of the behavior is shown below as running `sg start batches` results in the `batcheshelper-builder` command continually restarted. The fix is quite simple, we return an empty receiver channel when the process is done executing so that `sg` knows it's done and doesn't restart the command unless there's a change. ## Test plan * Manual testing with `go run ./dev/sg start batches` doesn't result in an infinite loop anymore. * Add unit tests ## Changelog
2026-02-06 13:31:54 +00:00 · 2024-07-31 22:09:44 +01:00 · 2024-07-31 22:09:44 +01:00 · 776701ba9c
commit 776701ba9c
parent b2e550c8e5
6 changed files with 107 additions and 27 deletions
--- a/dev/sg/internal/run/BUILD.bazel
+++ b/dev/sg/internal/run/BUILD.bazel
@ -41,14 +41,17 @@ go_test(
    name = "run_test",
    timeout = "short",
    srcs = [
+        "command_test.go",
        "docker_command_test.go",
        "logger_test.go",
    ],
    embed = [":run"],
    tags = [TAG_INFRA_DEVINFRA],
    deps = [
+        "//lib/errors",
        "@com_github_google_go_cmp//cmp",
        "@com_github_stretchr_testify//assert",
+        "@com_github_stretchr_testify//require",
        "@in_gopkg_yaml_v2//:yaml_v2",
    ],
 )
--- a/dev/sg/internal/run/command.go
+++ b/dev/sg/internal/run/command.go
@ -231,6 +231,8 @@ type startedCmd struct {

 	outEg  *pool.ErrorPool
 	result chan error
+
+	finished bool
 }

 type commandOptions struct {
@ -428,6 +430,13 @@ func (sc *startedCmd) getOutputWriter(ctx context.Context, opts *outputOptions,
 }

 func (sc *startedCmd) Exit() <-chan error {
+	// We track the state of a single process to avoid an infinite loop
+	// for short-running commands. When the command is done executing,
+	// we simply return an empty receiver channel instead.
+	if sc.finished {
+		fakeChan := make(<-chan error)
+		return fakeChan
+	}
 	if sc.result == nil {
 		sc.result = make(chan error)
 		go func() {
@ -440,6 +449,8 @@ func (sc *startedCmd) Exit() <-chan error {

 func (sc *startedCmd) Wait() error {
 	err := sc.wait()
+	// We are certain that the command is done executing at this point.
+	sc.finished = true
 	var e *exec.ExitError
 	if errors.As(err, &e) {
 		err = runErr{
@ -453,7 +464,12 @@ func (sc *startedCmd) Wait() error {
 	return err
 }

+var mockStartedCmdWaitFunc func() error
+
 func (sc *startedCmd) wait() error {
+	if mockStartedCmdWaitFunc != nil {
+		return mockStartedCmdWaitFunc()
+	}
 	if err := sc.outEg.Wait(); err != nil {
 		return err
 	}
--- a/dev/sg/internal/run/command_test.go
+++ b/dev/sg/internal/run/command_test.go
@ -0,0 +1,71 @@
+package run
+
+import (
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/sourcegraph/sourcegraph/lib/errors"
+	"github.com/stretchr/testify/require"
+)
+
+func TestStartedCmd_Exit(t *testing.T) {
+	var wg sync.WaitGroup
+
+	mockStartedCmdWaitFunc = func() error {
+		wg.Add(1)
+		defer wg.Done()
+
+		return nil
+	}
+
+	t.Cleanup(func() {
+		// Wait for all ongoing calls to finish
+		wg.Wait()
+		mockStartedCmdWaitFunc = nil
+	})
+
+	t.Run("returns fake channel when finished", func(t *testing.T) {
+		sc := &startedCmd{finished: true}
+		ch := sc.Exit()
+		require.NotNil(t, ch)
+		select {
+		case <-ch:
+			t.Error("Expected empty channel, but received a value")
+		default:
+			// This is the expected behavior
+		}
+	})
+
+	t.Run("creates and returns result channel", func(t *testing.T) {
+		sc := &startedCmd{}
+		ch := sc.Exit()
+		require.NotNil(t, ch)
+		require.NotNil(t, sc.result)
+	})
+
+	t.Run("returns existing result channel", func(t *testing.T) {
+		expectedErrChan := make(chan error, 1)
+		sc := &startedCmd{result: expectedErrChan}
+		ch := sc.Exit()
+
+		require.NotNil(t, ch)
+		_, ok := interface{}(ch).(<-chan error)
+		require.True(t, ok, "returned channel should be of type <-chan error")
+
+		// Verify channel behavior
+		go func() {
+			expectedErrChan <- errors.New("test error")
+		}()
+
+		select {
+		case err := <-ch:
+			require.Error(t, err, "should receive an error from the channel")
+		case <-time.After(time.Millisecond):
+			t.Error("timed out waiting to receive from channel")
+		}
+
+		// Clean up
+		close(expectedErrChan)
+	})
+}
--- a/dev/sg/internal/run/run.go
+++ b/dev/sg/internal/run/run.go
@ -85,20 +85,6 @@ func (runner *cmdRunner) run(ctx context.Context) error {
 					runner.WriteLine(output.Styledf(output.StyleSuccess, "%s%s stopped due to context error: %v%s", output.StyleBold, config.Name, ctx.Err(), output.StyleReset))
 					return ctx.Err()

-				// Handle process exit
-				case err := <-proc.Exit():
-					// If the process failed, we exit immediately
-					if err != nil {
-						return err
-					}
-
-					runner.WriteLine(output.Styledf(output.StyleSuccess, "%s%s exited without error%s", output.StyleBold, config.Name, output.StyleReset))
-
-					// If we shouldn't restart when the process exits, return
-					if !config.ContinueWatchOnExit {
-						return nil
-					}
-
 				// handle file watcher triggered
 				case <-wantRestart:
 					// If the command has an installer, re-run the install and determine if we should restart
@ -126,6 +112,20 @@ func (runner *cmdRunner) run(ctx context.Context) error {
 					} else {
 						runner.WriteLine(output.Styledf(output.StylePending, "Binary for %s did not change. Not restarting.", config.Name))
 					}
+
+				// Handle process exit
+				case err := <-proc.Exit():
+					// If the process failed, we exit immediately
+					if err != nil {
+						return err
+					}
+
+					runner.WriteLine(output.Styledf(output.StyleSuccess, "%s%s exited without error%s", output.StyleBold, config.Name, output.StyleReset))
+
+					// If we shouldn't restart when the process exits, return
+					if !config.ContinueWatchOnExitZero {
+						return nil
+					}
 				}
 			}
 		})
--- a/dev/sg/internal/run/sgconfig_command_options.go
+++ b/dev/sg/internal/run/sgconfig_command_options.go
@ -14,8 +14,8 @@ type SGConfigCommandOptions struct {
 	IgnoreStdout bool              `yaml:"ignoreStdout"`
 	IgnoreStderr bool              `yaml:"ignoreStderr"`
 	// If true, the runner will continue watching this commands dependencies
-	// even if the command exits with a zero status code.
-	ContinueWatchOnExit bool `yaml:"continueWatchOnExit"`
+	// if the command's last execution was successful (i.e exitCode = 0)
+	ContinueWatchOnExitZero bool `yaml:"continueWatchOnExit"`
 	// Preamble is a short and visible message, displayed when the command is launched.
 	Preamble string `yaml:"preamble"`

@ -35,7 +35,7 @@ func (opts SGConfigCommandOptions) Merge(other SGConfigCommandOptions) SGConfigC
 	merged.Args = mergeStrings(merged.Args, other.Args)
 	merged.IgnoreStdout = other.IgnoreStdout || merged.IgnoreStdout
 	merged.IgnoreStderr = other.IgnoreStderr || merged.IgnoreStderr
-	merged.ContinueWatchOnExit = other.ContinueWatchOnExit || merged.ContinueWatchOnExit
+	merged.ContinueWatchOnExitZero = other.ContinueWatchOnExitZero || merged.ContinueWatchOnExitZero
 	merged.Preamble = mergeStrings(merged.Preamble, other.Preamble)
 	merged.Logfile = mergeStrings(merged.Logfile, other.Logfile)
 	merged.RepositoryRoot = mergeStrings(merged.RepositoryRoot, other.RepositoryRoot)
--- a/sg.config.yaml
+++ b/sg.config.yaml
@ -1342,16 +1342,6 @@ bazelCommands:
      TMPDIR: $HOME/.sourcegraph/indexer-temp

 dockerCommands:
-  batcheshelper-builder:
-    # Nothing to run for this, we just want to re-run the install script every time.
-    cmd: exit 0
-    target: //cmd/batcheshelper:image_tarball
-    image: batcheshelper:candidate
-    env:
-      # TODO: This is required but should only be set on M1 Macs.
-      PLATFORM: linux/arm64
-    continueWatchOnExit: true
-
  grafana:
    target: //docker-images/grafana:image_tarball
    docker: