Batch indexing: syntactic codeintel worker scaffolding (#59747)

Currently the worker itself does nothing, only exposes a health endpoint and loads basic environment configuration.

    Bazel build for the Docker container
    Wire in scip-treesitter-cli to make it available in the container
    Dev setup for scip-treesitter-cli (copied from scip-ctags setup for local development)
    Run configuration for the worker sg run codeintel-syntactic-worker to test
    Start configuration sg start codeintel-syntactic - contains only the minimal dependencies required to run the worker, we will expand the configuration gradually as we add more features
This commit is contained in:
Anton Sviridov 2024-01-24 13:04:20 +00:00 committed by GitHub
parent 014cad15ba
commit cd1721b43a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 398 additions and 10 deletions

View File

@ -0,0 +1,77 @@
load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
load("@rules_oci//oci:defs.bzl", "oci_image", "oci_push", "oci_tarball")
load("@rules_pkg//:pkg.bzl", "pkg_tar")
load("@container_structure_test//:defs.bzl", "container_structure_test")
load("//dev:oci_defs.bzl", "image_repository")
go_library(
name = "syntactic-code-intel-worker_lib",
srcs = ["main.go"],
importpath = "github.com/sourcegraph/sourcegraph/cmd/syntactic-code-intel-worker",
visibility = ["//visibility:private"],
deps = [
"//cmd/syntactic-code-intel-worker/shared",
"//internal/sanitycheck",
"//internal/service/svcmain",
],
)
go_binary(
name = "syntactic-code-intel-worker",
embed = [":syntactic-code-intel-worker_lib"],
visibility = ["//visibility:public"],
x_defs = {
"github.com/sourcegraph/sourcegraph/internal/version.version": "{STABLE_VERSION}",
"github.com/sourcegraph/sourcegraph/internal/version.timestamp": "{VERSION_TIMESTAMP}",
},
)
pkg_tar(
name = "tar_syntactic-code-intel-worker",
srcs = [":syntactic-code-intel-worker"],
)
pkg_tar(
name = "tar_scip-treesitter",
srcs = ["//docker-images/syntax-highlighter/crates/scip-treesitter-cli:scip-treesitter"],
package_dir = "/usr/local/bin",
)
oci_image(
name = "image",
base = "@wolfi_base",
entrypoint = [
"/sbin/tini",
"--",
"/syntactic-code-intel-worker",
],
tars = [
":tar_syntactic-code-intel-worker",
"tar_scip-treesitter",
],
user = "sourcegraph",
)
oci_tarball(
name = "image_tarball",
image = ":image",
repo_tags = ["syntactic-code-intel-worker:candidate"],
)
container_structure_test(
name = "image_test",
timeout = "short",
configs = ["image_test.yaml"],
driver = "docker",
image = ":image",
tags = [
"exclusive",
"requires-network",
],
)
oci_push(
name = "candidate_push",
image = ":image",
repository = image_repository("syntactic-code-intel-worker"),
)

View File

@ -0,0 +1 @@
# See https://github.com/sourcegraph/codenotify for documentation.

View File

@ -0,0 +1,8 @@
# Syntactic code intel worker
🚧 WORK IN PROGRESS 🚧
Stateless service that handles generating SCIP data for codebases
using Tree-sitter for powering syntax-based code navigation.
[Design docs](https://docs.google.com/document/d/14MHauv52o4zTFiV6gC6NOJZxcJpglK-ElWa64gqeKDo/edit) (Sourcegraph internal)

View File

@ -0,0 +1,21 @@
schemaVersion: "2.0.0"
commandTests:
- name: "worker binary is runnable"
command: "/syntactic-code-intel-worker"
envVars:
- key: "SANITY_CHECK"
value: "true"
- name: "scip treesitter binary is runnable"
command: "/usr/local/bin/scip-treesitter"
envVars:
- key: "SANITY_CHECK"
value: "true"
- name: "not running as root"
command: "/usr/bin/id"
args:
- -u
excludedOutput: ["^0"]
exitCode: 0

View File

@ -0,0 +1,12 @@
package main
import (
"github.com/sourcegraph/sourcegraph/cmd/syntactic-code-intel-worker/shared"
"github.com/sourcegraph/sourcegraph/internal/sanitycheck"
"github.com/sourcegraph/sourcegraph/internal/service/svcmain"
)
func main() {
sanitycheck.Pass()
svcmain.SingleServiceMain(shared.Service)
}

View File

@ -0,0 +1,24 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "shared",
srcs = [
"config.go",
"service.go",
"shared.go",
],
importpath = "github.com/sourcegraph/sourcegraph/cmd/syntactic-code-intel-worker/shared",
visibility = ["//visibility:public"],
deps = [
"//internal/codeintel/shared/lsifuploadstore",
"//internal/debugserver",
"//internal/encryption/keyring",
"//internal/env",
"//internal/goroutine",
"//internal/httpserver",
"//internal/observation",
"//internal/service",
"//lib/errors",
"@com_github_sourcegraph_log//:log",
],
)

View File

@ -0,0 +1,55 @@
package shared
import (
"net"
"strconv"
"time"
"github.com/sourcegraph/sourcegraph/internal/codeintel/shared/lsifuploadstore"
"github.com/sourcegraph/sourcegraph/internal/env"
"github.com/sourcegraph/sourcegraph/lib/errors"
)
type Config struct {
env.BaseConfig
WorkerPollInterval time.Duration
WorkerConcurrency int
WorkerBudget int64
MaximumRuntimePerJob time.Duration
SCIPUploadStoreConfig *lsifuploadstore.Config
CliPath string
ListenAddress string
}
const DefaultPort = 3188
func (c *Config) Load() {
c.SCIPUploadStoreConfig = &lsifuploadstore.Config{}
c.SCIPUploadStoreConfig.Load()
c.WorkerPollInterval = c.GetInterval("SYNTACTIC_CODE_INTEL_WORKER_POLL_INTERVAL", "1s", "Interval between queries to the repository queue")
c.WorkerConcurrency = c.GetInt("SYNTACTIC_CODE_INTEL_WORKER_CONCURRENCY", "1", "The maximum number of repositories that can be processed concurrently.")
c.WorkerBudget = int64(c.GetInt("SYNTACTIC_CODE_INTEL_WORKER_BUDGET", "0", "The amount of compressed input data (in bytes) a worker can process concurrently. Zero acts as an infinite budget."))
c.MaximumRuntimePerJob = c.GetInterval("SYNTACTIC_CODE_INTEL_WORKER_MAXIMUM_RUNTIME_PER_JOB", "25m", "The maximum time a single repository indexing job can take")
c.CliPath = c.Get("SCIP_TREESITTER_COMMAND", "scip-treesitter", "TODO: fill in description")
c.ListenAddress = c.GetOptional("SYNTACTIC_CODE_INTEL_WORKER_ADDR", "The address under which the syntactic codeintel worker API listens. Can include a port.")
// Fall back to a reasonable default.
if c.ListenAddress == "" {
port := strconv.Itoa(DefaultPort)
host := ""
if env.InsecureDev {
host = "127.0.0.1"
}
c.ListenAddress = net.JoinHostPort(host, port)
}
}
func (c *Config) Validate() error {
var errs error
errs = errors.Append(errs, c.BaseConfig.Validate())
errs = errors.Append(errs, c.SCIPUploadStoreConfig.Validate())
return errs
}

View File

@ -0,0 +1,26 @@
package shared
import (
"context"
"github.com/sourcegraph/sourcegraph/internal/debugserver"
"github.com/sourcegraph/sourcegraph/internal/env"
"github.com/sourcegraph/sourcegraph/internal/observation"
"github.com/sourcegraph/sourcegraph/internal/service"
)
type svc struct{}
func (svc) Name() string { return "syntactic-code-intel-worker" }
func (svc) Configure() (env.Config, []debugserver.Endpoint) {
var config Config
config.Load()
return &config, nil
}
func (svc) Start(ctx context.Context, observationCtx *observation.Context, ready service.ReadyFunc, config env.Config) error {
return Main(ctx, observationCtx, ready, *config.(*Config))
}
var Service service.Service = svc{}

View File

@ -0,0 +1,40 @@
package shared
import (
"context"
"net/http"
"time"
"github.com/sourcegraph/log"
"github.com/sourcegraph/sourcegraph/internal/encryption/keyring"
"github.com/sourcegraph/sourcegraph/internal/goroutine"
"github.com/sourcegraph/sourcegraph/internal/httpserver"
"github.com/sourcegraph/sourcegraph/internal/observation"
"github.com/sourcegraph/sourcegraph/internal/service"
"github.com/sourcegraph/sourcegraph/lib/errors"
)
func Main(ctx context.Context, observationCtx *observation.Context, ready service.ReadyFunc, config Config) error {
logger := observationCtx.Logger
if err := keyring.Init(ctx); err != nil {
return errors.Wrap(err, "initializing keyring")
}
logger.Info("Syntactic code intel worker running",
log.String("path to scip-treesitter CLI", config.CliPath),
log.String("API address", config.ListenAddress))
// Initialize health server
server := httpserver.NewFromAddr(config.ListenAddress, &http.Server{
ReadTimeout: 75 * time.Second,
WriteTimeout: 10 * time.Minute,
Handler: httpserver.NewHandler(nil),
})
// Go!
goroutine.MonitorBackgroundRoutines(ctx, server)
return nil
}

View File

@ -18,6 +18,7 @@ allowed_prefix=(
# Transitively depends on updatecheck package which imports but does not use DB
github.com/sourcegraph/sourcegraph/cmd/pings
github.com/sourcegraph/sourcegraph/cmd/precise-code-intel-worker
github.com/sourcegraph/sourcegraph/cmd/syntactic-code-intel-worker
github.com/sourcegraph/sourcegraph/cmd/repo-updater
# Transitively depends on zoekt package which imports but does not use DB
github.com/sourcegraph/sourcegraph/cmd/searcher

View File

@ -31,6 +31,7 @@ var allowedToImport = []string{
// Transitively depends on updatecheck package which imports but does not use DB
"github.com/sourcegraph/sourcegraph/cmd/pings",
"github.com/sourcegraph/sourcegraph/cmd/precise-code-intel-worker",
"github.com/sourcegraph/sourcegraph/cmd/syntactic-code-intel-worker",
"github.com/sourcegraph/sourcegraph/cmd/repo-updater",
// Transitively depends on zoekt package which imports but does not use DB
"github.com/sourcegraph/sourcegraph/cmd/searcher",

17
dev/scip-treesitter-dev Executable file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
# Wrapper for `scip-treesitter` similar to `dev/scip-ctags-dev`.
#
# To use an alternate scip-treesitter binary for development, invoke
# `SCIP_TREESITTER_COMMAND=path/to/scip-treesitter sg start`.
root="$(dirname "${BASH_SOURCE[0]}")/.." >/dev/null
TARGET=$("$root/dev/scip-treesitter-install.sh" which)
if [ ! -f "${TARGET}" ]; then
echo "scip-treesitter is not installed, please run ./dev/scip-treesitter-install.sh"
echo "Alternatively you can use SCIP_TREESITTER_COMMAND=path/to/scip-treesitter to use your own binary."
exit 1
else
${TARGET} "$@"
fi

34
dev/scip-treesitter-install.sh Executable file
View File

@ -0,0 +1,34 @@
#!/usr/bin/env bash
set -euf -o pipefail
pushd "$(dirname "${BASH_SOURCE[0]}")/.." >/dev/null
mkdir -p .bin
# TODO: add similar task to zoekt alpine
NAME="scip-treesitter"
TARGET="$PWD/.bin/${NAME}"
if [ $# -ne 0 ]; then
if [ "$1" == "which" ]; then
echo "$TARGET"
exit 0
fi
fi
function ctrl_c() {
printf "[-] Installation cancelled.\n"
exit 1
}
trap ctrl_c INT
function build_scip_treesitter {
cd docker-images/syntax-highlighter/crates/scip-treesitter-cli
cargo build --bin scip-treesitter --target-dir target
cp ./target/release/scip-treesitter "$TARGET"
}
build_scip_treesitter
popd >/dev/null

View File

@ -1,5 +1,5 @@
{
"checksum": "d9e91f35b8090c7c539c13c008d84bfffcc8e9ba4989fa33e0fa11e47136d117",
"checksum": "2aace7295e52e3affbef65c6d2975a2348c691c9ee9758235daa1104fcda8c37",
"crates": {
"addr2line 0.20.0": {
"name": "addr2line",

View File

@ -2,8 +2,8 @@ load("@crate_index//:defs.bzl", "aliases", "all_crate_deps")
load("@rules_rust//rust:defs.bzl", "rust_binary", "rust_library", "rust_test")
rust_binary(
name = "scip-treesitter-cli",
srcs = glob(["src/main.rs"]),
name = "scip-treesitter",
srcs = ["src/bin/scip-treesitter.rs"],
aliases = aliases(),
proc_macro_deps = all_crate_deps(
proc_macro = True,
@ -71,7 +71,7 @@ rust_test(
],
allow_empty = False,
),
data = [":scip-treesitter-cli"] +
data = [":scip-treesitter"] +
glob(
["tests/snapshots/**"],
allow_empty = False,
@ -79,12 +79,12 @@ rust_test(
env = {
"INSTA_WORKSPACE_ROOT": ".",
"RUST_BACKTRACE": "1",
"SCIP_CLI_LOCATION": "$(rootpath :scip-treesitter-cli)",
"SCIP_CLI_LOCATION": "$(rootpath :scip-treesitter)",
},
deps = all_crate_deps(
normal = True,
) + [
":scip-treesitter-cli",
":scip-treesitter",
":scip-treesitter-cli-lib",
] + WORKSPACE_DEPS,
)

View File

@ -3,7 +3,8 @@ name = "scip-treesitter-cli"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[[bin]]
name = "scip-treesitter"
[dependencies]
lazy_static = "1.0"

View File

@ -81,6 +81,19 @@ enum Commands {
}
pub fn main() {
// Exits with a code zero if the environment variable SANITY_CHECK equals
// to "true". This enables testing that the current program is in a runnable
// state against the platform it's being executed on.
//
// See https://github.com/GoogleContainerTools/container-structure-test
match std::env::var("SANITY_CHECK") {
Ok(v) if v == "true" => {
println!("Sanity check passed, exiting without error");
std::process::exit(0)
}
_ => {}
};
let cli = Cli::parse();
match cli.command {

View File

@ -59,6 +59,8 @@ env:
{ "Name": "embeddings", "Host": "127.0.0.1:6099" },
{ "Name": "zoekt-index-0", "Host": "127.0.0.1:6072" },
{ "Name": "zoekt-index-1", "Host": "127.0.0.1:6073" },
{ "Name": "syntactic-code-intel-worker-0", "Host": "127.0.0.1:6075" },
{ "Name": "syntactic-code-intel-worker-1", "Host": "127.0.0.1:6076" },
{ "Name": "zoekt-web-0", "Host": "127.0.0.1:3070", "DefaultPath": "/debug/requests/" },
{ "Name": "zoekt-web-1", "Host": "127.0.0.1:3071", "DefaultPath": "/debug/requests/" }
]
@ -182,7 +184,7 @@ commands:
fi
go build -gcflags="$GCFLAGS" -o .bin/gitserver github.com/sourcegraph/sourcegraph/cmd/gitserver
checkBinary: .bin/gitserver
env: &gitserverenv
env:
HOSTNAME: 127.0.0.1:3178
watch:
- lib
@ -197,7 +199,6 @@ commands:
gitserver-0:
<<: *gitserver_template
env:
<<: *gitserverenv
GITSERVER_EXTERNAL_ADDR: 127.0.0.1:3501
GITSERVER_ADDR: 127.0.0.1:3501
SRC_REPOS_DIR: $HOME/.sourcegraph/repos_1
@ -206,7 +207,6 @@ commands:
gitserver-1:
<<: *gitserver_template
env:
<<: *gitserverenv
GITSERVER_EXTERNAL_ADDR: 127.0.0.1:3502
GITSERVER_ADDR: 127.0.0.1:3502
SRC_REPOS_DIR: $HOME/.sourcegraph/repos_2
@ -280,6 +280,7 @@ commands:
-e QDRANT_INIT_FILE_PATH=/data/.qdrant-initialized \
--entrypoint /usr/local/bin/qdrant \
sourcegraph/qdrant:insiders
worker:
cmd: |
export SOURCEGRAPH_LICENSE_GENERATION_KEY=$(cat ../dev-private/enterprise/dev/test-license-generation-key.pem)
@ -563,6 +564,46 @@ commands:
- cmd/precise-code-intel-worker
- lib/codeintel
syntactic-codeintel-worker-template: &syntactic_codeintel_worker_template
cmd: |
export SOURCEGRAPH_LICENSE_GENERATION_KEY=$(cat ../dev-private/enterprise/dev/test-license-generation-key.pem)
.bin/syntactic-code-intel-worker
install: |
if [ -n "$DELVE" ]; then
export GCFLAGS='all=-N -l'
fi
if [ ! -f $(./dev/scip-treesitter-install.sh which) ]; then
echo "Building scip-treesitter"
./dev/scip-treesitter-install.sh
fi
echo "Building codeintel-outkline-scip-worker"
go build -gcflags="$GCFLAGS" -o .bin/syntactic-code-intel-worker github.com/sourcegraph/sourcegraph/cmd/syntactic-code-intel-worker
checkBinary: .bin/syntactic-code-intel-worker
watch:
- lib
- internal
- cmd/syntactic-code-intel-worker
- lib/codeintel
env:
SCIP_TREESITTER_COMMAND: dev/scip-treesitter-dev
syntactic-code-intel-worker-0:
<<: *syntactic_codeintel_worker_template
env:
SYNTACTIC_CODE_INTEL_WORKER_ADDR: 127.0.0.1:6075
syntactic-code-intel-worker-1:
<<: *syntactic_codeintel_worker_template
cmd: |
export SOURCEGRAPH_LICENSE_GENERATION_KEY=$(cat ../dev-private/enterprise/dev/test-license-generation-key.pem)
.bin/syntactic-code-intel-worker
env:
SYNTACTIC_CODE_INTEL_WORKER_ADDR: 127.0.0.1:6076
executor-template:
&executor_template # TMPDIR is set here so it's not set in the `install` process, which would trip up `go build`.
cmd: |
@ -1039,6 +1080,7 @@ bazelCommands:
GITSERVER_ADDR: 127.0.0.1:3502
SRC_REPOS_DIR: $HOME/.sourcegraph/repos_2
SRC_PROF_HTTP: 127.0.0.1:3552
codeintel-worker:
precmd: |
export SOURCEGRAPH_LICENSE_GENERATION_KEY=$(cat ../dev-private/enterprise/dev/test-license-generation-key.pem)
@ -1195,6 +1237,21 @@ commandsets:
- grafana
- prometheus
codeintel-syntactic:
requiresDevPrivate: true
checks:
- docker
- redis
- postgres
- git
commands:
- frontend
- worker
- blobstore
- syntactic-code-intel-worker-0
- syntactic-code-intel-worker-1
codeintel:
requiresDevPrivate: true
checks: