Syntactic Indexing: add TAR archive indexing mode to scip-syntax CLI (#63097)

Fixes GRAPH-651
Fixes GRAPH-650

New features:
- `index tar <input>` allows indexing tar archives when `input` is a
file, and `stdin` when `input` is literal string `-`
- **BREAKING: Instead of flags controlling the type of input source, we
now have subcommands: `index files`, `index workspace`, `index tar`**

Refactoring:
- Tests were improved, with helper functions broken down into composable
pieces, and producing 1 snapshot per test, making it easier to manage
- Closures were removed from the indexing code, instead replaced with
functions.
- Calls to `.unwrap` were replaced with better error handling code
- Path canonicalisation was replaced with path cleanup + absolutisation
- to avoid following symlinks (which is what `canonicalize` would do)

Most of the refactoring was triggered by the changes required to add
more tests.

## Test plan

- New integration tests

---------

Co-authored-by: Christoph Hegemann <christoph.hegemann@sourcegraph.com>
This commit is contained in:
Anton Sviridov 2024-06-06 19:33:58 +01:00 committed by GitHub
parent c157fa82ff
commit dfa60d6c9b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 1918 additions and 372 deletions

File diff suppressed because it is too large Load Diff

View File

@ -625,23 +625,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
version = "0.3.2"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b30f669a7961ef1631673d2766cc92f52d64f7ef354d4fe0ddfd30ed52f0f4f"
checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
dependencies = [
"errno-dragonfly",
"libc",
"windows-sys 0.48.0",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
"windows-sys 0.52.0",
]
[[package]]
@ -656,9 +645,9 @@ dependencies = [
[[package]]
name = "fastrand"
version = "2.0.0"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
[[package]]
name = "fd-lock"
@ -685,6 +674,18 @@ dependencies = [
"version_check",
]
[[package]]
name = "filetime"
version = "0.2.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd"
dependencies = [
"cfg-if",
"libc",
"redox_syscall 0.4.1",
"windows-sys 0.52.0",
]
[[package]]
name = "flate2"
version = "1.0.27"
@ -1068,9 +1069,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.147"
version = "0.2.155"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
[[package]]
name = "line-wrap"
@ -1089,9 +1090,9 @@ checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linux-raw-sys"
version = "0.4.5"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503"
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
name = "lock_api"
@ -1350,6 +1351,12 @@ version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
[[package]]
name = "path-clean"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17359afc20d7ab31fdb42bb844c8b3bb1dabd7dcf7e68428492da7f16966fcef"
[[package]]
name = "pear"
version = "0.2.7"
@ -1644,6 +1651,15 @@ dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "redox_syscall"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "redox_users"
version = "0.4.3"
@ -1815,15 +1831,15 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustix"
version = "0.38.8"
version = "0.38.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19ed4fa021d81c8392ce04db050a3da9a60299050b7ae1cf482d862b54a7218f"
checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
dependencies = [
"bitflags 2.4.0",
"errno",
"libc",
"linux-raw-sys",
"windows-sys 0.48.0",
"windows-sys 0.52.0",
]
[[package]]
@ -1898,6 +1914,7 @@ dependencies = [
"insta",
"lazy_static",
"paste",
"path-clean",
"predicates",
"protobuf",
"scip",
@ -1905,6 +1922,8 @@ dependencies = [
"serde_json",
"string-interner",
"syntax-analysis",
"tar",
"tempfile",
"tree-sitter-all-languages",
"walkdir",
]
@ -2161,16 +2180,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]]
name = "tempfile"
version = "3.8.0"
name = "tar"
version = "0.4.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
dependencies = [
"filetime",
"libc",
"xattr",
]
[[package]]
name = "tempfile"
version = "3.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
dependencies = [
"cfg-if",
"fastrand",
"redox_syscall 0.3.5",
"rustix",
"windows-sys 0.48.0",
"windows-sys 0.52.0",
]
[[package]]
@ -2892,6 +2921,15 @@ dependencies = [
"windows-targets 0.48.5",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.5",
]
[[package]]
name = "windows-targets"
version = "0.42.2"
@ -2922,6 +2960,22 @@ dependencies = [
"windows_x86_64_msvc 0.48.5",
]
[[package]]
name = "windows-targets"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
dependencies = [
"windows_aarch64_gnullvm 0.52.5",
"windows_aarch64_msvc 0.52.5",
"windows_i686_gnu 0.52.5",
"windows_i686_gnullvm",
"windows_i686_msvc 0.52.5",
"windows_x86_64_gnu 0.52.5",
"windows_x86_64_gnullvm 0.52.5",
"windows_x86_64_msvc 0.52.5",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.42.2"
@ -2934,6 +2988,12 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
[[package]]
name = "windows_aarch64_msvc"
version = "0.42.2"
@ -2946,6 +3006,12 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
[[package]]
name = "windows_i686_gnu"
version = "0.42.2"
@ -2958,6 +3024,18 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_gnu"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
[[package]]
name = "windows_i686_msvc"
version = "0.42.2"
@ -2970,6 +3048,12 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_i686_msvc"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
[[package]]
name = "windows_x86_64_gnu"
version = "0.42.2"
@ -2982,6 +3066,12 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.42.2"
@ -2994,6 +3084,12 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
[[package]]
name = "windows_x86_64_msvc"
version = "0.42.2"
@ -3006,6 +3102,12 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
[[package]]
name = "winnow"
version = "0.5.14"
@ -3024,6 +3126,17 @@ dependencies = [
"tap",
]
[[package]]
name = "xattr"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f"
dependencies = [
"libc",
"linux-raw-sys",
"rustix",
]
[[package]]
name = "yaml-rust"
version = "0.4.5"

View File

@ -59,6 +59,7 @@ syntect = { git = "https://github.com/sourcegraph/syntect", rev = "7e02c5b4085e6
tree-sitter = "0.20.9"
tree-sitter-highlight = "0.20.1"
walkdir = "2"
path-clean = "1"
scip = "0.3.2"
protobuf = "3"

View File

@ -94,6 +94,7 @@ rust_test(
tags = [TAG_PLATFORM_GRAPH],
deps = all_crate_deps(
normal = True,
normal_dev = True,
) + [
":scip-syntax",
":scip_syntax_lib",

View File

@ -23,6 +23,11 @@ serde = { workspace = true }
serde_json = { workspace = true }
string-interner = { workspace = true }
walkdir = { workspace = true }
path-clean = { workspace = true }
syntax-analysis = { path = "../syntax-analysis" }
tree-sitter-all-languages = { path = "../tree-sitter-all-languages" }
tar = "0.4.40"
[dev-dependencies]
tempfile="3.10.1"

View File

@ -1,7 +1,13 @@
use std::path::{Path, PathBuf};
use std::{
env,
fs::File,
io::{self, prelude::*},
path::{Path, PathBuf},
};
use anyhow::{anyhow, Context, Result};
use anyhow::{anyhow, bail, Context, Result};
use clap::ValueEnum;
use path_clean;
use scip::{types::Document, write_message_to_file};
use syntax_analysis::{get_globals, get_locals};
use tree_sitter_all_languages::ParserId;
@ -20,7 +26,7 @@ pub struct IndexOptions {
pub fail_fast: bool,
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)]
pub enum AnalysisMode {
/// Only extract occurrences of local definitions
Locals,
@ -39,12 +45,32 @@ impl AnalysisMode {
}
}
pub enum TarMode {
/// Data is streamed from STDIN
Stdin,
/// Data is read from a .tar file
File { location: PathBuf },
}
pub enum IndexMode {
/// Index only this list of files, without checking file extensions
Files { list: Vec<String> },
/// Discover all files that can be handled by the chosen language
/// in the passed location (which has to be a directory)
Workspace { location: PathBuf },
/// Discover all files that can be handled by the chosen language
/// in either a .tar file, or from STDIN to which TAR data is streamed
TarArchive { input: TarMode },
}
fn make_absolute(cwd: &Path, path: &Path) -> PathBuf {
if path.is_absolute() {
path.to_owned()
} else {
path_clean::clean(cwd.join(path))
}
}
pub fn index_command(
@ -55,20 +81,17 @@ pub fn index_command(
evaluate_against: Option<PathBuf>,
options: IndexOptions,
) -> Result<()> {
let p = ParserId::from_name(&language).unwrap();
let project_root = {
match index_mode {
IndexMode::Files { .. } => project_root,
IndexMode::Workspace { ref location } => location.clone(),
}
};
let parser_id = ParserId::from_name(&language)
.context(format!("No parser found for language {language}"))?;
let canonical_project_root = project_root.canonicalize().with_context(|| {
format!(
"Failed to canonicalize project root: {}",
project_root.display()
)
})?;
let cwd = env::current_dir().context("Failed to get the current working directory")?;
let absolute_project_root = make_absolute(
&cwd,
match &index_mode {
IndexMode::Workspace { location } => location,
_ => &project_root,
},
);
let mut index = scip::types::Index {
metadata: Some(scip::types::Metadata {
@ -79,68 +102,44 @@ pub fn index_command(
..Default::default()
})
.into(),
project_root: format!("file://{}", canonical_project_root.display()),
project_root: format!("file://{}", absolute_project_root.display()),
..Default::default()
})
.into(),
..Default::default()
};
let mut index_file = |filepath: &Path| -> Result<()> {
let contents = std::fs::read_to_string(filepath)
.with_context(|| format!("Failed to read file at {}", filepath.display()))?;
let filepath = if filepath.is_absolute() {
filepath.to_owned()
} else {
filepath.canonicalize().with_context(|| {
format!("Failed to canonicalize file path: {}", filepath.display())
})?
};
let relative_path = filepath
.strip_prefix(canonical_project_root.clone())
.with_context(|| {
format!(
"Failed to strip project root prefix: root={} file={}",
canonical_project_root.display(),
filepath.display()
)
})?;
match index_content(&contents, p, &options) {
Ok(mut document) => {
document.relative_path = relative_path.display().to_string();
index.documents.push(document);
Ok(())
}
Err(error) => {
if options.fail_fast {
Err(anyhow!(
"Failed to index {}: {:?}",
filepath.display(),
error
))
} else {
eprintln!("Failed to index {}: {:?}", filepath.display(), error);
Ok(())
}
}
}
};
let extensions = ParserId::language_extensions(&parser_id);
match index_mode {
IndexMode::Files { list } => {
let bar = create_progress_bar(list.len() as u64);
for filename in list {
let filepath = PathBuf::from(filename).canonicalize().unwrap();
bar.set_message(filepath.display().to_string());
index_file(&filepath)?;
bar.set_message(filename.clone());
let filepath = make_absolute(&cwd, &PathBuf::from(filename));
let document = index_file(&filepath, parser_id, &absolute_project_root, &options)?;
index.documents.push(document);
bar.inc(1);
}
bar.finish();
}
IndexMode::TarArchive { input } => match input {
TarMode::File { location } => {
let mut ar = tar::Archive::new(File::open(location)?);
let entries = ar.entries()?;
let documents = index_tar_entries(entries, parser_id, &options)?;
index.documents.extend(documents);
}
TarMode::Stdin => {
let stdin = io::stdin();
let mut ar: tar::Archive<_> = tar::Archive::new(stdin);
let entries = ar.entries()?;
let documents = index_tar_entries(entries, parser_id, &options)?;
index.documents.extend(documents);
}
},
IndexMode::Workspace { location } => {
let extensions = ParserId::language_extensions(&p);
let bar = create_spinner();
for entry in walkdir::WalkDir::new(location) {
@ -153,17 +152,21 @@ pub fn index_command(
};
if extensions.contains(extension) {
bar.set_message(entry.path().display().to_string());
index_file(&entry.into_path())?;
let document = index_file(
&entry.into_path(),
parser_id,
&absolute_project_root,
&options,
)?;
index.documents.push(document);
bar.tick();
}
}
}
}
eprintln!();
eprintln!(
"Writing index for {} documents into {}",
"\nWriting index for {} documents into {}",
index.documents.len(),
out.display()
);
@ -179,11 +182,102 @@ pub fn index_command(
.write_summary(&mut std::io::stdout(), Default::default())?
}
write_message_to_file(out.clone(), index)
write_message_to_file(&out, index)
.map_err(|err| anyhow!("{err:?}"))
.with_context(|| format!("When writing index to {}", out.display()))
}
fn index_file(
filepath: &Path,
parser_id: ParserId,
absolute_project_root: &Path,
options: &IndexOptions,
) -> Result<Document> {
let contents = std::fs::read_to_string(filepath)
.with_context(|| format!("Failed to read file at {}", filepath.display()))?;
let relative_path = filepath
.strip_prefix(absolute_project_root)
.with_context(|| {
format!(
"Failed to strip project root prefix: root={} file={}",
absolute_project_root.display(),
filepath.display()
)
})?;
match index_content(&contents, parser_id, options) {
Ok(mut document) => {
document.relative_path = relative_path.display().to_string();
Ok(document)
}
Err(error) => {
bail!("Failed to index {}: {:?}", filepath.display(), error)
}
}
}
fn index_tar_entries<R: Read>(
entries: tar::Entries<'_, R>,
parser: ParserId,
options: &IndexOptions,
) -> anyhow::Result<Vec<Document>> {
let extensions = ParserId::language_extensions(&parser);
let mut contents = String::new();
let mut documents: Vec<Document> = vec![];
let mut progress = 0;
let spinner = create_spinner();
for entry in entries {
let mut e = entry?;
let path = PathBuf::from(e.path()?);
if matches!(path.extension().and_then(|e| e.to_str()), Some(ext) if extensions.contains(ext))
{
match e.read_to_string(&mut contents) {
Ok(size) => {
match index_content(&contents, parser, options) {
Ok(mut document) => {
document.relative_path = path.display().to_string();
documents.push(document);
}
Err(error) => {
if options.fail_fast {
anyhow::bail!("Failed to index {}: {:?}", path.display(), error);
} else {
eprintln!("Failed to index {}: {:?}", path.display(), error);
}
}
}
if size > 0 {
contents.clear();
}
}
Err(error) => {
if options.fail_fast {
anyhow::bail!(
"Failed to read contents of path {}: {:?}",
path.display(),
error
)
} else {
eprintln!(
"Failed to read contents of path {}: {:?}",
path.display(),
error
);
}
}
}
progress += 1;
spinner.set_message(format!("[{}]: {}", progress, path.display()));
spinner.tick();
}
}
Ok(documents)
}
fn index_content(contents: &str, parser: ParserId, options: &IndexOptions) -> Result<Document> {
let mut document: Document;

View File

@ -1,7 +1,7 @@
use std::{path::PathBuf, process};
use clap::{Parser, Subcommand};
use scip_syntax::index::{index_command, AnalysisMode, IndexMode, IndexOptions};
use scip_syntax::index::{index_command, AnalysisMode, IndexMode, IndexOptions, TarMode};
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
@ -11,44 +11,78 @@ struct Cli {
command: Commands,
}
#[derive(Parser, Clone, Debug)]
struct IndexCommandOptions {
/// Which language parser to use to process the files
#[arg(short, long)]
language: String,
/// Path where the SCIP index will be written
#[arg(short, long, default_value = "./index.scip")]
out: String,
/// Analysis mode
#[arg(short, long, default_value = "full")]
mode: AnalysisMode,
/// Fail on first error
#[arg(long, default_value_t = false)]
fail_fast: bool,
/// Project root to write to SCIP index
#[arg(short, long, default_value = "./")]
project_root: String,
/// Evaluate the build index against an index from a file
#[arg(long)]
evaluate: Option<String>,
}
#[derive(Subcommand, Debug)]
enum IndexCommand {
/// Index a folder, automatically detecting files
/// to be processed by the chosen language
Workspace {
/// Folder to index - will be chosen as project root,
/// and files will be discovered according to
/// configured extensions for the selected language
/// Has to be absolute path.
dir: String,
#[command(flatten)]
options: IndexCommandOptions,
},
/// Index a list of files
Files {
/// List of files to analyse
filenames: Vec<String>,
#[command(flatten)]
options: IndexCommandOptions,
},
/// Index a .tar archive, either from a file or streaming from STDIN
Tar {
/// Either a path to .tar file, or "-" to read .tar data from STDIN
tar: String,
#[command(flatten)]
options: IndexCommandOptions,
},
}
#[derive(Parser, Debug)]
struct IndexCommandParser {
#[structopt(subcommand)]
index_command: IndexCommand,
}
#[derive(Subcommand)]
enum Commands {
/// Index source files using Tree Sitter parser for a given language
/// and produce a SCIP file
Index {
/// Which language parser to use to process the files
#[arg(short, long)]
language: String,
/// Path where the SCIP index will be written
#[arg(short, long, default_value = "./index.scip")]
out: String,
/// Folder to index - will be chosen as project root,
/// and files will be discovered according to
/// configured extensions for the selected language
#[arg(long)]
workspace: Option<String>,
/// List of files to analyse
filenames: Vec<String>,
/// Analysis mode
#[arg(short, long, default_value = "full")]
mode: AnalysisMode,
/// Fail on first error
#[arg(long, default_value_t = false)]
fail_fast: bool,
/// Project root to write to SCIP index
#[arg(short, long, default_value = "./")]
project_root: String,
/// Evaluate the build index against an index from a file
#[arg(long)]
evaluate: Option<String>,
},
#[clap(name = "index")]
Index(IndexCommandParser),
/// Fuzzily evaluate candidate SCIP index against known ground truth
ScipEvaluate {
@ -99,49 +133,44 @@ pub fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
match cli.command {
Commands::Index {
language,
out,
filenames,
workspace,
mode,
fail_fast,
project_root,
evaluate,
} => {
let index_mode = {
match workspace {
None => {
if filenames.is_empty() {
eprintln!("either specify --workspace or provide a list of files");
process::exit(1)
}
IndexMode::Files { list: filenames }
Commands::Index(index1) => {
let result = match index1.index_command {
IndexCommand::Files { filenames, options } => {
if filenames.is_empty() {
eprintln!("List of files cannot be empty");
process::exit(1)
}
Some(location) => {
if !filenames.is_empty() {
eprintln!("--workspace option cannot be combined with a list of files");
process::exit(1)
} else {
IndexMode::Workspace {
location: location.into(),
}
}
run_index_command(options, IndexMode::Files { list: filenames })
}
IndexCommand::Workspace { dir, options } => run_index_command(
options,
IndexMode::Workspace {
location: dir.into(),
},
),
IndexCommand::Tar { tar, options } => {
if tar == "-" {
run_index_command(
options,
IndexMode::TarArchive {
input: scip_syntax::index::TarMode::Stdin,
},
)
} else {
run_index_command(
options,
IndexMode::TarArchive {
input: TarMode::File {
location: PathBuf::from(tar),
},
},
)
}
}
};
index_command(
language,
index_mode,
PathBuf::from(out),
PathBuf::from(project_root),
evaluate.map(PathBuf::from),
IndexOptions {
analysis_mode: mode,
fail_fast,
},
)?
result.unwrap()
}
Commands::ScipEvaluate {
@ -166,3 +195,17 @@ pub fn main() -> anyhow::Result<()> {
}
Ok(())
}
fn run_index_command(options: IndexCommandOptions, mode: IndexMode) -> anyhow::Result<()> {
index_command(
options.language,
mode,
PathBuf::from(options.out),
PathBuf::from(options.project_root),
options.evaluate.map(PathBuf::from),
IndexOptions {
analysis_mode: options.mode,
fail_fast: options.fail_fast,
},
)
}

View File

@ -0,0 +1,4 @@
@Deprecated
package foo.bar;
class Baz {}

View File

@ -1,11 +1,13 @@
use std::{
collections::HashMap,
env::temp_dir,
collections::{HashMap, HashSet},
io::Write,
path::{Path, PathBuf},
process::Command,
process::{Command, Stdio},
};
use anyhow::{anyhow, bail, Context, Result};
use assert_cmd::{cargo::cargo_bin, prelude::*};
use scip::types::Document;
use scip_syntax::{
evaluate::Evaluator,
index::{index_command, AnalysisMode, IndexMode, IndexOptions},
@ -37,6 +39,7 @@ lazy_static::lazy_static! {
}
use syntax_analysis::snapshot::{dump_document_with_config, EmitSymbol, SnapshotOptions};
use tar::{Builder, Header};
fn snapshot_syntax_document(doc: &scip::types::Document, source: &str) -> String {
dump_document_with_config(
@ -54,7 +57,7 @@ fn snapshot_syntax_document(doc: &scip::types::Document, source: &str) -> String
fn java_e2e_evaluation() {
let dir = BASE.join("testdata/java");
let out_dir = temp_dir();
let out_dir = tempdir();
let candidate = out_dir.join("index-tree-sitter.scip");
@ -94,55 +97,273 @@ fn java_e2e_evaluation() {
}
#[test]
fn java_e2e_indexing() {
let out_dir = temp_dir();
let setup = HashMap::from([(
PathBuf::from("globals.java"),
include_str!("../testdata/globals.java").to_string(),
)]);
fn java_files_indexing() {
let out_dir = tempdir();
let setup = indexing_data();
run_index(&out_dir, &setup, vec!["--language", "java"]);
let mut cmd = command("index");
let output_location = out_dir.join("index.scip");
let paths = extract_paths(&setup);
let index = read_index_from_file(&out_dir.join("index.scip")).unwrap();
prepare(&out_dir, &setup).unwrap();
for doc in &index.documents {
let path = &doc.relative_path;
let dumped = snapshot_syntax_document(doc, setup.get(&PathBuf::from(&path)).expect("??"));
cmd.args(vec![
"files",
"--language",
"java",
"--out",
output_location.to_str().unwrap(),
])
.current_dir(&out_dir)
.args(paths)
.assert()
.success();
insta::assert_snapshot!(path.clone(), dumped);
}
let index = read_index_from_file(&output_location).unwrap();
assert_eq!(extract_paths(&setup), extract_indexed_paths(&index));
let index_snapshot = snapshot_from_files(&index.documents, &out_dir);
insta::assert_snapshot!(index_snapshot);
}
fn prepare(temp: &Path, files: &HashMap<PathBuf, String>) {
#[test]
fn java_workspace_indexing() {
let out_dir = tempdir();
let setup = indexing_data();
let mut cmd = command("index");
let output_location = out_dir.join("index.scip");
prepare(&out_dir, &setup).unwrap();
cmd.args(vec![
"workspace",
out_dir.to_str().unwrap(),
"--language",
"java",
"--out",
output_location.to_str().unwrap(),
])
.assert()
.success();
let index = read_index_from_file(&output_location).unwrap();
assert_eq!(extract_paths(&setup), extract_indexed_paths(&index));
let index_snapshot = snapshot_from_files(&index.documents, &out_dir);
insta::assert_snapshot!(index_snapshot);
}
#[test]
fn java_tar_file_indexing() {
let out_dir = tempdir();
let setup = indexing_data();
let tar_data = create_tar(&setup);
let data = tar_data.unwrap();
let mut cmd = command("index");
let tar_file = out_dir.join("test.tar");
let output_location = out_dir.join("index.scip");
write_file_bytes(&tar_file, &data).unwrap();
cmd.args(vec![
"tar",
tar_file.to_str().unwrap(),
"--language",
"java",
"--out",
output_location.to_str().unwrap(),
])
.assert()
.success();
let index = read_index_from_file(&output_location).unwrap();
assert_eq!(extract_paths(&setup), extract_indexed_paths(&index));
let index_snapshot = snapshot_from_data(&index.documents, &setup);
insta::assert_snapshot!(index_snapshot);
}
#[test]
fn java_tar_stream_indexing() {
let out_dir = tempdir();
let setup = indexing_data();
let tar_data = create_tar(&setup);
let data = tar_data.unwrap();
let mut cmd = command("index");
let tar_file = out_dir.join("test.tar");
let output_location = out_dir.join("index.scip");
write_file_bytes(&tar_file, &data)
.context("Failed to write tar data")
.unwrap();
let mut spawned = cmd
.args(vec![
"tar",
"-",
"--language",
"java",
"--out",
output_location.to_str().unwrap(),
])
.stdin(Stdio::piped())
.spawn()
.unwrap();
spawned.stdin.take().unwrap().write_all(&data).unwrap();
let exit_status = spawned.wait().unwrap();
assert_eq!(exit_status.code(), Some(0));
let index = read_index_from_file(&output_location).unwrap();
assert_eq!(extract_paths(&setup), extract_indexed_paths(&index));
let index_snapshot = snapshot_from_data(&index.documents, &setup);
insta::assert_snapshot!(index_snapshot);
}
fn prepare(temp: &Path, files: &HashMap<PathBuf, String>) -> Result<()> {
for (path, contents) in files.iter() {
let file_path = temp.join(path);
write_file(&file_path, contents);
write_file_string(&file_path, contents)?;
}
Ok(())
}
fn run_index(location: &PathBuf, files: &HashMap<PathBuf, String>, extra_arguments: Vec<&str>) {
prepare(location, files);
let mut base_args = vec!["index"];
base_args.extend(extra_arguments);
fn command(sub: &str) -> Command {
let mut cmd = Command::new(BINARY_LOCATION.to_str().unwrap());
cmd.args(base_args);
cmd.arg(sub);
for (path, _) in files.iter() {
cmd.arg(path.to_str().unwrap());
}
cmd.current_dir(location);
cmd.assert().success();
cmd
}
fn write_file(path: &PathBuf, contents: &String) {
fn write_file_string(path: &PathBuf, contents: &String) -> Result<()> {
write_file_bytes(path, contents.as_bytes())
}
fn write_file_bytes(path: &PathBuf, contents: &[u8]) -> Result<()> {
use std::io::Write;
let output = std::fs::File::create(path).unwrap();
let Some(parent) = path.parent() else {
bail!("failed to find parent dir for {}", path.display())
};
std::fs::create_dir_all(parent)
.with_context(|| anyhow!("Failed to create all parent folders for {}", path.display()))?;
let output = std::fs::File::create(path)
.with_context(|| anyhow!("Failed to open file {} for writing", path.to_str().unwrap()))?;
let mut writer = std::io::BufWriter::new(output);
writer.write_all(contents.as_bytes()).unwrap();
writer.write_all(contents)?;
Ok(())
}
fn tempdir() -> PathBuf {
tempfile::tempdir().unwrap().into_path()
}
fn create_tar(files: &HashMap<PathBuf, String>) -> Result<Vec<u8>, std::io::Error> {
let mut ar = Builder::new(Vec::new());
for (path, text) in files.iter() {
let mut header = Header::new_gnu();
let bytes = text.as_bytes();
header
.set_path(path.to_str().unwrap())
.expect("Failed to set path for archive entry");
header.set_size(bytes.len() as u64);
header.set_cksum();
ar.append(&header, bytes).unwrap();
}
ar.into_inner()
}
fn indexing_data() -> HashMap<PathBuf, String> {
HashMap::from([
(
PathBuf::from("src/main/java/globals.java"),
include_str!("../testdata/globals.java").to_string(),
),
(
PathBuf::from("package-info.java"),
include_str!("../testdata/package-info.java").to_string(),
),
])
}
fn extract_paths(setup: &HashMap<PathBuf, String>) -> HashSet<String> {
setup
.keys()
.map(|pb| pb.to_str().unwrap().to_string())
.collect()
}
fn extract_indexed_paths(index: &scip::types::Index) -> HashSet<String> {
index
.documents
.iter()
.map(|pb| pb.relative_path.clone())
.collect()
}
fn snapshot_from_files(docs: &[Document], project_root: &Path) -> String {
let mut str = String::new();
let mut docs = docs.to_owned();
docs.sort_by_key(|doc| doc.relative_path.clone());
for doc in docs {
let path = project_root.join(doc.relative_path.clone());
let contents = std::fs::read_to_string(path.clone())
.with_context(|| anyhow!("Failed to read path {}", path.display()))
.unwrap();
str.push_str(&format_snapshot_document(&doc, &contents));
}
str
}
fn format_snapshot_document(doc: &scip::types::Document, contents: &str) -> String {
let mut str = String::new();
str.push_str(format!("//----FILE={}\n", doc.relative_path).as_str());
str.push_str(&snapshot_syntax_document(doc, contents));
str.push_str("\n\n");
str
}
fn snapshot_from_data(docs: &[Document], data: &HashMap<PathBuf, String>) -> String {
let mut str = String::new();
let mut docs = docs.to_owned();
docs.sort_by_key(|doc| doc.relative_path.clone());
for doc in docs {
let contents = data
.get(&PathBuf::from(&doc.relative_path))
.context(format!("Failed to find {} in data", &doc.relative_path))
.unwrap();
str.push_str(&format_snapshot_document(&doc, contents));
}
str
}

View File

@ -1,7 +1,17 @@
---
source: crates/scip-syntax/tests/integration_test.rs
expression: dumped
expression: index_snapshot
---
//----FILE=package-info.java
@Deprecated
package foo.bar;
// ^^^^^^^ definition(Package) scip-ctags `foo.bar`/
class Baz {}
// ^^^ definition scip-ctags `foo.bar`/Baz#
//----FILE=src/main/java/globals.java
package MyPackage;
// ^^^^^^^^^ definition(Package) scip-ctags MyPackage/
@ -72,3 +82,5 @@ expression: dumped
}
}

View File

@ -0,0 +1,86 @@
---
source: crates/scip-syntax/tests/integration_test.rs
expression: index_snapshot
---
//----FILE=package-info.java
@Deprecated
package foo.bar;
// ^^^^^^^ definition(Package) scip-ctags `foo.bar`/
class Baz {}
// ^^^ definition scip-ctags `foo.bar`/Baz#
//----FILE=src/main/java/globals.java
package MyPackage;
// ^^^^^^^^^ definition(Package) scip-ctags MyPackage/
public class globals {
// ^^^^^^^ definition scip-ctags MyPackage/globals#
private static int field1;
// ^^^^^^ definition scip-ctags MyPackage/globals#field1.
protected static int field2;
// ^^^^^^ definition scip-ctags MyPackage/globals#field2.
public static int field3;
// ^^^^^^ definition scip-ctags MyPackage/globals#field3.
private int field4;
// ^^^^^^ definition scip-ctags MyPackage/globals#field4.
protected int field5;
// ^^^^^^ definition scip-ctags MyPackage/globals#field5.
public int field6;
// ^^^^^^ definition scip-ctags MyPackage/globals#field6.
private static void method1() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method1().
protected static void method2() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method2().
public static void method3() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method3().
private void method4() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method4().
protected void method5() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method5().
public void method6() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method6().
public static final String COOLEST_STRING = "probably this one";
// ^^^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#COOLEST_STRING.
public class ClassInAClass {
// ^^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#
boolean classy = true;
// ^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#classy.
public static enum Enum {
// ^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#
these,
// ^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#these.
should,
// ^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#should.
be,
// ^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#be.
recognized,
// ^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#recognized.
as,
// ^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#as.
terms
// ^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#terms.
}
public interface Goated {
// ^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Goated#
boolean withTheSauce();
// ^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Goated#withTheSauce().
}
public void myCoolMethod() {
// ^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#myCoolMethod().
class WhatIsGoingOn {}
boolean iThinkThisIsAllowedButWeDontReallyCare = true;
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ definition local 1
}
}
}

View File

@ -0,0 +1,86 @@
---
source: crates/scip-syntax/tests/integration_test.rs
expression: index_snapshot
---
//----FILE=package-info.java
@Deprecated
package foo.bar;
// ^^^^^^^ definition(Package) scip-ctags `foo.bar`/
class Baz {}
// ^^^ definition scip-ctags `foo.bar`/Baz#
//----FILE=src/main/java/globals.java
package MyPackage;
// ^^^^^^^^^ definition(Package) scip-ctags MyPackage/
public class globals {
// ^^^^^^^ definition scip-ctags MyPackage/globals#
private static int field1;
// ^^^^^^ definition scip-ctags MyPackage/globals#field1.
protected static int field2;
// ^^^^^^ definition scip-ctags MyPackage/globals#field2.
public static int field3;
// ^^^^^^ definition scip-ctags MyPackage/globals#field3.
private int field4;
// ^^^^^^ definition scip-ctags MyPackage/globals#field4.
protected int field5;
// ^^^^^^ definition scip-ctags MyPackage/globals#field5.
public int field6;
// ^^^^^^ definition scip-ctags MyPackage/globals#field6.
private static void method1() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method1().
protected static void method2() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method2().
public static void method3() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method3().
private void method4() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method4().
protected void method5() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method5().
public void method6() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method6().
public static final String COOLEST_STRING = "probably this one";
// ^^^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#COOLEST_STRING.
public class ClassInAClass {
// ^^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#
boolean classy = true;
// ^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#classy.
public static enum Enum {
// ^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#
these,
// ^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#these.
should,
// ^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#should.
be,
// ^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#be.
recognized,
// ^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#recognized.
as,
// ^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#as.
terms
// ^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#terms.
}
public interface Goated {
// ^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Goated#
boolean withTheSauce();
// ^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Goated#withTheSauce().
}
public void myCoolMethod() {
// ^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#myCoolMethod().
class WhatIsGoingOn {}
boolean iThinkThisIsAllowedButWeDontReallyCare = true;
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ definition local 1
}
}
}

View File

@ -0,0 +1,86 @@
---
source: crates/scip-syntax/tests/integration_test.rs
expression: index_snapshot
---
//----FILE=package-info.java
@Deprecated
package foo.bar;
// ^^^^^^^ definition(Package) scip-ctags `foo.bar`/
class Baz {}
// ^^^ definition scip-ctags `foo.bar`/Baz#
//----FILE=src/main/java/globals.java
package MyPackage;
// ^^^^^^^^^ definition(Package) scip-ctags MyPackage/
public class globals {
// ^^^^^^^ definition scip-ctags MyPackage/globals#
private static int field1;
// ^^^^^^ definition scip-ctags MyPackage/globals#field1.
protected static int field2;
// ^^^^^^ definition scip-ctags MyPackage/globals#field2.
public static int field3;
// ^^^^^^ definition scip-ctags MyPackage/globals#field3.
private int field4;
// ^^^^^^ definition scip-ctags MyPackage/globals#field4.
protected int field5;
// ^^^^^^ definition scip-ctags MyPackage/globals#field5.
public int field6;
// ^^^^^^ definition scip-ctags MyPackage/globals#field6.
private static void method1() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method1().
protected static void method2() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method2().
public static void method3() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method3().
private void method4() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method4().
protected void method5() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method5().
public void method6() {}
// ^^^^^^^ definition scip-ctags MyPackage/globals#method6().
public static final String COOLEST_STRING = "probably this one";
// ^^^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#COOLEST_STRING.
public class ClassInAClass {
// ^^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#
boolean classy = true;
// ^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#classy.
public static enum Enum {
// ^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#
these,
// ^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#these.
should,
// ^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#should.
be,
// ^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#be.
recognized,
// ^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#recognized.
as,
// ^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#as.
terms
// ^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Enum#terms.
}
public interface Goated {
// ^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Goated#
boolean withTheSauce();
// ^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#Goated#withTheSauce().
}
public void myCoolMethod() {
// ^^^^^^^^^^^^ definition scip-ctags MyPackage/globals#ClassInAClass#myCoolMethod().
class WhatIsGoingOn {}
boolean iThinkThisIsAllowedButWeDontReallyCare = true;
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ definition local 1
}
}
}