mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 18:51:59 +00:00
Backend: add line index (#63726)
This adds a line index utility. Frequently, I want to be able to efficiently index a file to extract a specific line or range of lines, but it's surprisingly tricky to get exactly right given weird definitions of "what even is a line" and edge conditions around out-of-bounds and such. So this adds a general-purpose utility to pre-calculate the locations of lines in the file, making extracting a line range a zero-allocation, `O(1)` operation. Not implemented: the same index can also be used to find the line that contains an offset, which I've also needed to do before. But I'll save that for when I actually have an immediate use for it.
This commit is contained in:
parent
0fc4d2811a
commit
5d8286b90f
@ -4,6 +4,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
|
||||
go_library(
|
||||
name = "byteutils",
|
||||
srcs = [
|
||||
"lineindex.go",
|
||||
"linereader.go",
|
||||
"nullscanner.go",
|
||||
],
|
||||
@ -14,7 +15,10 @@ go_library(
|
||||
|
||||
go_test(
|
||||
name = "byteutils_test",
|
||||
srcs = ["linereader_test.go"],
|
||||
srcs = [
|
||||
"lineindex_test.go",
|
||||
"linereader_test.go",
|
||||
],
|
||||
embed = [":byteutils"],
|
||||
tags = [TAG_PLATFORM_SOURCE],
|
||||
deps = [":byteutils"],
|
||||
)
|
||||
|
||||
108
internal/byteutils/lineindex.go
Normal file
108
internal/byteutils/lineindex.go
Normal file
@ -0,0 +1,108 @@
|
||||
package byteutils
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"math"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// NewLineIndex creates a new LineIndex from some file content.
|
||||
func NewLineIndex[T ~string | ~[]byte](content T) LineIndex {
|
||||
if len(content) > math.MaxUint32 {
|
||||
panic("content too large")
|
||||
}
|
||||
|
||||
// PERF: count the newlines in advance to allocate the index slice exactly
|
||||
// Explicitly case on the type rather than casting because the generics
|
||||
// seem to break the optimization that allows the allocation to be elided.
|
||||
var newlineCount int
|
||||
switch v := any(content).(type) {
|
||||
case string:
|
||||
newlineCount = strings.Count(v, "\n")
|
||||
case []byte:
|
||||
newlineCount = bytes.Count(v, []byte("\n"))
|
||||
}
|
||||
|
||||
index := make(LineIndex, 0, newlineCount+2)
|
||||
index = append(index, 0)
|
||||
offset := 0
|
||||
for {
|
||||
var loc int
|
||||
switch v := any(content).(type) {
|
||||
case string:
|
||||
loc = strings.IndexByte(v[offset:], '\n')
|
||||
case []byte:
|
||||
loc = bytes.IndexByte(v[offset:], '\n')
|
||||
}
|
||||
if loc == -1 {
|
||||
break
|
||||
}
|
||||
index = append(index, uint32(offset+loc+1))
|
||||
offset += loc + 1
|
||||
}
|
||||
index = append(index, uint32(len(content)))
|
||||
return index
|
||||
}
|
||||
|
||||
// LineIndex contains the line boundaries of the indexed content.
|
||||
// Its structure is:
|
||||
// - A leading 0
|
||||
// - A sorted list of every byte offset _after_ a newline byte
|
||||
// - A trailing len(content)
|
||||
//
|
||||
// This means:
|
||||
// - LineIndex[N] is the offset of the first byte of line N
|
||||
// - LineIndex[N+1] is the offset of the first byte after line N
|
||||
// - content[LineIndex[N]:LineIndex[N+1]] is the contents of line N
|
||||
type LineIndex []uint32
|
||||
|
||||
// LineRange returns a range that can be used to slice the indexed content to obtain
|
||||
// the line for the given number. The range is guaranteed to be a valid slice
|
||||
// into the content if the content is unchanged. If the line number refers to a
|
||||
// line that does not exist, a zero-length range will be returned pointing to
|
||||
// the beginning (for underflow) or end (for overflow) of the file.
|
||||
//
|
||||
// lineNumber is 0-indexed, and the returned range includes the terminating
|
||||
// newline (if it exists). Equivalent to Lines(lineNumber, lineNumber + 1).
|
||||
func (l LineIndex) LineRange(lineNumber int) (int, int) {
|
||||
return l.LinesRange(lineNumber, lineNumber+1)
|
||||
}
|
||||
|
||||
// LinesRange returns a range that can be used to slice the indexed content to
|
||||
// obtain the lines for the given half-open range. The range is guaranteed to
|
||||
// be a valid slice into the content if the content is unchanged. If the
|
||||
// requested range of lines does not exist, it will be truncated to return the
|
||||
// set of lines in that range that does exist.
|
||||
//
|
||||
// line numbers are 0-indexed, and the returned range includes the terminating
|
||||
// newline (if it exists).
|
||||
func (l LineIndex) LinesRange(startLine, endLine int) (int, int) {
|
||||
startLine = min(max(0, startLine), len(l)-1)
|
||||
endLine = min(max(startLine, endLine), len(l)-1)
|
||||
return int(l[startLine]), int(l[endLine])
|
||||
}
|
||||
|
||||
// For the purpose of this package, a line is defined as:
|
||||
// - zero or more non-newline bytes terminated by a newline byte
|
||||
// - OR one more non-newline terminated by the end of the file.
|
||||
//
|
||||
// Equivalently, the regex `[^\n]*\n|[^\n]+$`
|
||||
//
|
||||
// Equivalently, a newline at the last byte of the file does not
|
||||
// start an empty last line.
|
||||
//
|
||||
// Notably, this is at odds with the POSIX standard:
|
||||
// https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_206
|
||||
func (l LineIndex) LineCount() int {
|
||||
lastLineEnd := l[len(l)-1]
|
||||
contentEnd := l[len(l)-2]
|
||||
if lastLineEnd == contentEnd {
|
||||
return len(l) - 2
|
||||
}
|
||||
return len(l) - 1
|
||||
}
|
||||
|
||||
// NewlineCount is simply the number of newline bytes in the content
|
||||
func (l LineIndex) NewlineCount() int {
|
||||
return len(l) - 2
|
||||
}
|
||||
133
internal/byteutils/lineindex_test.go
Normal file
133
internal/byteutils/lineindex_test.go
Normal file
@ -0,0 +1,133 @@
|
||||
package byteutils
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"testing"
|
||||
"testing/quick"
|
||||
)
|
||||
|
||||
func naiveGetLines(contents string, lineStart, lineEnd int) string {
|
||||
lines := strings.SplitAfter(contents, "\n")
|
||||
if len(lines[len(lines)-1]) == 0 {
|
||||
lines = lines[:len(lines)-1]
|
||||
}
|
||||
clampedStart := min(max(0, lineStart), len(lines))
|
||||
clampedEnd := min(max(clampedStart, lineEnd), len(lines))
|
||||
return strings.Join(lines[clampedStart:clampedEnd], "")
|
||||
}
|
||||
|
||||
var testCases = []struct {
|
||||
contents string
|
||||
startLine, endLine int
|
||||
}{
|
||||
{"no trailing newline", 0, 1},
|
||||
{"trailing newline\n", 0, 1},
|
||||
{"trailing newline\nfollowed by no trailing newline", 0, 2},
|
||||
{"", 0, 0},
|
||||
{"\n", 0, 1},
|
||||
{"\n\n\n", 0, 3},
|
||||
|
||||
// Out of bounds
|
||||
{"\n\n\n", -1, 4},
|
||||
{"\n\n\n", -1, -1},
|
||||
{"\n\n\n", 4, 4},
|
||||
}
|
||||
|
||||
func TestNewlineIndex(t *testing.T) {
|
||||
lineIndexGetLines := func(contents string, startLine, endLine int) string {
|
||||
index := NewLineIndex(contents)
|
||||
start, end := index.LinesRange(startLine, endLine)
|
||||
return contents[start:end]
|
||||
}
|
||||
|
||||
t.Run("cases", func(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
got := lineIndexGetLines(tc.contents, tc.startLine, tc.endLine)
|
||||
want := naiveGetLines(tc.contents, tc.startLine, tc.endLine)
|
||||
if want != got {
|
||||
t.Log(tc)
|
||||
t.Fatalf("got: %q, want: %q", got, want)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("quick", func(t *testing.T) {
|
||||
quick.CheckEqual(lineIndexGetLines, naiveGetLines, nil)
|
||||
})
|
||||
|
||||
t.Run("line count", func(t *testing.T) {
|
||||
cases := []struct {
|
||||
content string
|
||||
lineCount int
|
||||
}{
|
||||
{"", 0},
|
||||
{"test", 1},
|
||||
{"test\n", 1},
|
||||
{"test\ntest", 2},
|
||||
{"test\ntest\n", 2},
|
||||
{"\n", 1},
|
||||
{"\n\n", 2},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
index := NewLineIndex(tc.content)
|
||||
if index.LineCount() != tc.lineCount {
|
||||
t.Fatalf("got %q, want %q", index.LineCount(), tc.lineCount)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("string allocs", func(t *testing.T) {
|
||||
contents := strings.Repeat("testline\n", 1000)
|
||||
allocs := testing.AllocsPerRun(10, func() {
|
||||
_ = NewLineIndex(contents)
|
||||
})
|
||||
if allocs != 1 {
|
||||
t.Fatalf("expected one alloc got %f", allocs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("byte allocs", func(t *testing.T) {
|
||||
contents := bytes.Repeat([]byte("testline\n"), 1000)
|
||||
allocs := testing.AllocsPerRun(10, func() {
|
||||
_ = NewLineIndex(contents)
|
||||
})
|
||||
if allocs != 1 {
|
||||
t.Fatalf("expected one alloc, got %f", allocs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func FuzzNewlineIndex(f *testing.F) {
|
||||
for _, tc := range testCases {
|
||||
f.Add(tc.contents, tc.startLine, tc.endLine)
|
||||
}
|
||||
f.Fuzz(func(t *testing.T, contents string, startLine, endLine int) {
|
||||
index := NewLineIndex(contents)
|
||||
start, end := index.LinesRange(startLine, endLine)
|
||||
got := contents[start:end]
|
||||
want := naiveGetLines(contents, startLine, endLine)
|
||||
if want != got {
|
||||
t.Fatalf("got: %q, want: %q", got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func BenchmarkLineIndex(b *testing.B) {
|
||||
b.Run("construct string", func(b *testing.B) {
|
||||
contents := strings.Repeat("testline\n", 1000)
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = NewLineIndex(contents)
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("construct bytes", func(b *testing.B) {
|
||||
contents := bytes.Repeat([]byte("testline\n"), 1000)
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = NewLineIndex(contents)
|
||||
}
|
||||
})
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user