Backend: add line index (#63726)

This adds a line index utility. Frequently, I want to be able to efficiently index a file to extract a specific line or range of lines, but it's surprisingly tricky to get exactly right given weird definitions of "what even is a line" and edge conditions around out-of-bounds and such. So this adds a general-purpose utility to pre-calculate the locations of lines in the file, making extracting a line range a zero-allocation, `O(1)` operation. Not implemented: the same index can also be used to find the line that contains an offset, which I've also needed to do before. But I'll save that for when I actually have an immediate use for it.
2026-02-06 18:51:59 +00:00 · 2024-07-09 13:59:42 -06:00 · 2024-07-09 13:59:42 -06:00 · 5d8286b90f
commit 5d8286b90f
parent 0fc4d2811a
3 changed files with 247 additions and 2 deletions
--- a/internal/byteutils/BUILD.bazel
+++ b/internal/byteutils/BUILD.bazel
@ -4,6 +4,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
    name = "byteutils",
    srcs = [
+        "lineindex.go",
        "linereader.go",
        "nullscanner.go",
    ],
@ -14,7 +15,10 @@ go_library(

 go_test(
    name = "byteutils_test",
-    srcs = ["linereader_test.go"],
+    srcs = [
+        "lineindex_test.go",
+        "linereader_test.go",
+    ],
+    embed = [":byteutils"],
    tags = [TAG_PLATFORM_SOURCE],
-    deps = [":byteutils"],
 )
--- a/internal/byteutils/lineindex.go
+++ b/internal/byteutils/lineindex.go
@ -0,0 +1,108 @@
+package byteutils
+
+import (
+	"bytes"
+	"math"
+	"strings"
+)
+
+// NewLineIndex creates a new LineIndex from some file content.
+func NewLineIndex[T ~string | ~[]byte](content T) LineIndex {
+	if len(content) > math.MaxUint32 {
+		panic("content too large")
+	}
+
+	// PERF: count the newlines in advance to allocate the index slice exactly
+	// Explicitly case on the type rather than casting because the generics
+	// seem to break the optimization that allows the allocation to be elided.
+	var newlineCount int
+	switch v := any(content).(type) {
+	case string:
+		newlineCount = strings.Count(v, "\n")
+	case []byte:
+		newlineCount = bytes.Count(v, []byte("\n"))
+	}
+
+	index := make(LineIndex, 0, newlineCount+2)
+	index = append(index, 0)
+	offset := 0
+	for {
+		var loc int
+		switch v := any(content).(type) {
+		case string:
+			loc = strings.IndexByte(v[offset:], '\n')
+		case []byte:
+			loc = bytes.IndexByte(v[offset:], '\n')
+		}
+		if loc == -1 {
+			break
+		}
+		index = append(index, uint32(offset+loc+1))
+		offset += loc + 1
+	}
+	index = append(index, uint32(len(content)))
+	return index
+}
+
+// LineIndex contains the line boundaries of the indexed content.
+// Its structure is:
+// - A leading 0
+// - A sorted list of every byte offset _after_ a newline byte
+// - A trailing len(content)
+//
+// This means:
+// - LineIndex[N] is the offset of the first byte of line N
+// - LineIndex[N+1] is the offset of the first byte after line N
+// - content[LineIndex[N]:LineIndex[N+1]] is the contents of line N
+type LineIndex []uint32
+
+// LineRange returns a range that can be used to slice the indexed content to obtain
+// the line for the given number. The range is guaranteed to be a valid slice
+// into the content if the content is unchanged. If the line number refers to a
+// line that does not exist, a zero-length range will be returned pointing to
+// the beginning (for underflow) or end (for overflow) of the file.
+//
+// lineNumber is 0-indexed, and the returned range includes the terminating
+// newline (if it exists). Equivalent to Lines(lineNumber, lineNumber + 1).
+func (l LineIndex) LineRange(lineNumber int) (int, int) {
+	return l.LinesRange(lineNumber, lineNumber+1)
+}
+
+// LinesRange returns a range that can be used to slice the indexed content to
+// obtain the lines for the given half-open range. The range is guaranteed to
+// be a valid slice into the content if the content is unchanged. If the
+// requested range of lines does not exist, it will be truncated to return the
+// set of lines in that range that does exist.
+//
+// line numbers are 0-indexed, and the returned range includes the terminating
+// newline (if it exists).
+func (l LineIndex) LinesRange(startLine, endLine int) (int, int) {
+	startLine = min(max(0, startLine), len(l)-1)
+	endLine = min(max(startLine, endLine), len(l)-1)
+	return int(l[startLine]), int(l[endLine])
+}
+
+// For the purpose of this package, a line is defined as:
+// - zero or more non-newline bytes terminated by a newline byte
+// - OR one more non-newline terminated by the end of the file.
+//
+// Equivalently, the regex `[^\n]*\n|[^\n]+$`
+//
+// Equivalently, a newline at the last byte of the file does not
+// start an empty last line.
+//
+// Notably, this is at odds with the POSIX standard:
+// https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_206
+func (l LineIndex) LineCount() int {
+	lastLineEnd := l[len(l)-1]
+	contentEnd := l[len(l)-2]
+	if lastLineEnd == contentEnd {
+		return len(l) - 2
+	}
+	return len(l) - 1
+}
+
+// NewlineCount is simply the number of newline bytes in the content
+func (l LineIndex) NewlineCount() int {
+	return len(l) - 2
+}
--- a/internal/byteutils/lineindex_test.go
+++ b/internal/byteutils/lineindex_test.go
@ -0,0 +1,133 @@
+package byteutils
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+	"testing/quick"
+)
+
+func naiveGetLines(contents string, lineStart, lineEnd int) string {
+	lines := strings.SplitAfter(contents, "\n")
+	if len(lines[len(lines)-1]) == 0 {
+		lines = lines[:len(lines)-1]
+	}
+	clampedStart := min(max(0, lineStart), len(lines))
+	clampedEnd := min(max(clampedStart, lineEnd), len(lines))
+	return strings.Join(lines[clampedStart:clampedEnd], "")
+}
+
+var testCases = []struct {
+	contents           string
+	startLine, endLine int
+}{
+	{"no trailing newline", 0, 1},
+	{"trailing newline\n", 0, 1},
+	{"trailing newline\nfollowed by no trailing newline", 0, 2},
+	{"", 0, 0},
+	{"\n", 0, 1},
+	{"\n\n\n", 0, 3},
+
+	// Out of bounds
+	{"\n\n\n", -1, 4},
+	{"\n\n\n", -1, -1},
+	{"\n\n\n", 4, 4},
+}
+
+func TestNewlineIndex(t *testing.T) {
+	lineIndexGetLines := func(contents string, startLine, endLine int) string {
+		index := NewLineIndex(contents)
+		start, end := index.LinesRange(startLine, endLine)
+		return contents[start:end]
+	}
+
+	t.Run("cases", func(t *testing.T) {
+		for _, tc := range testCases {
+			got := lineIndexGetLines(tc.contents, tc.startLine, tc.endLine)
+			want := naiveGetLines(tc.contents, tc.startLine, tc.endLine)
+			if want != got {
+				t.Log(tc)
+				t.Fatalf("got: %q, want: %q", got, want)
+			}
+		}
+	})
+
+	t.Run("quick", func(t *testing.T) {
+		quick.CheckEqual(lineIndexGetLines, naiveGetLines, nil)
+	})
+
+	t.Run("line count", func(t *testing.T) {
+		cases := []struct {
+			content   string
+			lineCount int
+		}{
+			{"", 0},
+			{"test", 1},
+			{"test\n", 1},
+			{"test\ntest", 2},
+			{"test\ntest\n", 2},
+			{"\n", 1},
+			{"\n\n", 2},
+		}
+
+		for _, tc := range cases {
+			index := NewLineIndex(tc.content)
+			if index.LineCount() != tc.lineCount {
+				t.Fatalf("got %q, want %q", index.LineCount(), tc.lineCount)
+			}
+		}
+	})
+
+	t.Run("string allocs", func(t *testing.T) {
+		contents := strings.Repeat("testline\n", 1000)
+		allocs := testing.AllocsPerRun(10, func() {
+			_ = NewLineIndex(contents)
+		})
+		if allocs != 1 {
+			t.Fatalf("expected one alloc got %f", allocs)
+		}
+	})
+
+	t.Run("byte allocs", func(t *testing.T) {
+		contents := bytes.Repeat([]byte("testline\n"), 1000)
+		allocs := testing.AllocsPerRun(10, func() {
+			_ = NewLineIndex(contents)
+		})
+		if allocs != 1 {
+			t.Fatalf("expected one alloc, got %f", allocs)
+		}
+	})
+}
+
+func FuzzNewlineIndex(f *testing.F) {
+	for _, tc := range testCases {
+		f.Add(tc.contents, tc.startLine, tc.endLine)
+	}
+	f.Fuzz(func(t *testing.T, contents string, startLine, endLine int) {
+		index := NewLineIndex(contents)
+		start, end := index.LinesRange(startLine, endLine)
+		got := contents[start:end]
+		want := naiveGetLines(contents, startLine, endLine)
+		if want != got {
+			t.Fatalf("got: %q, want: %q", got, want)
+		}
+	})
+}
+
+func BenchmarkLineIndex(b *testing.B) {
+	b.Run("construct string", func(b *testing.B) {
+		contents := strings.Repeat("testline\n", 1000)
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = NewLineIndex(contents)
+		}
+	})
+
+	b.Run("construct bytes", func(b *testing.B) {
+		contents := bytes.Repeat([]byte("testline\n"), 1000)
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = NewLineIndex(contents)
+		}
+	})
+}