diff --git a/internal/byteutils/linereader.go b/internal/byteutils/linereader.go new file mode 100644 index 00000000000..3ecfcc848e1 --- /dev/null +++ b/internal/byteutils/linereader.go @@ -0,0 +1,69 @@ +package byteutils + +import "bytes" + +// NewLineReader creates a new lineReader instance that reads lines from data. +// It is more memory effective than bytes.Split, because it does not require 24 bytes +// for each subslice it generates, and instead returns one subslice at a time. +// Benchmarks prove it is faster _and_ more memory efficient than bytes.Split, see +// the test file for details. +// Note: This behaves slightly differently to bytes.Split! +// For an empty input, it does NOT read a single line, like bytes.Split would. +// Also, it does NOT return a final empty line if the input is terminated with +// a final newline. +// +// data is the byte slice to read lines from. +// +// A lineReader can be used to iterate over lines in a byte slice. +// +// For example: +// +// data := []byte("hello\nworld\n") +// reader := bytes.NewLineReader(data) +// +// for reader.Scan() { +// line := reader.Line() +// // Use line... +// } +func NewLineReader(data []byte) lineReader { + return lineReader{data: data} +} + +// lineReader is a struct that can be used to iterate over lines in a byte slice. +type lineReader struct { + i int + data []byte + current []byte +} + +// Scan advances the lineReader to the next line and returns true, or returns false if there are no more lines. +// The lineReader's current field will be updated to contain the next line. +// Scan must be called before calling Line. +func (r *lineReader) Scan() bool { + // If we are at the end of the data, stop + if r.i >= len(r.data) { + return false + } + // Mark the start of the line + start := r.i + // Find the next newline + i := bytes.IndexByte(r.data[start:], '\n') + if i >= 0 { + // Exclude the newline from the line + r.current = r.data[start : start+i] + // Advance past the newline + r.i += i + 1 + return true + } + // Otherwise include the last byte + r.current = r.data[start:] + r.i = len(r.data) + return true +} + +// Line returns the current line. +// The line is valid until the next call to Scan. +// Scan must be called before calling Line. +func (r *lineReader) Line() []byte { + return r.current +} diff --git a/internal/byteutils/linereader_test.go b/internal/byteutils/linereader_test.go new file mode 100644 index 00000000000..b0c1a6b0268 --- /dev/null +++ b/internal/byteutils/linereader_test.go @@ -0,0 +1,213 @@ +package byteutils_test + +import ( + "bytes" + "testing" + + "github.com/sourcegraph/sourcegraph/internal/byteutils" +) + +func TestNewLineReader(t *testing.T) { + data := []byte("hello\nworld\n") + reader := byteutils.NewLineReader(data) + + if !reader.Scan() { + t.Error("expected scan to succeed") + } + if got, want := reader.Line(), []byte("hello"); !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } + + if !reader.Scan() { + t.Error("expected scan to succeed") + } + if got, want := reader.Line(), []byte("world"); !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } + + if reader.Scan() { + t.Error("expected scan to fail, no more lines") + } +} + +func TestNewLineReaderNoFinalNewline(t *testing.T) { + data := []byte("hello world\nhello sourcegraph") + reader := byteutils.NewLineReader(data) + + if !reader.Scan() { + t.Error("expected scan to succeed") + } + if got, want := reader.Line(), []byte("hello world"); !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } + + if !reader.Scan() { + t.Error("expected scan to succeed") + } + if got, want := reader.Line(), []byte("hello sourcegraph"); !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } + + if reader.Scan() { + t.Error("expected scan to fail, no more lines") + } +} + +func TestNewLineReaderEmptyLines(t *testing.T) { + data := []byte("\n\n\n") + reader := byteutils.NewLineReader(data) + + if !reader.Scan() { + t.Error("expected scan to succeed") + } + if got, want := reader.Line(), []byte(""); !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } + + if !reader.Scan() { + t.Error("expected scan to succeed") + } + if got, want := reader.Line(), []byte(""); !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } + + if !reader.Scan() { + t.Error("expected scan to succeed") + } + if got, want := reader.Line(), []byte(""); !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } + + if reader.Scan() { + t.Error("expected scan to fail, no more lines") + } +} + +func TestLineReaderNoCopy(t *testing.T) { + data := []byte("hello world\n") + reader := byteutils.NewLineReader(data) + + if !reader.Scan() { + t.Error("expected scan to succeed") + } + got, want := reader.Line(), []byte("hello world") + if !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } + + if reader.Scan() { + t.Error("expected scan to fail, no more lines") + } + + // Test that modifying the data in the array backing the scanned line does + // modify the original data, just like with bytes.Split. We do _not_ copy + // the data, just create a subslice. + got[1] = 'a' + + if got, want := data, []byte("hallo world\n"); !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } +} + +func TestLineReaderConsecutiveScans(t *testing.T) { + data := []byte("hello\nworld\n") + reader := byteutils.NewLineReader(data) + + if !reader.Scan() { + t.Error("expected scan to succeed") + } + got, want := reader.Line(), []byte("hello") + if !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } + + if !reader.Scan() { + t.Error("expected scan to pass") + } + + // Check that got is unmodified, it should still point to the old line. + if !bytes.Equal(got, want) { + t.Errorf("got %q, want %q", got, want) + } +} + +func BenchmarkNewLineReader(b *testing.B) { + data := []byte("hello\nworld\nhello\nworld\nhello\nworld\n") + for i := 0; i < b.N; i++ { + reader := byteutils.NewLineReader(data) + for reader.Scan() { + l := reader.Line() + _ = l + } + } + b.ReportAllocs() +} + +func BenchmarkBytesSplit(b *testing.B) { + data := []byte("hello\nworld\nhello\nworld\nhello\nworld\n") + for i := 0; i < b.N; i++ { + b := bytes.Split(data, []byte("\n")) + _ = b + } + b.ReportAllocs() +} + +func BenchmarkNewLineReaderLongLine(b *testing.B) { + data := make([]byte, 0, 10000*12) + for i := 0; i < 10000; i++ { + data = append(data, []byte("hello world")...) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + reader := byteutils.NewLineReader(data) + for reader.Scan() { + l := reader.Line() + _ = l + } + } + b.ReportAllocs() +} + +func BenchmarkBytesSplitLongLine(b *testing.B) { + data := make([]byte, 0, 10000*12) + for i := 0; i < 10000; i++ { + data = append(data, []byte("hello world")...) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + b := bytes.Split(data, []byte("\n")) + _ = b + } + b.ReportAllocs() +} + +func BenchmarkNewLineReaderManyLines(b *testing.B) { + data := make([]byte, 0, 10000*12) + for i := 0; i < 10000; i++ { + data = append(data, []byte("hello world")...) + data = append(data, []byte("\n")...) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + reader := byteutils.NewLineReader(data) + for reader.Scan() { + l := reader.Line() + _ = l + } + } + b.ReportAllocs() +} + +func BenchmarkBytesSplitManyLines(b *testing.B) { + data := make([]byte, 0, 10000*12) + for i := 0; i < 10000; i++ { + data = append(data, []byte("hello world")...) + data = append(data, []byte("\n")...) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + b := bytes.Split(data, []byte("\n")) + _ = b + } + b.ReportAllocs() +}