Add linereader helper (#49965)

This adds a new helper to read byte slices line by line without the
overhead of requiring 24 bytes per subslice it creates from
`bytes.Split`. This is also different to bufio.Scanner in that it
doesn't need to allocate a large buffer to support long line lengths.
I've added a bunch of benchmarks for this, it seems to perform generally
better, with fewer allocated memory. In a follow-up PR, I will make use
of this helper.

```
goos: darwin
goarch: arm64
pkg: github.com/sourcegraph/sourcegraph/internal/byteutils
BenchmarkNewLineReader-10               12786025                87.10 ns/op            0 B/op          0 allocs/op
BenchmarkBytesSplit-10                   9051704               125.6 ns/op           176 B/op          1 allocs/op
BenchmarkNewLineReaderLongLine-10         689241              1673 ns/op               0 B/op          0 allocs/op
BenchmarkBytesSplitLongLine-10            518218              2198 ns/op              24 B/op          1 allocs/op
BenchmarkNewLineReaderManyLines-10          8460            137936 ns/op               0 B/op          0 allocs/op
BenchmarkBytesSplitManyLines-10             7690            149148 ns/op          245760 B/op          1 allocs/op
PASS
```
This commit is contained in:
Erik Seliger 2023-03-24 17:39:49 +01:00 committed by GitHub
parent 3509ad3282
commit 211d5430c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 282 additions and 0 deletions

View File

@ -0,0 +1,69 @@
package byteutils
import "bytes"
// NewLineReader creates a new lineReader instance that reads lines from data.
// It is more memory effective than bytes.Split, because it does not require 24 bytes
// for each subslice it generates, and instead returns one subslice at a time.
// Benchmarks prove it is faster _and_ more memory efficient than bytes.Split, see
// the test file for details.
// Note: This behaves slightly differently to bytes.Split!
// For an empty input, it does NOT read a single line, like bytes.Split would.
// Also, it does NOT return a final empty line if the input is terminated with
// a final newline.
//
// data is the byte slice to read lines from.
//
// A lineReader can be used to iterate over lines in a byte slice.
//
// For example:
//
// data := []byte("hello\nworld\n")
// reader := bytes.NewLineReader(data)
//
// for reader.Scan() {
// line := reader.Line()
// // Use line...
// }
func NewLineReader(data []byte) lineReader {
return lineReader{data: data}
}
// lineReader is a struct that can be used to iterate over lines in a byte slice.
type lineReader struct {
i int
data []byte
current []byte
}
// Scan advances the lineReader to the next line and returns true, or returns false if there are no more lines.
// The lineReader's current field will be updated to contain the next line.
// Scan must be called before calling Line.
func (r *lineReader) Scan() bool {
// If we are at the end of the data, stop
if r.i >= len(r.data) {
return false
}
// Mark the start of the line
start := r.i
// Find the next newline
i := bytes.IndexByte(r.data[start:], '\n')
if i >= 0 {
// Exclude the newline from the line
r.current = r.data[start : start+i]
// Advance past the newline
r.i += i + 1
return true
}
// Otherwise include the last byte
r.current = r.data[start:]
r.i = len(r.data)
return true
}
// Line returns the current line.
// The line is valid until the next call to Scan.
// Scan must be called before calling Line.
func (r *lineReader) Line() []byte {
return r.current
}

View File

@ -0,0 +1,213 @@
package byteutils_test
import (
"bytes"
"testing"
"github.com/sourcegraph/sourcegraph/internal/byteutils"
)
func TestNewLineReader(t *testing.T) {
data := []byte("hello\nworld\n")
reader := byteutils.NewLineReader(data)
if !reader.Scan() {
t.Error("expected scan to succeed")
}
if got, want := reader.Line(), []byte("hello"); !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
if !reader.Scan() {
t.Error("expected scan to succeed")
}
if got, want := reader.Line(), []byte("world"); !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
if reader.Scan() {
t.Error("expected scan to fail, no more lines")
}
}
func TestNewLineReaderNoFinalNewline(t *testing.T) {
data := []byte("hello world\nhello sourcegraph")
reader := byteutils.NewLineReader(data)
if !reader.Scan() {
t.Error("expected scan to succeed")
}
if got, want := reader.Line(), []byte("hello world"); !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
if !reader.Scan() {
t.Error("expected scan to succeed")
}
if got, want := reader.Line(), []byte("hello sourcegraph"); !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
if reader.Scan() {
t.Error("expected scan to fail, no more lines")
}
}
func TestNewLineReaderEmptyLines(t *testing.T) {
data := []byte("\n\n\n")
reader := byteutils.NewLineReader(data)
if !reader.Scan() {
t.Error("expected scan to succeed")
}
if got, want := reader.Line(), []byte(""); !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
if !reader.Scan() {
t.Error("expected scan to succeed")
}
if got, want := reader.Line(), []byte(""); !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
if !reader.Scan() {
t.Error("expected scan to succeed")
}
if got, want := reader.Line(), []byte(""); !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
if reader.Scan() {
t.Error("expected scan to fail, no more lines")
}
}
func TestLineReaderNoCopy(t *testing.T) {
data := []byte("hello world\n")
reader := byteutils.NewLineReader(data)
if !reader.Scan() {
t.Error("expected scan to succeed")
}
got, want := reader.Line(), []byte("hello world")
if !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
if reader.Scan() {
t.Error("expected scan to fail, no more lines")
}
// Test that modifying the data in the array backing the scanned line does
// modify the original data, just like with bytes.Split. We do _not_ copy
// the data, just create a subslice.
got[1] = 'a'
if got, want := data, []byte("hallo world\n"); !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
}
func TestLineReaderConsecutiveScans(t *testing.T) {
data := []byte("hello\nworld\n")
reader := byteutils.NewLineReader(data)
if !reader.Scan() {
t.Error("expected scan to succeed")
}
got, want := reader.Line(), []byte("hello")
if !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
if !reader.Scan() {
t.Error("expected scan to pass")
}
// Check that got is unmodified, it should still point to the old line.
if !bytes.Equal(got, want) {
t.Errorf("got %q, want %q", got, want)
}
}
func BenchmarkNewLineReader(b *testing.B) {
data := []byte("hello\nworld\nhello\nworld\nhello\nworld\n")
for i := 0; i < b.N; i++ {
reader := byteutils.NewLineReader(data)
for reader.Scan() {
l := reader.Line()
_ = l
}
}
b.ReportAllocs()
}
func BenchmarkBytesSplit(b *testing.B) {
data := []byte("hello\nworld\nhello\nworld\nhello\nworld\n")
for i := 0; i < b.N; i++ {
b := bytes.Split(data, []byte("\n"))
_ = b
}
b.ReportAllocs()
}
func BenchmarkNewLineReaderLongLine(b *testing.B) {
data := make([]byte, 0, 10000*12)
for i := 0; i < 10000; i++ {
data = append(data, []byte("hello world")...)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
reader := byteutils.NewLineReader(data)
for reader.Scan() {
l := reader.Line()
_ = l
}
}
b.ReportAllocs()
}
func BenchmarkBytesSplitLongLine(b *testing.B) {
data := make([]byte, 0, 10000*12)
for i := 0; i < 10000; i++ {
data = append(data, []byte("hello world")...)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
b := bytes.Split(data, []byte("\n"))
_ = b
}
b.ReportAllocs()
}
func BenchmarkNewLineReaderManyLines(b *testing.B) {
data := make([]byte, 0, 10000*12)
for i := 0; i < 10000; i++ {
data = append(data, []byte("hello world")...)
data = append(data, []byte("\n")...)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
reader := byteutils.NewLineReader(data)
for reader.Scan() {
l := reader.Line()
_ = l
}
}
b.ReportAllocs()
}
func BenchmarkBytesSplitManyLines(b *testing.B) {
data := make([]byte, 0, 10000*12)
for i := 0; i < 10000; i++ {
data = append(data, []byte("hello world")...)
data = append(data, []byte("\n")...)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
b := bytes.Split(data, []byte("\n"))
_ = b
}
b.ReportAllocs()
}