mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 15:51:43 +00:00
Add linereader helper (#49965)
This adds a new helper to read byte slices line by line without the overhead of requiring 24 bytes per subslice it creates from `bytes.Split`. This is also different to bufio.Scanner in that it doesn't need to allocate a large buffer to support long line lengths. I've added a bunch of benchmarks for this, it seems to perform generally better, with fewer allocated memory. In a follow-up PR, I will make use of this helper. ``` goos: darwin goarch: arm64 pkg: github.com/sourcegraph/sourcegraph/internal/byteutils BenchmarkNewLineReader-10 12786025 87.10 ns/op 0 B/op 0 allocs/op BenchmarkBytesSplit-10 9051704 125.6 ns/op 176 B/op 1 allocs/op BenchmarkNewLineReaderLongLine-10 689241 1673 ns/op 0 B/op 0 allocs/op BenchmarkBytesSplitLongLine-10 518218 2198 ns/op 24 B/op 1 allocs/op BenchmarkNewLineReaderManyLines-10 8460 137936 ns/op 0 B/op 0 allocs/op BenchmarkBytesSplitManyLines-10 7690 149148 ns/op 245760 B/op 1 allocs/op PASS ```
This commit is contained in:
parent
3509ad3282
commit
211d5430c6
69
internal/byteutils/linereader.go
Normal file
69
internal/byteutils/linereader.go
Normal file
@ -0,0 +1,69 @@
|
||||
package byteutils
|
||||
|
||||
import "bytes"
|
||||
|
||||
// NewLineReader creates a new lineReader instance that reads lines from data.
|
||||
// It is more memory effective than bytes.Split, because it does not require 24 bytes
|
||||
// for each subslice it generates, and instead returns one subslice at a time.
|
||||
// Benchmarks prove it is faster _and_ more memory efficient than bytes.Split, see
|
||||
// the test file for details.
|
||||
// Note: This behaves slightly differently to bytes.Split!
|
||||
// For an empty input, it does NOT read a single line, like bytes.Split would.
|
||||
// Also, it does NOT return a final empty line if the input is terminated with
|
||||
// a final newline.
|
||||
//
|
||||
// data is the byte slice to read lines from.
|
||||
//
|
||||
// A lineReader can be used to iterate over lines in a byte slice.
|
||||
//
|
||||
// For example:
|
||||
//
|
||||
// data := []byte("hello\nworld\n")
|
||||
// reader := bytes.NewLineReader(data)
|
||||
//
|
||||
// for reader.Scan() {
|
||||
// line := reader.Line()
|
||||
// // Use line...
|
||||
// }
|
||||
func NewLineReader(data []byte) lineReader {
|
||||
return lineReader{data: data}
|
||||
}
|
||||
|
||||
// lineReader is a struct that can be used to iterate over lines in a byte slice.
|
||||
type lineReader struct {
|
||||
i int
|
||||
data []byte
|
||||
current []byte
|
||||
}
|
||||
|
||||
// Scan advances the lineReader to the next line and returns true, or returns false if there are no more lines.
|
||||
// The lineReader's current field will be updated to contain the next line.
|
||||
// Scan must be called before calling Line.
|
||||
func (r *lineReader) Scan() bool {
|
||||
// If we are at the end of the data, stop
|
||||
if r.i >= len(r.data) {
|
||||
return false
|
||||
}
|
||||
// Mark the start of the line
|
||||
start := r.i
|
||||
// Find the next newline
|
||||
i := bytes.IndexByte(r.data[start:], '\n')
|
||||
if i >= 0 {
|
||||
// Exclude the newline from the line
|
||||
r.current = r.data[start : start+i]
|
||||
// Advance past the newline
|
||||
r.i += i + 1
|
||||
return true
|
||||
}
|
||||
// Otherwise include the last byte
|
||||
r.current = r.data[start:]
|
||||
r.i = len(r.data)
|
||||
return true
|
||||
}
|
||||
|
||||
// Line returns the current line.
|
||||
// The line is valid until the next call to Scan.
|
||||
// Scan must be called before calling Line.
|
||||
func (r *lineReader) Line() []byte {
|
||||
return r.current
|
||||
}
|
||||
213
internal/byteutils/linereader_test.go
Normal file
213
internal/byteutils/linereader_test.go
Normal file
@ -0,0 +1,213 @@
|
||||
package byteutils_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"testing"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/internal/byteutils"
|
||||
)
|
||||
|
||||
func TestNewLineReader(t *testing.T) {
|
||||
data := []byte("hello\nworld\n")
|
||||
reader := byteutils.NewLineReader(data)
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to succeed")
|
||||
}
|
||||
if got, want := reader.Line(), []byte("hello"); !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to succeed")
|
||||
}
|
||||
if got, want := reader.Line(), []byte("world"); !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if reader.Scan() {
|
||||
t.Error("expected scan to fail, no more lines")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewLineReaderNoFinalNewline(t *testing.T) {
|
||||
data := []byte("hello world\nhello sourcegraph")
|
||||
reader := byteutils.NewLineReader(data)
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to succeed")
|
||||
}
|
||||
if got, want := reader.Line(), []byte("hello world"); !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to succeed")
|
||||
}
|
||||
if got, want := reader.Line(), []byte("hello sourcegraph"); !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if reader.Scan() {
|
||||
t.Error("expected scan to fail, no more lines")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewLineReaderEmptyLines(t *testing.T) {
|
||||
data := []byte("\n\n\n")
|
||||
reader := byteutils.NewLineReader(data)
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to succeed")
|
||||
}
|
||||
if got, want := reader.Line(), []byte(""); !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to succeed")
|
||||
}
|
||||
if got, want := reader.Line(), []byte(""); !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to succeed")
|
||||
}
|
||||
if got, want := reader.Line(), []byte(""); !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if reader.Scan() {
|
||||
t.Error("expected scan to fail, no more lines")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLineReaderNoCopy(t *testing.T) {
|
||||
data := []byte("hello world\n")
|
||||
reader := byteutils.NewLineReader(data)
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to succeed")
|
||||
}
|
||||
got, want := reader.Line(), []byte("hello world")
|
||||
if !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if reader.Scan() {
|
||||
t.Error("expected scan to fail, no more lines")
|
||||
}
|
||||
|
||||
// Test that modifying the data in the array backing the scanned line does
|
||||
// modify the original data, just like with bytes.Split. We do _not_ copy
|
||||
// the data, just create a subslice.
|
||||
got[1] = 'a'
|
||||
|
||||
if got, want := data, []byte("hallo world\n"); !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLineReaderConsecutiveScans(t *testing.T) {
|
||||
data := []byte("hello\nworld\n")
|
||||
reader := byteutils.NewLineReader(data)
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to succeed")
|
||||
}
|
||||
got, want := reader.Line(), []byte("hello")
|
||||
if !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if !reader.Scan() {
|
||||
t.Error("expected scan to pass")
|
||||
}
|
||||
|
||||
// Check that got is unmodified, it should still point to the old line.
|
||||
if !bytes.Equal(got, want) {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNewLineReader(b *testing.B) {
|
||||
data := []byte("hello\nworld\nhello\nworld\nhello\nworld\n")
|
||||
for i := 0; i < b.N; i++ {
|
||||
reader := byteutils.NewLineReader(data)
|
||||
for reader.Scan() {
|
||||
l := reader.Line()
|
||||
_ = l
|
||||
}
|
||||
}
|
||||
b.ReportAllocs()
|
||||
}
|
||||
|
||||
func BenchmarkBytesSplit(b *testing.B) {
|
||||
data := []byte("hello\nworld\nhello\nworld\nhello\nworld\n")
|
||||
for i := 0; i < b.N; i++ {
|
||||
b := bytes.Split(data, []byte("\n"))
|
||||
_ = b
|
||||
}
|
||||
b.ReportAllocs()
|
||||
}
|
||||
|
||||
func BenchmarkNewLineReaderLongLine(b *testing.B) {
|
||||
data := make([]byte, 0, 10000*12)
|
||||
for i := 0; i < 10000; i++ {
|
||||
data = append(data, []byte("hello world")...)
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
reader := byteutils.NewLineReader(data)
|
||||
for reader.Scan() {
|
||||
l := reader.Line()
|
||||
_ = l
|
||||
}
|
||||
}
|
||||
b.ReportAllocs()
|
||||
}
|
||||
|
||||
func BenchmarkBytesSplitLongLine(b *testing.B) {
|
||||
data := make([]byte, 0, 10000*12)
|
||||
for i := 0; i < 10000; i++ {
|
||||
data = append(data, []byte("hello world")...)
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
b := bytes.Split(data, []byte("\n"))
|
||||
_ = b
|
||||
}
|
||||
b.ReportAllocs()
|
||||
}
|
||||
|
||||
func BenchmarkNewLineReaderManyLines(b *testing.B) {
|
||||
data := make([]byte, 0, 10000*12)
|
||||
for i := 0; i < 10000; i++ {
|
||||
data = append(data, []byte("hello world")...)
|
||||
data = append(data, []byte("\n")...)
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
reader := byteutils.NewLineReader(data)
|
||||
for reader.Scan() {
|
||||
l := reader.Line()
|
||||
_ = l
|
||||
}
|
||||
}
|
||||
b.ReportAllocs()
|
||||
}
|
||||
|
||||
func BenchmarkBytesSplitManyLines(b *testing.B) {
|
||||
data := make([]byte, 0, 10000*12)
|
||||
for i := 0; i < 10000; i++ {
|
||||
data = append(data, []byte("hello world")...)
|
||||
data = append(data, []byte("\n")...)
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
b := bytes.Split(data, []byte("\n"))
|
||||
_ = b
|
||||
}
|
||||
b.ReportAllocs()
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user