mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 18:11:48 +00:00
This adds a new helper to read byte slices line by line without the overhead of requiring 24 bytes per subslice it creates from `bytes.Split`. This is also different to bufio.Scanner in that it doesn't need to allocate a large buffer to support long line lengths. I've added a bunch of benchmarks for this, it seems to perform generally better, with fewer allocated memory. In a follow-up PR, I will make use of this helper. ``` goos: darwin goarch: arm64 pkg: github.com/sourcegraph/sourcegraph/internal/byteutils BenchmarkNewLineReader-10 12786025 87.10 ns/op 0 B/op 0 allocs/op BenchmarkBytesSplit-10 9051704 125.6 ns/op 176 B/op 1 allocs/op BenchmarkNewLineReaderLongLine-10 689241 1673 ns/op 0 B/op 0 allocs/op BenchmarkBytesSplitLongLine-10 518218 2198 ns/op 24 B/op 1 allocs/op BenchmarkNewLineReaderManyLines-10 8460 137936 ns/op 0 B/op 0 allocs/op BenchmarkBytesSplitManyLines-10 7690 149148 ns/op 245760 B/op 1 allocs/op PASS ```
70 lines
2.0 KiB
Go
70 lines
2.0 KiB
Go
package byteutils
|
|
|
|
import "bytes"
|
|
|
|
// NewLineReader creates a new lineReader instance that reads lines from data.
|
|
// It is more memory effective than bytes.Split, because it does not require 24 bytes
|
|
// for each subslice it generates, and instead returns one subslice at a time.
|
|
// Benchmarks prove it is faster _and_ more memory efficient than bytes.Split, see
|
|
// the test file for details.
|
|
// Note: This behaves slightly differently to bytes.Split!
|
|
// For an empty input, it does NOT read a single line, like bytes.Split would.
|
|
// Also, it does NOT return a final empty line if the input is terminated with
|
|
// a final newline.
|
|
//
|
|
// data is the byte slice to read lines from.
|
|
//
|
|
// A lineReader can be used to iterate over lines in a byte slice.
|
|
//
|
|
// For example:
|
|
//
|
|
// data := []byte("hello\nworld\n")
|
|
// reader := bytes.NewLineReader(data)
|
|
//
|
|
// for reader.Scan() {
|
|
// line := reader.Line()
|
|
// // Use line...
|
|
// }
|
|
func NewLineReader(data []byte) lineReader {
|
|
return lineReader{data: data}
|
|
}
|
|
|
|
// lineReader is a struct that can be used to iterate over lines in a byte slice.
|
|
type lineReader struct {
|
|
i int
|
|
data []byte
|
|
current []byte
|
|
}
|
|
|
|
// Scan advances the lineReader to the next line and returns true, or returns false if there are no more lines.
|
|
// The lineReader's current field will be updated to contain the next line.
|
|
// Scan must be called before calling Line.
|
|
func (r *lineReader) Scan() bool {
|
|
// If we are at the end of the data, stop
|
|
if r.i >= len(r.data) {
|
|
return false
|
|
}
|
|
// Mark the start of the line
|
|
start := r.i
|
|
// Find the next newline
|
|
i := bytes.IndexByte(r.data[start:], '\n')
|
|
if i >= 0 {
|
|
// Exclude the newline from the line
|
|
r.current = r.data[start : start+i]
|
|
// Advance past the newline
|
|
r.i += i + 1
|
|
return true
|
|
}
|
|
// Otherwise include the last byte
|
|
r.current = r.data[start:]
|
|
r.i = len(r.data)
|
|
return true
|
|
}
|
|
|
|
// Line returns the current line.
|
|
// The line is valid until the next call to Scan.
|
|
// Scan must be called before calling Line.
|
|
func (r *lineReader) Line() []byte {
|
|
return r.current
|
|
}
|