gcs: Support independent fp rate and bin size.

This modifies the code to support an independent false positive rate and
Golomb coding bin size.  Among other things, this permits more optimal
parameters for minimizing the filter size to be specified.

This capability will be used in the upcoming version 2 filters that will
ultimately be included in header commitments.

For a concrete example, the current version 1 filter for block 89341 on
mainnet contains 2470 items resulting in a full serialized size of 6,669
bytes.  In contrast, if the optimal parameters were specified as
described by the comments in this commit, with no other changes to the
items included in the filter, that same filter would be 6,505 bytes,
which is a size reduction of about 2.46%.  This might not seem like a
significant amount, but consider that there is a filter for every block,
so it really adds up.

Since the internal filter no longer directly has a P parameter, this
moves the method to obtain it to the FilterV1 type and adds a new test
to ensure it is returned properly.

Additionally, all of the tests are converted to use the parameters while
retaining the same effective parameters to help prove correctness of the
new code.

Finally, it also significantly reduces the number of allocations
required to construct a filter resulting in faster filter construction
and reduced pressure on the GC and does some other minor consistency
cleanup while here.

In terms of the reduction in allocations, the following is a before and
after comparison of building filters with 50k and 100k elements:

benchmark                    old ns/op    new ns/op     delta
--------------------------------------------------------------
BenchmarkFilterBuild50000    18095111     15680001     -13.35%
BenchmarkFilterBuild100000   31980156     31389892     -1.85%

benchmark                    old allocs   new allocs   delta
--------------------------------------------------------------
BenchmarkFilterBuild50000    31           6            -80.65%
BenchmarkFilterBuild100000   34           6            -82.35%

benchmark                    old bytes    new bytes    delta
--------------------------------------------------------------
BenchmarkFilterBuild50000    1202343      688271       -42.76%
BenchmarkFilterBuild100000   2488472      1360000      -45.35%
This commit is contained in:
Dave Collins 2019-08-21 01:27:46 -05:00
parent 3305fcb3fa
commit 952bd7bba3
No known key found for this signature in database
GPG Key ID: B8904D9D9C93D1F2
2 changed files with 185 additions and 85 deletions

View File

@ -18,8 +18,6 @@ import (
"github.com/decred/dcrd/crypto/blake256"
)
// Inspired by https://github.com/rasky/gcs
// KeySize is the size of the byte array required for key material for the
// SipHash keyed hash function.
const KeySize = 16
@ -40,34 +38,62 @@ func (s *uint64s) Swap(i, j int) { (*s)[i], (*s)[j] = (*s)[j], (*s)[i] }
type filter struct {
version uint16
n uint32
p uint8
modulusNP uint64
b uint8
modulusNM uint64
filterNData []byte
filterData []byte // Slice into filterNData with raw filter bytes.
}
// newFilter builds a new GCS filter of the specified version with the collision
// probability of `1/(2**P)`, key `key`, and including every `[]byte` in `data`
// as a member of the set.
func newFilter(version uint16, P uint8, key [KeySize]byte, data [][]byte) (*filter, error) {
if len(data) > math.MaxInt32 {
// newFilter builds a new GCS filter with the specified version and provided
// tunable parameters that contains every item of the passed data as a member of
// the set.
//
// B is the tunable bits parameter for constructing the filter that is used as
// the bin size in the underlying Golomb coding with a value of 2^B. The
// optimal value of B to minimize the size of the filter for a given false
// positive rate 1/M is floor(log_2(M) - 0.055256). The maximum allowed value
// for B is 32.
//
// M is the inverse of the target false positive rate for the filter. The
// optimal value of M to minimize the size of the filter for a given B is
// ceil(1.497137 * 2^B).
//
// key is a key used in the SipHash function used to hash each data element
// prior to inclusion in the filter. This helps thwart would be attackers
// attempting to choose elements that intentionally cause false positives.
//
// The general process for determining optimal parameters for B and M to
// minimize the size of the filter is to start with the desired false positive
// rate and calculate B per the aforementioned formula accordingly. Then, if
// the application permits the false positive rate to be varied, calculate the
// optimal value of M via the formula provided under the description of M.
//
// NOTE: Since this function must only be used internally, it will panic if
// called with a value of B greater than 32.
func newFilter(version uint16, B uint8, M uint64, key [KeySize]byte, data [][]byte) (*filter, error) {
if B > 32 {
panic(fmt.Sprintf("B value of %d is greater than max allowed 32", B))
}
switch version {
case 1:
default:
panic(fmt.Sprintf("version %d filters are not supported", version))
}
numEntries := uint64(len(data))
if numEntries > math.MaxInt32 {
str := fmt.Sprintf("unable to create filter with %d entries greater "+
"than max allowed %d", len(data), math.MaxInt32)
return nil, makeError(ErrNTooBig, str)
}
if P > 32 {
str := fmt.Sprintf("P value of %d is greater than max allowed 32", P)
return nil, makeError(ErrPTooBig, str)
}
// Create the filter object and insert metadata.
modP := uint64(1 << P)
modPMask := modP - 1
modBMask := uint64(1<<B) - 1
f := filter{
version: version,
n: uint32(len(data)),
p: P,
modulusNP: uint64(len(data)) * modP,
n: uint32(numEntries),
b: B,
modulusNM: numEntries * M,
}
// Nothing to do for an empty filter.
@ -75,45 +101,62 @@ func newFilter(version uint16, P uint8, key [KeySize]byte, data [][]byte) (*filt
return &f, nil
}
// Allocate filter data.
values := make([]uint64, 0, len(data))
// Insert the hash (modulo N*P) of each data element into a slice and
// sort the slice.
// Insert the hash of each data element reduced to the range [0,N*M) into
// slice and sort it.
values := make([]uint64, 0, numEntries)
k0 := binary.LittleEndian.Uint64(key[0:8])
k1 := binary.LittleEndian.Uint64(key[8:16])
for _, d := range data {
v := siphash.Hash(k0, k1, d) % f.modulusNP
v := siphash.Hash(k0, k1, d) % f.modulusNM
values = append(values, v)
}
sort.Sort((*uint64s)(&values))
// Every entry will have f.b bits for the remainder portion and a quotient
// that is expected to be 1 on average with an exponentially decreasing
// probability for each subsequent value with reasonably optimal parameters.
// A quotient of 1 takes 2 bits in unary to encode and a quotient of 2 takes
// 3 bits. Since the first two terms dominate, a reasonable expected size
// in bytes is:
// (NB + 2N/2 + 3N/2) / 8
var b bitWriter
sizeHint := (numEntries*uint64(f.b) + numEntries + 3*numEntries>>1) >> 3
b.bytes = make([]byte, 0, sizeHint)
// Write the sorted list of values into the filter bitstream,
// compressing it using Golomb coding.
var value, lastValue, remainder uint64
// Write the sorted list of values into the filter bitstream using Golomb
// coding.
var quotient, prevValue, remainder uint64
for _, v := range values {
// Calculate the difference between this value and the last,
// modulo P.
remainder = (v - lastValue) & modPMask
delta := v - prevValue
prevValue = v
// Calculate the difference between this value and the last,
// divided by P.
value = (v - lastValue - remainder) >> f.p
lastValue = v
// Calculate the remainder of the difference between this value and the
// previous when dividing by 2^B.
//
// r = d % 2^B
remainder = delta & modBMask
// Write the P multiple into the bitstream in unary; the
// average should be around 1 (2 bits - 0b10).
for value > 0 {
// Calculate the quotient of the difference between this value and the
// previous when dividing by 2^B.
//
// q = floor(d / 2^B)
quotient = (delta - remainder) >> f.b
// Write the quotient into the bitstream in unary. The average value
// will be around 1 for reasonably optimal parameters (which is encoded
// as 2 bits - 0b10).
for quotient > 0 {
b.writeOne()
value--
quotient--
}
b.writeZero()
// Write the remainder as a big-endian integer with enough bits
// to represent the appropriate collision probability.
b.writeNBits(remainder, uint(f.p))
// Write the remainder into the bitstream as a big-endian integer with
// B bits. Note that Golomb coding typically uses truncated binary
// encoding in order to support arbitrary bin sizes, however, since 2^B
// is necessarily fixed to a power of 2, it is equivalent to a regular
// binary code.
b.writeNBits(remainder, uint(f.b))
}
// Save the filter data internally as n + filter bytes
@ -127,38 +170,31 @@ func newFilter(version uint16, P uint8, key [KeySize]byte, data [][]byte) (*filt
}
// Bytes returns the serialized format of the GCS filter which includes N, but
// does not include P (returned by a separate method) or the key used by
// SipHash.
// does not include other parameters such as the false positive rate or the key.
func (f *filter) Bytes() []byte {
return f.filterNData
}
// P returns the filter's collision probability as a negative power of 2 (that
// is, a collision probability of `1/2**20` is represented as 20).
func (f *filter) P() uint8 {
return f.p
}
// N returns the size of the data set used to build the filter.
func (f *filter) N() uint32 {
return f.n
}
// readFullUint64 reads a value represented by the sum of a unary multiple of
// the filter's P modulus (`2**P`) and a big-endian P-bit remainder.
// the Golomb coding bin size (2^B) and a big-endian B-bit remainder.
func (f *filter) readFullUint64(b *bitReader) (uint64, error) {
v, err := b.readUnary()
if err != nil {
return 0, err
}
rem, err := b.readNBits(uint(f.p))
rem, err := b.readNBits(uint(f.b))
if err != nil {
return 0, err
}
// Add the multiple and the remainder.
return v<<f.p + rem, nil
return v<<f.b + rem, nil
}
// Match checks whether a []byte value is likely (within collision probability)
@ -175,7 +211,7 @@ func (f *filter) Match(key [KeySize]byte, data []byte) bool {
// Hash our search term with the same parameters as the filter.
k0 := binary.LittleEndian.Uint64(key[0:8])
k1 := binary.LittleEndian.Uint64(key[8:16])
term := siphash.Hash(k0, k1, data) % f.modulusNP
term := siphash.Hash(k0, k1, data) % f.modulusNM
// Go through the search filter and look for the desired value.
var lastValue uint64
@ -227,7 +263,7 @@ func (f *filter) MatchAny(key [KeySize]byte, data [][]byte) bool {
k0 := binary.LittleEndian.Uint64(key[0:8])
k1 := binary.LittleEndian.Uint64(key[8:16])
for _, d := range data {
v := siphash.Hash(k0, k1, d) % f.modulusNP
v := siphash.Hash(k0, k1, d) % f.modulusNM
*values = append(*values, v)
}
sort.Sort((*uint64s)(values))
@ -279,22 +315,32 @@ func (f *filter) Hash() chainhash.Hash {
return chainhash.Hash(blake256.Sum256(f.filterNData))
}
// Filter describes an immutable filter that can be built from a set of data
// elements, serialized, deserialized, and queried in a thread-safe manner. The
// serialized form is compressed as a Golomb Coded Set (GCS), but does not
// include N or P to allow the user to encode the metadata separately if
// necessary. The hash function used is SipHash, a keyed function; the key used
// in building the filter is required in order to match filter values and is
// not included in the serialized form.
// FilterV1 describes an immutable filter that can be built from a set of data
// elements, serialized, deserialized, and queried in a thread-safe manner. The
// serialized form is compressed as a Golomb Coded Set (GCS) along with the
// number of members of the set. The hash function used is SipHash, a keyed
// function. The key used in building the filter is required in order to match
// filter values and is not included in the serialized form.
type FilterV1 struct {
filter
}
// NewFilter builds a new version 1 GCS filter with the collision probability of
// `1/(2**P)`, key `key`, and including every `[]byte` in `data` as a member of
// the set.
// P returns the filter's collision probability as a negative power of 2. For
// example, a collision probability of 1 / 2^20 is represented as 20.
func (f *FilterV1) P() uint8 {
return f.b
}
// NewFilter builds a new version 1 GCS filter with a collision probability of
// 1 / 2^P for the given key and data.
func NewFilterV1(P uint8, key [KeySize]byte, data [][]byte) (*FilterV1, error) {
filter, err := newFilter(1, P, key, data)
// Basic sanity check.
if P > 32 {
str := fmt.Sprintf("P value of %d is greater than max allowed 32", P)
return nil, makeError(ErrPTooBig, str)
}
filter, err := newFilter(1, P, 1<<P, key, data)
if err != nil {
return nil, err
}
@ -323,8 +369,8 @@ func FromBytesV1(P uint8, d []byte) (*FilterV1, error) {
f := filter{
version: 1,
n: n,
p: P,
modulusNP: uint64(n) * uint64(1<<P),
b: P,
modulusNM: uint64(n) * uint64(1<<P),
filterNData: d,
filterData: filterData,
}

View File

@ -52,7 +52,8 @@ func TestFilter(t *testing.T) {
tests := []struct {
name string // test description
version uint16 // filter version
p uint8 // collision probability
b uint8 // golomb coding bin size
m uint64 // inverse of false positive rate
matchKey [KeySize]byte // random filter key for matches
contents [][]byte // data to include in the filter
wantMatches [][]byte // expected matches
@ -62,7 +63,8 @@ func TestFilter(t *testing.T) {
}{{
name: "empty filter",
version: 1,
p: 20,
b: 20,
m: 1 << 20,
matchKey: randKey,
contents: nil,
wantMatches: nil,
@ -70,9 +72,10 @@ func TestFilter(t *testing.T) {
wantBytes: "",
wantHash: "0000000000000000000000000000000000000000000000000000000000000000",
}, {
name: "contents1 with P=20",
name: "contents1 with B=20, M=1<<20",
version: 1,
p: 20,
b: 20,
m: 1 << 20,
matchKey: randKey,
contents: contents1,
wantMatches: contents1,
@ -80,9 +83,10 @@ func TestFilter(t *testing.T) {
wantBytes: "00000011ce76b76760b54096a233d504ce55b80600fb072c74893cf306eb0c050f0b3c32e8c23436f8f5e67a986a46470790",
wantHash: "a802fbe6f06991877cde8f3d770d8da8cf195816f04874cab045ffccaddd880d",
}, {
name: "contents1 with P=19",
name: "contents1 with B=19, M=1<<19",
version: 1,
p: 19,
b: 19,
m: 1 << 19,
matchKey: randKey,
contents: contents1,
wantMatches: contents1,
@ -90,9 +94,10 @@ func TestFilter(t *testing.T) {
wantBytes: "000000112375937586050f0e9e19689983a3ab9b6f8f0cbc2f204b5233d5099ca0c9fbe9ec6a1f60e76fba3ad6835a28",
wantHash: "be9ba34f03ced957e6f5c4d583ddfd34c136b486fbec2a42b4c7588a2d7813c1",
}, {
name: "contents2 with P=19",
name: "contents2 with B=19, M=1<<19",
version: 1,
p: 19,
b: 19,
m: 1 << 19,
matchKey: randKey,
contents: contents2,
wantMatches: contents2,
@ -100,9 +105,10 @@ func TestFilter(t *testing.T) {
wantBytes: "000000114306259e36131a6c9bbd968a6c61dc110804d5ac91d20d6e9314a50332bffed877657c004e2366fcd34cda60",
wantHash: "dcbaf452f6de4c82ea506fa551d75876c4979ef388f785509b130de62eeaec23",
}, {
name: "contents2 with P=10",
name: "contents2 with B=10, M=1<<10",
version: 1,
p: 10,
b: 10,
m: 1 << 10,
matchKey: randKey,
contents: contents2,
wantMatches: contents2,
@ -114,17 +120,18 @@ func TestFilter(t *testing.T) {
for _, test := range tests {
// Create a filter with the match key for all tests not related to
// testing serialization.
f, err := newFilter(test.version, test.p, test.matchKey, test.contents)
f, err := newFilter(test.version, test.b, test.m, test.matchKey,
test.contents)
if err != nil {
t.Errorf("%q: unexpected err: %v", test.name, err)
continue
}
// Ensure the parameter values are returned properly.
resultP := f.P()
if resultP != test.p {
t.Errorf("%q: unexpected P -- got %d, want %d", test.name,
resultP, test.p)
resultB := f.b
if resultB != test.b {
t.Errorf("%q: unexpected B -- got %d, want %d", test.name,
resultB, test.b)
continue
}
resultN := f.N()
@ -133,6 +140,15 @@ func TestFilter(t *testing.T) {
resultN, uint32(len(test.contents)))
continue
}
if test.version == 1 {
v1Filter := &FilterV1{filter: *f}
resultP := v1Filter.P()
if resultP != test.b {
t.Errorf("%q: unexpected P -- got %d, want %d", test.name,
resultP, test.b)
continue
}
}
// Ensure empty data never matches.
if f.Match(test.matchKey, nil) {
@ -199,8 +215,8 @@ func TestFilter(t *testing.T) {
}
// Recreate the filter with a fixed key for serialization testing.
fixedFilter, err := newFilter(test.version, test.p, test.fixedKey,
test.contents)
fixedFilter, err := newFilter(test.version, test.b, test.m,
test.fixedKey, test.contents)
if err != nil {
t.Errorf("%q: unexpected err: %v", test.name, err)
continue
@ -244,7 +260,7 @@ func TestFilter(t *testing.T) {
var f2 filterMatcher
switch test.version {
case 1:
tf2, err := FromBytesV1(test.p, wantBytes)
tf2, err := FromBytesV1(test.b, wantBytes)
if err != nil {
t.Errorf("%q: unexpected err: %v", test.name, err)
continue
@ -361,3 +377,41 @@ func TestZeroHashMatches(t *testing.T) {
t.Fatalf("failed to match key with 0 siphash")
}
}
// TestPanics ensures various internal functions panic when called improperly.
func TestPanics(t *testing.T) {
testPanic := func(fn func()) (paniced bool) {
// Setup a defer to catch the expected panic and update the
// return variable.
defer func() {
if err := recover(); err != nil {
paniced = true
}
}()
fn()
return false
}
// Ensure attempting to create a filter with parameters too large panics.
paniced := testPanic(func() {
const largeB = 33
const smallM = 1 << 10
var key [KeySize]byte
newFilter(1, largeB, smallM, key, nil)
})
if !paniced {
t.Fatal("newFilter did not panic with too large parameter")
}
// Ensure attempting to create and unsupported filter version panics.
paniced = testPanic(func() {
const normalB = 19
const normalM = 784931
var key [KeySize]byte
newFilter(65535, normalB, normalM, key, nil)
})
if !paniced {
t.Fatal("newFilter did not panic with unsupported version")
}
}