diff --git a/extern/rleplus/internal/bitvector.go b/extern/rleplus/internal/bitvector.go new file mode 100644 index 000000000..65bae074e --- /dev/null +++ b/extern/rleplus/internal/bitvector.go @@ -0,0 +1,154 @@ +package bitvector + +import ( + "errors" + "log" +) + +var ( + // ErrOutOfRange - the index passed is out of range for the BitVector + ErrOutOfRange = errors.New("index out of range") +) + +// BitNumbering indicates the ordering of bits, either +// least-significant bit in position 0, or most-significant bit +// in position 0. +// +// It it used in 3 ways with BitVector: +// 1. Ordering of bits within the Buf []byte structure +// 2. What order to add bits when using Extend() +// 3. What order to read bits when using Take() +// +// https://en.wikipedia.org/wiki/Bit_numbering +type BitNumbering int + +const ( + // LSB0 - bit ordering starts with the low-order bit + LSB0 BitNumbering = iota + + // MSB0 - bit ordering starts with the high-order bit + MSB0 +) + +// BitVector is used to manipulate ordered collections of bits +type BitVector struct { + Buf []byte + + // BytePacking is the bit ordering within bytes + BytePacking BitNumbering + + // Len is the logical number of bits in the vector. + // The last byte in Buf may have undefined bits if Len is not a multiple of 8 + Len uint +} + +// NewBitVector constructs a new BitVector from a slice of bytes. +// +// The bytePacking parameter is required to know how to interpret the bit ordering within the bytes. +func NewBitVector(buf []byte, bytePacking BitNumbering) *BitVector { + return &BitVector{ + BytePacking: bytePacking, + Buf: buf, + Len: uint(len(buf) * 8), + } +} + +// Push adds a single bit to the BitVector. +// +// Although it takes a byte, only the low-order bit is used, so just use 0 or 1. +func (v *BitVector) Push(val byte) { + if v.Len%8 == 0 { + v.Buf = append(v.Buf, 0) + } + lastIdx := v.Len / 8 + + switch v.BytePacking { + case LSB0: + v.Buf[lastIdx] |= (val & 1) << (v.Len % 8) + default: + v.Buf[lastIdx] |= (val & 1) << (7 - (v.Len % 8)) + } + + v.Len++ +} + +// Get returns a single bit as a byte -- either 0 or 1 +func (v *BitVector) Get(idx uint) (byte, error) { + if idx >= v.Len { + return 0, ErrOutOfRange + } + blockIdx := idx / 8 + + switch v.BytePacking { + case LSB0: + return v.Buf[blockIdx] >> (idx % 8) & 1, nil + default: + return v.Buf[blockIdx] >> (7 - idx%8) & 1, nil + } +} + +// Extend adds up to 8 bits to the receiver +// +// Given a byte b == 0b11010101 +// v.Extend(b, 4, LSB0) would add < 1, 0, 1, 0 > +// v.Extend(b, 4, MSB0) would add < 1, 1, 0, 1 > +// +// Panics if count is out of range +func (v *BitVector) Extend(val byte, count uint, order BitNumbering) { + if count > 8 { + log.Panicf("invalid count") + } + + for i := uint(0); i < count; i++ { + switch order { + case LSB0: + v.Push((val >> i) & 1) + default: + v.Push((val >> (7 - i)) & 1) + } + } +} + +// Take reads up to 8 bits at the given index. +// +// Given a BitVector < 1, 1, 0, 1, 0, 1, 0, 1 > +// v.Take(0, 4, LSB0) would return 0b00001011 +// v.Take(0, 4, MSB0) would return 0b11010000 +// +// Panics if count is out of range +func (v *BitVector) Take(index uint, count uint, order BitNumbering) (out byte) { + if count > 8 { + log.Panicf("invalid count") + } + + for i := uint(0); i < count; i++ { + val, _ := v.Get(index + i) + + switch order { + case LSB0: + out |= val << i + default: + out |= val << (7 - i) + } + } + return +} + +// Iterator returns a function, which when invoked, returns the number +// of bits requested, and increments an internal cursor. +// +// When the end of the BitVector is reached, it returns zeroes indefinitely +// +// Panics if count is out of range +func (v *BitVector) Iterator(order BitNumbering) func(uint) byte { + cursor := uint(0) + return func(count uint) (out byte) { + if count > 8 { + log.Panicf("invalid count") + } + + out = v.Take(cursor, count, order) + cursor += count + return + } +} diff --git a/extern/rleplus/internal/bitvector_test.go b/extern/rleplus/internal/bitvector_test.go new file mode 100644 index 000000000..a98c00a8e --- /dev/null +++ b/extern/rleplus/internal/bitvector_test.go @@ -0,0 +1,136 @@ +package bitvector_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + bitvector "github.com/filecoin-project/go-lotus/extern/rleplus/internal" +) + +func TestBitVector(t *testing.T) { + t.Run("zero value", func(t *testing.T) { + var v bitvector.BitVector + + assert.Equal(t, bitvector.LSB0, v.BytePacking) + }) + + t.Run("Push", func(t *testing.T) { + // MSB0 bit numbering + v := bitvector.BitVector{BytePacking: bitvector.MSB0} + v.Push(1) + v.Push(0) + v.Push(1) + v.Push(1) + + assert.Equal(t, byte(176), v.Buf[0]) + + // LSB0 bit numbering + v = bitvector.BitVector{BytePacking: bitvector.LSB0} + v.Push(1) + v.Push(0) + v.Push(1) + v.Push(1) + + assert.Equal(t, byte(13), v.Buf[0]) + }) + + t.Run("Get", func(t *testing.T) { + bits := []byte{1, 0, 1, 1, 0, 0, 1, 0} + + for _, numbering := range []bitvector.BitNumbering{bitvector.MSB0, bitvector.LSB0} { + v := bitvector.BitVector{BytePacking: numbering} + + for _, bit := range bits { + v.Push(bit) + } + + for idx, expected := range bits { + actual, _ := v.Get(uint(idx)) + assert.Equal(t, expected, actual) + } + } + }) + + t.Run("Extend", func(t *testing.T) { + val := byte(171) // 0b10101011 + + var v bitvector.BitVector + + // MSB0 bit numbering + v = bitvector.BitVector{} + v.Extend(val, 4, bitvector.MSB0) + assertBitVector(t, []byte{1, 0, 1, 0}, v) + v.Extend(val, 5, bitvector.MSB0) + assertBitVector(t, []byte{1, 0, 1, 0, 1, 0, 1, 0, 1}, v) + + // LSB0 bit numbering + v = bitvector.BitVector{} + v.Extend(val, 4, bitvector.LSB0) + assertBitVector(t, []byte{1, 1, 0, 1}, v) + v.Extend(val, 5, bitvector.LSB0) + assertBitVector(t, []byte{1, 1, 0, 1, 1, 1, 0, 1, 0}, v) + }) + + t.Run("invalid counts to Take/Extend/Iterator cause panics", func(t *testing.T) { + v := bitvector.BitVector{BytePacking: bitvector.LSB0} + + assert.Panics(t, func() { v.Extend(0xff, 9, bitvector.LSB0) }) + + assert.Panics(t, func() { v.Take(0, 9, bitvector.LSB0) }) + + next := v.Iterator(bitvector.LSB0) + assert.Panics(t, func() { next(9) }) + }) + + t.Run("Take", func(t *testing.T) { + var v bitvector.BitVector + + bits := []byte{1, 0, 1, 0, 1, 0, 1, 1} + for _, bit := range bits { + v.Push(bit) + } + + assert.Equal(t, byte(176), v.Take(4, 4, bitvector.MSB0)) + assert.Equal(t, byte(13), v.Take(4, 4, bitvector.LSB0)) + }) + + t.Run("Iterator", func(t *testing.T) { + var buf []byte + + // make a bitvector of 256 sample bits + for i := 0; i < 32; i++ { + buf = append(buf, 128+32) + } + + v := bitvector.NewBitVector(buf, bitvector.LSB0) + + next := v.Iterator(bitvector.LSB0) + + // compare to Get() + for i := uint(0); i < v.Len; i++ { + expected, _ := v.Get(i) + assert.Equal(t, expected, next(1)) + } + + // out of range should return zero + assert.Equal(t, byte(0), next(1)) + assert.Equal(t, byte(0), next(8)) + + // compare to Take() + next = v.Iterator(bitvector.LSB0) + assert.Equal(t, next(5), v.Take(0, 5, bitvector.LSB0)) + assert.Equal(t, next(8), v.Take(5, 8, bitvector.LSB0)) + }) +} + +// Note: When using this helper assertion, expectedBits should *only* be 0s and 1s. +func assertBitVector(t *testing.T, expectedBits []byte, actual bitvector.BitVector) { + assert.Equal(t, uint(len(expectedBits)), actual.Len) + + for idx, bit := range expectedBits { + actualBit, err := actual.Get(uint(idx)) + assert.NoError(t, err) + assert.Equal(t, bit, actualBit) + } +} diff --git a/extern/rleplus/rleplus.go b/extern/rleplus/rleplus.go new file mode 100644 index 000000000..6a9d8dacb --- /dev/null +++ b/extern/rleplus/rleplus.go @@ -0,0 +1,204 @@ +package rleplus + +import ( + "encoding/binary" + "errors" + "fmt" + "sort" + + bitvector "github.com/filecoin-project/go-lotus/extern/rleplus/internal" +) + +// Version is the 2 lowest bits of this constant +const Version = 0 + +var ( + // ErrRunLengthTooLarge - data implies a run-length which isn't supported + ErrRunLengthTooLarge = fmt.Errorf("run length too large for RLE+ version %d", Version) + + // ErrDecode - invalid encoding for this version + ErrDecode = fmt.Errorf("invalid encoding for RLE+ version %d", Version) + + // ErrWrongVersion - wrong version of RLE+ + ErrWrongVersion = errors.New("invalid RLE+ version") +) + +// Encode returns the RLE+ representation of the provided integers. +// Also returned is the number of bits required by this encoding, +// which is not necessarily on a byte boundary. +// +// The RLE+ spec is here: https://github.com/filecoin-project/specs/blob/master/data-structures.md#rle-bitset-encoding +// and is described by the BNF Grammar: +// +// ::=
+//
::= +// ::= "00" +// ::= | "" +// ::= | | +// ::= "1" +// ::= "01" +// ::= "00" +// ::= "0" | "1" +// +// Filecoin specific: +// The encoding is returned as a []byte, each byte packed starting with the low-order bit (LSB0) +func Encode(ints []uint64) ([]byte, uint, error) { + v := bitvector.BitVector{BytePacking: bitvector.LSB0} + firstBit, runs := RunLengths(ints) + + // Add version header + v.Extend(Version, 2, bitvector.LSB0) + + v.Push(firstBit) + + for _, run := range runs { + switch { + case run == 1: + v.Push(1) + case run < 16: + v.Push(0) + v.Push(1) + v.Extend(byte(run), 4, bitvector.LSB0) + case run >= 16: + v.Push(0) + v.Push(0) + // 10 bytes needed to encode MaxUint64 + buf := make([]byte, 10) + numBytes := binary.PutUvarint(buf, run) + for i := 0; i < numBytes; i++ { + v.Extend(buf[i], 8, bitvector.LSB0) + } + default: + return nil, 0, ErrRunLengthTooLarge + } + } + + return v.Buf, v.Len, nil +} + +// Decode returns integers represented by the given RLE+ encoding +// +// The length of the encoding is not specified. It is inferred by +// reading zeroes from the (possibly depleted) BitVector, by virtue +// of the behavior of BitVector.Take() returning 0 when the end of +// the BitVector has been reached. This has the downside of not +// being able to detect corrupt encodings. +// +// The passed []byte should be packed in LSB0 bit numbering +func Decode(buf []byte) (ints []uint64, err error) { + if len(buf) == 0 { + return + } + + v := bitvector.NewBitVector(buf, bitvector.LSB0) + take := v.Iterator(bitvector.LSB0) + + // Read version and check + // Version check + ver := take(2) + if ver != Version { + return nil, ErrWrongVersion + } + + curIdx := uint64(0) + curBit := take(1) + var runLength int + done := false + + for done == false { + y := take(1) + switch y { + case 1: + runLength = 1 + case 0: + val := take(1) + + if val == 1 { + // short block + runLength = int(take(4)) + } else { + // long block + var buf []byte + for { + b := take(8) + buf = append(buf, b) + + if b&0x80 == 0 { + break + } + + // 10 bytes is required to store math.MaxUint64 in a uvarint + if len(buf) > 10 { + return nil, ErrDecode + } + } + x, _ := binary.Uvarint(buf) + + if x == 0 { + done = true + } + runLength = int(x) + } + } + + if curBit == 1 { + for j := 0; j < runLength; j++ { + ints = append(ints, curIdx+uint64(j)) + } + } + curIdx += uint64(runLength) + curBit = 1 - curBit + } + + return +} + +// RunLengths transforms integers into its bit-set-run-length representation. +// +// A set of unsigned integers { 0, 2, 4, 5, 6 } can be thought of as +// indices into a bitset { 1, 0, 1, 0, 1, 1, 1 } where bitset[index] == 1. +// +// The bit set run lengths of this set would then be { 1, 1, 1, 1, 3 }, +// representing lengths of runs alternating between 1 and 0, starting +// with a first bit of 1. +// +// Duplicated numbers are ignored. +// +// This is a helper function for Encode() +func RunLengths(ints []uint64) (firstBit byte, runs []uint64) { + if len(ints) == 0 { + return + } + + // Sort our incoming numbers + sort.Slice(ints, func(i, j int) bool { return ints[i] < ints[j] }) + + prev := ints[0] + + // Initialize our return value + if prev == 0 { + firstBit = 1 + } + + if firstBit == 0 { + // first run of zeroes + runs = append(runs, prev) + } + runs = append(runs, 1) + + for _, cur := range ints[1:] { + delta := cur - prev + switch { + case delta == 1: + runs[len(runs)-1]++ + case delta > 1: + // add run of zeroes if there is a gap + runs = append(runs, delta-1) + runs = append(runs, 1) + default: + // repeated number? + } + prev = cur + } + return +} diff --git a/extern/rleplus/rleplus_test.go b/extern/rleplus/rleplus_test.go new file mode 100644 index 000000000..f740a5597 --- /dev/null +++ b/extern/rleplus/rleplus_test.go @@ -0,0 +1,181 @@ +package rleplus_test + +import ( + "fmt" + "math" + "sort" + "testing" + + "github.com/filecoin-project/go-lotus/extern/rleplus" + bitvector "github.com/filecoin-project/go-lotus/extern/rleplus/internal" + "gotest.tools/assert" +) + +func TestRleplus(t *testing.T) { + + t.Run("Encode", func(t *testing.T) { + // Encode an intset + ints := []uint64{ + // run of 1 + 0, + // gap of 1 + // run of 1 + 2, + // gap of 1 + // run of 3 + 4, 5, 6, + // gap of 4 + // run of 17 + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + } + + expectedBits := []byte{ + 0, 0, // version + 1, // first bit + 1, // run of 1 + 1, // gap of 1 + 1, // run of 1 + 1, // gap of 1 + 0, 1, 1, 1, 0, 0, // run of 3 + 0, 1, 0, 0, 1, 0, // gap of 4 + + // run of 17 < 0 0 (varint) > + 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, + } + + v := bitvector.BitVector{} + for _, bit := range expectedBits { + v.Push(bit) + } + actualBytes, _, err := rleplus.Encode(ints) + assert.NilError(t, err) + + assert.Equal(t, len(v.Buf), len(actualBytes)) + for idx, expected := range v.Buf { + assert.Equal( + t, + fmt.Sprintf("%08b", expected), + fmt.Sprintf("%08b", actualBytes[idx]), + ) + } + }) + + t.Run("Encode allows all runs sizes possible uint64", func(t *testing.T) { + // create a run of math.MaxUint64 + ints := []uint64{math.MaxUint64} + _, _, err := rleplus.Encode(ints) + assert.NilError(t, err) + }) + + t.Run("Decode", func(t *testing.T) { + testCases := [][]uint64{ + {}, + {1}, + {0}, + {0, 1, 2, 3}, + { + // run of 1 + 0, + // gap of 1 + // run of 1 + 2, + // gap of 1 + // run of 3 + 4, 5, 6, + // gap of 4 + // run of 17 + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + }, + } + + for _, tc := range testCases { + encoded, _, err := rleplus.Encode(tc) + assert.NilError(t, err) + + result, err := rleplus.Decode(encoded) + assert.NilError(t, err) + + sort.Slice(tc, func(i, j int) bool { return tc[i] < tc[j] }) + sort.Slice(result, func(i, j int) bool { return result[i] < result[j] }) + + assert.Equal(t, len(tc), len(result)) + + for idx, expected := range tc { + assert.Equal(t, expected, result[idx]) + } + } + }) + + t.Run("Decode version check", func(t *testing.T) { + _, err := rleplus.Decode([]byte{0xff}) + assert.Error(t, err, "invalid RLE+ version") + }) + + t.Run("Decode returns an error with a bad encoding", func(t *testing.T) { + // create an encoding with a buffer with a run which is too long + _, err := rleplus.Decode([]byte{0xe0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}) + assert.Error(t, err, "invalid encoding for RLE+ version 0") + }) + + t.Run("outputs same as reference implementation", func(t *testing.T) { + // Encoding bitvec![LittleEndian; 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + // in the Rust reference implementation gives an encoding of [223, 145, 136, 0] (without version field) + // The bit vector is equivalent to the integer set { 0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 } + + // This is the above reference output with a version header "00" manually added + referenceEncoding := []byte{124, 71, 34, 2} + + expectedNumbers := []uint64{0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27} + + encoded, _, err := rleplus.Encode(expectedNumbers) + assert.NilError(t, err) + + // Our encoded bytes are the same as the ref bytes + assert.Equal(t, len(referenceEncoding), len(encoded)) + for idx, expected := range referenceEncoding { + assert.Equal(t, expected, encoded[idx]) + } + + decoded, err := rleplus.Decode(referenceEncoding) + assert.NilError(t, err) + + // Our decoded integers are the same as expected + sort.Slice(decoded, func(i, j int) bool { return decoded[i] < decoded[j] }) + assert.Equal(t, len(expectedNumbers), len(decoded)) + for idx, expected := range expectedNumbers { + assert.Equal(t, expected, decoded[idx]) + } + }) + + t.Run("RunLengths", func(t *testing.T) { + testCases := []struct { + ints []uint64 + first byte + runs []uint64 + }{ + // empty + {}, + + // leading with ones + {[]uint64{0}, 1, []uint64{1}}, + {[]uint64{0, 1}, 1, []uint64{2}}, + {[]uint64{0, 0xffffffff, 0xffffffff + 1}, 1, []uint64{1, 0xffffffff - 1, 2}}, + + // leading with zeroes + {[]uint64{1}, 0, []uint64{1, 1}}, + {[]uint64{2}, 0, []uint64{2, 1}}, + {[]uint64{10, 11, 13, 20}, 0, []uint64{10, 2, 1, 1, 6, 1}}, + {[]uint64{10, 11, 11, 13, 20, 10, 11, 13, 20}, 0, []uint64{10, 2, 1, 1, 6, 1}}, + } + + for _, testCase := range testCases { + first, runs := rleplus.RunLengths(testCase.ints) + assert.Equal(t, testCase.first, first) + assert.Equal(t, len(testCase.runs), len(runs)) + for idx, runLength := range testCase.runs { + assert.Equal(t, runLength, runs[idx]) + } + } + }) +} diff --git a/go.mod b/go.mod index c9094f618..05fa6342b 100644 --- a/go.mod +++ b/go.mod @@ -81,6 +81,7 @@ require ( golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7 google.golang.org/api v0.9.0 // indirect gopkg.in/urfave/cli.v2 v2.0.0-20180128182452-d3ae77c26ac8 + gotest.tools v2.2.0+incompatible launchpad.net/gocheck v0.0.0-20140225173054-000000000087 // indirect ) diff --git a/go.sum b/go.sum index 215ad6d95..9325c510f 100644 --- a/go.sum +++ b/go.sum @@ -694,6 +694,8 @@ gopkg.in/urfave/cli.v2 v2.0.0-20180128182452-d3ae77c26ac8/go.mod h1:cKXr3E0k4aos gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= +gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=