Squashed 'extern/rleplus/' content from commit 59d0714

git-subtree-dir: extern/rleplus git-subtree-split: 59d0714e9be58cf96d82cdce18fe727041f9001d
2019-09-16 14:07:48 +02:00 · 2019-09-16 14:07:48 +02:00 · c57c47ffb5
commit c57c47ffb5
4 changed files with 679 additions and 0 deletions
--- a/internal/bitvector.go
+++ b/internal/bitvector.go
@ -0,0 +1,154 @@
+package bitvector
+
+import (
+	"errors"
+	"log"
+)
+
+var (
+	// ErrOutOfRange - the index passed is out of range for the BitVector
+	ErrOutOfRange = errors.New("index out of range")
+)
+
+// BitNumbering indicates the ordering of bits, either
+// least-significant bit in position 0, or most-significant bit
+// in position 0.
+//
+// It it used in 3 ways with BitVector:
+// 1. Ordering of bits within the Buf []byte structure
+// 2. What order to add bits when using Extend()
+// 3. What order to read bits when using Take()
+//
+// https://en.wikipedia.org/wiki/Bit_numbering
+type BitNumbering int
+
+const (
+	// LSB0 - bit ordering starts with the low-order bit
+	LSB0 BitNumbering = iota
+
+	// MSB0 - bit ordering starts with the high-order bit
+	MSB0
+)
+
+// BitVector is used to manipulate ordered collections of bits
+type BitVector struct {
+	Buf []byte
+
+	// BytePacking is the bit ordering within bytes
+	BytePacking BitNumbering
+
+	// Len is the logical number of bits in the vector.
+	// The last byte in Buf may have undefined bits if Len is not a multiple of 8
+	Len uint
+}
+
+// NewBitVector constructs a new BitVector from a slice of bytes.
+//
+// The bytePacking parameter is required to know how to interpret the bit ordering within the bytes.
+func NewBitVector(buf []byte, bytePacking BitNumbering) *BitVector {
+	return &BitVector{
+		BytePacking: bytePacking,
+		Buf:         buf,
+		Len:         uint(len(buf) * 8),
+	}
+}
+
+// Push adds a single bit to the BitVector.
+//
+// Although it takes a byte, only the low-order bit is used, so just use 0 or 1.
+func (v *BitVector) Push(val byte) {
+	if v.Len%8 == 0 {
+		v.Buf = append(v.Buf, 0)
+	}
+	lastIdx := v.Len / 8
+
+	switch v.BytePacking {
+	case LSB0:
+		v.Buf[lastIdx] |= (val & 1) << (v.Len % 8)
+	default:
+		v.Buf[lastIdx] |= (val & 1) << (7 - (v.Len % 8))
+	}
+
+	v.Len++
+}
+
+// Get returns a single bit as a byte -- either 0 or 1
+func (v *BitVector) Get(idx uint) (byte, error) {
+	if idx >= v.Len {
+		return 0, ErrOutOfRange
+	}
+	blockIdx := idx / 8
+
+	switch v.BytePacking {
+	case LSB0:
+		return v.Buf[blockIdx] >> (idx % 8) & 1, nil
+	default:
+		return v.Buf[blockIdx] >> (7 - idx%8) & 1, nil
+	}
+}
+
+// Extend adds up to 8 bits to the receiver
+//
+// Given a byte b == 0b11010101
+// v.Extend(b, 4, LSB0) would add < 1, 0, 1, 0 >
+// v.Extend(b, 4, MSB0) would add < 1, 1, 0, 1 >
+//
+// Panics if count is out of range
+func (v *BitVector) Extend(val byte, count uint, order BitNumbering) {
+	if count > 8 {
+		log.Panicf("invalid count")
+	}
+
+	for i := uint(0); i < count; i++ {
+		switch order {
+		case LSB0:
+			v.Push((val >> i) & 1)
+		default:
+			v.Push((val >> (7 - i)) & 1)
+		}
+	}
+}
+
+// Take reads up to 8 bits at the given index.
+//
+// Given a BitVector < 1, 1, 0, 1, 0, 1, 0, 1 >
+// v.Take(0, 4, LSB0) would return 0b00001011
+// v.Take(0, 4, MSB0) would return 0b11010000
+//
+// Panics if count is out of range
+func (v *BitVector) Take(index uint, count uint, order BitNumbering) (out byte) {
+	if count > 8 {
+		log.Panicf("invalid count")
+	}
+
+	for i := uint(0); i < count; i++ {
+		val, _ := v.Get(index + i)
+
+		switch order {
+		case LSB0:
+			out |= val << i
+		default:
+			out |= val << (7 - i)
+		}
+	}
+	return
+}
+
+// Iterator returns a function, which when invoked, returns the number
+// of bits requested, and increments an internal cursor.
+//
+// When the end of the BitVector is reached, it returns zeroes indefinitely
+//
+// Panics if count is out of range
+func (v *BitVector) Iterator(order BitNumbering) func(uint) byte {
+	cursor := uint(0)
+	return func(count uint) (out byte) {
+		if count > 8 {
+			log.Panicf("invalid count")
+		}
+
+		out = v.Take(cursor, count, order)
+		cursor += count
+		return
+	}
+}
--- a/internal/bitvector_test.go
+++ b/internal/bitvector_test.go
@ -0,0 +1,138 @@
+package bitvector_test
+
+import (
+	"testing"
+
+	"github.com/filecoin-project/go-filecoin/rleplus/internal"
+	tf "github.com/filecoin-project/go-filecoin/testhelpers/testflags"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestBitVector(t *testing.T) {
+	tf.UnitTest(t)
+
+	t.Run("zero value", func(t *testing.T) {
+		var v bitvector.BitVector
+
+		assert.Equal(t, bitvector.LSB0, v.BytePacking)
+	})
+
+	t.Run("Push", func(t *testing.T) {
+		// MSB0 bit numbering
+		v := bitvector.BitVector{BytePacking: bitvector.MSB0}
+		v.Push(1)
+		v.Push(0)
+		v.Push(1)
+		v.Push(1)
+
+		assert.Equal(t, byte(176), v.Buf[0])
+
+		// LSB0 bit numbering
+		v = bitvector.BitVector{BytePacking: bitvector.LSB0}
+		v.Push(1)
+		v.Push(0)
+		v.Push(1)
+		v.Push(1)
+
+		assert.Equal(t, byte(13), v.Buf[0])
+	})
+
+	t.Run("Get", func(t *testing.T) {
+		bits := []byte{1, 0, 1, 1, 0, 0, 1, 0}
+
+		for _, numbering := range []bitvector.BitNumbering{bitvector.MSB0, bitvector.LSB0} {
+			v := bitvector.BitVector{BytePacking: numbering}
+
+			for _, bit := range bits {
+				v.Push(bit)
+			}
+
+			for idx, expected := range bits {
+				actual, _ := v.Get(uint(idx))
+				assert.Equal(t, expected, actual)
+			}
+		}
+	})
+
+	t.Run("Extend", func(t *testing.T) {
+		val := byte(171) // 0b10101011
+
+		var v bitvector.BitVector
+
+		// MSB0 bit numbering
+		v = bitvector.BitVector{}
+		v.Extend(val, 4, bitvector.MSB0)
+		assertBitVector(t, []byte{1, 0, 1, 0}, v)
+		v.Extend(val, 5, bitvector.MSB0)
+		assertBitVector(t, []byte{1, 0, 1, 0, 1, 0, 1, 0, 1}, v)
+
+		// LSB0 bit numbering
+		v = bitvector.BitVector{}
+		v.Extend(val, 4, bitvector.LSB0)
+		assertBitVector(t, []byte{1, 1, 0, 1}, v)
+		v.Extend(val, 5, bitvector.LSB0)
+		assertBitVector(t, []byte{1, 1, 0, 1, 1, 1, 0, 1, 0}, v)
+	})
+
+	t.Run("invalid counts to Take/Extend/Iterator cause panics", func(t *testing.T) {
+		v := bitvector.BitVector{BytePacking: bitvector.LSB0}
+
+		assert.Panics(t, func() { v.Extend(0xff, 9, bitvector.LSB0) })
+
+		assert.Panics(t, func() { v.Take(0, 9, bitvector.LSB0) })
+
+		next := v.Iterator(bitvector.LSB0)
+		assert.Panics(t, func() { next(9) })
+	})
+
+	t.Run("Take", func(t *testing.T) {
+		var v bitvector.BitVector
+
+		bits := []byte{1, 0, 1, 0, 1, 0, 1, 1}
+		for _, bit := range bits {
+			v.Push(bit)
+		}
+
+		assert.Equal(t, byte(176), v.Take(4, 4, bitvector.MSB0))
+		assert.Equal(t, byte(13), v.Take(4, 4, bitvector.LSB0))
+	})
+
+	t.Run("Iterator", func(t *testing.T) {
+		var buf []byte
+
+		// make a bitvector of 256 sample bits
+		for i := 0; i < 32; i++ {
+			buf = append(buf, 128+32)
+		}
+
+		v := bitvector.NewBitVector(buf, bitvector.LSB0)
+
+		next := v.Iterator(bitvector.LSB0)
+
+		// compare to Get()
+		for i := uint(0); i < v.Len; i++ {
+			expected, _ := v.Get(i)
+			assert.Equal(t, expected, next(1))
+		}
+
+		// out of range should return zero
+		assert.Equal(t, byte(0), next(1))
+		assert.Equal(t, byte(0), next(8))
+
+		// compare to Take()
+		next = v.Iterator(bitvector.LSB0)
+		assert.Equal(t, next(5), v.Take(0, 5, bitvector.LSB0))
+		assert.Equal(t, next(8), v.Take(5, 8, bitvector.LSB0))
+	})
+}
+
+// Note: When using this helper assertion, expectedBits should *only* be 0s and 1s.
+func assertBitVector(t *testing.T, expectedBits []byte, actual bitvector.BitVector) {
+	assert.Equal(t, uint(len(expectedBits)), actual.Len)
+
+	for idx, bit := range expectedBits {
+		actualBit, err := actual.Get(uint(idx))
+		assert.NoError(t, err)
+		assert.Equal(t, bit, actualBit)
+	}
+}
--- a/rleplus.go
+++ b/rleplus.go
@ -0,0 +1,204 @@
+package rleplus
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"sort"
+
+	"github.com/filecoin-project/go-filecoin/rleplus/internal"
+)
+
+// Version is the 2 lowest bits of this constant
+const Version = 0
+
+var (
+	// ErrRunLengthTooLarge - data implies a run-length which isn't supported
+	ErrRunLengthTooLarge = fmt.Errorf("run length too large for RLE+ version %d", Version)
+
+	// ErrDecode - invalid encoding for this version
+	ErrDecode = fmt.Errorf("invalid encoding for RLE+ version %d", Version)
+
+	// ErrWrongVersion - wrong version of RLE+
+	ErrWrongVersion = errors.New("invalid RLE+ version")
+)
+
+// Encode returns the RLE+ representation of the provided integers.
+// Also returned is the number of bits required by this encoding,
+// which is not necessarily on a byte boundary.
+//
+// The RLE+ spec is here: https://github.com/filecoin-project/specs/blob/master/data-structures.md#rle-bitset-encoding
+// and is described by the BNF Grammar:
+//
+//    <encoding> ::= <header> <blocks>
+//    <header> ::= <version> <bit>
+//    <version> ::= "00"
+//    <blocks> ::= <block> <blocks> | ""
+//    <block> ::= <block_single> | <block_short> | <block_long>
+//    <block_single> ::= "1"
+//    <block_short> ::= "01" <bit> <bit> <bit> <bit>
+//    <block_long> ::= "00" <unsigned_varint>
+//    <bit> ::= "0" | "1"
+//
+// Filecoin specific:
+// The encoding is returned as a []byte, each byte packed starting with the low-order bit (LSB0)
+func Encode(ints []uint64) ([]byte, uint, error) {
+	v := bitvector.BitVector{BytePacking: bitvector.LSB0}
+	firstBit, runs := RunLengths(ints)
+
+	// Add version header
+	v.Extend(Version, 2, bitvector.LSB0)
+
+	v.Push(firstBit)
+
+	for _, run := range runs {
+		switch {
+		case run == 1:
+			v.Push(1)
+		case run < 16:
+			v.Push(0)
+			v.Push(1)
+			v.Extend(byte(run), 4, bitvector.LSB0)
+		case run >= 16:
+			v.Push(0)
+			v.Push(0)
+			// 10 bytes needed to encode MaxUint64
+			buf := make([]byte, 10)
+			numBytes := binary.PutUvarint(buf, run)
+			for i := 0; i < numBytes; i++ {
+				v.Extend(buf[i], 8, bitvector.LSB0)
+			}
+		default:
+			return nil, 0, ErrRunLengthTooLarge
+		}
+	}
+
+	return v.Buf, v.Len, nil
+}
+
+// Decode returns integers represented by the given RLE+ encoding
+//
+// The length of the encoding is not specified.  It is inferred by
+// reading zeroes from the (possibly depleted) BitVector, by virtue
+// of the behavior of BitVector.Take() returning 0 when the end of
+// the BitVector has been reached. This has the downside of not
+// being able to detect corrupt encodings.
+//
+// The passed []byte should be packed in LSB0 bit numbering
+func Decode(buf []byte) (ints []uint64, err error) {
+	if len(buf) == 0 {
+		return
+	}
+
+	v := bitvector.NewBitVector(buf, bitvector.LSB0)
+	take := v.Iterator(bitvector.LSB0)
+
+	// Read version and check
+	// Version check
+	ver := take(2)
+	if ver != Version {
+		return nil, ErrWrongVersion
+	}
+
+	curIdx := uint64(0)
+	curBit := take(1)
+	var runLength int
+	done := false
+
+	for done == false {
+		y := take(1)
+		switch y {
+		case 1:
+			runLength = 1
+		case 0:
+			val := take(1)
+
+			if val == 1 {
+				// short block
+				runLength = int(take(4))
+			} else {
+				// long block
+				var buf []byte
+				for {
+					b := take(8)
+					buf = append(buf, b)
+
+					if b&0x80 == 0 {
+						break
+					}
+
+					// 10 bytes is required to store math.MaxUint64 in a uvarint
+					if len(buf) > 10 {
+						return nil, ErrDecode
+					}
+				}
+				x, _ := binary.Uvarint(buf)
+
+				if x == 0 {
+					done = true
+				}
+				runLength = int(x)
+			}
+		}
+
+		if curBit == 1 {
+			for j := 0; j < runLength; j++ {
+				ints = append(ints, curIdx+uint64(j))
+			}
+		}
+		curIdx += uint64(runLength)
+		curBit = 1 - curBit
+	}
+
+	return
+}
+
+// RunLengths transforms integers into its bit-set-run-length representation.
+//
+// A set of unsigned integers { 0, 2, 4, 5, 6 } can be thought of as
+// indices into a bitset { 1, 0, 1, 0, 1, 1, 1 } where bitset[index] == 1.
+//
+// The bit set run lengths of this set would then be { 1, 1, 1, 1, 3 },
+// representing lengths of runs alternating between 1 and 0, starting
+// with a first bit of 1.
+//
+// Duplicated numbers are ignored.
+//
+// This is a helper function for Encode()
+func RunLengths(ints []uint64) (firstBit byte, runs []uint64) {
+	if len(ints) == 0 {
+		return
+	}
+
+	// Sort our incoming numbers
+	sort.Slice(ints, func(i, j int) bool { return ints[i] < ints[j] })
+
+	prev := ints[0]
+
+	// Initialize our return value
+	if prev == 0 {
+		firstBit = 1
+	}
+
+	if firstBit == 0 {
+		// first run of zeroes
+		runs = append(runs, prev)
+	}
+	runs = append(runs, 1)
+
+	for _, cur := range ints[1:] {
+		delta := cur - prev
+		switch {
+		case delta == 1:
+			runs[len(runs)-1]++
+		case delta > 1:
+			// add run of zeroes if there is a gap
+			runs = append(runs, delta-1)
+			runs = append(runs, 1)
+		default:
+			// repeated number?
+		}
+		prev = cur
+	}
+	return
+}
--- a/rleplus_test.go
+++ b/rleplus_test.go
@ -0,0 +1,183 @@
+package rleplus_test
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"testing"
+
+	"github.com/filecoin-project/go-filecoin/rleplus"
+	"github.com/filecoin-project/go-filecoin/rleplus/internal"
+	tf "github.com/filecoin-project/go-filecoin/testhelpers/testflags"
+	"gotest.tools/assert"
+)
+
+func TestRleplus(t *testing.T) {
+	tf.UnitTest(t)
+
+	t.Run("Encode", func(t *testing.T) {
+		// Encode an intset
+		ints := []uint64{
+			// run of 1
+			0,
+			// gap of 1
+			// run of 1
+			2,
+			// gap of 1
+			// run of 3
+			4, 5, 6,
+			// gap of 4
+			// run of 17
+			11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+		}
+
+		expectedBits := []byte{
+			0, 0, // version
+			1,                // first bit
+			1,                // run of 1
+			1,                // gap of 1
+			1,                // run of 1
+			1,                // gap of 1
+			0, 1, 1, 1, 0, 0, // run of 3
+			0, 1, 0, 0, 1, 0, // gap of 4
+
+			// run of 17 < 0 0 (varint) >
+			0, 0,
+			1, 0, 0, 0, 1, 0, 0, 0,
+		}
+
+		v := bitvector.BitVector{}
+		for _, bit := range expectedBits {
+			v.Push(bit)
+		}
+		actualBytes, _, err := rleplus.Encode(ints)
+		assert.NilError(t, err)
+
+		assert.Equal(t, len(v.Buf), len(actualBytes))
+		for idx, expected := range v.Buf {
+			assert.Equal(
+				t,
+				fmt.Sprintf("%08b", expected),
+				fmt.Sprintf("%08b", actualBytes[idx]),
+			)
+		}
+	})
+
+	t.Run("Encode allows all runs sizes possible uint64", func(t *testing.T) {
+		// create a run of math.MaxUint64
+		ints := []uint64{math.MaxUint64}
+		_, _, err := rleplus.Encode(ints)
+		assert.NilError(t, err)
+	})
+
+	t.Run("Decode", func(t *testing.T) {
+		testCases := [][]uint64{
+			{},
+			{1},
+			{0},
+			{0, 1, 2, 3},
+			{
+				// run of 1
+				0,
+				// gap of 1
+				// run of 1
+				2,
+				// gap of 1
+				// run of 3
+				4, 5, 6,
+				// gap of 4
+				// run of 17
+				11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+			},
+		}
+
+		for _, tc := range testCases {
+			encoded, _, err := rleplus.Encode(tc)
+			assert.NilError(t, err)
+
+			result, err := rleplus.Decode(encoded)
+			assert.NilError(t, err)
+
+			sort.Slice(tc, func(i, j int) bool { return tc[i] < tc[j] })
+			sort.Slice(result, func(i, j int) bool { return result[i] < result[j] })
+
+			assert.Equal(t, len(tc), len(result))
+
+			for idx, expected := range tc {
+				assert.Equal(t, expected, result[idx])
+			}
+		}
+	})
+
+	t.Run("Decode version check", func(t *testing.T) {
+		_, err := rleplus.Decode([]byte{0xff})
+		assert.Error(t, err, "invalid RLE+ version")
+	})
+
+	t.Run("Decode returns an error with a bad encoding", func(t *testing.T) {
+		// create an encoding with a buffer with a run which is too long
+		_, err := rleplus.Decode([]byte{0xe0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
+		assert.Error(t, err, "invalid encoding for RLE+ version 0")
+	})
+
+	t.Run("outputs same as reference implementation", func(t *testing.T) {
+		// Encoding bitvec![LittleEndian; 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+		// in the Rust reference implementation gives an encoding of [223, 145, 136, 0] (without version field)
+		// The bit vector is equivalent to the integer set { 0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }
+
+		// This is the above reference output with a version header "00" manually added
+		referenceEncoding := []byte{124, 71, 34, 2}
+
+		expectedNumbers := []uint64{0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}
+
+		encoded, _, err := rleplus.Encode(expectedNumbers)
+		assert.NilError(t, err)
+
+		// Our encoded bytes are the same as the ref bytes
+		assert.Equal(t, len(referenceEncoding), len(encoded))
+		for idx, expected := range referenceEncoding {
+			assert.Equal(t, expected, encoded[idx])
+		}
+
+		decoded, err := rleplus.Decode(referenceEncoding)
+		assert.NilError(t, err)
+
+		// Our decoded integers are the same as expected
+		sort.Slice(decoded, func(i, j int) bool { return decoded[i] < decoded[j] })
+		assert.Equal(t, len(expectedNumbers), len(decoded))
+		for idx, expected := range expectedNumbers {
+			assert.Equal(t, expected, decoded[idx])
+		}
+	})
+
+	t.Run("RunLengths", func(t *testing.T) {
+		testCases := []struct {
+			ints  []uint64
+			first byte
+			runs  []uint64
+		}{
+			// empty
+			{},
+
+			// leading with ones
+			{[]uint64{0}, 1, []uint64{1}},
+			{[]uint64{0, 1}, 1, []uint64{2}},
+			{[]uint64{0, 0xffffffff, 0xffffffff + 1}, 1, []uint64{1, 0xffffffff - 1, 2}},
+
+			// leading with zeroes
+			{[]uint64{1}, 0, []uint64{1, 1}},
+			{[]uint64{2}, 0, []uint64{2, 1}},
+			{[]uint64{10, 11, 13, 20}, 0, []uint64{10, 2, 1, 1, 6, 1}},
+			{[]uint64{10, 11, 11, 13, 20, 10, 11, 13, 20}, 0, []uint64{10, 2, 1, 1, 6, 1}},
+		}
+
+		for _, testCase := range testCases {
+			first, runs := rleplus.RunLengths(testCase.ints)
+			assert.Equal(t, testCase.first, first)
+			assert.Equal(t, len(testCase.runs), len(runs))
+			for idx, runLength := range testCase.runs {
+				assert.Equal(t, runLength, runs[idx])
+			}
+		}
+	})
+}