Squashed 'extern/rleplus/' content from commit 59d0714

git-subtree-dir: extern/rleplus git-subtree-split: 59d0714e9be58cf96d82cdce18fe727041f9001d
2019-09-16 14:07:48 +02:00 · 2019-09-16 14:07:48 +02:00 · c57c47ffb5
commit c57c47ffb5
4 changed files with 679 additions and 0 deletions
--- a/internal/bitvector.go
+++ b/internal/bitvector.go
@ -0,0 +1,154 @@
 package bitvector
 import (
 	"errors"
 	"log"
 )
 var (
 	// ErrOutOfRange - the index passed is out of range for the BitVector
 	ErrOutOfRange = errors.New("index out of range")
 )
 // BitNumbering indicates the ordering of bits, either
 // least-significant bit in position 0, or most-significant bit
 // in position 0.
 //
 // It it used in 3 ways with BitVector:
 // 1. Ordering of bits within the Buf []byte structure
 // 2. What order to add bits when using Extend()
 // 3. What order to read bits when using Take()
 //
 // https://en.wikipedia.org/wiki/Bit_numbering
 type BitNumbering int
 const (
 	// LSB0 - bit ordering starts with the low-order bit
 	LSB0 BitNumbering = iota
 	// MSB0 - bit ordering starts with the high-order bit
 	MSB0
 )
 // BitVector is used to manipulate ordered collections of bits
 type BitVector struct {
 	Buf []byte
 	// BytePacking is the bit ordering within bytes
 	BytePacking BitNumbering
 	// Len is the logical number of bits in the vector.
 	// The last byte in Buf may have undefined bits if Len is not a multiple of 8
 	Len uint
 }
 // NewBitVector constructs a new BitVector from a slice of bytes.
 //
 // The bytePacking parameter is required to know how to interpret the bit ordering within the bytes.
 func NewBitVector(buf []byte, bytePacking BitNumbering) *BitVector {
 	return &BitVector{
 		BytePacking: bytePacking,
 		Buf:         buf,
 		Len:         uint(len(buf) * 8),
 	}
 }
 // Push adds a single bit to the BitVector.
 //
 // Although it takes a byte, only the low-order bit is used, so just use 0 or 1.
 func (v *BitVector) Push(val byte) {
 	if v.Len%8 == 0 {
 		v.Buf = append(v.Buf, 0)
 	}
 	lastIdx := v.Len / 8
 	switch v.BytePacking {
 	case LSB0:
 		v.Buf[lastIdx] |= (val & 1) << (v.Len % 8)
 	default:
 		v.Buf[lastIdx] |= (val & 1) << (7 - (v.Len % 8))
 	}
 	v.Len++
 }
 // Get returns a single bit as a byte -- either 0 or 1
 func (v *BitVector) Get(idx uint) (byte, error) {
 	if idx >= v.Len {
 		return 0, ErrOutOfRange
 	}
 	blockIdx := idx / 8
 	switch v.BytePacking {
 	case LSB0:
 		return v.Buf[blockIdx] >> (idx % 8) & 1, nil
 	default:
 		return v.Buf[blockIdx] >> (7 - idx%8) & 1, nil
 	}
 }
 // Extend adds up to 8 bits to the receiver
 //
 // Given a byte b == 0b11010101
 // v.Extend(b, 4, LSB0) would add < 1, 0, 1, 0 >
 // v.Extend(b, 4, MSB0) would add < 1, 1, 0, 1 >
 //
 // Panics if count is out of range
 func (v *BitVector) Extend(val byte, count uint, order BitNumbering) {
 	if count > 8 {
 		log.Panicf("invalid count")
 	}
 	for i := uint(0); i < count; i++ {
 		switch order {
 		case LSB0:
 			v.Push((val >> i) & 1)
 		default:
 			v.Push((val >> (7 - i)) & 1)
 		}
 	}
 }
 // Take reads up to 8 bits at the given index.
 //
 // Given a BitVector < 1, 1, 0, 1, 0, 1, 0, 1 >
 // v.Take(0, 4, LSB0) would return 0b00001011
 // v.Take(0, 4, MSB0) would return 0b11010000
 //
 // Panics if count is out of range
 func (v *BitVector) Take(index uint, count uint, order BitNumbering) (out byte) {
 	if count > 8 {
 		log.Panicf("invalid count")
 	}
 	for i := uint(0); i < count; i++ {
 		val, _ := v.Get(index + i)
 		switch order {
 		case LSB0:
 			out |= val << i
 		default:
 			out |= val << (7 - i)
 		}
 	}
 	return
 }
 // Iterator returns a function, which when invoked, returns the number
 // of bits requested, and increments an internal cursor.
 //
 // When the end of the BitVector is reached, it returns zeroes indefinitely
 //
 // Panics if count is out of range
 func (v *BitVector) Iterator(order BitNumbering) func(uint) byte {
 	cursor := uint(0)
 	return func(count uint) (out byte) {
 		if count > 8 {
 			log.Panicf("invalid count")
 		}
 		out = v.Take(cursor, count, order)
 		cursor += count
 		return
 	}
 }
--- a/internal/bitvector_test.go
+++ b/internal/bitvector_test.go
@ -0,0 +1,138 @@
 package bitvector_test
 import (
 	"testing"
 	"github.com/filecoin-project/go-filecoin/rleplus/internal"
 	tf "github.com/filecoin-project/go-filecoin/testhelpers/testflags"
 	"github.com/stretchr/testify/assert"
 )
 func TestBitVector(t *testing.T) {
 	tf.UnitTest(t)
 	t.Run("zero value", func(t *testing.T) {
 		var v bitvector.BitVector
 		assert.Equal(t, bitvector.LSB0, v.BytePacking)
 	})
 	t.Run("Push", func(t *testing.T) {
 		// MSB0 bit numbering
 		v := bitvector.BitVector{BytePacking: bitvector.MSB0}
 		v.Push(1)
 		v.Push(0)
 		v.Push(1)
 		v.Push(1)
 		assert.Equal(t, byte(176), v.Buf[0])
 		// LSB0 bit numbering
 		v = bitvector.BitVector{BytePacking: bitvector.LSB0}
 		v.Push(1)
 		v.Push(0)
 		v.Push(1)
 		v.Push(1)
 		assert.Equal(t, byte(13), v.Buf[0])
 	})
 	t.Run("Get", func(t *testing.T) {
 		bits := []byte{1, 0, 1, 1, 0, 0, 1, 0}
 		for _, numbering := range []bitvector.BitNumbering{bitvector.MSB0, bitvector.LSB0} {
 			v := bitvector.BitVector{BytePacking: numbering}
 			for _, bit := range bits {
 				v.Push(bit)
 			}
 			for idx, expected := range bits {
 				actual, _ := v.Get(uint(idx))
 				assert.Equal(t, expected, actual)
 			}
 		}
 	})
 	t.Run("Extend", func(t *testing.T) {
 		val := byte(171) // 0b10101011
 		var v bitvector.BitVector
 		// MSB0 bit numbering
 		v = bitvector.BitVector{}
 		v.Extend(val, 4, bitvector.MSB0)
 		assertBitVector(t, []byte{1, 0, 1, 0}, v)
 		v.Extend(val, 5, bitvector.MSB0)
 		assertBitVector(t, []byte{1, 0, 1, 0, 1, 0, 1, 0, 1}, v)
 		// LSB0 bit numbering
 		v = bitvector.BitVector{}
 		v.Extend(val, 4, bitvector.LSB0)
 		assertBitVector(t, []byte{1, 1, 0, 1}, v)
 		v.Extend(val, 5, bitvector.LSB0)
 		assertBitVector(t, []byte{1, 1, 0, 1, 1, 1, 0, 1, 0}, v)
 	})
 	t.Run("invalid counts to Take/Extend/Iterator cause panics", func(t *testing.T) {
 		v := bitvector.BitVector{BytePacking: bitvector.LSB0}
 		assert.Panics(t, func() { v.Extend(0xff, 9, bitvector.LSB0) })
 		assert.Panics(t, func() { v.Take(0, 9, bitvector.LSB0) })
 		next := v.Iterator(bitvector.LSB0)
 		assert.Panics(t, func() { next(9) })
 	})
 	t.Run("Take", func(t *testing.T) {
 		var v bitvector.BitVector
 		bits := []byte{1, 0, 1, 0, 1, 0, 1, 1}
 		for _, bit := range bits {
 			v.Push(bit)
 		}
 		assert.Equal(t, byte(176), v.Take(4, 4, bitvector.MSB0))
 		assert.Equal(t, byte(13), v.Take(4, 4, bitvector.LSB0))
 	})
 	t.Run("Iterator", func(t *testing.T) {
 		var buf []byte
 		// make a bitvector of 256 sample bits
 		for i := 0; i < 32; i++ {
 			buf = append(buf, 128+32)
 		}
 		v := bitvector.NewBitVector(buf, bitvector.LSB0)
 		next := v.Iterator(bitvector.LSB0)
 		// compare to Get()
 		for i := uint(0); i < v.Len; i++ {
 			expected, _ := v.Get(i)
 			assert.Equal(t, expected, next(1))
 		}
 		// out of range should return zero
 		assert.Equal(t, byte(0), next(1))
 		assert.Equal(t, byte(0), next(8))
 		// compare to Take()
 		next = v.Iterator(bitvector.LSB0)
 		assert.Equal(t, next(5), v.Take(0, 5, bitvector.LSB0))
 		assert.Equal(t, next(8), v.Take(5, 8, bitvector.LSB0))
 	})
 }
 // Note: When using this helper assertion, expectedBits should *only* be 0s and 1s.
 func assertBitVector(t *testing.T, expectedBits []byte, actual bitvector.BitVector) {
 	assert.Equal(t, uint(len(expectedBits)), actual.Len)
 	for idx, bit := range expectedBits {
 		actualBit, err := actual.Get(uint(idx))
 		assert.NoError(t, err)
 		assert.Equal(t, bit, actualBit)
 	}
 }
--- a/rleplus.go
+++ b/rleplus.go
@ -0,0 +1,204 @@
 package rleplus
 import (
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"sort"
 	"github.com/filecoin-project/go-filecoin/rleplus/internal"
 )
 // Version is the 2 lowest bits of this constant
 const Version = 0
 var (
 	// ErrRunLengthTooLarge - data implies a run-length which isn't supported
 	ErrRunLengthTooLarge = fmt.Errorf("run length too large for RLE+ version %d", Version)
 	// ErrDecode - invalid encoding for this version
 	ErrDecode = fmt.Errorf("invalid encoding for RLE+ version %d", Version)
 	// ErrWrongVersion - wrong version of RLE+
 	ErrWrongVersion = errors.New("invalid RLE+ version")
 )
 // Encode returns the RLE+ representation of the provided integers.
 // Also returned is the number of bits required by this encoding,
 // which is not necessarily on a byte boundary.
 //
 // The RLE+ spec is here: https://github.com/filecoin-project/specs/blob/master/data-structures.md#rle-bitset-encoding
 // and is described by the BNF Grammar:
 //
 //    <encoding> ::= <header> <blocks>
 //    <header> ::= <version> <bit>
 //    <version> ::= "00"
 //    <blocks> ::= <block> <blocks> | ""
 //    <block> ::= <block_single> | <block_short> | <block_long>
 //    <block_single> ::= "1"
 //    <block_short> ::= "01" <bit> <bit> <bit> <bit>
 //    <block_long> ::= "00" <unsigned_varint>
 //    <bit> ::= "0" | "1"
 //
 // Filecoin specific:
 // The encoding is returned as a []byte, each byte packed starting with the low-order bit (LSB0)
 func Encode(ints []uint64) ([]byte, uint, error) {
 	v := bitvector.BitVector{BytePacking: bitvector.LSB0}
 	firstBit, runs := RunLengths(ints)
 	// Add version header
 	v.Extend(Version, 2, bitvector.LSB0)
 	v.Push(firstBit)
 	for _, run := range runs {
 		switch {
 		case run == 1:
 			v.Push(1)
 		case run < 16:
 			v.Push(0)
 			v.Push(1)
 			v.Extend(byte(run), 4, bitvector.LSB0)
 		case run >= 16:
 			v.Push(0)
 			v.Push(0)
 			// 10 bytes needed to encode MaxUint64
 			buf := make([]byte, 10)
 			numBytes := binary.PutUvarint(buf, run)
 			for i := 0; i < numBytes; i++ {
 				v.Extend(buf[i], 8, bitvector.LSB0)
 			}
 		default:
 			return nil, 0, ErrRunLengthTooLarge
 		}
 	}
 	return v.Buf, v.Len, nil
 }
 // Decode returns integers represented by the given RLE+ encoding
 //
 // The length of the encoding is not specified.  It is inferred by
 // reading zeroes from the (possibly depleted) BitVector, by virtue
 // of the behavior of BitVector.Take() returning 0 when the end of
 // the BitVector has been reached. This has the downside of not
 // being able to detect corrupt encodings.
 //
 // The passed []byte should be packed in LSB0 bit numbering
 func Decode(buf []byte) (ints []uint64, err error) {
 	if len(buf) == 0 {
 		return
 	}
 	v := bitvector.NewBitVector(buf, bitvector.LSB0)
 	take := v.Iterator(bitvector.LSB0)
 	// Read version and check
 	// Version check
 	ver := take(2)
 	if ver != Version {
 		return nil, ErrWrongVersion
 	}
 	curIdx := uint64(0)
 	curBit := take(1)
 	var runLength int
 	done := false
 	for done == false {
 		y := take(1)
 		switch y {
 		case 1:
 			runLength = 1
 		case 0:
 			val := take(1)
 			if val == 1 {
 				// short block
 				runLength = int(take(4))
 			} else {
 				// long block
 				var buf []byte
 				for {
 					b := take(8)
 					buf = append(buf, b)
 					if b&0x80 == 0 {
 						break
 					}
 					// 10 bytes is required to store math.MaxUint64 in a uvarint
 					if len(buf) > 10 {
 						return nil, ErrDecode
 					}
 				}
 				x, _ := binary.Uvarint(buf)
 				if x == 0 {
 					done = true
 				}
 				runLength = int(x)
 			}
 		}
 		if curBit == 1 {
 			for j := 0; j < runLength; j++ {
 				ints = append(ints, curIdx+uint64(j))
 			}
 		}
 		curIdx += uint64(runLength)
 		curBit = 1 - curBit
 	}
 	return
 }
 // RunLengths transforms integers into its bit-set-run-length representation.
 //
 // A set of unsigned integers { 0, 2, 4, 5, 6 } can be thought of as
 // indices into a bitset { 1, 0, 1, 0, 1, 1, 1 } where bitset[index] == 1.
 //
 // The bit set run lengths of this set would then be { 1, 1, 1, 1, 3 },
 // representing lengths of runs alternating between 1 and 0, starting
 // with a first bit of 1.
 //
 // Duplicated numbers are ignored.
 //
 // This is a helper function for Encode()
 func RunLengths(ints []uint64) (firstBit byte, runs []uint64) {
 	if len(ints) == 0 {
 		return
 	}
 	// Sort our incoming numbers
 	sort.Slice(ints, func(i, j int) bool { return ints[i] < ints[j] })
 	prev := ints[0]
 	// Initialize our return value
 	if prev == 0 {
 		firstBit = 1
 	}
 	if firstBit == 0 {
 		// first run of zeroes
 		runs = append(runs, prev)
 	}
 	runs = append(runs, 1)
 	for _, cur := range ints[1:] {
 		delta := cur - prev
 		switch {
 		case delta == 1:
 			runs[len(runs)-1]++
 		case delta > 1:
 			// add run of zeroes if there is a gap
 			runs = append(runs, delta-1)
 			runs = append(runs, 1)
 		default:
 			// repeated number?
 		}
 		prev = cur
 	}
 	return
 }
--- a/rleplus_test.go
+++ b/rleplus_test.go
@ -0,0 +1,183 @@
 package rleplus_test
 import (
 	"fmt"
 	"math"
 	"sort"
 	"testing"
 	"github.com/filecoin-project/go-filecoin/rleplus"
 	"github.com/filecoin-project/go-filecoin/rleplus/internal"
 	tf "github.com/filecoin-project/go-filecoin/testhelpers/testflags"
 	"gotest.tools/assert"
 )
 func TestRleplus(t *testing.T) {
 	tf.UnitTest(t)
 	t.Run("Encode", func(t *testing.T) {
 		// Encode an intset
 		ints := []uint64{
 			// run of 1
 			0,
 			// gap of 1
 			// run of 1
 			2,
 			// gap of 1
 			// run of 3
 			4, 5, 6,
 			// gap of 4
 			// run of 17
 			11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
 		}
 		expectedBits := []byte{
 			0, 0, // version
 			1,                // first bit
 			1,                // run of 1
 			1,                // gap of 1
 			1,                // run of 1
 			1,                // gap of 1
 			0, 1, 1, 1, 0, 0, // run of 3
 			0, 1, 0, 0, 1, 0, // gap of 4
 			// run of 17 < 0 0 (varint) >
 			0, 0,
 			1, 0, 0, 0, 1, 0, 0, 0,
 		}
 		v := bitvector.BitVector{}
 		for _, bit := range expectedBits {
 			v.Push(bit)
 		}
 		actualBytes, _, err := rleplus.Encode(ints)
 		assert.NilError(t, err)
 		assert.Equal(t, len(v.Buf), len(actualBytes))
 		for idx, expected := range v.Buf {
 			assert.Equal(
 				t,
 				fmt.Sprintf("%08b", expected),
 				fmt.Sprintf("%08b", actualBytes[idx]),
 			)
 		}
 	})
 	t.Run("Encode allows all runs sizes possible uint64", func(t *testing.T) {
 		// create a run of math.MaxUint64
 		ints := []uint64{math.MaxUint64}
 		_, _, err := rleplus.Encode(ints)
 		assert.NilError(t, err)
 	})
 	t.Run("Decode", func(t *testing.T) {
 		testCases := [][]uint64{
 			{},
 			{1},
 			{0},
 			{0, 1, 2, 3},
 			{
 				// run of 1
 				0,
 				// gap of 1
 				// run of 1
 				2,
 				// gap of 1
 				// run of 3
 				4, 5, 6,
 				// gap of 4
 				// run of 17
 				11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
 			},
 		}
 		for _, tc := range testCases {
 			encoded, _, err := rleplus.Encode(tc)
 			assert.NilError(t, err)
 			result, err := rleplus.Decode(encoded)
 			assert.NilError(t, err)
 			sort.Slice(tc, func(i, j int) bool { return tc[i] < tc[j] })
 			sort.Slice(result, func(i, j int) bool { return result[i] < result[j] })
 			assert.Equal(t, len(tc), len(result))
 			for idx, expected := range tc {
 				assert.Equal(t, expected, result[idx])
 			}
 		}
 	})
 	t.Run("Decode version check", func(t *testing.T) {
 		_, err := rleplus.Decode([]byte{0xff})
 		assert.Error(t, err, "invalid RLE+ version")
 	})
 	t.Run("Decode returns an error with a bad encoding", func(t *testing.T) {
 		// create an encoding with a buffer with a run which is too long
 		_, err := rleplus.Decode([]byte{0xe0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
 		assert.Error(t, err, "invalid encoding for RLE+ version 0")
 	})
 	t.Run("outputs same as reference implementation", func(t *testing.T) {
 		// Encoding bitvec![LittleEndian; 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 		// in the Rust reference implementation gives an encoding of [223, 145, 136, 0] (without version field)
 		// The bit vector is equivalent to the integer set { 0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }
 		// This is the above reference output with a version header "00" manually added
 		referenceEncoding := []byte{124, 71, 34, 2}
 		expectedNumbers := []uint64{0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}
 		encoded, _, err := rleplus.Encode(expectedNumbers)
 		assert.NilError(t, err)
 		// Our encoded bytes are the same as the ref bytes
 		assert.Equal(t, len(referenceEncoding), len(encoded))
 		for idx, expected := range referenceEncoding {
 			assert.Equal(t, expected, encoded[idx])
 		}
 		decoded, err := rleplus.Decode(referenceEncoding)
 		assert.NilError(t, err)
 		// Our decoded integers are the same as expected
 		sort.Slice(decoded, func(i, j int) bool { return decoded[i] < decoded[j] })
 		assert.Equal(t, len(expectedNumbers), len(decoded))
 		for idx, expected := range expectedNumbers {
 			assert.Equal(t, expected, decoded[idx])
 		}
 	})
 	t.Run("RunLengths", func(t *testing.T) {
 		testCases := []struct {
 			ints  []uint64
 			first byte
 			runs  []uint64
 		}{
 			// empty
 			{},
 			// leading with ones
 			{[]uint64{0}, 1, []uint64{1}},
 			{[]uint64{0, 1}, 1, []uint64{2}},
 			{[]uint64{0, 0xffffffff, 0xffffffff + 1}, 1, []uint64{1, 0xffffffff - 1, 2}},
 			// leading with zeroes
 			{[]uint64{1}, 0, []uint64{1, 1}},
 			{[]uint64{2}, 0, []uint64{2, 1}},
 			{[]uint64{10, 11, 13, 20}, 0, []uint64{10, 2, 1, 1, 6, 1}},
 			{[]uint64{10, 11, 11, 13, 20, 10, 11, 13, 20}, 0, []uint64{10, 2, 1, 1, 6, 1}},
 		}
 		for _, testCase := range testCases {
 			first, runs := rleplus.RunLengths(testCase.ints)
 			assert.Equal(t, testCase.first, first)
 			assert.Equal(t, len(testCase.runs), len(runs))
 			for idx, runLength := range testCase.runs {
 				assert.Equal(t, runLength, runs[idx])
 			}
 		}
 	})
 }