Merge pull request #194 from filecoin-project/feat/rleplus

Add rleplus
This commit is contained in:
Łukasz Magiera 2019-09-16 15:05:13 +02:00 committed by GitHub
commit a305dcab22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 678 additions and 0 deletions

154
extern/rleplus/internal/bitvector.go vendored Normal file
View File

@ -0,0 +1,154 @@
package bitvector
import (
"errors"
"log"
)
var (
// ErrOutOfRange - the index passed is out of range for the BitVector
ErrOutOfRange = errors.New("index out of range")
)
// BitNumbering indicates the ordering of bits, either
// least-significant bit in position 0, or most-significant bit
// in position 0.
//
// It it used in 3 ways with BitVector:
// 1. Ordering of bits within the Buf []byte structure
// 2. What order to add bits when using Extend()
// 3. What order to read bits when using Take()
//
// https://en.wikipedia.org/wiki/Bit_numbering
type BitNumbering int
const (
// LSB0 - bit ordering starts with the low-order bit
LSB0 BitNumbering = iota
// MSB0 - bit ordering starts with the high-order bit
MSB0
)
// BitVector is used to manipulate ordered collections of bits
type BitVector struct {
Buf []byte
// BytePacking is the bit ordering within bytes
BytePacking BitNumbering
// Len is the logical number of bits in the vector.
// The last byte in Buf may have undefined bits if Len is not a multiple of 8
Len uint
}
// NewBitVector constructs a new BitVector from a slice of bytes.
//
// The bytePacking parameter is required to know how to interpret the bit ordering within the bytes.
func NewBitVector(buf []byte, bytePacking BitNumbering) *BitVector {
return &BitVector{
BytePacking: bytePacking,
Buf: buf,
Len: uint(len(buf) * 8),
}
}
// Push adds a single bit to the BitVector.
//
// Although it takes a byte, only the low-order bit is used, so just use 0 or 1.
func (v *BitVector) Push(val byte) {
if v.Len%8 == 0 {
v.Buf = append(v.Buf, 0)
}
lastIdx := v.Len / 8
switch v.BytePacking {
case LSB0:
v.Buf[lastIdx] |= (val & 1) << (v.Len % 8)
default:
v.Buf[lastIdx] |= (val & 1) << (7 - (v.Len % 8))
}
v.Len++
}
// Get returns a single bit as a byte -- either 0 or 1
func (v *BitVector) Get(idx uint) (byte, error) {
if idx >= v.Len {
return 0, ErrOutOfRange
}
blockIdx := idx / 8
switch v.BytePacking {
case LSB0:
return v.Buf[blockIdx] >> (idx % 8) & 1, nil
default:
return v.Buf[blockIdx] >> (7 - idx%8) & 1, nil
}
}
// Extend adds up to 8 bits to the receiver
//
// Given a byte b == 0b11010101
// v.Extend(b, 4, LSB0) would add < 1, 0, 1, 0 >
// v.Extend(b, 4, MSB0) would add < 1, 1, 0, 1 >
//
// Panics if count is out of range
func (v *BitVector) Extend(val byte, count uint, order BitNumbering) {
if count > 8 {
log.Panicf("invalid count")
}
for i := uint(0); i < count; i++ {
switch order {
case LSB0:
v.Push((val >> i) & 1)
default:
v.Push((val >> (7 - i)) & 1)
}
}
}
// Take reads up to 8 bits at the given index.
//
// Given a BitVector < 1, 1, 0, 1, 0, 1, 0, 1 >
// v.Take(0, 4, LSB0) would return 0b00001011
// v.Take(0, 4, MSB0) would return 0b11010000
//
// Panics if count is out of range
func (v *BitVector) Take(index uint, count uint, order BitNumbering) (out byte) {
if count > 8 {
log.Panicf("invalid count")
}
for i := uint(0); i < count; i++ {
val, _ := v.Get(index + i)
switch order {
case LSB0:
out |= val << i
default:
out |= val << (7 - i)
}
}
return
}
// Iterator returns a function, which when invoked, returns the number
// of bits requested, and increments an internal cursor.
//
// When the end of the BitVector is reached, it returns zeroes indefinitely
//
// Panics if count is out of range
func (v *BitVector) Iterator(order BitNumbering) func(uint) byte {
cursor := uint(0)
return func(count uint) (out byte) {
if count > 8 {
log.Panicf("invalid count")
}
out = v.Take(cursor, count, order)
cursor += count
return
}
}

View File

@ -0,0 +1,136 @@
package bitvector_test
import (
"testing"
"github.com/stretchr/testify/assert"
bitvector "github.com/filecoin-project/go-lotus/extern/rleplus/internal"
)
func TestBitVector(t *testing.T) {
t.Run("zero value", func(t *testing.T) {
var v bitvector.BitVector
assert.Equal(t, bitvector.LSB0, v.BytePacking)
})
t.Run("Push", func(t *testing.T) {
// MSB0 bit numbering
v := bitvector.BitVector{BytePacking: bitvector.MSB0}
v.Push(1)
v.Push(0)
v.Push(1)
v.Push(1)
assert.Equal(t, byte(176), v.Buf[0])
// LSB0 bit numbering
v = bitvector.BitVector{BytePacking: bitvector.LSB0}
v.Push(1)
v.Push(0)
v.Push(1)
v.Push(1)
assert.Equal(t, byte(13), v.Buf[0])
})
t.Run("Get", func(t *testing.T) {
bits := []byte{1, 0, 1, 1, 0, 0, 1, 0}
for _, numbering := range []bitvector.BitNumbering{bitvector.MSB0, bitvector.LSB0} {
v := bitvector.BitVector{BytePacking: numbering}
for _, bit := range bits {
v.Push(bit)
}
for idx, expected := range bits {
actual, _ := v.Get(uint(idx))
assert.Equal(t, expected, actual)
}
}
})
t.Run("Extend", func(t *testing.T) {
val := byte(171) // 0b10101011
var v bitvector.BitVector
// MSB0 bit numbering
v = bitvector.BitVector{}
v.Extend(val, 4, bitvector.MSB0)
assertBitVector(t, []byte{1, 0, 1, 0}, v)
v.Extend(val, 5, bitvector.MSB0)
assertBitVector(t, []byte{1, 0, 1, 0, 1, 0, 1, 0, 1}, v)
// LSB0 bit numbering
v = bitvector.BitVector{}
v.Extend(val, 4, bitvector.LSB0)
assertBitVector(t, []byte{1, 1, 0, 1}, v)
v.Extend(val, 5, bitvector.LSB0)
assertBitVector(t, []byte{1, 1, 0, 1, 1, 1, 0, 1, 0}, v)
})
t.Run("invalid counts to Take/Extend/Iterator cause panics", func(t *testing.T) {
v := bitvector.BitVector{BytePacking: bitvector.LSB0}
assert.Panics(t, func() { v.Extend(0xff, 9, bitvector.LSB0) })
assert.Panics(t, func() { v.Take(0, 9, bitvector.LSB0) })
next := v.Iterator(bitvector.LSB0)
assert.Panics(t, func() { next(9) })
})
t.Run("Take", func(t *testing.T) {
var v bitvector.BitVector
bits := []byte{1, 0, 1, 0, 1, 0, 1, 1}
for _, bit := range bits {
v.Push(bit)
}
assert.Equal(t, byte(176), v.Take(4, 4, bitvector.MSB0))
assert.Equal(t, byte(13), v.Take(4, 4, bitvector.LSB0))
})
t.Run("Iterator", func(t *testing.T) {
var buf []byte
// make a bitvector of 256 sample bits
for i := 0; i < 32; i++ {
buf = append(buf, 128+32)
}
v := bitvector.NewBitVector(buf, bitvector.LSB0)
next := v.Iterator(bitvector.LSB0)
// compare to Get()
for i := uint(0); i < v.Len; i++ {
expected, _ := v.Get(i)
assert.Equal(t, expected, next(1))
}
// out of range should return zero
assert.Equal(t, byte(0), next(1))
assert.Equal(t, byte(0), next(8))
// compare to Take()
next = v.Iterator(bitvector.LSB0)
assert.Equal(t, next(5), v.Take(0, 5, bitvector.LSB0))
assert.Equal(t, next(8), v.Take(5, 8, bitvector.LSB0))
})
}
// Note: When using this helper assertion, expectedBits should *only* be 0s and 1s.
func assertBitVector(t *testing.T, expectedBits []byte, actual bitvector.BitVector) {
assert.Equal(t, uint(len(expectedBits)), actual.Len)
for idx, bit := range expectedBits {
actualBit, err := actual.Get(uint(idx))
assert.NoError(t, err)
assert.Equal(t, bit, actualBit)
}
}

204
extern/rleplus/rleplus.go vendored Normal file
View File

@ -0,0 +1,204 @@
package rleplus
import (
"encoding/binary"
"errors"
"fmt"
"sort"
bitvector "github.com/filecoin-project/go-lotus/extern/rleplus/internal"
)
// Version is the 2 lowest bits of this constant
const Version = 0
var (
// ErrRunLengthTooLarge - data implies a run-length which isn't supported
ErrRunLengthTooLarge = fmt.Errorf("run length too large for RLE+ version %d", Version)
// ErrDecode - invalid encoding for this version
ErrDecode = fmt.Errorf("invalid encoding for RLE+ version %d", Version)
// ErrWrongVersion - wrong version of RLE+
ErrWrongVersion = errors.New("invalid RLE+ version")
)
// Encode returns the RLE+ representation of the provided integers.
// Also returned is the number of bits required by this encoding,
// which is not necessarily on a byte boundary.
//
// The RLE+ spec is here: https://github.com/filecoin-project/specs/blob/master/data-structures.md#rle-bitset-encoding
// and is described by the BNF Grammar:
//
// <encoding> ::= <header> <blocks>
// <header> ::= <version> <bit>
// <version> ::= "00"
// <blocks> ::= <block> <blocks> | ""
// <block> ::= <block_single> | <block_short> | <block_long>
// <block_single> ::= "1"
// <block_short> ::= "01" <bit> <bit> <bit> <bit>
// <block_long> ::= "00" <unsigned_varint>
// <bit> ::= "0" | "1"
//
// Filecoin specific:
// The encoding is returned as a []byte, each byte packed starting with the low-order bit (LSB0)
func Encode(ints []uint64) ([]byte, uint, error) {
v := bitvector.BitVector{BytePacking: bitvector.LSB0}
firstBit, runs := RunLengths(ints)
// Add version header
v.Extend(Version, 2, bitvector.LSB0)
v.Push(firstBit)
for _, run := range runs {
switch {
case run == 1:
v.Push(1)
case run < 16:
v.Push(0)
v.Push(1)
v.Extend(byte(run), 4, bitvector.LSB0)
case run >= 16:
v.Push(0)
v.Push(0)
// 10 bytes needed to encode MaxUint64
buf := make([]byte, 10)
numBytes := binary.PutUvarint(buf, run)
for i := 0; i < numBytes; i++ {
v.Extend(buf[i], 8, bitvector.LSB0)
}
default:
return nil, 0, ErrRunLengthTooLarge
}
}
return v.Buf, v.Len, nil
}
// Decode returns integers represented by the given RLE+ encoding
//
// The length of the encoding is not specified. It is inferred by
// reading zeroes from the (possibly depleted) BitVector, by virtue
// of the behavior of BitVector.Take() returning 0 when the end of
// the BitVector has been reached. This has the downside of not
// being able to detect corrupt encodings.
//
// The passed []byte should be packed in LSB0 bit numbering
func Decode(buf []byte) (ints []uint64, err error) {
if len(buf) == 0 {
return
}
v := bitvector.NewBitVector(buf, bitvector.LSB0)
take := v.Iterator(bitvector.LSB0)
// Read version and check
// Version check
ver := take(2)
if ver != Version {
return nil, ErrWrongVersion
}
curIdx := uint64(0)
curBit := take(1)
var runLength int
done := false
for done == false {
y := take(1)
switch y {
case 1:
runLength = 1
case 0:
val := take(1)
if val == 1 {
// short block
runLength = int(take(4))
} else {
// long block
var buf []byte
for {
b := take(8)
buf = append(buf, b)
if b&0x80 == 0 {
break
}
// 10 bytes is required to store math.MaxUint64 in a uvarint
if len(buf) > 10 {
return nil, ErrDecode
}
}
x, _ := binary.Uvarint(buf)
if x == 0 {
done = true
}
runLength = int(x)
}
}
if curBit == 1 {
for j := 0; j < runLength; j++ {
ints = append(ints, curIdx+uint64(j))
}
}
curIdx += uint64(runLength)
curBit = 1 - curBit
}
return
}
// RunLengths transforms integers into its bit-set-run-length representation.
//
// A set of unsigned integers { 0, 2, 4, 5, 6 } can be thought of as
// indices into a bitset { 1, 0, 1, 0, 1, 1, 1 } where bitset[index] == 1.
//
// The bit set run lengths of this set would then be { 1, 1, 1, 1, 3 },
// representing lengths of runs alternating between 1 and 0, starting
// with a first bit of 1.
//
// Duplicated numbers are ignored.
//
// This is a helper function for Encode()
func RunLengths(ints []uint64) (firstBit byte, runs []uint64) {
if len(ints) == 0 {
return
}
// Sort our incoming numbers
sort.Slice(ints, func(i, j int) bool { return ints[i] < ints[j] })
prev := ints[0]
// Initialize our return value
if prev == 0 {
firstBit = 1
}
if firstBit == 0 {
// first run of zeroes
runs = append(runs, prev)
}
runs = append(runs, 1)
for _, cur := range ints[1:] {
delta := cur - prev
switch {
case delta == 1:
runs[len(runs)-1]++
case delta > 1:
// add run of zeroes if there is a gap
runs = append(runs, delta-1)
runs = append(runs, 1)
default:
// repeated number?
}
prev = cur
}
return
}

181
extern/rleplus/rleplus_test.go vendored Normal file
View File

@ -0,0 +1,181 @@
package rleplus_test
import (
"fmt"
"math"
"sort"
"testing"
"github.com/filecoin-project/go-lotus/extern/rleplus"
bitvector "github.com/filecoin-project/go-lotus/extern/rleplus/internal"
"gotest.tools/assert"
)
func TestRleplus(t *testing.T) {
t.Run("Encode", func(t *testing.T) {
// Encode an intset
ints := []uint64{
// run of 1
0,
// gap of 1
// run of 1
2,
// gap of 1
// run of 3
4, 5, 6,
// gap of 4
// run of 17
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
}
expectedBits := []byte{
0, 0, // version
1, // first bit
1, // run of 1
1, // gap of 1
1, // run of 1
1, // gap of 1
0, 1, 1, 1, 0, 0, // run of 3
0, 1, 0, 0, 1, 0, // gap of 4
// run of 17 < 0 0 (varint) >
0, 0,
1, 0, 0, 0, 1, 0, 0, 0,
}
v := bitvector.BitVector{}
for _, bit := range expectedBits {
v.Push(bit)
}
actualBytes, _, err := rleplus.Encode(ints)
assert.NilError(t, err)
assert.Equal(t, len(v.Buf), len(actualBytes))
for idx, expected := range v.Buf {
assert.Equal(
t,
fmt.Sprintf("%08b", expected),
fmt.Sprintf("%08b", actualBytes[idx]),
)
}
})
t.Run("Encode allows all runs sizes possible uint64", func(t *testing.T) {
// create a run of math.MaxUint64
ints := []uint64{math.MaxUint64}
_, _, err := rleplus.Encode(ints)
assert.NilError(t, err)
})
t.Run("Decode", func(t *testing.T) {
testCases := [][]uint64{
{},
{1},
{0},
{0, 1, 2, 3},
{
// run of 1
0,
// gap of 1
// run of 1
2,
// gap of 1
// run of 3
4, 5, 6,
// gap of 4
// run of 17
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
},
}
for _, tc := range testCases {
encoded, _, err := rleplus.Encode(tc)
assert.NilError(t, err)
result, err := rleplus.Decode(encoded)
assert.NilError(t, err)
sort.Slice(tc, func(i, j int) bool { return tc[i] < tc[j] })
sort.Slice(result, func(i, j int) bool { return result[i] < result[j] })
assert.Equal(t, len(tc), len(result))
for idx, expected := range tc {
assert.Equal(t, expected, result[idx])
}
}
})
t.Run("Decode version check", func(t *testing.T) {
_, err := rleplus.Decode([]byte{0xff})
assert.Error(t, err, "invalid RLE+ version")
})
t.Run("Decode returns an error with a bad encoding", func(t *testing.T) {
// create an encoding with a buffer with a run which is too long
_, err := rleplus.Decode([]byte{0xe0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
assert.Error(t, err, "invalid encoding for RLE+ version 0")
})
t.Run("outputs same as reference implementation", func(t *testing.T) {
// Encoding bitvec![LittleEndian; 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
// in the Rust reference implementation gives an encoding of [223, 145, 136, 0] (without version field)
// The bit vector is equivalent to the integer set { 0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }
// This is the above reference output with a version header "00" manually added
referenceEncoding := []byte{124, 71, 34, 2}
expectedNumbers := []uint64{0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}
encoded, _, err := rleplus.Encode(expectedNumbers)
assert.NilError(t, err)
// Our encoded bytes are the same as the ref bytes
assert.Equal(t, len(referenceEncoding), len(encoded))
for idx, expected := range referenceEncoding {
assert.Equal(t, expected, encoded[idx])
}
decoded, err := rleplus.Decode(referenceEncoding)
assert.NilError(t, err)
// Our decoded integers are the same as expected
sort.Slice(decoded, func(i, j int) bool { return decoded[i] < decoded[j] })
assert.Equal(t, len(expectedNumbers), len(decoded))
for idx, expected := range expectedNumbers {
assert.Equal(t, expected, decoded[idx])
}
})
t.Run("RunLengths", func(t *testing.T) {
testCases := []struct {
ints []uint64
first byte
runs []uint64
}{
// empty
{},
// leading with ones
{[]uint64{0}, 1, []uint64{1}},
{[]uint64{0, 1}, 1, []uint64{2}},
{[]uint64{0, 0xffffffff, 0xffffffff + 1}, 1, []uint64{1, 0xffffffff - 1, 2}},
// leading with zeroes
{[]uint64{1}, 0, []uint64{1, 1}},
{[]uint64{2}, 0, []uint64{2, 1}},
{[]uint64{10, 11, 13, 20}, 0, []uint64{10, 2, 1, 1, 6, 1}},
{[]uint64{10, 11, 11, 13, 20, 10, 11, 13, 20}, 0, []uint64{10, 2, 1, 1, 6, 1}},
}
for _, testCase := range testCases {
first, runs := rleplus.RunLengths(testCase.ints)
assert.Equal(t, testCase.first, first)
assert.Equal(t, len(testCase.runs), len(runs))
for idx, runLength := range testCase.runs {
assert.Equal(t, runLength, runs[idx])
}
}
})
}

1
go.mod
View File

@ -81,6 +81,7 @@ require (
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7
google.golang.org/api v0.9.0 // indirect
gopkg.in/urfave/cli.v2 v2.0.0-20180128182452-d3ae77c26ac8
gotest.tools v2.2.0+incompatible
launchpad.net/gocheck v0.0.0-20140225173054-000000000087 // indirect
)

2
go.sum
View File

@ -694,6 +694,8 @@ gopkg.in/urfave/cli.v2 v2.0.0-20180128182452-d3ae77c26ac8/go.mod h1:cKXr3E0k4aos
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo=
gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw=
honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=