Add lazy RLE+ decoding

License: MIT
Signed-off-by: Jakub Sztandera <kubuxu@protonmail.ch>
This commit is contained in:
Jakub Sztandera 2019-09-20 21:28:22 +02:00 committed by Jakub Sztandera
parent bbc61a8f86
commit c557aa206f
No known key found for this signature in database
GPG Key ID: 9A9AF56F8B3879BA
4 changed files with 444 additions and 0 deletions

View File

@ -0,0 +1,154 @@
package bitvector
import (
"errors"
"log"
)
var (
// ErrOutOfRange - the index passed is out of range for the BitVector
ErrOutOfRange = errors.New("index out of range")
)
// BitNumbering indicates the ordering of bits, either
// least-significant bit in position 0, or most-significant bit
// in position 0.
//
// It it used in 3 ways with BitVector:
// 1. Ordering of bits within the Buf []byte structure
// 2. What order to add bits when using Extend()
// 3. What order to read bits when using Take()
//
// https://en.wikipedia.org/wiki/Bit_numbering
type BitNumbering int
const (
// LSB0 - bit ordering starts with the low-order bit
LSB0 BitNumbering = iota
// MSB0 - bit ordering starts with the high-order bit
MSB0
)
// BitVector is used to manipulate ordered collections of bits
type BitVector struct {
Buf []byte
// BytePacking is the bit ordering within bytes
BytePacking BitNumbering
// Len is the logical number of bits in the vector.
// The last byte in Buf may have undefined bits if Len is not a multiple of 8
Len uint
}
// NewBitVector constructs a new BitVector from a slice of bytes.
//
// The bytePacking parameter is required to know how to interpret the bit ordering within the bytes.
func NewBitVector(buf []byte, bytePacking BitNumbering) *BitVector {
return &BitVector{
BytePacking: bytePacking,
Buf: buf,
Len: uint(len(buf) * 8),
}
}
// Push adds a single bit to the BitVector.
//
// Although it takes a byte, only the low-order bit is used, so just use 0 or 1.
func (v *BitVector) Push(val byte) {
if v.Len%8 == 0 {
v.Buf = append(v.Buf, 0)
}
lastIdx := v.Len / 8
switch v.BytePacking {
case LSB0:
v.Buf[lastIdx] |= (val & 1) << (v.Len % 8)
default:
v.Buf[lastIdx] |= (val & 1) << (7 - (v.Len % 8))
}
v.Len++
}
// Get returns a single bit as a byte -- either 0 or 1
func (v *BitVector) Get(idx uint) (byte, error) {
if idx >= v.Len {
return 0, ErrOutOfRange
}
blockIdx := idx / 8
switch v.BytePacking {
case LSB0:
return v.Buf[blockIdx] >> (idx % 8) & 1, nil
default:
return v.Buf[blockIdx] >> (7 - idx%8) & 1, nil
}
}
// Extend adds up to 8 bits to the receiver
//
// Given a byte b == 0b11010101
// v.Extend(b, 4, LSB0) would add < 1, 0, 1, 0 >
// v.Extend(b, 4, MSB0) would add < 1, 1, 0, 1 >
//
// Panics if count is out of range
func (v *BitVector) Extend(val byte, count uint, order BitNumbering) {
if count > 8 {
log.Panicf("invalid count")
}
for i := uint(0); i < count; i++ {
switch order {
case LSB0:
v.Push((val >> i) & 1)
default:
v.Push((val >> (7 - i)) & 1)
}
}
}
// Take reads up to 8 bits at the given index.
//
// Given a BitVector < 1, 1, 0, 1, 0, 1, 0, 1 >
// v.Take(0, 4, LSB0) would return 0b00001011
// v.Take(0, 4, MSB0) would return 0b11010000
//
// Panics if count is out of range
func (v *BitVector) Take(index uint, count uint, order BitNumbering) (out byte) {
if count > 8 {
log.Panicf("invalid count")
}
for i := uint(0); i < count; i++ {
val, _ := v.Get(index + i)
switch order {
case LSB0:
out |= val << i
default:
out |= val << (7 - i)
}
}
return
}
// Iterator returns a function, which when invoked, returns the number
// of bits requested, and increments an internal cursor.
//
// When the end of the BitVector is reached, it returns zeroes indefinitely
//
// Panics if count is out of range
func (v *BitVector) Iterator(order BitNumbering) func(uint) byte {
cursor := uint(0)
return func(count uint) (out byte) {
if count > 8 {
log.Panicf("invalid count")
}
out = v.Take(cursor, count, order)
cursor += count
return
}
}

View File

@ -0,0 +1,136 @@
package bitvector_test
import (
"testing"
"github.com/stretchr/testify/assert"
bitvector "github.com/filecoin-project/go-lotus/lib/rlepluslazy/internal"
)
func TestBitVector(t *testing.T) {
t.Run("zero value", func(t *testing.T) {
var v bitvector.BitVector
assert.Equal(t, bitvector.LSB0, v.BytePacking)
})
t.Run("Push", func(t *testing.T) {
// MSB0 bit numbering
v := bitvector.BitVector{BytePacking: bitvector.MSB0}
v.Push(1)
v.Push(0)
v.Push(1)
v.Push(1)
assert.Equal(t, byte(176), v.Buf[0])
// LSB0 bit numbering
v = bitvector.BitVector{BytePacking: bitvector.LSB0}
v.Push(1)
v.Push(0)
v.Push(1)
v.Push(1)
assert.Equal(t, byte(13), v.Buf[0])
})
t.Run("Get", func(t *testing.T) {
bits := []byte{1, 0, 1, 1, 0, 0, 1, 0}
for _, numbering := range []bitvector.BitNumbering{bitvector.MSB0, bitvector.LSB0} {
v := bitvector.BitVector{BytePacking: numbering}
for _, bit := range bits {
v.Push(bit)
}
for idx, expected := range bits {
actual, _ := v.Get(uint(idx))
assert.Equal(t, expected, actual)
}
}
})
t.Run("Extend", func(t *testing.T) {
val := byte(171) // 0b10101011
var v bitvector.BitVector
// MSB0 bit numbering
v = bitvector.BitVector{}
v.Extend(val, 4, bitvector.MSB0)
assertBitVector(t, []byte{1, 0, 1, 0}, v)
v.Extend(val, 5, bitvector.MSB0)
assertBitVector(t, []byte{1, 0, 1, 0, 1, 0, 1, 0, 1}, v)
// LSB0 bit numbering
v = bitvector.BitVector{}
v.Extend(val, 4, bitvector.LSB0)
assertBitVector(t, []byte{1, 1, 0, 1}, v)
v.Extend(val, 5, bitvector.LSB0)
assertBitVector(t, []byte{1, 1, 0, 1, 1, 1, 0, 1, 0}, v)
})
t.Run("invalid counts to Take/Extend/Iterator cause panics", func(t *testing.T) {
v := bitvector.BitVector{BytePacking: bitvector.LSB0}
assert.Panics(t, func() { v.Extend(0xff, 9, bitvector.LSB0) })
assert.Panics(t, func() { v.Take(0, 9, bitvector.LSB0) })
next := v.Iterator(bitvector.LSB0)
assert.Panics(t, func() { next(9) })
})
t.Run("Take", func(t *testing.T) {
var v bitvector.BitVector
bits := []byte{1, 0, 1, 0, 1, 0, 1, 1}
for _, bit := range bits {
v.Push(bit)
}
assert.Equal(t, byte(176), v.Take(4, 4, bitvector.MSB0))
assert.Equal(t, byte(13), v.Take(4, 4, bitvector.LSB0))
})
t.Run("Iterator", func(t *testing.T) {
var buf []byte
// make a bitvector of 256 sample bits
for i := 0; i < 32; i++ {
buf = append(buf, 128+32)
}
v := bitvector.NewBitVector(buf, bitvector.LSB0)
next := v.Iterator(bitvector.LSB0)
// compare to Get()
for i := uint(0); i < v.Len; i++ {
expected, _ := v.Get(i)
assert.Equal(t, expected, next(1))
}
// out of range should return zero
assert.Equal(t, byte(0), next(1))
assert.Equal(t, byte(0), next(8))
// compare to Take()
next = v.Iterator(bitvector.LSB0)
assert.Equal(t, next(5), v.Take(0, 5, bitvector.LSB0))
assert.Equal(t, next(8), v.Take(5, 8, bitvector.LSB0))
})
}
// Note: When using this helper assertion, expectedBits should *only* be 0s and 1s.
func assertBitVector(t *testing.T, expectedBits []byte, actual bitvector.BitVector) {
assert.Equal(t, uint(len(expectedBits)), actual.Len)
for idx, bit := range expectedBits {
actualBit, err := actual.Get(uint(idx))
assert.NoError(t, err)
assert.Equal(t, bit, actualBit)
}
}

110
lib/rlepluslazy/rleplus.go Normal file
View File

@ -0,0 +1,110 @@
package rlepluslazy
import (
"encoding/binary"
"errors"
"fmt"
bitvector "github.com/filecoin-project/go-lotus/lib/rlepluslazy/internal"
"golang.org/x/xerrors"
)
const Version = 0
var (
ErrWrongVersion = errors.New("invalid RLE+ version")
ErrDecode = fmt.Errorf("invalid encoding for RLE+ version %d", Version)
)
type RLE struct {
vec *bitvector.BitVector
}
func FromBuf(buf []byte) (*RLE, error) {
rle := &RLE{vec: bitvector.NewBitVector(buf, bitvector.LSB0)}
if err := rle.check(); err != nil {
return nil, xerrors.Errorf("could not create RLE+ for a buffer: %w", err)
}
return rle, nil
}
func (rle *RLE) check() error {
ver := rle.vec.Take(0, 2, bitvector.LSB0)
if ver != Version {
return ErrWrongVersion
}
return nil
}
func (rle *RLE) Iterator() (*iterator, error) {
vit := rle.vec.Iterator(bitvector.LSB0)
vit(2) // Take version
it := &iterator{next: vit}
if err := it.prep(vit(1)); err != nil {
return nil, err
}
return it, nil
}
type iterator struct {
next func(uint) byte
curIdx uint64
rep uint64
}
func (it *iterator) HasNext() bool {
return it.rep != 0
}
func (it *iterator) prep(curBit byte) error {
loop:
for it.rep == 0 {
x := it.next(1)
switch x {
case 1:
it.rep = 1
case 0:
y := it.next(1)
switch y {
case 1:
it.rep = uint64(it.next(4))
case 0:
var buf = make([]byte, 0, 10)
for {
b := it.next(8)
buf = append(buf, b)
if b&0x80 == 0 {
break
}
if len(buf) > 10 {
return xerrors.Errorf("run too long: %w", ErrDecode)
}
}
it.rep, _ = binary.Uvarint(buf)
}
// run with 0 length means end
if it.rep == 0 {
break loop
}
}
if curBit == 0 {
curBit = 1
it.curIdx = it.curIdx + it.rep
it.rep = 0
}
}
return nil
}
func (it *iterator) Next() (uint64, error) {
it.rep--
res := it.curIdx
it.curIdx++
return res, it.prep(0)
}

View File

@ -0,0 +1,44 @@
package rlepluslazy
import (
"testing"
"github.com/filecoin-project/go-lotus/extern/rleplus"
"github.com/stretchr/testify/assert"
)
func TestDecode(t *testing.T) {
// Encoding bitvec![LittleEndian; 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
// in the Rust reference implementation gives an encoding of [223, 145, 136, 0] (without version field)
// The bit vector is equivalent to the integer set { 0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }
// This is the above reference output with a version header "00" manually added
referenceEncoding := []byte{124, 71, 34, 2}
expectedNumbers := []uint64{0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}
encoded, _, err := rleplus.Encode(expectedNumbers)
assert.NoError(t, err)
// Our encoded bytes are the same as the ref bytes
assert.Equal(t, len(referenceEncoding), len(encoded))
for idx, expected := range referenceEncoding {
assert.Equal(t, expected, encoded[idx])
}
rle, err := FromBuf(referenceEncoding)
assert.NoError(t, err)
decoded := make([]uint64, 0, len(expectedNumbers))
it, err := rle.Iterator()
assert.NoError(t, err)
for it.HasNext() {
bit, err := it.Next()
assert.NoError(t, err)
decoded = append(decoded, bit)
}
// Our decoded integers are the same as expected
assert.Equal(t, expectedNumbers, decoded)
}