feat(orm)!: ordered variable length encoding for uint32 and uint64 types (#11090)

## Description

`uint64` values are used in the ORM as auto-incrementing primary keys. Always using 8 bytes for these values is a bit of a waste of space. Unfortunately, varint encoding does not support ordered prefix iteration.

This PR introduces a compact, well-ordered variable length encoding for `uint32` and `uint64` types. `fixed32` and `fixed64` integers are still encoded as 4 and 8 byte fixed-length big-endian arrays. With this, users have a choice of encoding based on what type of data they are storing. An auto-incrementing primary key should prefer the variable length `uint64` whereas a fixed precision decimal might want to use `fixed64`.

See the golden test updates to see how this reduces key lengths.

This encoding works by using the first two bits to encode the buffer length (4 possible lengths). I'm not sure if my choice of 2,4,6 and 9 bytes is the right choice of 4 lenths for `uint64` - there are many alternate choices. I could have also chosen 3 bits and allowed for 8 possible lengths, but way waste an extra bit? Input on the right design parameters would be appreciated.



---

### Author Checklist

*All items are required. Please add a note to the item if the item is not applicable and
please add links to any relevant follow up issues.*

I have...

- [ ] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title
- [ ] added `!` to the type prefix if API or client breaking change
- [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#pr-targeting))
- [ ] provided a link to the relevant issue or specification
- [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/master/docs/building-modules)
- [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#testing)
- [ ] added a changelog entry to `CHANGELOG.md`
- [ ] included comments for [documenting Go code](https://blog.golang.org/godoc)
- [ ] updated the relevant documentation or specification
- [ ] reviewed "Files changed" and left comments if necessary
- [ ] confirmed all CI checks have passed

### Reviewers Checklist

*All items are required. Please add a note if the item is not applicable and please add
your handle next to the items reviewed if you only reviewed selected items.*

I have...

- [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title
- [ ] confirmed `!` in the type prefix if API or client breaking change
- [ ] confirmed all author checklist items have been addressed 
- [ ] reviewed state machine logic
- [ ] reviewed API design and naming
- [ ] reviewed documentation is accurate
- [ ] reviewed tests and test coverage
- [ ] manually tested (if applicable)
This commit is contained in:
Aaron Craelius 2022-02-07 12:58:55 -05:00 committed by GitHub
parent a0a1197c27
commit 1944a0883e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 780 additions and 377 deletions

View File

@ -78,10 +78,14 @@ func GetCodec(field protoreflect.FieldDescriptor, nonTerminal bool) (Codec, erro
} else {
return StringCodec{}, nil
}
case protoreflect.Uint32Kind, protoreflect.Fixed32Kind:
return Uint32Codec{}, nil
case protoreflect.Uint64Kind, protoreflect.Fixed64Kind:
return Uint64Codec{}, nil
case protoreflect.Uint32Kind:
return CompactUint32Codec{}, nil
case protoreflect.Fixed32Kind:
return FixedUint32Codec{}, nil
case protoreflect.Uint64Kind:
return CompactUint64Codec{}, nil
case protoreflect.Fixed64Kind:
return FixedUint64Codec{}, nil
case protoreflect.Int32Kind, protoreflect.Sint32Kind, protoreflect.Sfixed32Kind:
return Int32Codec{}, nil
case protoreflect.Int64Kind, protoreflect.Sint64Kind, protoreflect.Sfixed64Kind:

View File

@ -85,3 +85,97 @@ func TestNTBytesTooLong(t *testing.T) {
_, err = cdc.ComputeBufferSize(bz)
assert.ErrorContains(t, err, ormerrors.BytesFieldTooLong.Error())
}
func TestCompactUInt32(t *testing.T) {
var lastBz []byte
testEncodeDecode := func(x uint32, expectedLen int) {
bz := ormfield.EncodeCompactUint32(x)
assert.Equal(t, expectedLen, len(bz))
y, err := ormfield.DecodeCompactUint32(bytes.NewReader(bz))
assert.NilError(t, err)
assert.Equal(t, x, y)
assert.Assert(t, bytes.Compare(lastBz, bz) < 0)
lastBz = bz
}
testEncodeDecode(64, 2)
testEncodeDecode(16383, 2)
testEncodeDecode(16384, 3)
testEncodeDecode(4194303, 3)
testEncodeDecode(4194304, 4)
testEncodeDecode(1073741823, 4)
testEncodeDecode(1073741824, 5)
// randomized tests
rapid.Check(t, func(t *rapid.T) {
x := rapid.Uint32().Draw(t, "x").(uint32)
y := rapid.Uint32().Draw(t, "y").(uint32)
bx := ormfield.EncodeCompactUint32(x)
by := ormfield.EncodeCompactUint32(y)
cmp := bytes.Compare(bx, by)
if x < y {
assert.Equal(t, -1, cmp)
} else if x == y {
assert.Equal(t, 0, cmp)
} else {
assert.Equal(t, 1, cmp)
}
x2, err := ormfield.DecodeCompactUint32(bytes.NewReader(bx))
assert.NilError(t, err)
assert.Equal(t, x, x2)
y2, err := ormfield.DecodeCompactUint32(bytes.NewReader(by))
assert.NilError(t, err)
assert.Equal(t, y, y2)
})
}
func TestCompactUInt64(t *testing.T) {
var lastBz []byte
testEncodeDecode := func(x uint64, expectedLen int) {
bz := ormfield.EncodeCompactUint64(x)
assert.Equal(t, expectedLen, len(bz))
y, err := ormfield.DecodeCompactUint64(bytes.NewReader(bz))
assert.NilError(t, err)
assert.Equal(t, x, y)
assert.Assert(t, bytes.Compare(lastBz, bz) < 0)
lastBz = bz
}
testEncodeDecode(64, 2)
testEncodeDecode(16383, 2)
testEncodeDecode(16384, 4)
testEncodeDecode(4194303, 4)
testEncodeDecode(4194304, 4)
testEncodeDecode(1073741823, 4)
testEncodeDecode(1073741824, 6)
testEncodeDecode(70368744177663, 6)
testEncodeDecode(70368744177664, 9)
// randomized tests
rapid.Check(t, func(t *rapid.T) {
x := rapid.Uint64().Draw(t, "x").(uint64)
y := rapid.Uint64().Draw(t, "y").(uint64)
bx := ormfield.EncodeCompactUint64(x)
by := ormfield.EncodeCompactUint64(y)
cmp := bytes.Compare(bx, by)
if x < y {
assert.Equal(t, -1, cmp)
} else if x == y {
assert.Equal(t, 0, cmp)
} else {
assert.Equal(t, 1, cmp)
}
x2, err := ormfield.DecodeCompactUint64(bytes.NewReader(bx))
assert.NilError(t, err)
assert.Equal(t, x, x2)
y2, err := ormfield.DecodeCompactUint64(bytes.NewReader(by))
assert.NilError(t, err)
assert.Equal(t, y, y2)
})
}

View File

@ -2,36 +2,179 @@ package ormfield
import (
"encoding/binary"
"fmt"
"io"
"google.golang.org/protobuf/reflect/protoreflect"
)
// Uint32Codec encodes uint32 values as 4-byte big-endian integers.
type Uint32Codec struct{}
// FixedUint32Codec encodes uint32 values as 4-byte big-endian integers.
type FixedUint32Codec struct{}
func (u Uint32Codec) FixedBufferSize() int {
func (u FixedUint32Codec) FixedBufferSize() int {
return 4
}
func (u Uint32Codec) ComputeBufferSize(protoreflect.Value) (int, error) {
func (u FixedUint32Codec) ComputeBufferSize(protoreflect.Value) (int, error) {
return u.FixedBufferSize(), nil
}
func (u Uint32Codec) IsOrdered() bool {
func (u FixedUint32Codec) IsOrdered() bool {
return true
}
func (u Uint32Codec) Compare(v1, v2 protoreflect.Value) int {
func (u FixedUint32Codec) Compare(v1, v2 protoreflect.Value) int {
return compareUint(v1, v2)
}
func (u Uint32Codec) Decode(r Reader) (protoreflect.Value, error) {
func (u FixedUint32Codec) Decode(r Reader) (protoreflect.Value, error) {
var x uint32
err := binary.Read(r, binary.BigEndian, &x)
return protoreflect.ValueOfUint32(x), err
}
func (u Uint32Codec) Encode(value protoreflect.Value, w io.Writer) error {
func (u FixedUint32Codec) Encode(value protoreflect.Value, w io.Writer) error {
return binary.Write(w, binary.BigEndian, uint32(value.Uint()))
}
// CompactUint32Codec encodes uint32 values using EncodeCompactUint32.
type CompactUint32Codec struct{}
func (c CompactUint32Codec) Decode(r Reader) (protoreflect.Value, error) {
x, err := DecodeCompactUint32(r)
return protoreflect.ValueOfUint32(x), err
}
func (c CompactUint32Codec) Encode(value protoreflect.Value, w io.Writer) error {
_, err := w.Write(EncodeCompactUint32(uint32(value.Uint())))
return err
}
func (c CompactUint32Codec) Compare(v1, v2 protoreflect.Value) int {
return compareUint(v1, v2)
}
func (c CompactUint32Codec) IsOrdered() bool {
return true
}
func (c CompactUint32Codec) FixedBufferSize() int {
return 5
}
func (c CompactUint32Codec) ComputeBufferSize(protoreflect.Value) (int, error) {
return c.FixedBufferSize(), nil
}
// EncodeCompactUint32 encodes uint32 values in 2,3,4 or 5 bytes.
// Unlike regular varints, this encoding is
// suitable for ordered prefix scans. The length of the output + 2 is encoded
// in the first 2 bits of the first byte and the remaining bits encoded with
// big-endian ordering.
// Values less than 2^14 fill fit in 2 bytes, values less than 2^22 will
// fit in 3, and values less than 2^30 will fit in 4.
func EncodeCompactUint32(x uint32) []byte {
switch {
case x < 16384: // 2^14
buf := make([]byte, 2)
buf[0] = byte(x >> 8)
buf[1] = byte(x)
return buf
case x < 4194304: // 2^22
buf := make([]byte, 3)
buf[0] = 0x40
buf[0] |= byte(x >> 16)
buf[1] = byte(x >> 8)
buf[2] = byte(x)
return buf
case x < 1073741824: // 2^30
buf := make([]byte, 4)
buf[0] = 0x80
buf[0] |= byte(x >> 24)
buf[1] = byte(x >> 16)
buf[2] = byte(x >> 8)
buf[3] = byte(x)
return buf
default:
buf := make([]byte, 5)
buf[0] = 0xC0
buf[0] |= byte(x >> 26)
buf[1] = byte(x >> 18)
buf[2] = byte(x >> 10)
buf[3] = byte(x >> 2)
buf[4] = byte(x) & 0x3
return buf
}
}
// DecodeCompactUint32 decodes a uint32 encoded with EncodeCompactU32.
func DecodeCompactUint32(reader io.Reader) (uint32, error) {
var buf [5]byte
n, err := reader.Read(buf[:1])
if err != nil {
return 0, err
}
if n < 1 {
return 0, io.ErrUnexpectedEOF
}
switch buf[0] >> 6 {
case 0:
n, err := reader.Read(buf[1:2])
if err != nil {
return 0, err
}
if n < 1 {
return 0, io.ErrUnexpectedEOF
}
x := uint32(buf[0]) << 8
x |= uint32(buf[1])
return x, nil
case 1:
n, err := reader.Read(buf[1:3])
if err != nil {
return 0, err
}
if n < 2 {
return 0, io.ErrUnexpectedEOF
}
x := (uint32(buf[0]) & 0x3F) << 16
x |= uint32(buf[1]) << 8
x |= uint32(buf[2])
return x, nil
case 2:
n, err := reader.Read(buf[1:4])
if err != nil {
return 0, err
}
if n < 3 {
return 0, io.ErrUnexpectedEOF
}
x := (uint32(buf[0]) & 0x3F) << 24
x |= uint32(buf[1]) << 16
x |= uint32(buf[2]) << 8
x |= uint32(buf[3])
return x, nil
case 3:
n, err := reader.Read(buf[1:5])
if err != nil {
return 0, err
}
if n < 4 {
return 0, io.ErrUnexpectedEOF
}
x := (uint32(buf[0]) & 0x3F) << 26
x |= uint32(buf[1]) << 18
x |= uint32(buf[2]) << 10
x |= uint32(buf[3]) << 2
x |= uint32(buf[4])
return x, nil
default:
return 0, fmt.Errorf("unexpected case")
}
}

View File

@ -2,37 +2,38 @@ package ormfield
import (
"encoding/binary"
"fmt"
"io"
"google.golang.org/protobuf/reflect/protoreflect"
)
// Uint64Codec encodes uint64 values as 8-byte big-endian integers.
type Uint64Codec struct{}
// FixedUint64Codec encodes uint64 values as 8-byte big-endian integers.
type FixedUint64Codec struct{}
func (u Uint64Codec) FixedBufferSize() int {
func (u FixedUint64Codec) FixedBufferSize() int {
return 8
}
func (u Uint64Codec) ComputeBufferSize(protoreflect.Value) (int, error) {
func (u FixedUint64Codec) ComputeBufferSize(protoreflect.Value) (int, error) {
return u.FixedBufferSize(), nil
}
func (u Uint64Codec) IsOrdered() bool {
func (u FixedUint64Codec) IsOrdered() bool {
return true
}
func (u Uint64Codec) Compare(v1, v2 protoreflect.Value) int {
func (u FixedUint64Codec) Compare(v1, v2 protoreflect.Value) int {
return compareUint(v1, v2)
}
func (u Uint64Codec) Decode(r Reader) (protoreflect.Value, error) {
func (u FixedUint64Codec) Decode(r Reader) (protoreflect.Value, error) {
var x uint64
err := binary.Read(r, binary.BigEndian, &x)
return protoreflect.ValueOfUint64(x), err
}
func (u Uint64Codec) Encode(value protoreflect.Value, w io.Writer) error {
func (u FixedUint64Codec) Encode(value protoreflect.Value, w io.Writer) error {
return binary.Write(w, binary.BigEndian, value.Uint())
}
@ -47,3 +48,157 @@ func compareUint(v1, v2 protoreflect.Value) int {
return 1
}
}
// CompactUint64Codec encodes uint64 values using EncodeCompactUint64.
type CompactUint64Codec struct{}
func (c CompactUint64Codec) Decode(r Reader) (protoreflect.Value, error) {
x, err := DecodeCompactUint64(r)
return protoreflect.ValueOfUint64(x), err
}
func (c CompactUint64Codec) Encode(value protoreflect.Value, w io.Writer) error {
_, err := w.Write(EncodeCompactUint64(value.Uint()))
return err
}
func (c CompactUint64Codec) Compare(v1, v2 protoreflect.Value) int {
return compareUint(v1, v2)
}
func (c CompactUint64Codec) IsOrdered() bool {
return true
}
func (c CompactUint64Codec) FixedBufferSize() int {
return 9
}
func (c CompactUint64Codec) ComputeBufferSize(protoreflect.Value) (int, error) {
return c.FixedBufferSize(), nil
}
// EncodeCompactUint64 encodes uint64 values in 2,4,6 or 9 bytes.
// Unlike regular varints, this encoding is
// suitable for ordered prefix scans. The first two bits of the first byte
// indicate the length of the buffer - 00 for 2, 01 for 4, 10 for 6 and
// 11 for 9. The remaining bits are encoded with big-endian ordering.
// Values less than 2^14 fill fit in 2 bytes, values less than 2^30 will
// fit in 4, and values less than 2^46 will fit in 6.
func EncodeCompactUint64(x uint64) []byte {
switch {
case x < 16384: // 2^14
buf := make([]byte, 2)
buf[0] = byte(x >> 8)
buf[1] = byte(x)
return buf
case x < 1073741824: // 2^30
buf := make([]byte, 4)
buf[0] = 0x40
buf[0] |= byte(x >> 24)
buf[1] = byte(x >> 16)
buf[2] = byte(x >> 8)
buf[3] = byte(x)
return buf
case x < 70368744177664: // 2^46
buf := make([]byte, 6)
buf[0] = 0x80
buf[0] |= byte(x >> 40)
buf[1] = byte(x >> 32)
buf[2] = byte(x >> 24)
buf[3] = byte(x >> 16)
buf[4] = byte(x >> 8)
buf[5] = byte(x)
return buf
default:
buf := make([]byte, 9)
buf[0] = 0xC0
buf[0] |= byte(x >> 58)
buf[1] = byte(x >> 50)
buf[2] = byte(x >> 42)
buf[3] = byte(x >> 34)
buf[4] = byte(x >> 26)
buf[5] = byte(x >> 18)
buf[6] = byte(x >> 10)
buf[7] = byte(x >> 2)
buf[8] = byte(x) & 0x3
return buf
}
}
func DecodeCompactUint64(reader io.Reader) (uint64, error) {
var buf [9]byte
n, err := reader.Read(buf[:1])
if err != nil {
return 0, err
}
if n < 1 {
return 0, io.ErrUnexpectedEOF
}
switch buf[0] >> 6 {
case 0:
n, err := reader.Read(buf[1:2])
if err != nil {
return 0, err
}
if n < 1 {
return 0, io.ErrUnexpectedEOF
}
x := uint64(buf[0]) << 8
x |= uint64(buf[1])
return x, nil
case 1:
n, err := reader.Read(buf[1:4])
if err != nil {
return 0, err
}
if n < 3 {
return 0, io.ErrUnexpectedEOF
}
x := (uint64(buf[0]) & 0x3F) << 24
x |= uint64(buf[1]) << 16
x |= uint64(buf[2]) << 8
x |= uint64(buf[3])
return x, nil
case 2:
n, err := reader.Read(buf[1:6])
if err != nil {
return 0, err
}
if n < 5 {
return 0, io.ErrUnexpectedEOF
}
x := (uint64(buf[0]) & 0x3F) << 40
x |= uint64(buf[1]) << 32
x |= uint64(buf[2]) << 24
x |= uint64(buf[3]) << 16
x |= uint64(buf[4]) << 8
x |= uint64(buf[5])
return x, nil
case 3:
n, err := reader.Read(buf[1:9])
if err != nil {
return 0, err
}
if n < 8 {
return 0, io.ErrUnexpectedEOF
}
x := (uint64(buf[0]) & 0x3F) << 58
x |= uint64(buf[1]) << 50
x |= uint64(buf[2]) << 42
x |= uint64(buf[3]) << 34
x |= uint64(buf[4]) << 26
x |= uint64(buf[5]) << 18
x |= uint64(buf[6]) << 10
x |= uint64(buf[7]) << 2
x |= uint64(buf[8])
return x, nil
default:
return 0, fmt.Errorf("unexpected case")
}
}

View File

@ -1,54 +1,54 @@
GET 03000000000000000005
GET 03000005
PK testpb.ExampleAutoIncrementTable 5 -> {"id":5}
GET 03808002
SEQ testpb.ExampleAutoIncrementTable 0
GET 03000000000000000001
GET 03000001
PK testpb.ExampleAutoIncrementTable 1 -> {"id":1}
ORM INSERT testpb.ExampleAutoIncrementTable {"id":1,"x":"foo","y":5}
HAS 0301666f6f
ERR:EOF
SET 03000000000000000001 1203666f6f1805
SET 03000001 1203666f6f1805
PK testpb.ExampleAutoIncrementTable 1 -> {"id":1,"x":"foo","y":5}
SET 03808002 01
SEQ testpb.ExampleAutoIncrementTable 1
SET 0301666f6f 0000000000000001
SET 0301666f6f 0001
UNIQ testpb.ExampleAutoIncrementTable x : foo -> 1
GET 03808002 01
SEQ testpb.ExampleAutoIncrementTable 1
GET 03000000000000000002
GET 03000002
PK testpb.ExampleAutoIncrementTable 2 -> {"id":2}
ORM INSERT testpb.ExampleAutoIncrementTable {"id":2,"x":"bar","y":10}
HAS 0301626172
ERR:EOF
SET 03000000000000000002 1203626172180a
SET 03000002 1203626172180a
PK testpb.ExampleAutoIncrementTable 2 -> {"id":2,"x":"bar","y":10}
SET 03808002 02
SEQ testpb.ExampleAutoIncrementTable 2
SET 0301626172 0000000000000002
SET 0301626172 0002
UNIQ testpb.ExampleAutoIncrementTable x : bar -> 2
GET 03808002 02
SEQ testpb.ExampleAutoIncrementTable 2
ITERATOR 0300 -> 0301
VALID true
KEY 03000000000000000001 1203666f6f1805
KEY 03000001 1203666f6f1805
PK testpb.ExampleAutoIncrementTable 1 -> {"id":1,"x":"foo","y":5}
NEXT
VALID true
KEY 03000000000000000002 1203626172180a
KEY 03000002 1203626172180a
PK testpb.ExampleAutoIncrementTable 2 -> {"id":2,"x":"bar","y":10}
NEXT
VALID false
ITERATOR 0300 -> 0301
VALID true
KEY 03000000000000000001 1203666f6f1805
KEY 03000001 1203666f6f1805
PK testpb.ExampleAutoIncrementTable 1 -> {"id":1,"x":"foo","y":5}
KEY 03000000000000000001 1203666f6f1805
KEY 03000001 1203666f6f1805
PK testpb.ExampleAutoIncrementTable 1 -> {"id":1,"x":"foo","y":5}
NEXT
VALID true
KEY 03000000000000000002 1203626172180a
KEY 03000002 1203626172180a
PK testpb.ExampleAutoIncrementTable 2 -> {"id":2,"x":"bar","y":10}
KEY 03000000000000000002 1203626172180a
KEY 03000002 1203626172180a
PK testpb.ExampleAutoIncrementTable 2 -> {"id":2,"x":"bar","y":10}
NEXT
VALID false

File diff suppressed because it is too large Load Diff

View File

@ -36,8 +36,15 @@ message PrimaryKeyDescriptor {
// fields is a comma-separated list of fields in the primary key. Spaces are
// not allowed. Supported field types, their encodings, and any applicable constraints
// are described below.
// - uint32, uint64 are encoded as big-endian fixed width bytes and support
// sorted iteration.
// - uint32 are encoded as 2,3,4 or 5 bytes using a compact encoding that
// is suitable for sorted iteration (not varint encoding). This type is
// well-suited for small integers.
// - uint64 are encoded as 2,4,6 or 9 bytes using a compact encoding that
// is suitable for sorted iteration (not varint encoding). This type is
// well-suited for small integers such as auto-incrementing sequences.
// - fixed32, fixed64 are encoded as big-endian fixed width bytes and support
// sorted iteration. These types are well-suited for encoding fixed with
// decimals as integers.
// - string's are encoded as raw bytes in terminal key segments and null-terminated
// in non-terminal segments. Null characters are thus forbidden in strings.
// string fields support sorted iteration.
@ -46,7 +53,7 @@ message PrimaryKeyDescriptor {
// longer than 255 bytes are unsupported and bytes fields should not
// be assumed to be lexically sorted. If you have a byte array longer than
// 255 bytes that you'd like to index, you should consider hashing it first.
// - int32, sint32, int64, sint64 are encoding as fixed width bytes with
// - int32, sint32, int64, sint64, sfixed32, sfixed64 are encoded as fixed width bytes with
// an encoding that enables sorted iteration.
// - google.protobuf.Timestamp and google.protobuf.Duration are encoded
// as 12 bytes using an encoding that enables sorted iteration.