core/rawdb: implement sequential reads in freezer_table (#23117)

* core/rawdb: implement sequential reads in freezer_table

* core/rawdb, ethdb: add sequential reader to db interface

* core/rawdb: lint nitpicks

* core/rawdb: fix some nitpicks

* core/rawdb: fix flaw with deferred reads not being performed

* core/rawdb: better documentation
This commit is contained in:
Martin Holst Swende 2021-08-13 10:51:01 +02:00 committed by GitHub
parent a580f7d6c5
commit 5f98020a21
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 309 additions and 52 deletions

View File

@ -89,6 +89,11 @@ func (db *nofreezedb) Ancient(kind string, number uint64) ([]byte, error) {
return nil, errNotSupported return nil, errNotSupported
} }
// ReadAncients returns an error as we don't have a backing chain freezer.
func (db *nofreezedb) ReadAncients(kind string, start, max, maxByteSize uint64) ([][]byte, error) {
return nil, errNotSupported
}
// Ancients returns an error as we don't have a backing chain freezer. // Ancients returns an error as we don't have a backing chain freezer.
func (db *nofreezedb) Ancients() (uint64, error) { func (db *nofreezedb) Ancients() (uint64, error) {
return 0, errNotSupported return 0, errNotSupported

View File

@ -180,6 +180,18 @@ func (f *freezer) Ancient(kind string, number uint64) ([]byte, error) {
return nil, errUnknownTable return nil, errUnknownTable
} }
// ReadAncients retrieves multiple items in sequence, starting from the index 'start'.
// It will return
// - at most 'max' items,
// - at least 1 item (even if exceeding the maxByteSize), but will otherwise
// return as many items as fit into maxByteSize.
func (f *freezer) ReadAncients(kind string, start, count, maxBytes uint64) ([][]byte, error) {
if table := f.tables[kind]; table != nil {
return table.RetrieveItems(start, count, maxBytes)
}
return nil, errUnknownTable
}
// Ancients returns the length of the frozen items. // Ancients returns the length of the frozen items.
func (f *freezer) Ancients() (uint64, error) { func (f *freezer) Ancients() (uint64, error) {
return atomic.LoadUint64(&f.frozen), nil return atomic.LoadUint64(&f.frozen), nil

View File

@ -70,6 +70,19 @@ func (i *indexEntry) marshallBinary() []byte {
return b return b
} }
// bounds returns the start- and end- offsets, and the file number of where to
// read there data item marked by the two index entries. The two entries are
// assumed to be sequential.
func (start *indexEntry) bounds(end *indexEntry) (startOffset, endOffset, fileId uint32) {
if start.filenum != end.filenum {
// If a piece of data 'crosses' a data-file,
// it's actually in one piece on the second data-file.
// We return a zero-indexEntry for the second file as start
return 0, end.offset, end.filenum
}
return start.offset, end.offset, end.filenum
}
// freezerTable represents a single chained data table within the freezer (e.g. blocks). // freezerTable represents a single chained data table within the freezer (e.g. blocks).
// It consists of a data file (snappy encoded arbitrary data blobs) and an indexEntry // It consists of a data file (snappy encoded arbitrary data blobs) and an indexEntry
// file (uncompressed 64 bit indices into the data file). // file (uncompressed 64 bit indices into the data file).
@ -546,84 +559,183 @@ func (t *freezerTable) append(item uint64, encodedBlob []byte, wlock bool) (bool
return false, nil return false, nil
} }
// getBounds returns the indexes for the item // getIndices returns the index entries for the given from-item, covering 'count' items.
// returns start, end, filenumber and error // N.B: The actual number of returned indices for N items will always be N+1 (unless an
func (t *freezerTable) getBounds(item uint64) (uint32, uint32, uint32, error) { // error is returned).
buffer := make([]byte, indexEntrySize) // OBS: This method assumes that the caller has already verified (and/or trimmed) the range
var startIdx, endIdx indexEntry // so that the items are within bounds. If this method is used to read out of bounds,
// Read second index // it will return error.
if _, err := t.index.ReadAt(buffer, int64((item+1)*indexEntrySize)); err != nil { func (t *freezerTable) getIndices(from, count uint64) ([]*indexEntry, error) {
return 0, 0, 0, err // Apply the table-offset
from = from - uint64(t.itemOffset)
// For reading N items, we need N+1 indices.
buffer := make([]byte, (count+1)*indexEntrySize)
if _, err := t.index.ReadAt(buffer, int64(from*indexEntrySize)); err != nil {
return nil, err
} }
endIdx.unmarshalBinary(buffer) var (
// Read first index (unless it's the very first item) indices []*indexEntry
if item != 0 { offset int
if _, err := t.index.ReadAt(buffer, int64(item*indexEntrySize)); err != nil { )
return 0, 0, 0, err for i := from; i <= from+count; i++ {
index := new(indexEntry)
index.unmarshalBinary(buffer[offset:])
offset += indexEntrySize
indices = append(indices, index)
} }
startIdx.unmarshalBinary(buffer) if from == 0 {
} else {
// Special case if we're reading the first item in the freezer. We assume that // Special case if we're reading the first item in the freezer. We assume that
// the first item always start from zero(regarding the deletion, we // the first item always start from zero(regarding the deletion, we
// only support deletion by files, so that the assumption is held). // only support deletion by files, so that the assumption is held).
// This means we can use the first item metadata to carry information about // This means we can use the first item metadata to carry information about
// the 'global' offset, for the deletion-case // the 'global' offset, for the deletion-case
return 0, endIdx.offset, endIdx.filenum, nil indices[0].offset = 0
indices[0].filenum = indices[1].filenum
} }
if startIdx.filenum != endIdx.filenum { return indices, nil
// If a piece of data 'crosses' a data-file,
// it's actually in one piece on the second data-file.
// We return a zero-indexEntry for the second file as start
return 0, endIdx.offset, endIdx.filenum, nil
}
return startIdx.offset, endIdx.offset, endIdx.filenum, nil
} }
// Retrieve looks up the data offset of an item with the given number and retrieves // Retrieve looks up the data offset of an item with the given number and retrieves
// the raw binary blob from the data file. // the raw binary blob from the data file.
func (t *freezerTable) Retrieve(item uint64) ([]byte, error) { func (t *freezerTable) Retrieve(item uint64) ([]byte, error) {
blob, err := t.retrieve(item) items, err := t.RetrieveItems(item, 1, 0)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if t.noCompression { return items[0], nil
return blob, nil
}
return snappy.Decode(nil, blob)
} }
// retrieve looks up the data offset of an item with the given number and retrieves // RetrieveItems returns multiple items in sequence, starting from the index 'start'.
// the raw binary blob from the data file. OBS! This method does not decode // It will return at most 'max' items, but will abort earlier to respect the
// compressed data. // 'maxBytes' argument. However, if the 'maxBytes' is smaller than the size of one
func (t *freezerTable) retrieve(item uint64) ([]byte, error) { // item, it _will_ return one element and possibly overflow the maxBytes.
func (t *freezerTable) RetrieveItems(start, count, maxBytes uint64) ([][]byte, error) {
// First we read the 'raw' data, which might be compressed.
diskData, sizes, err := t.retrieveItems(start, count, maxBytes)
if err != nil {
return nil, err
}
var (
output = make([][]byte, 0, count)
offset int // offset for reading
outputSize int // size of uncompressed data
)
// Now slice up the data and decompress.
for i, diskSize := range sizes {
item := diskData[offset : offset+diskSize]
offset += diskSize
decompressedSize := diskSize
if !t.noCompression {
decompressedSize, _ = snappy.DecodedLen(item)
}
if i > 0 && uint64(outputSize+decompressedSize) > maxBytes {
break
}
if !t.noCompression {
data, err := snappy.Decode(nil, item)
if err != nil {
return nil, err
}
output = append(output, data)
} else {
output = append(output, item)
}
outputSize += decompressedSize
}
return output, nil
}
// retrieveItems reads up to 'count' items from the table. It reads at least
// one item, but otherwise avoids reading more than maxBytes bytes.
// It returns the (potentially compressed) data, and the sizes.
func (t *freezerTable) retrieveItems(start, count, maxBytes uint64) ([]byte, []int, error) {
t.lock.RLock() t.lock.RLock()
defer t.lock.RUnlock() defer t.lock.RUnlock()
// Ensure the table and the item is accessible // Ensure the table and the item is accessible
if t.index == nil || t.head == nil { if t.index == nil || t.head == nil {
return nil, errClosed return nil, nil, errClosed
} }
if atomic.LoadUint64(&t.items) <= item { itemCount := atomic.LoadUint64(&t.items) // max number
return nil, errOutOfBounds // Ensure the start is written, not deleted from the tail, and that the
// caller actually wants something
if itemCount <= start || uint64(t.itemOffset) > start || count == 0 {
return nil, nil, errOutOfBounds
} }
// Ensure the item was not deleted from the tail either if start+count > itemCount {
if uint64(t.itemOffset) > item { count = itemCount - start
return nil, errOutOfBounds
} }
startOffset, endOffset, filenum, err := t.getBounds(item - uint64(t.itemOffset)) var (
if err != nil { output = make([]byte, maxBytes) // Buffer to read data into
return nil, err outputSize int // Used size of that buffer
)
// readData is a helper method to read a single data item from disk.
readData := func(fileId, start uint32, length int) error {
// In case a small limit is used, and the elements are large, may need to
// realloc the read-buffer when reading the first (and only) item.
if len(output) < length {
output = make([]byte, length)
} }
dataFile, exist := t.files[filenum] dataFile, exist := t.files[fileId]
if !exist { if !exist {
return nil, fmt.Errorf("missing data file %d", filenum) return fmt.Errorf("missing data file %d", fileId)
} }
// Retrieve the data itself, decompress and return if _, err := dataFile.ReadAt(output[outputSize:outputSize+length], int64(start)); err != nil {
blob := make([]byte, endOffset-startOffset) return err
if _, err := dataFile.ReadAt(blob, int64(startOffset)); err != nil {
return nil, err
} }
t.readMeter.Mark(int64(len(blob) + 2*indexEntrySize)) outputSize += length
return blob, nil return nil
}
// Read all the indexes in one go
indices, err := t.getIndices(start, count)
if err != nil {
return nil, nil, err
}
var (
sizes []int // The sizes for each element
totalSize = 0 // The total size of all data read so far
readStart = indices[0].offset // Where, in the file, to start reading
unreadSize = 0 // The size of the as-yet-unread data
)
for i, firstIndex := range indices[:len(indices)-1] {
secondIndex := indices[i+1]
// Determine the size of the item.
offset1, offset2, _ := firstIndex.bounds(secondIndex)
size := int(offset2 - offset1)
// Crossing a file boundary?
if secondIndex.filenum != firstIndex.filenum {
// If we have unread data in the first file, we need to do that read now.
if unreadSize > 0 {
if err := readData(firstIndex.filenum, readStart, unreadSize); err != nil {
return nil, nil, err
}
unreadSize = 0
}
readStart = 0
}
if i > 0 && uint64(totalSize+size) > maxBytes {
// About to break out due to byte limit being exceeded. We don't
// read this last item, but we need to do the deferred reads now.
if unreadSize > 0 {
if err := readData(secondIndex.filenum, readStart, unreadSize); err != nil {
return nil, nil, err
}
}
break
}
// Defer the read for later
unreadSize += size
totalSize += size
sizes = append(sizes, size)
if i == len(indices)-2 || uint64(totalSize) > maxBytes {
// Last item, need to do the read now
if err := readData(secondIndex.filenum, readStart, unreadSize); err != nil {
return nil, nil, err
}
break
}
}
return output[:outputSize], sizes, nil
} }
// has returns an indicator whether the specified number data // has returns an indicator whether the specified number data

View File

@ -74,7 +74,7 @@ func TestFreezerBasics(t *testing.T) {
exp := getChunk(15, y) exp := getChunk(15, y)
got, err := f.Retrieve(uint64(y)) got, err := f.Retrieve(uint64(y))
if err != nil { if err != nil {
t.Fatal(err) t.Fatalf("reading item %d: %v", y, err)
} }
if !bytes.Equal(got, exp) { if !bytes.Equal(got, exp) {
t.Fatalf("test %d, got \n%x != \n%x", y, got, exp) t.Fatalf("test %d, got \n%x != \n%x", y, got, exp)
@ -692,3 +692,118 @@ func TestAppendTruncateParallel(t *testing.T) {
} }
} }
} }
// TestSequentialRead does some basic tests on the RetrieveItems.
func TestSequentialRead(t *testing.T) {
rm, wm, sg := metrics.NewMeter(), metrics.NewMeter(), metrics.NewGauge()
fname := fmt.Sprintf("batchread-%d", rand.Uint64())
{ // Fill table
f, err := newCustomTable(os.TempDir(), fname, rm, wm, sg, 50, true)
if err != nil {
t.Fatal(err)
}
// Write 15 bytes 30 times
for x := 0; x < 30; x++ {
data := getChunk(15, x)
f.Append(uint64(x), data)
}
f.DumpIndex(0, 30)
f.Close()
}
{ // Open it, iterate, verify iteration
f, err := newCustomTable(os.TempDir(), fname, rm, wm, sg, 50, true)
if err != nil {
t.Fatal(err)
}
items, err := f.RetrieveItems(0, 10000, 100000)
if err != nil {
t.Fatal(err)
}
if have, want := len(items), 30; have != want {
t.Fatalf("want %d items, have %d ", want, have)
}
for i, have := range items {
want := getChunk(15, i)
if !bytes.Equal(want, have) {
t.Fatalf("data corruption: have\n%x\n, want \n%x\n", have, want)
}
}
f.Close()
}
{ // Open it, iterate, verify byte limit. The byte limit is less than item
// size, so each lookup should only return one item
f, err := newCustomTable(os.TempDir(), fname, rm, wm, sg, 40, true)
if err != nil {
t.Fatal(err)
}
items, err := f.RetrieveItems(0, 10000, 10)
if err != nil {
t.Fatal(err)
}
if have, want := len(items), 1; have != want {
t.Fatalf("want %d items, have %d ", want, have)
}
for i, have := range items {
want := getChunk(15, i)
if !bytes.Equal(want, have) {
t.Fatalf("data corruption: have\n%x\n, want \n%x\n", have, want)
}
}
f.Close()
}
}
// TestSequentialReadByteLimit does some more advanced tests on batch reads.
// These tests check that when the byte limit hits, we correctly abort in time,
// but also properly do all the deferred reads for the previous data, regardless
// of whether the data crosses a file boundary or not.
func TestSequentialReadByteLimit(t *testing.T) {
rm, wm, sg := metrics.NewMeter(), metrics.NewMeter(), metrics.NewGauge()
fname := fmt.Sprintf("batchread-2-%d", rand.Uint64())
{ // Fill table
f, err := newCustomTable(os.TempDir(), fname, rm, wm, sg, 100, true)
if err != nil {
t.Fatal(err)
}
// Write 10 bytes 30 times,
// Splitting it at every 100 bytes (10 items)
for x := 0; x < 30; x++ {
data := getChunk(10, x)
f.Append(uint64(x), data)
}
f.Close()
}
for i, tc := range []struct {
items uint64
limit uint64
want int
}{
{9, 89, 8},
{10, 99, 9},
{11, 109, 10},
{100, 89, 8},
{100, 99, 9},
{100, 109, 10},
} {
{
f, err := newCustomTable(os.TempDir(), fname, rm, wm, sg, 100, true)
if err != nil {
t.Fatal(err)
}
items, err := f.RetrieveItems(0, tc.items, tc.limit)
if err != nil {
t.Fatal(err)
}
if have, want := len(items), tc.want; have != want {
t.Fatalf("test %d: want %d items, have %d ", i, want, have)
}
for ii, have := range items {
want := getChunk(10, ii)
if !bytes.Equal(want, have) {
t.Fatalf("test %d: data corruption item %d: have\n%x\n, want \n%x\n", i, ii, have, want)
}
}
f.Close()
}
}
}

View File

@ -62,6 +62,12 @@ func (t *table) Ancient(kind string, number uint64) ([]byte, error) {
return t.db.Ancient(kind, number) return t.db.Ancient(kind, number)
} }
// ReadAncients is a noop passthrough that just forwards the request to the underlying
// database.
func (t *table) ReadAncients(kind string, start, count, maxBytes uint64) ([][]byte, error) {
return t.db.ReadAncients(kind, start, count, maxBytes)
}
// Ancients is a noop passthrough that just forwards the request to the underlying // Ancients is a noop passthrough that just forwards the request to the underlying
// database. // database.
func (t *table) Ancients() (uint64, error) { func (t *table) Ancients() (uint64, error) {

View File

@ -76,6 +76,13 @@ type AncientReader interface {
// Ancient retrieves an ancient binary blob from the append-only immutable files. // Ancient retrieves an ancient binary blob from the append-only immutable files.
Ancient(kind string, number uint64) ([]byte, error) Ancient(kind string, number uint64) ([]byte, error)
// ReadAncients retrieves multiple items in sequence, starting from the index 'start'.
// It will return
// - at most 'count' items,
// - at least 1 item (even if exceeding the maxBytes), but will otherwise
// return as many items as fit into maxBytes.
ReadAncients(kind string, start, count, maxBytes uint64) ([][]byte, error)
// Ancients returns the ancient item numbers in the ancient store. // Ancients returns the ancient item numbers in the ancient store.
Ancients() (uint64, error) Ancients() (uint64, error)