core/rawdb: fix datarace in freezer (#22728)
The Append / truncate operations were racy. When a datafile reaches 2Gb, a new file is needed. For this operation, we require a writelock, which is not needed in the 99.99% of all cases where the data does fit in the current head-file. This transition from readlock to writelock was incorrect, and as the readlock was released, a truncate operation could slip in between, and truncate the data. This would have been fine, however, the Append operation continued writing as if no truncation had occurred, e.g writing item 5 where item 0 should reside. This PR changes the behaviour, so that if when we run into the situation that a new file is needed, it aborts, and retries, this time with a writelock. The outcome of the situation described above, running on this PR, would instead be that the Append operation exits with a failure.
This commit is contained in:
parent
34f3c9539b
commit
9b99e3dfe0
@ -465,35 +465,59 @@ func (t *freezerTable) releaseFilesAfter(num uint32, remove bool) {
|
|||||||
// Note, this method will *not* flush any data to disk so be sure to explicitly
|
// Note, this method will *not* flush any data to disk so be sure to explicitly
|
||||||
// fsync before irreversibly deleting data from the database.
|
// fsync before irreversibly deleting data from the database.
|
||||||
func (t *freezerTable) Append(item uint64, blob []byte) error {
|
func (t *freezerTable) Append(item uint64, blob []byte) error {
|
||||||
// Read lock prevents competition with truncate
|
// Encode the blob before the lock portion
|
||||||
t.lock.RLock()
|
|
||||||
// Ensure the table is still accessible
|
|
||||||
if t.index == nil || t.head == nil {
|
|
||||||
t.lock.RUnlock()
|
|
||||||
return errClosed
|
|
||||||
}
|
|
||||||
// Ensure only the next item can be written, nothing else
|
|
||||||
if atomic.LoadUint64(&t.items) != item {
|
|
||||||
t.lock.RUnlock()
|
|
||||||
return fmt.Errorf("appending unexpected item: want %d, have %d", t.items, item)
|
|
||||||
}
|
|
||||||
// Encode the blob and write it into the data file
|
|
||||||
if !t.noCompression {
|
if !t.noCompression {
|
||||||
blob = snappy.Encode(nil, blob)
|
blob = snappy.Encode(nil, blob)
|
||||||
}
|
}
|
||||||
bLen := uint32(len(blob))
|
// Read lock prevents competition with truncate
|
||||||
|
retry, err := t.append(item, blob, false)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if retry {
|
||||||
|
// Read lock was insufficient, retry with a writelock
|
||||||
|
_, err = t.append(item, blob, true)
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// append injects a binary blob at the end of the freezer table.
|
||||||
|
// Normally, inserts do not require holding the write-lock, so it should be invoked with 'wlock' set to
|
||||||
|
// false.
|
||||||
|
// However, if the data will grown the current file out of bounds, then this
|
||||||
|
// method will return 'true, nil', indicating that the caller should retry, this time
|
||||||
|
// with 'wlock' set to true.
|
||||||
|
func (t *freezerTable) append(item uint64, encodedBlob []byte, wlock bool) (bool, error) {
|
||||||
|
if wlock {
|
||||||
|
t.lock.Lock()
|
||||||
|
defer t.lock.Unlock()
|
||||||
|
} else {
|
||||||
|
t.lock.RLock()
|
||||||
|
defer t.lock.RUnlock()
|
||||||
|
}
|
||||||
|
// Ensure the table is still accessible
|
||||||
|
if t.index == nil || t.head == nil {
|
||||||
|
return false, errClosed
|
||||||
|
}
|
||||||
|
// Ensure only the next item can be written, nothing else
|
||||||
|
if atomic.LoadUint64(&t.items) != item {
|
||||||
|
return false, fmt.Errorf("appending unexpected item: want %d, have %d", t.items, item)
|
||||||
|
}
|
||||||
|
bLen := uint32(len(encodedBlob))
|
||||||
if t.headBytes+bLen < bLen ||
|
if t.headBytes+bLen < bLen ||
|
||||||
t.headBytes+bLen > t.maxFileSize {
|
t.headBytes+bLen > t.maxFileSize {
|
||||||
// we need a new file, writing would overflow
|
// Writing would overflow, so we need to open a new data file.
|
||||||
t.lock.RUnlock()
|
// If we don't already hold the writelock, abort and let the caller
|
||||||
t.lock.Lock()
|
// invoke this method a second time.
|
||||||
|
if !wlock {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
nextID := atomic.LoadUint32(&t.headId) + 1
|
nextID := atomic.LoadUint32(&t.headId) + 1
|
||||||
// We open the next file in truncated mode -- if this file already
|
// We open the next file in truncated mode -- if this file already
|
||||||
// exists, we need to start over from scratch on it
|
// exists, we need to start over from scratch on it
|
||||||
newHead, err := t.openFile(nextID, openFreezerFileTruncated)
|
newHead, err := t.openFile(nextID, openFreezerFileTruncated)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.lock.Unlock()
|
return false, err
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
// Close old file, and reopen in RDONLY mode
|
// Close old file, and reopen in RDONLY mode
|
||||||
t.releaseFile(t.headId)
|
t.releaseFile(t.headId)
|
||||||
@ -503,13 +527,9 @@ func (t *freezerTable) Append(item uint64, blob []byte) error {
|
|||||||
t.head = newHead
|
t.head = newHead
|
||||||
atomic.StoreUint32(&t.headBytes, 0)
|
atomic.StoreUint32(&t.headBytes, 0)
|
||||||
atomic.StoreUint32(&t.headId, nextID)
|
atomic.StoreUint32(&t.headId, nextID)
|
||||||
t.lock.Unlock()
|
|
||||||
t.lock.RLock()
|
|
||||||
}
|
}
|
||||||
|
if _, err := t.head.Write(encodedBlob); err != nil {
|
||||||
defer t.lock.RUnlock()
|
return false, err
|
||||||
if _, err := t.head.Write(blob); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
newOffset := atomic.AddUint32(&t.headBytes, bLen)
|
newOffset := atomic.AddUint32(&t.headBytes, bLen)
|
||||||
idx := indexEntry{
|
idx := indexEntry{
|
||||||
@ -523,7 +543,7 @@ func (t *freezerTable) Append(item uint64, blob []byte) error {
|
|||||||
t.sizeGauge.Inc(int64(bLen + indexEntrySize))
|
t.sizeGauge.Inc(int64(bLen + indexEntrySize))
|
||||||
|
|
||||||
atomic.AddUint64(&t.items, 1)
|
atomic.AddUint64(&t.items, 1)
|
||||||
return nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// getBounds returns the indexes for the item
|
// getBounds returns the indexes for the item
|
||||||
@ -562,45 +582,49 @@ func (t *freezerTable) getBounds(item uint64) (uint32, uint32, uint32, error) {
|
|||||||
// Retrieve looks up the data offset of an item with the given number and retrieves
|
// Retrieve looks up the data offset of an item with the given number and retrieves
|
||||||
// the raw binary blob from the data file.
|
// the raw binary blob from the data file.
|
||||||
func (t *freezerTable) Retrieve(item uint64) ([]byte, error) {
|
func (t *freezerTable) Retrieve(item uint64) ([]byte, error) {
|
||||||
|
blob, err := t.retrieve(item)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if t.noCompression {
|
||||||
|
return blob, nil
|
||||||
|
}
|
||||||
|
return snappy.Decode(nil, blob)
|
||||||
|
}
|
||||||
|
|
||||||
|
// retrieve looks up the data offset of an item with the given number and retrieves
|
||||||
|
// the raw binary blob from the data file. OBS! This method does not decode
|
||||||
|
// compressed data.
|
||||||
|
func (t *freezerTable) retrieve(item uint64) ([]byte, error) {
|
||||||
t.lock.RLock()
|
t.lock.RLock()
|
||||||
|
defer t.lock.RUnlock()
|
||||||
// Ensure the table and the item is accessible
|
// Ensure the table and the item is accessible
|
||||||
if t.index == nil || t.head == nil {
|
if t.index == nil || t.head == nil {
|
||||||
t.lock.RUnlock()
|
|
||||||
return nil, errClosed
|
return nil, errClosed
|
||||||
}
|
}
|
||||||
if atomic.LoadUint64(&t.items) <= item {
|
if atomic.LoadUint64(&t.items) <= item {
|
||||||
t.lock.RUnlock()
|
|
||||||
return nil, errOutOfBounds
|
return nil, errOutOfBounds
|
||||||
}
|
}
|
||||||
// Ensure the item was not deleted from the tail either
|
// Ensure the item was not deleted from the tail either
|
||||||
if uint64(t.itemOffset) > item {
|
if uint64(t.itemOffset) > item {
|
||||||
t.lock.RUnlock()
|
|
||||||
return nil, errOutOfBounds
|
return nil, errOutOfBounds
|
||||||
}
|
}
|
||||||
startOffset, endOffset, filenum, err := t.getBounds(item - uint64(t.itemOffset))
|
startOffset, endOffset, filenum, err := t.getBounds(item - uint64(t.itemOffset))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.lock.RUnlock()
|
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
dataFile, exist := t.files[filenum]
|
dataFile, exist := t.files[filenum]
|
||||||
if !exist {
|
if !exist {
|
||||||
t.lock.RUnlock()
|
|
||||||
return nil, fmt.Errorf("missing data file %d", filenum)
|
return nil, fmt.Errorf("missing data file %d", filenum)
|
||||||
}
|
}
|
||||||
// Retrieve the data itself, decompress and return
|
// Retrieve the data itself, decompress and return
|
||||||
blob := make([]byte, endOffset-startOffset)
|
blob := make([]byte, endOffset-startOffset)
|
||||||
if _, err := dataFile.ReadAt(blob, int64(startOffset)); err != nil {
|
if _, err := dataFile.ReadAt(blob, int64(startOffset)); err != nil {
|
||||||
t.lock.RUnlock()
|
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
t.lock.RUnlock()
|
|
||||||
t.readMeter.Mark(int64(len(blob) + 2*indexEntrySize))
|
t.readMeter.Mark(int64(len(blob) + 2*indexEntrySize))
|
||||||
|
|
||||||
if t.noCompression {
|
|
||||||
return blob, nil
|
return blob, nil
|
||||||
}
|
}
|
||||||
return snappy.Decode(nil, blob)
|
|
||||||
}
|
|
||||||
|
|
||||||
// has returns an indicator whether the specified number data
|
// has returns an indicator whether the specified number data
|
||||||
// exists in the freezer table.
|
// exists in the freezer table.
|
||||||
|
@ -18,10 +18,13 @@ package rawdb
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io/ioutil"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -637,6 +640,55 @@ func TestOffset(t *testing.T) {
|
|||||||
// 1. have data files d0, d1, d2, d3
|
// 1. have data files d0, d1, d2, d3
|
||||||
// 2. remove d2,d3
|
// 2. remove d2,d3
|
||||||
//
|
//
|
||||||
// However, all 'normal' failure modes arising due to failing to sync() or save a file should be
|
// However, all 'normal' failure modes arising due to failing to sync() or save a file
|
||||||
// handled already, and the case described above can only (?) happen if an external process/user
|
// should be handled already, and the case described above can only (?) happen if an
|
||||||
// deletes files from the filesystem.
|
// external process/user deletes files from the filesystem.
|
||||||
|
|
||||||
|
// TestAppendTruncateParallel is a test to check if the Append/truncate operations are
|
||||||
|
// racy.
|
||||||
|
//
|
||||||
|
// The reason why it's not a regular fuzzer, within tests/fuzzers, is that it is dependent
|
||||||
|
// on timing rather than 'clever' input -- there's no determinism.
|
||||||
|
func TestAppendTruncateParallel(t *testing.T) {
|
||||||
|
dir, err := ioutil.TempDir("", "freezer")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
f, err := newCustomTable(dir, "tmp", metrics.NilMeter{}, metrics.NilMeter{}, metrics.NilGauge{}, 8, true)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fill := func(mark uint64) []byte {
|
||||||
|
data := make([]byte, 8)
|
||||||
|
binary.LittleEndian.PutUint64(data, mark)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < 5000; i++ {
|
||||||
|
f.truncate(0)
|
||||||
|
data0 := fill(0)
|
||||||
|
f.Append(0, data0)
|
||||||
|
data1 := fill(1)
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(2)
|
||||||
|
go func() {
|
||||||
|
f.truncate(0)
|
||||||
|
wg.Done()
|
||||||
|
}()
|
||||||
|
go func() {
|
||||||
|
f.Append(1, data1)
|
||||||
|
wg.Done()
|
||||||
|
}()
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
if have, err := f.Retrieve(0); err == nil {
|
||||||
|
if !bytes.Equal(have, data0) {
|
||||||
|
t.Fatalf("have %x want %x", have, data0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user