/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "math" "github.com/dgraph-io/badger/pb" "github.com/dgraph-io/badger/table" "github.com/dgraph-io/badger/y" humanize "github.com/dustin/go-humanize" "github.com/pkg/errors" ) const headStreamId uint32 = math.MaxUint32 // StreamWriter is used to write data coming from multiple streams. The streams must not have any // overlapping key ranges. Within each stream, the keys must be sorted. Badger Stream framework is // capable of generating such an output. So, this StreamWriter can be used at the other end to build // BadgerDB at a much faster pace by writing SSTables (and value logs) directly to LSM tree levels // without causing any compactions at all. This is way faster than using batched writer or using // transactions, but only applicable in situations where the keys are pre-sorted and the DB is being // bootstrapped. Existing data would get deleted when using this writer. So, this is only useful // when restoring from backup or replicating DB across servers. // // StreamWriter should not be called on in-use DB instances. It is designed only to bootstrap new // DBs. type StreamWriter struct { db *DB done func() throttle *y.Throttle head valuePointer maxVersion uint64 writers map[uint32]*sortedWriter } // NewStreamWriter creates a StreamWriter. Right after creating StreamWriter, Prepare must be // called. The memory usage of a StreamWriter is directly proportional to the number of streams // possible. So, efforts must be made to keep the number of streams low. Stream framework would // typically use 16 goroutines and hence create 16 streams. func (db *DB) NewStreamWriter() *StreamWriter { return &StreamWriter{ db: db, // throttle shouldn't make much difference. Memory consumption is based on the number of // concurrent streams being processed. throttle: y.NewThrottle(16), writers: make(map[uint32]*sortedWriter), } } // Prepare should be called before writing any entry to StreamWriter. It deletes all data present in // existing DB, stops compactions and any writes being done by other means. Be very careful when // calling Prepare, because it could result in permanent data loss. Not calling Prepare would result // in a corrupt Badger instance. func (sw *StreamWriter) Prepare() error { var err error sw.done, err = sw.db.dropAll() return err } // Write writes KVList to DB. Each KV within the list contains the stream id which StreamWriter // would use to demux the writes. func (sw *StreamWriter) Write(kvs *pb.KVList) error { var entries []*Entry for _, kv := range kvs.Kv { var meta, userMeta byte if len(kv.Meta) > 0 { meta = kv.Meta[0] } if len(kv.UserMeta) > 0 { userMeta = kv.UserMeta[0] } if sw.maxVersion < kv.Version { sw.maxVersion = kv.Version } e := &Entry{ Key: y.KeyWithTs(kv.Key, kv.Version), Value: kv.Value, UserMeta: userMeta, ExpiresAt: kv.ExpiresAt, meta: meta, } // If the value can be colocated with the key in LSM tree, we can skip // writing the value to value log. e.skipVlog = sw.db.shouldWriteValueToLSM(*e) entries = append(entries, e) } req := &request{ Entries: entries, } y.AssertTrue(len(kvs.Kv) == len(req.Entries)) if err := sw.db.vlog.write([]*request{req}); err != nil { return err } for i, kv := range kvs.Kv { e := req.Entries[i] vptr := req.Ptrs[i] if !vptr.IsZero() { y.AssertTrue(sw.head.Less(vptr)) sw.head = vptr } writer, ok := sw.writers[kv.StreamId] if !ok { writer = sw.newWriter(kv.StreamId) sw.writers[kv.StreamId] = writer } var vs y.ValueStruct if e.skipVlog { vs = y.ValueStruct{ Value: e.Value, Meta: e.meta, UserMeta: e.UserMeta, ExpiresAt: e.ExpiresAt, } } else { vbuf := make([]byte, vptrSize) vs = y.ValueStruct{ Value: vptr.Encode(vbuf), Meta: e.meta | bitValuePointer, UserMeta: e.UserMeta, ExpiresAt: e.ExpiresAt, } } if err := writer.Add(e.Key, vs); err != nil { return err } } return nil } // Flush is called once we are done writing all the entries. It syncs DB directories. It also // updates Oracle with maxVersion found in all entries (if DB is not managed). func (sw *StreamWriter) Flush() error { defer sw.done() for _, writer := range sw.writers { if err := writer.Done(); err != nil { return err } } // Encode and write the value log head into a new table. data := make([]byte, vptrSize) sw.head.Encode(data) headWriter := sw.newWriter(headStreamId) if err := headWriter.Add( y.KeyWithTs(head, sw.maxVersion), y.ValueStruct{Value: data}); err != nil { return err } if err := headWriter.Done(); err != nil { return err } if !sw.db.opt.managedTxns { sw.db.orc = newOracle(sw.db.opt) sw.db.orc.nextTxnTs = sw.maxVersion sw.db.orc.txnMark.Done(sw.maxVersion) sw.db.orc.readMark.Done(sw.maxVersion) sw.db.orc.incrementNextTs() } // Wait for all files to be written. if err := sw.throttle.Finish(); err != nil { return err } // Now sync the directories, so all the files are registered. if sw.db.opt.ValueDir != sw.db.opt.Dir { if err := syncDir(sw.db.opt.ValueDir); err != nil { return err } } return syncDir(sw.db.opt.Dir) } type sortedWriter struct { db *DB throttle *y.Throttle builder *table.Builder lastKey []byte streamId uint32 } func (sw *StreamWriter) newWriter(streamId uint32) *sortedWriter { return &sortedWriter{ db: sw.db, streamId: streamId, throttle: sw.throttle, builder: table.NewTableBuilder(), } } // ErrUnsortedKey is returned when any out of order key arrives at sortedWriter during call to Add. var ErrUnsortedKey = errors.New("Keys not in sorted order") // Add adds key and vs to sortedWriter. func (w *sortedWriter) Add(key []byte, vs y.ValueStruct) error { if bytes.Compare(key, w.lastKey) <= 0 { return ErrUnsortedKey } sameKey := y.SameKey(key, w.lastKey) w.lastKey = y.SafeCopy(w.lastKey, key) if err := w.builder.Add(key, vs); err != nil { return err } // Same keys should go into the same SSTable. if !sameKey && w.builder.ReachedCapacity(w.db.opt.MaxTableSize) { return w.send() } return nil } func (w *sortedWriter) send() error { if err := w.throttle.Do(); err != nil { return err } go func(builder *table.Builder) { data := builder.Finish() err := w.createTable(data) w.throttle.Done(err) }(w.builder) w.builder = table.NewTableBuilder() return nil } // Done is called once we are done writing all keys and valueStructs // to sortedWriter. It completes writing current SST to disk. func (w *sortedWriter) Done() error { if w.builder.Empty() { return nil } return w.send() } func (w *sortedWriter) createTable(data []byte) error { if len(data) == 0 { return nil } fileID := w.db.lc.reserveFileID() fd, err := y.CreateSyncedFile(table.NewFilename(fileID, w.db.opt.Dir), true) if err != nil { return err } if _, err := fd.Write(data); err != nil { return err } tbl, err := table.OpenTable(fd, w.db.opt.TableLoadingMode, nil) if err != nil { return err } lc := w.db.lc var lhandler *levelHandler // We should start the levels from 1, because we need level 0 to set the !badger!head key. We // cannot mix up this key with other keys from the DB, otherwise we would introduce a range // overlap violation. y.AssertTrue(len(lc.levels) > 1) for _, l := range lc.levels[1:] { ratio := float64(l.getTotalSize()) / float64(l.maxTotalSize) if ratio < 1.0 { lhandler = l break } } if lhandler == nil { // If we're exceeding the size of the lowest level, shove it in the lowest level. Can't do // better than that. lhandler = lc.levels[len(lc.levels)-1] } if w.streamId == headStreamId { // This is a special !badger!head key. We should store it at level 0, separate from all the // other keys to avoid an overlap. lhandler = lc.levels[0] } // Now that table can be opened successfully, let's add this to the MANIFEST. change := &pb.ManifestChange{ Id: tbl.ID(), Op: pb.ManifestChange_CREATE, Level: uint32(lhandler.level), Checksum: tbl.Checksum, } if err := w.db.manifest.addChanges([]*pb.ManifestChange{change}); err != nil { return err } if err := lhandler.replaceTables([]*table.Table{}, []*table.Table{tbl}); err != nil { return err } w.db.opt.Infof("Table created: %d at level: %d for stream: %d. Size: %s\n", fileID, lhandler.level, w.streamId, humanize.Bytes(uint64(tbl.Size()))) return nil }