func decodeBatchToMem(data []byte, expectSeq uint64, mdb *memdb.DB) (seq uint64, batchLen int, err error) { seq, batchLen, err = decodeBatchHeader(data) if err != nil { return 0, 0, err } if seq < expectSeq { return 0, 0, newErrBatchCorrupted("invalid sequence number") } data = data[batchHeaderLen:] var ik []byte var decodedLen int err = decodeBatch(data, func(i int, index batchIndex) error { if i >= batchLen { return newErrBatchCorrupted("invalid records length") } ik = makeInternalKey(ik, index.k(data), seq+uint64(i), index.keyType) if err := mdb.Put(ik, index.v(data)); err != nil { return err } decodedLen++ return nil }) if err == nil && decodedLen != batchLen { err = newErrBatchCorrupted(fmt.Sprintf("invalid records length: %d vs %d", batchLen, decodedLen)) } return }
func (b *Batch) revertMemReplay(to *memdb.DB) error { var ikScratch []byte return b.decodeRec(func(i int, kt keyType, key, value []byte) error { ikScratch := makeInternalKey(ikScratch, key, b.seq+uint64(i), kt) return to.Delete(ikScratch) }) }
func (b *Batch) memReplay(to *memdb.DB) error { var ikScratch []byte return b.decodeRec(func(i int, kt kType, key, value []byte) error { ikScratch = makeIkey(ikScratch, key, b.seq+uint64(i), kt) return to.Put(ikScratch, value) }) }
func (d *DB) memCompaction(mem *memdb.DB) { s := d.s c := newCMem(s) stats := new(cStatsStaging) s.printf("MemCompaction: started, size=%d entries=%d", mem.Size(), mem.Len()) d.transact(func() (err error) { stats.startTimer() defer stats.stopTimer() return c.flush(mem, -1) }) d.transact(func() (err error) { stats.startTimer() defer stats.stopTimer() return c.commit(d.journal.file.Num(), d.fseq) }) stats.write = c.t.size d.cstats[c.level].add(stats) // drop frozen mem d.dropFrozenMem() c = nil }
func (b *Batch) revertMem(seq uint64, mdb *memdb.DB) error { var ik []byte for i, index := range b.index { ik = makeInternalKey(ik, index.k(b.data), seq+uint64(i), index.keyType) if err := mdb.Delete(ik); err != nil { return err } } return nil }
func (db *DB) mpoolGet(n int) *memDB { var mdb *memdb.DB select { case mdb = <-db.memPool: default: } if mdb == nil || mdb.Capacity() < n { mdb = memdb.New(db.s.icmp, maxInt(db.s.o.GetWriteBuffer(), n)) } return &memDB{ db: db, DB: mdb, } }
func (s *session) flushMemdb(rec *sessionRecord, mdb *memdb.DB, level int) (level_ int, err error) { // Create sorted table. iter := mdb.NewIterator(nil) defer iter.Release() t, n, err := s.tops.createFrom(iter) if err != nil { return level, err } // Pick level and add to record. if level < 0 { level = s.pickMemdbLevel(t.imin.ukey(), t.imax.ukey()) } rec.addTableFile(level, t) s.logf("memdb@flush created L%d@%d N·%d S·%s %q:%q", level, t.file.Num(), n, shortenb(int(t.size)), t.imin, t.imax) return level, nil }
func (c *cMem) flush(mem *memdb.DB, level int) error { s := c.s // Write memdb to table t, n, err := s.tops.createFrom(mem.NewIterator(nil)) if err != nil { return err } if level < 0 { level = s.version_NB().pickLevel(t.min.ukey(), t.max.ukey()) } c.rec.addTableFile(level, t) s.logf("mem@flush created L%d@%d N·%d S·%s %q:%q", level, t.file.Num(), n, shortenb(int(t.size)), t.min, t.max) c.level = level return nil }
func memGet(mdb *memdb.DB, ikey internalKey, icmp *iComparer) (ok bool, mv []byte, err error) { mk, mv, err := mdb.Find(ikey) if err == nil { ukey, _, kt, kerr := parseInternalKey(mk) if kerr != nil { // Shouldn't have had happen. panic(kerr) } if icmp.uCompare(ukey, ikey.ukey()) == 0 { if kt == keyTypeDel { return true, nil, ErrNotFound } return true, mv, nil } } else if err != ErrNotFound { return true, nil, err } return }
func (c *cMem) flush(mem *memdb.DB, level int) error { s := c.s // Write memdb to table t, n, err := s.tops.createFrom(mem.NewIterator()) if err != nil { return err } if level < 0 { level = s.version().pickLevel(t.min.ukey(), t.max.ukey()) } c.rec.addTableFile(level, t) s.printf("Compaction: table created, source=mem level=%d num=%d size=%d entries=%d min=%q max=%q", level, t.file.Num(), t.size, n, t.min, t.max) c.level = level c.t = t return nil }
func (s *session) flushMemdb(rec *sessionRecord, mdb *memdb.DB, maxLevel int) (int, error) { // Create sorted table. iter := mdb.NewIterator(nil) defer iter.Release() t, n, err := s.tops.createFrom(iter) if err != nil { return 0, err } // Pick level other than zero can cause compaction issue with large // bulk insert and delete on strictly incrementing key-space. The // problem is that the small deletion markers trapped at lower level, // while key/value entries keep growing at higher level. Since the // key-space is strictly incrementing it will not overlaps with // higher level, thus maximum possible level is always picked, while // overlapping deletion marker pushed into lower level. // See: https://github.com/syndtr/goleveldb/issues/127. flushLevel := s.pickMemdbLevel(t.imin.ukey(), t.imax.ukey(), maxLevel) rec.addTableFile(flushLevel, t) s.logf("memdb@flush created L%d@%d N·%d S·%s %q:%q", flushLevel, t.fd.Num, n, shortenb(int(t.size)), t.imin, t.imax) return flushLevel, nil }
func (c *cMem) flush(mem *memdb.DB, level int) error { s := c.s // Write memdb to table. iter := mem.NewIterator(nil) defer iter.Release() t, n, err := s.tops.createFrom(iter) if err != nil { return err } // Pick level. if level < 0 { v := s.version() level = v.pickLevel(t.imin.ukey(), t.imax.ukey()) v.release() } c.rec.addTableFile(level, t) s.logf("mem@flush created L%d@%d N·%d S·%s %q:%q", level, t.file.Num(), n, shortenb(int(t.size)), t.imin, t.imax) c.level = level return nil }
func isMemOverlaps(icmp *iComparer, mem *memdb.DB, min, max []byte) bool { iter := mem.NewIterator(nil) defer iter.Release() return (max == nil || (iter.First() && icmp.uCompare(max, internalKey(iter.Key()).ukey()) >= 0)) && (min == nil || (iter.Last() && icmp.uCompare(min, internalKey(iter.Key()).ukey()) <= 0)) }
func (db *DB) recoverJournal() error { // Get all tables and sort it by file number. journalFiles_, err := db.s.getFiles(storage.TypeJournal) if err != nil { return err } journalFiles := files(journalFiles_) journalFiles.sort() // Discard older journal. prev := -1 for i, file := range journalFiles { if file.Num() >= db.s.stJournalNum { if prev >= 0 { i-- journalFiles[i] = journalFiles[prev] } journalFiles = journalFiles[i:] break } else if file.Num() == db.s.stPrevJournalNum { prev = i } } var jr *journal.Reader var of storage.File var mem *memdb.DB batch := new(Batch) cm := newCMem(db.s) buf := new(util.Buffer) // Options. strict := db.s.o.GetStrict(opt.StrictJournal) checksum := db.s.o.GetStrict(opt.StrictJournalChecksum) writeBuffer := db.s.o.GetWriteBuffer() recoverJournal := func(file storage.File) error { db.logf("journal@recovery recovering @%d", file.Num()) reader, err := file.Open() if err != nil { return err } defer reader.Close() // Create/reset journal reader instance. if jr == nil { jr = journal.NewReader(reader, dropper{db.s, file}, strict, checksum) } else { jr.Reset(reader, dropper{db.s, file}, strict, checksum) } // Flush memdb and remove obsolete journal file. if of != nil { if mem.Len() > 0 { if err := cm.flush(mem, 0); err != nil { return err } } if err := cm.commit(file.Num(), db.seq); err != nil { return err } cm.reset() of.Remove() of = nil } // Replay journal to memdb. mem.Reset() for { r, err := jr.Next() if err != nil { if err == io.EOF { break } return err } buf.Reset() if _, err := buf.ReadFrom(r); err != nil { if err == io.ErrUnexpectedEOF { continue } else { return err } } if err := batch.decode(buf.Bytes()); err != nil { return err } if err := batch.memReplay(mem); err != nil { return err } // Save sequence number. db.seq = batch.seq + uint64(batch.len()) // Flush it if large enough. if mem.Size() >= writeBuffer { if err := cm.flush(mem, 0); err != nil { return err } mem.Reset() } } of = file return nil } // Recover all journals. if len(journalFiles) > 0 { db.logf("journal@recovery F·%d", len(journalFiles)) // Mark file number as used. db.s.markFileNum(journalFiles[len(journalFiles)-1].Num()) mem = memdb.New(db.s.icmp, writeBuffer) for _, file := range journalFiles { if err := recoverJournal(file); err != nil { return err } } // Flush the last journal. if mem.Len() > 0 { if err := cm.flush(mem, 0); err != nil { return err } } } // Create a new journal. if _, err := db.newMem(0); err != nil { return err } // Commit. if err := cm.commit(db.journalFile.Num(), db.seq); err != nil { // Close journal. if db.journal != nil { db.journal.Close() db.journalWriter.Close() } return err } // Remove the last obsolete journal file. if of != nil { of.Remove() } return nil }
func (d *DB) recoverJournal() (err error) { s := d.s icmp := s.cmp s.printf("JournalRecovery: started, min=%d", s.stJournalNum) var mem *memdb.DB batch := new(Batch) cm := newCMem(s) journals := files(s.getFiles(storage.TypeJournal)) journals.sort() rJournals := make([]storage.File, 0, len(journals)) for _, journal := range journals { if journal.Num() >= s.stJournalNum || journal.Num() == s.stPrevJournalNum { s.markFileNum(journal.Num()) rJournals = append(rJournals, journal) } } var r, fr *journalReader for _, journal := range rJournals { s.printf("JournalRecovery: recovering, num=%d", journal.Num()) r, err = newJournalReader(journal, true, s.journalDropFunc("journal", journal.Num())) if err != nil { return } if mem != nil { if mem.Len() > 0 { err = cm.flush(mem, 0) if err != nil { return } } err = cm.commit(r.file.Num(), d.seq) if err != nil { return } cm.reset() fr.remove() fr = nil } mem = memdb.New(icmp) for r.journal.Next() { err = batch.decode(r.journal.Record()) if err != nil { return } err = batch.memReplay(mem) if err != nil { return } d.seq = batch.seq + uint64(batch.len()) if mem.Size() > s.o.GetWriteBuffer() { // flush to table err = cm.flush(mem, 0) if err != nil { return } // create new memdb mem = memdb.New(icmp) } } err = r.journal.Error() if err != nil { return } r.close() fr = r } // create new journal _, err = d.newMem() if err != nil { return } if mem != nil && mem.Len() > 0 { err = cm.flush(mem, 0) if err != nil { return } } err = cm.commit(d.journal.file.Num(), d.seq) if err != nil { return } if fr != nil { fr.remove() } return }
func (b *Batch) revertMemReplay(to *memdb.DB) error { return b.decodeRec(func(i int, kt kType, key, value []byte) { ikey := newIkey(key, b.seq+uint64(i), kt) to.Delete(ikey) }) }
func (d *DB) recoverJournal() error { s := d.s ff0, err := s.getFiles(storage.TypeJournal) if err != nil { return err } ff1 := files(ff0) ff1.sort() ff2 := make([]storage.File, 0, len(ff1)) for _, file := range ff1 { if file.Num() >= s.stJournalNum || file.Num() == s.stPrevJournalNum { s.markFileNum(file.Num()) ff2 = append(ff2, file) } } var jr *journal.Reader var of storage.File var mem *memdb.DB batch := new(Batch) cm := newCMem(s) buf := new(util.Buffer) // Options. strict := s.o.GetStrict(opt.StrictJournal) checksum := s.o.GetStrict(opt.StrictJournalChecksum) writeBuffer := s.o.GetWriteBuffer() recoverJournal := func(file storage.File) error { s.logf("journal@recovery recovering @%d", file.Num()) reader, err := file.Open() if err != nil { return err } defer reader.Close() if jr == nil { jr = journal.NewReader(reader, dropper{s, file}, strict, checksum) } else { jr.Reset(reader, dropper{s, file}, strict, checksum) } if of != nil { if mem.Len() > 0 { if err := cm.flush(mem, 0); err != nil { return err } } if err := cm.commit(file.Num(), d.seq); err != nil { return err } cm.reset() of.Remove() of = nil } // Reset memdb. mem.Reset() for { r, err := jr.Next() if err != nil { if err == io.EOF { break } return err } buf.Reset() if _, err := buf.ReadFrom(r); err != nil { if strict { return err } continue } if err := batch.decode(buf.Bytes()); err != nil { return err } if err := batch.memReplay(mem); err != nil { return err } d.seq = batch.seq + uint64(batch.len()) if mem.Size() >= writeBuffer { // Large enough, flush it. if err := cm.flush(mem, 0); err != nil { return err } // Reset memdb. mem.Reset() } } of = file return nil } // Recover all journals. if len(ff2) > 0 { s.logf("journal@recovery F·%d", len(ff2)) mem = memdb.New(s.icmp, writeBuffer) for _, file := range ff2 { if err := recoverJournal(file); err != nil { return err } } // Flush the last journal. if mem.Len() > 0 { if err := cm.flush(mem, 0); err != nil { return err } } } // Create a new journal. if _, err := d.newMem(0); err != nil { return err } // Commit. if err := cm.commit(d.journalFile.Num(), d.seq); err != nil { return err } // Remove the last journal. if of != nil { of.Remove() } return nil }
func (b *Batch) memReplay(to *memdb.DB) error { return b.decodeRec(func(i int, t vType, key, value []byte) { ikey := newIKey(key, b.seq+uint64(i), t) to.Put(ikey, value) }) }
func (d *DB) recoverLog() (err error) { s := d.s icmp := s.cmp s.printf("LogRecovery: started, min=%d", s.stLogNum) var mem *memdb.DB batch := new(Batch) cm := newCMem(s) logs, skip := files(s.getFiles(desc.TypeLog)), 0 logs.sort() for _, log := range logs { if log.Num() < s.stLogNum { skip++ continue } s.markFileNum(log.Num()) } var r, fr *logReader for _, log := range logs[skip:] { s.printf("LogRecovery: recovering, num=%d", log.Num()) r, err = newLogReader(log, true, s.logDropFunc("log", log.Num())) if err != nil { return } if mem != nil { if mem.Len() > 0 { err = cm.flush(mem, 0) if err != nil { return } } err = cm.commit(r.file.Num(), d.seq) if err != nil { return } cm.reset() fr.remove() fr = nil } mem = memdb.New(icmp) for r.log.Next() { err = batch.decode(r.log.Record()) if err != nil { return } err = batch.memReplay(mem) if err != nil { return } d.seq = batch.seq + uint64(batch.len()) if mem.Size() > s.o.GetWriteBuffer() { // flush to table err = cm.flush(mem, 0) if err != nil { return } // create new memdb mem = memdb.New(icmp) } } err = r.log.Error() if err != nil { return } r.close() fr = r } // create new log _, err = d.newMem() if err != nil { return } if mem != nil && mem.Len() > 0 { err = cm.flush(mem, 0) if err != nil { return } } err = cm.commit(d.log.file.Num(), d.seq) if err != nil { return } if fr != nil { fr.remove() } return }