func (db *DB) compactionError() { var err error noerr: // No error. for { select { case err = <-db.compErrSetC: switch { case err == nil: case err == ErrReadOnly, errors.IsCorrupted(err): goto hasperr default: goto haserr } case _, _ = <-db.closeC: return } } haserr: // Transient error. for { select { case db.compErrC <- err: case err = <-db.compErrSetC: switch { case err == nil: goto noerr case err == ErrReadOnly, errors.IsCorrupted(err): goto hasperr default: } case _, _ = <-db.closeC: return } } hasperr: // Persistent error. for { select { case db.compErrC <- err: case db.compPerErrC <- err: case db.writeLockC <- struct{}{}: // Hold write lock, so that write won't pass-through. db.compWriteLocking = true case _, _ = <-db.closeC: if db.compWriteLocking { // We should release the lock or Close will hang. <-db.writeLockC } return } } }
func (i *mergedIterator) iterErr(iter Iterator) bool { if err := iter.Error(); err != nil { if i.errf != nil { i.errf(err) } if i.strict || !errors.IsCorrupted(err) { i.err = err return true } } return false }
func (i *indexedIterator) dataErr() bool { if err := i.data.Error(); err != nil { if i.errf != nil { i.errf(err) } if i.strict || !errors.IsCorrupted(err) { i.err = err return true } } return false }
// Recover a database session; need external synchronization. func (s *session) recover() (err error) { defer func() { if os.IsNotExist(err) { // Don't return os.ErrNotExist if the underlying storage contains // other files that belong to LevelDB. So the DB won't get trashed. if fds, _ := s.stor.List(storage.TypeAll); len(fds) > 0 { err = &errors.ErrCorrupted{Fd: storage.FileDesc{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}} } } }() fd, err := s.stor.GetMeta() if err != nil { return } reader, err := s.stor.Open(fd) if err != nil { return } defer reader.Close() var ( // Options. strict = s.o.GetStrict(opt.StrictManifest) jr = journal.NewReader(reader, dropper{s, fd}, strict, true) rec = &sessionRecord{} staging = s.stVersion.newStaging() ) for { var r io.Reader r, err = jr.Next() if err != nil { if err == io.EOF { err = nil break } return errors.SetFd(err, fd) } err = rec.decode(r) if err == nil { // save compact pointers for _, r := range rec.compPtrs { s.setCompPtr(r.level, internalKey(r.ikey)) } // commit record to version staging staging.commit(rec) } else { err = errors.SetFd(err, fd) if strict || !errors.IsCorrupted(err) { return } s.logf("manifest error: %v (skipped)", errors.SetFd(err, fd)) } rec.resetCompPtrs() rec.resetAddedTables() rec.resetDeletedTables() } switch { case !rec.has(recComparer): return newErrManifestCorrupted(fd, "comparer", "missing") case rec.comparer != s.icmp.uName(): return newErrManifestCorrupted(fd, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer)) case !rec.has(recNextFileNum): return newErrManifestCorrupted(fd, "next-file-num", "missing") case !rec.has(recJournalNum): return newErrManifestCorrupted(fd, "journal-file-num", "missing") case !rec.has(recSeqNum): return newErrManifestCorrupted(fd, "seq-num", "missing") } s.manifestFd = fd s.setVersion(staging.finish()) s.setNextFileNum(rec.nextFileNum) s.recordCommited(rec) return nil }
func (db *DB) compactionTransact(name string, t compactionTransactInterface) { defer func() { if x := recover(); x != nil { if x == errCompactionTransactExiting { if err := t.revert(); err != nil { db.logf("%s revert error %q", name, err) } } panic(x) } }() const ( backoffMin = 1 * time.Second backoffMax = 8 * time.Second backoffMul = 2 * time.Second ) var ( backoff = backoffMin backoffT = time.NewTimer(backoff) lastCnt = compactionTransactCounter(0) disableBackoff = db.s.o.GetDisableCompactionBackoff() ) for n := 0; ; n++ { // Check wether the DB is closed. if db.isClosed() { db.logf("%s exiting", name) db.compactionExitTransact() } else if n > 0 { db.logf("%s retrying N·%d", name, n) } // Execute. cnt := compactionTransactCounter(0) err := t.run(&cnt) if err != nil { db.logf("%s error I·%d %q", name, cnt, err) } // Set compaction error status. select { case db.compErrSetC <- err: case perr := <-db.compPerErrC: if err != nil { db.logf("%s exiting (persistent error %q)", name, perr) db.compactionExitTransact() } case _, _ = <-db.closeC: db.logf("%s exiting", name) db.compactionExitTransact() } if err == nil { return } if errors.IsCorrupted(err) { db.logf("%s exiting (corruption detected)", name) db.compactionExitTransact() } if !disableBackoff { // Reset backoff duration if counter is advancing. if cnt > lastCnt { backoff = backoffMin lastCnt = cnt } // Backoff. backoffT.Reset(backoff) if backoff < backoffMax { backoff *= backoffMul if backoff > backoffMax { backoff = backoffMax } } select { case <-backoffT.C: case _, _ = <-db.closeC: db.logf("%s exiting", name) db.compactionExitTransact() } } } }
func main() { flag.Parse() if enableBufferPool { bpool = util.NewBufferPool(opt.DefaultBlockSize + 128) } log.Printf("Test DB stored at %q", dbPath) if httpProf != "" { log.Printf("HTTP pprof listening at %q", httpProf) runtime.SetBlockProfileRate(1) go func() { if err := http.ListenAndServe(httpProf, nil); err != nil { log.Fatalf("HTTPPROF: %v", err) } }() } runtime.GOMAXPROCS(runtime.NumCPU()) os.RemoveAll(dbPath) stor, err := storage.OpenFile(dbPath, false) if err != nil { log.Fatal(err) } tstor := &testingStorage{stor} defer tstor.Close() fatalf := func(err error, format string, v ...interface{}) { atomic.StoreUint32(&fail, 1) atomic.StoreUint32(&done, 1) log.Printf("FATAL: "+format, v...) if err != nil && errors.IsCorrupted(err) { cerr := err.(*errors.ErrCorrupted) if !cerr.Fd.Nil() && cerr.Fd.Type == storage.TypeTable { log.Print("FATAL: corruption detected, scanning...") if !tstor.scanTable(storage.FileDesc{Type: storage.TypeTable, Num: cerr.Fd.Num}, false) { log.Printf("FATAL: unable to find corrupted key/value pair in table %v", cerr.Fd) } } } runtime.Goexit() } if openFilesCacheCapacity == 0 { openFilesCacheCapacity = -1 } o := &opt.Options{ OpenFilesCacheCapacity: openFilesCacheCapacity, DisableBufferPool: !enableBufferPool, DisableBlockCache: !enableBlockCache, ErrorIfExist: true, Compression: opt.NoCompression, } if enableCompression { o.Compression = opt.DefaultCompression } db, err := leveldb.Open(tstor, o) if err != nil { log.Fatal(err) } defer db.Close() var ( mu = &sync.Mutex{} gGetStat = &latencyStats{} gIterStat = &latencyStats{} gWriteStat = &latencyStats{} gTrasactionStat = &latencyStats{} startTime = time.Now() writeReq = make(chan *leveldb.Batch) writeAck = make(chan error) writeAckAck = make(chan struct{}) ) go func() { for b := range writeReq { var err error if mrand.Float64() < transactionProb { log.Print("> Write using transaction") gTrasactionStat.start() var tr *leveldb.Transaction if tr, err = db.OpenTransaction(); err == nil { if err = tr.Write(b, nil); err == nil { if err = tr.Commit(); err == nil { gTrasactionStat.record(b.Len()) } } else { tr.Discard() } } } else { gWriteStat.start() if err = db.Write(b, nil); err == nil { gWriteStat.record(b.Len()) } } writeAck <- err <-writeAckAck } }() go func() { for { time.Sleep(3 * time.Second) log.Print("------------------------") log.Printf("> Elapsed=%v", time.Now().Sub(startTime)) mu.Lock() log.Printf("> GetLatencyMin=%v GetLatencyMax=%v GetLatencyAvg=%v GetRatePerSec=%d", gGetStat.min, gGetStat.max, gGetStat.avg(), gGetStat.ratePerSec()) log.Printf("> IterLatencyMin=%v IterLatencyMax=%v IterLatencyAvg=%v IterRatePerSec=%d", gIterStat.min, gIterStat.max, gIterStat.avg(), gIterStat.ratePerSec()) log.Printf("> WriteLatencyMin=%v WriteLatencyMax=%v WriteLatencyAvg=%v WriteRatePerSec=%d", gWriteStat.min, gWriteStat.max, gWriteStat.avg(), gWriteStat.ratePerSec()) log.Printf("> TransactionLatencyMin=%v TransactionLatencyMax=%v TransactionLatencyAvg=%v TransactionRatePerSec=%d", gTrasactionStat.min, gTrasactionStat.max, gTrasactionStat.avg(), gTrasactionStat.ratePerSec()) mu.Unlock() cachedblock, _ := db.GetProperty("leveldb.cachedblock") openedtables, _ := db.GetProperty("leveldb.openedtables") alivesnaps, _ := db.GetProperty("leveldb.alivesnaps") aliveiters, _ := db.GetProperty("leveldb.aliveiters") blockpool, _ := db.GetProperty("leveldb.blockpool") log.Printf("> BlockCache=%s OpenedTables=%s AliveSnaps=%s AliveIter=%s BlockPool=%q", cachedblock, openedtables, alivesnaps, aliveiters, blockpool) log.Print("------------------------") } }() for ns, numKey := range numKeys { func(ns, numKey int) { log.Printf("[%02d] STARTING: numKey=%d", ns, numKey) keys := make([][]byte, numKey) for i := range keys { keys[i] = randomData(nil, byte(ns), 1, uint32(i), keyLen) } wg.Add(1) go func() { var wi uint32 defer func() { log.Printf("[%02d] WRITER DONE #%d", ns, wi) wg.Done() }() var ( b = new(leveldb.Batch) k2, v2 []byte nReader int32 ) for atomic.LoadUint32(&done) == 0 { log.Printf("[%02d] WRITER #%d", ns, wi) b.Reset() for _, k1 := range keys { k2 = randomData(k2, byte(ns), 2, wi, keyLen) v2 = randomData(v2, byte(ns), 3, wi, valueLen) b.Put(k2, v2) b.Put(k1, k2) } writeReq <- b if err := <-writeAck; err != nil { writeAckAck <- struct{}{} fatalf(err, "[%02d] WRITER #%d db.Write: %v", ns, wi, err) } snap, err := db.GetSnapshot() if err != nil { writeAckAck <- struct{}{} fatalf(err, "[%02d] WRITER #%d db.GetSnapshot: %v", ns, wi, err) } writeAckAck <- struct{}{} wg.Add(1) atomic.AddInt32(&nReader, 1) go func(snapwi uint32, snap *leveldb.Snapshot) { var ( ri int iterStat = &latencyStats{} getStat = &latencyStats{} ) defer func() { mu.Lock() gGetStat.add(getStat) gIterStat.add(iterStat) mu.Unlock() atomic.AddInt32(&nReader, -1) log.Printf("[%02d] READER #%d.%d DONE Snap=%v Alive=%d IterLatency=%v GetLatency=%v", ns, snapwi, ri, snap, atomic.LoadInt32(&nReader), iterStat.avg(), getStat.avg()) snap.Release() wg.Done() }() stopi := snapwi + 3 for (ri < 3 || atomic.LoadUint32(&wi) < stopi) && atomic.LoadUint32(&done) == 0 { var n int iter := snap.NewIterator(dataPrefixSlice(byte(ns), 1), nil) iterStat.start() for iter.Next() { k1 := iter.Key() k2 := iter.Value() iterStat.record(1) if dataNS(k2) != byte(ns) { fatalf(nil, "[%02d] READER #%d.%d K%d invalid in-key NS: want=%d got=%d", ns, snapwi, ri, n, ns, dataNS(k2)) } kwritei := dataI(k2) if kwritei != snapwi { fatalf(nil, "[%02d] READER #%d.%d K%d invalid in-key iter num: %d", ns, snapwi, ri, n, kwritei) } getStat.start() v2, err := snap.Get(k2, nil) if err != nil { fatalf(err, "[%02d] READER #%d.%d K%d snap.Get: %v\nk1: %x\n -> k2: %x", ns, snapwi, ri, n, err, k1, k2) } getStat.record(1) if checksum0, checksum1 := dataChecksum(v2); checksum0 != checksum1 { err := &errors.ErrCorrupted{Fd: storage.FileDesc{0xff, 0}, Err: fmt.Errorf("v2: %x: checksum mismatch: %v vs %v", v2, checksum0, checksum1)} fatalf(err, "[%02d] READER #%d.%d K%d snap.Get: %v\nk1: %x\n -> k2: %x", ns, snapwi, ri, n, err, k1, k2) } n++ iterStat.start() } iter.Release() if err := iter.Error(); err != nil { fatalf(err, "[%02d] READER #%d.%d K%d iter.Error: %v", ns, snapwi, ri, numKey, err) } if n != numKey { fatalf(nil, "[%02d] READER #%d.%d missing keys: want=%d got=%d", ns, snapwi, ri, numKey, n) } ri++ } }(wi, snap) atomic.AddUint32(&wi, 1) } }() delB := new(leveldb.Batch) wg.Add(1) go func() { var ( i int iterStat = &latencyStats{} ) defer func() { log.Printf("[%02d] SCANNER DONE #%d", ns, i) wg.Done() }() time.Sleep(2 * time.Second) for atomic.LoadUint32(&done) == 0 { var n int delB.Reset() iter := db.NewIterator(dataNsSlice(byte(ns)), nil) iterStat.start() for iter.Next() && atomic.LoadUint32(&done) == 0 { k := iter.Key() v := iter.Value() iterStat.record(1) for ci, x := range [...][]byte{k, v} { checksum0, checksum1 := dataChecksum(x) if checksum0 != checksum1 { if ci == 0 { fatalf(nil, "[%02d] SCANNER %d.%d invalid key checksum: want %d, got %d\n%x -> %x", ns, i, n, checksum0, checksum1, k, v) } else { fatalf(nil, "[%02d] SCANNER %d.%d invalid value checksum: want %d, got %d\n%x -> %x", ns, i, n, checksum0, checksum1, k, v) } } } if dataPrefix(k) == 2 || mrand.Int()%999 == 0 { delB.Delete(k) } n++ iterStat.start() } iter.Release() if err := iter.Error(); err != nil { fatalf(err, "[%02d] SCANNER #%d.%d iter.Error: %v", ns, i, n, err) } if n > 0 { log.Printf("[%02d] SCANNER #%d IterLatency=%v", ns, i, iterStat.avg()) } if delB.Len() > 0 && atomic.LoadUint32(&done) == 0 { t := time.Now() writeReq <- delB if err := <-writeAck; err != nil { writeAckAck <- struct{}{} fatalf(err, "[%02d] SCANNER #%d db.Write: %v", ns, i, err) } else { writeAckAck <- struct{}{} } log.Printf("[%02d] SCANNER #%d Deleted=%d Time=%v", ns, i, delB.Len(), time.Now().Sub(t)) } i++ } }() }(ns, numKey) } go func() { sig := make(chan os.Signal) signal.Notify(sig, os.Interrupt, os.Kill) log.Printf("Got signal: %v, exiting...", <-sig) atomic.StoreUint32(&done, 1) }() wg.Wait() }
func (ts *testingStorage) scanTable(fd storage.FileDesc, checksum bool) (corrupted bool) { r, err := ts.Open(fd) if err != nil { log.Fatal(err) } defer r.Close() size, err := r.Seek(0, os.SEEK_END) if err != nil { log.Fatal(err) } o := &opt.Options{ DisableLargeBatchTransaction: true, Strict: opt.NoStrict, } if checksum { o.Strict = opt.StrictBlockChecksum | opt.StrictReader } tr, err := table.NewReader(r, size, fd, nil, bpool, o) if err != nil { log.Fatal(err) } defer tr.Release() checkData := func(i int, t string, data []byte) bool { if len(data) == 0 { panic(fmt.Sprintf("[%v] nil data: i=%d t=%s", fd, i, t)) } checksum0, checksum1 := dataChecksum(data) if checksum0 != checksum1 { atomic.StoreUint32(&fail, 1) atomic.StoreUint32(&done, 1) corrupted = true data0, data1 := dataSplit(data) data0c0, data0c1 := dataChecksum(data0) data1c0, data1c1 := dataChecksum(data1) log.Printf("FATAL: [%v] Corrupted data i=%d t=%s (%#x != %#x): %x(%v) vs %x(%v)", fd, i, t, checksum0, checksum1, data0, data0c0 == data0c1, data1, data1c0 == data1c1) return true } return false } iter := tr.NewIterator(nil, nil) defer iter.Release() for i := 0; iter.Next(); i++ { ukey, _, kt, kerr := parseIkey(iter.Key()) if kerr != nil { atomic.StoreUint32(&fail, 1) atomic.StoreUint32(&done, 1) corrupted = true log.Printf("FATAL: [%v] Corrupted ikey i=%d: %v", fd, i, kerr) return } if checkData(i, "key", ukey) { return } if kt == ktVal && checkData(i, "value", iter.Value()) { return } } if err := iter.Error(); err != nil { if errors.IsCorrupted(err) { atomic.StoreUint32(&fail, 1) atomic.StoreUint32(&done, 1) corrupted = true log.Printf("FATAL: [%v] Corruption detected: %v", fd, err) } else { log.Fatal(err) } } return }
// NewReader creates a new initialized table reader for the file. // The fi, cache and bpool is optional and can be nil. // // The returned table reader instance is goroutine-safe. func NewReader(f io.ReaderAt, size int64, fd storage.FileDesc, cache *cache.NamespaceGetter, bpool *util.BufferPool, o *opt.Options) (*Reader, error) { if f == nil { return nil, errors.New("leveldb/table: nil file") } r := &Reader{ fd: fd, reader: f, cache: cache, bpool: bpool, o: o, cmp: o.GetComparer(), verifyChecksum: o.GetStrict(opt.StrictBlockChecksum), } if size < footerLen { r.err = r.newErrCorrupted(0, size, "table", "too small") return r, nil } footerPos := size - footerLen var footer [footerLen]byte if _, err := r.reader.ReadAt(footer[:], footerPos); err != nil && err != io.EOF { return nil, err } if string(footer[footerLen-len(magic):footerLen]) != magic { r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad magic number") return r, nil } var n int // Decode the metaindex block handle. r.metaBH, n = decodeBlockHandle(footer[:]) if n == 0 { r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad metaindex block handle") return r, nil } // Decode the index block handle. r.indexBH, n = decodeBlockHandle(footer[n:]) if n == 0 { r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad index block handle") return r, nil } // Read metaindex block. metaBlock, err := r.readBlock(r.metaBH, true) if err != nil { if errors.IsCorrupted(err) { r.err = err return r, nil } else { return nil, err } } // Set data end. r.dataEnd = int64(r.metaBH.offset) // Read metaindex. metaIter := r.newBlockIter(metaBlock, nil, nil, true) for metaIter.Next() { key := string(metaIter.Key()) if !strings.HasPrefix(key, "filter.") { continue } fn := key[7:] if f0 := o.GetFilter(); f0 != nil && f0.Name() == fn { r.filter = f0 } else { for _, f0 := range o.GetAltFilters() { if f0.Name() == fn { r.filter = f0 break } } } if r.filter != nil { filterBH, n := decodeBlockHandle(metaIter.Value()) if n == 0 { continue } r.filterBH = filterBH // Update data end. r.dataEnd = int64(filterBH.offset) break } } metaIter.Release() metaBlock.Release() // Cache index and filter block locally, since we don't have global cache. if cache == nil { r.indexBlock, err = r.readBlock(r.indexBH, true) if err != nil { if errors.IsCorrupted(err) { r.err = err return r, nil } else { return nil, err } } if r.filter != nil { r.filterBlock, err = r.readFilterBlock(r.filterBH) if err != nil { if !errors.IsCorrupted(err) { return nil, err } // Don't use filter then. r.filter = nil } } } return r, nil }
func (r *Reader) find(key []byte, filtered bool, ro *opt.ReadOptions, noValue bool) (rkey, value []byte, err error) { r.mu.RLock() defer r.mu.RUnlock() if r.err != nil { err = r.err return } indexBlock, rel, err := r.getIndexBlock(true) if err != nil { return } defer rel.Release() index := r.newBlockIter(indexBlock, nil, nil, true) defer index.Release() if !index.Seek(key) { err = index.Error() if err == nil { err = ErrNotFound } return } dataBH, n := decodeBlockHandle(index.Value()) if n == 0 { r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") return } if filtered && r.filter != nil { filterBlock, frel, ferr := r.getFilterBlock(true) if ferr == nil { if !filterBlock.contains(r.filter, dataBH.offset, key) { frel.Release() return nil, nil, ErrNotFound } frel.Release() } else if !errors.IsCorrupted(ferr) { err = ferr return } } data := r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache()) defer data.Release() if !data.Seek(key) { err = data.Error() if err == nil { err = ErrNotFound } return } // Don't use block buffer, no need to copy the buffer. rkey = data.Key() if !noValue { if r.bpool == nil { value = data.Value() } else { // Use block buffer, and since the buffer will be recycled, the buffer // need to be copied. value = append([]byte{}, data.Value()...) } } return }
func (db *DB) recoverJournalRO() error { // Get all journals and sort it by file number. rawFds, err := db.s.stor.List(storage.TypeJournal) if err != nil { return err } sortFds(rawFds) // Journals that will be recovered. var fds []storage.FileDesc for _, fd := range rawFds { if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum { fds = append(fds, fd) } } var ( // Options. strict = db.s.o.GetStrict(opt.StrictJournal) checksum = db.s.o.GetStrict(opt.StrictJournalChecksum) writeBuffer = db.s.o.GetWriteBuffer() mdb = memdb.New(db.s.icmp, writeBuffer) ) // Recover journals. if len(fds) > 0 { db.logf("journal@recovery RO·Mode F·%d", len(fds)) var ( jr *journal.Reader buf = &util.Buffer{} batch = &Batch{} ) for _, fd := range fds { db.logf("journal@recovery recovering @%d", fd.Num) fr, err := db.s.stor.Open(fd) if err != nil { return err } // Create or reset journal reader instance. if jr == nil { jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum) } else { jr.Reset(fr, dropper{db.s, fd}, strict, checksum) } // Replay journal to memdb. for { r, err := jr.Next() if err != nil { if err == io.EOF { break } fr.Close() return errors.SetFd(err, fd) } buf.Reset() if _, err := buf.ReadFrom(r); err != nil { if err == io.ErrUnexpectedEOF { // This is error returned due to corruption, with strict == false. continue } fr.Close() return errors.SetFd(err, fd) } if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mdb); err != nil { if !strict && errors.IsCorrupted(err) { db.s.logf("journal error: %v (skipped)", err) // We won't apply sequence number as it might be corrupted. continue } fr.Close() return errors.SetFd(err, fd) } // Save sequence number. db.seq = batch.seq + uint64(batch.Len()) } fr.Close() } } // Set memDB. db.mem = &memDB{db: db, DB: mdb, ref: 1} return nil }
func (db *DB) recoverJournal() error { // Get all journals and sort it by file number. rawFds, err := db.s.stor.List(storage.TypeJournal) if err != nil { return err } sortFds(rawFds) // Journals that will be recovered. var fds []storage.FileDesc for _, fd := range rawFds { if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum { fds = append(fds, fd) } } var ( ofd storage.FileDesc // Obsolete file. rec = &sessionRecord{} ) // Recover journals. if len(fds) > 0 { db.logf("journal@recovery F·%d", len(fds)) // Mark file number as used. db.s.markFileNum(fds[len(fds)-1].Num) var ( // Options. strict = db.s.o.GetStrict(opt.StrictJournal) checksum = db.s.o.GetStrict(opt.StrictJournalChecksum) writeBuffer = db.s.o.GetWriteBuffer() jr *journal.Reader mdb = memdb.New(db.s.icmp, writeBuffer) buf = &util.Buffer{} batch = &Batch{} ) for _, fd := range fds { db.logf("journal@recovery recovering @%d", fd.Num) fr, err := db.s.stor.Open(fd) if err != nil { return err } // Create or reset journal reader instance. if jr == nil { jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum) } else { jr.Reset(fr, dropper{db.s, fd}, strict, checksum) } // Flush memdb and remove obsolete journal file. if !ofd.Nil() { if mdb.Len() > 0 { if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil { fr.Close() return err } } rec.setJournalNum(fd.Num) rec.setSeqNum(db.seq) if err := db.s.commit(rec); err != nil { fr.Close() return err } rec.resetAddedTables() db.s.stor.Remove(ofd) ofd = storage.FileDesc{} } // Replay journal to memdb. mdb.Reset() for { r, err := jr.Next() if err != nil { if err == io.EOF { break } fr.Close() return errors.SetFd(err, fd) } buf.Reset() if _, err := buf.ReadFrom(r); err != nil { if err == io.ErrUnexpectedEOF { // This is error returned due to corruption, with strict == false. continue } fr.Close() return errors.SetFd(err, fd) } if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mdb); err != nil { if !strict && errors.IsCorrupted(err) { db.s.logf("journal error: %v (skipped)", err) // We won't apply sequence number as it might be corrupted. continue } fr.Close() return errors.SetFd(err, fd) } // Save sequence number. db.seq = batch.seq + uint64(batch.Len()) // Flush it if large enough. if mdb.Size() >= writeBuffer { if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil { fr.Close() return err } mdb.Reset() } } fr.Close() ofd = fd } // Flush the last memdb. if mdb.Len() > 0 { if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil { return err } } } // Create a new journal. if _, err := db.newMem(0); err != nil { return err } // Commit. rec.setJournalNum(db.journalFd.Num) rec.setSeqNum(db.seq) if err := db.s.commit(rec); err != nil { // Close journal on error. if db.journal != nil { db.journal.Close() db.journalWriter.Close() } return err } // Remove the last obsolete journal file. if !ofd.Nil() { db.s.stor.Remove(ofd) } return nil }
func recoverTable(s *session, o *opt.Options) error { o = dupOptions(o) // Mask StrictReader, lets StrictRecovery doing its job. o.Strict &= ^opt.StrictReader // Get all tables and sort it by file number. fds, err := s.stor.List(storage.TypeTable) if err != nil { return err } sortFds(fds) var ( maxSeq uint64 recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int // We will drop corrupted table. strict = o.GetStrict(opt.StrictRecovery) noSync = o.GetNoSync() rec = &sessionRecord{} bpool = util.NewBufferPool(o.GetBlockSize() + 5) ) buildTable := func(iter iterator.Iterator) (tmpFd storage.FileDesc, size int64, err error) { tmpFd = s.newTemp() writer, err := s.stor.Create(tmpFd) if err != nil { return } defer func() { writer.Close() if err != nil { s.stor.Remove(tmpFd) tmpFd = storage.FileDesc{} } }() // Copy entries. tw := table.NewWriter(writer, o) for iter.Next() { key := iter.Key() if validInternalKey(key) { err = tw.Append(key, iter.Value()) if err != nil { return } } } err = iter.Error() if err != nil { return } err = tw.Close() if err != nil { return } if !noSync { err = writer.Sync() if err != nil { return } } size = int64(tw.BytesLen()) return } recoverTable := func(fd storage.FileDesc) error { s.logf("table@recovery recovering @%d", fd.Num) reader, err := s.stor.Open(fd) if err != nil { return err } var closed bool defer func() { if !closed { reader.Close() } }() // Get file size. size, err := reader.Seek(0, 2) if err != nil { return err } var ( tSeq uint64 tgoodKey, tcorruptedKey, tcorruptedBlock int imin, imax []byte ) tr, err := table.NewReader(reader, size, fd, nil, bpool, o) if err != nil { return err } iter := tr.NewIterator(nil, nil) if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok { itererr.SetErrorCallback(func(err error) { if errors.IsCorrupted(err) { s.logf("table@recovery block corruption @%d %q", fd.Num, err) tcorruptedBlock++ } }) } // Scan the table. for iter.Next() { key := iter.Key() _, seq, _, kerr := parseInternalKey(key) if kerr != nil { tcorruptedKey++ continue } tgoodKey++ if seq > tSeq { tSeq = seq } if imin == nil { imin = append([]byte{}, key...) } imax = append(imax[:0], key...) } if err := iter.Error(); err != nil { iter.Release() return err } iter.Release() goodKey += tgoodKey corruptedKey += tcorruptedKey corruptedBlock += tcorruptedBlock if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) { droppedTable++ s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) return nil } if tgoodKey > 0 { if tcorruptedKey > 0 || tcorruptedBlock > 0 { // Rebuild the table. s.logf("table@recovery rebuilding @%d", fd.Num) iter := tr.NewIterator(nil, nil) tmpFd, newSize, err := buildTable(iter) iter.Release() if err != nil { return err } closed = true reader.Close() if err := s.stor.Rename(tmpFd, fd); err != nil { return err } size = newSize } if tSeq > maxSeq { maxSeq = tSeq } recoveredKey += tgoodKey // Add table to level 0. rec.addTable(0, fd.Num, size, imin, imax) s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) } else { droppedTable++ s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", fd.Num, tcorruptedKey, tcorruptedBlock, size) } return nil } // Recover all tables. if len(fds) > 0 { s.logf("table@recovery F·%d", len(fds)) // Mark file number as used. s.markFileNum(fds[len(fds)-1].Num) for _, fd := range fds { if err := recoverTable(fd); err != nil { return err } } s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(fds), recoveredKey, goodKey, corruptedKey, maxSeq) } // Set sequence number. rec.setSeqNum(maxSeq) // Create new manifest. if err := s.create(); err != nil { return err } // Commit. return s.commit(rec) }