Ejemplo n.º 1
0
func (db *DB) compactionError() {
	var err error
noerr:
	// No error.
	for {
		select {
		case err = <-db.compErrSetC:
			switch {
			case err == nil:
			case err == ErrReadOnly, errors.IsCorrupted(err):
				goto hasperr
			default:
				goto haserr
			}
		case _, _ = <-db.closeC:
			return
		}
	}
haserr:
	// Transient error.
	for {
		select {
		case db.compErrC <- err:
		case err = <-db.compErrSetC:
			switch {
			case err == nil:
				goto noerr
			case err == ErrReadOnly, errors.IsCorrupted(err):
				goto hasperr
			default:
			}
		case _, _ = <-db.closeC:
			return
		}
	}
hasperr:
	// Persistent error.
	for {
		select {
		case db.compErrC <- err:
		case db.compPerErrC <- err:
		case db.writeLockC <- struct{}{}:
			// Hold write lock, so that write won't pass-through.
			db.compWriteLocking = true
		case _, _ = <-db.closeC:
			if db.compWriteLocking {
				// We should release the lock or Close will hang.
				<-db.writeLockC
			}
			return
		}
	}
}
Ejemplo n.º 2
0
func (i *mergedIterator) iterErr(iter Iterator) bool {
	if err := iter.Error(); err != nil {
		if i.errf != nil {
			i.errf(err)
		}
		if i.strict || !errors.IsCorrupted(err) {
			i.err = err
			return true
		}
	}
	return false
}
Ejemplo n.º 3
0
func (i *indexedIterator) dataErr() bool {
	if err := i.data.Error(); err != nil {
		if i.errf != nil {
			i.errf(err)
		}
		if i.strict || !errors.IsCorrupted(err) {
			i.err = err
			return true
		}
	}
	return false
}
Ejemplo n.º 4
0
// Recover a database session; need external synchronization.
func (s *session) recover() (err error) {
	defer func() {
		if os.IsNotExist(err) {
			// Don't return os.ErrNotExist if the underlying storage contains
			// other files that belong to LevelDB. So the DB won't get trashed.
			if fds, _ := s.stor.List(storage.TypeAll); len(fds) > 0 {
				err = &errors.ErrCorrupted{Fd: storage.FileDesc{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}}
			}
		}
	}()

	fd, err := s.stor.GetMeta()
	if err != nil {
		return
	}

	reader, err := s.stor.Open(fd)
	if err != nil {
		return
	}
	defer reader.Close()

	var (
		// Options.
		strict = s.o.GetStrict(opt.StrictManifest)

		jr      = journal.NewReader(reader, dropper{s, fd}, strict, true)
		rec     = &sessionRecord{}
		staging = s.stVersion.newStaging()
	)
	for {
		var r io.Reader
		r, err = jr.Next()
		if err != nil {
			if err == io.EOF {
				err = nil
				break
			}
			return errors.SetFd(err, fd)
		}

		err = rec.decode(r)
		if err == nil {
			// save compact pointers
			for _, r := range rec.compPtrs {
				s.setCompPtr(r.level, internalKey(r.ikey))
			}
			// commit record to version staging
			staging.commit(rec)
		} else {
			err = errors.SetFd(err, fd)
			if strict || !errors.IsCorrupted(err) {
				return
			}
			s.logf("manifest error: %v (skipped)", errors.SetFd(err, fd))
		}
		rec.resetCompPtrs()
		rec.resetAddedTables()
		rec.resetDeletedTables()
	}

	switch {
	case !rec.has(recComparer):
		return newErrManifestCorrupted(fd, "comparer", "missing")
	case rec.comparer != s.icmp.uName():
		return newErrManifestCorrupted(fd, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer))
	case !rec.has(recNextFileNum):
		return newErrManifestCorrupted(fd, "next-file-num", "missing")
	case !rec.has(recJournalNum):
		return newErrManifestCorrupted(fd, "journal-file-num", "missing")
	case !rec.has(recSeqNum):
		return newErrManifestCorrupted(fd, "seq-num", "missing")
	}

	s.manifestFd = fd
	s.setVersion(staging.finish())
	s.setNextFileNum(rec.nextFileNum)
	s.recordCommited(rec)
	return nil
}
Ejemplo n.º 5
0
func (db *DB) compactionTransact(name string, t compactionTransactInterface) {
	defer func() {
		if x := recover(); x != nil {
			if x == errCompactionTransactExiting {
				if err := t.revert(); err != nil {
					db.logf("%s revert error %q", name, err)
				}
			}
			panic(x)
		}
	}()

	const (
		backoffMin = 1 * time.Second
		backoffMax = 8 * time.Second
		backoffMul = 2 * time.Second
	)
	var (
		backoff  = backoffMin
		backoffT = time.NewTimer(backoff)
		lastCnt  = compactionTransactCounter(0)

		disableBackoff = db.s.o.GetDisableCompactionBackoff()
	)
	for n := 0; ; n++ {
		// Check wether the DB is closed.
		if db.isClosed() {
			db.logf("%s exiting", name)
			db.compactionExitTransact()
		} else if n > 0 {
			db.logf("%s retrying N·%d", name, n)
		}

		// Execute.
		cnt := compactionTransactCounter(0)
		err := t.run(&cnt)
		if err != nil {
			db.logf("%s error I·%d %q", name, cnt, err)
		}

		// Set compaction error status.
		select {
		case db.compErrSetC <- err:
		case perr := <-db.compPerErrC:
			if err != nil {
				db.logf("%s exiting (persistent error %q)", name, perr)
				db.compactionExitTransact()
			}
		case _, _ = <-db.closeC:
			db.logf("%s exiting", name)
			db.compactionExitTransact()
		}
		if err == nil {
			return
		}
		if errors.IsCorrupted(err) {
			db.logf("%s exiting (corruption detected)", name)
			db.compactionExitTransact()
		}

		if !disableBackoff {
			// Reset backoff duration if counter is advancing.
			if cnt > lastCnt {
				backoff = backoffMin
				lastCnt = cnt
			}

			// Backoff.
			backoffT.Reset(backoff)
			if backoff < backoffMax {
				backoff *= backoffMul
				if backoff > backoffMax {
					backoff = backoffMax
				}
			}
			select {
			case <-backoffT.C:
			case _, _ = <-db.closeC:
				db.logf("%s exiting", name)
				db.compactionExitTransact()
			}
		}
	}
}
Ejemplo n.º 6
0
func main() {
	flag.Parse()

	if enableBufferPool {
		bpool = util.NewBufferPool(opt.DefaultBlockSize + 128)
	}

	log.Printf("Test DB stored at %q", dbPath)
	if httpProf != "" {
		log.Printf("HTTP pprof listening at %q", httpProf)
		runtime.SetBlockProfileRate(1)
		go func() {
			if err := http.ListenAndServe(httpProf, nil); err != nil {
				log.Fatalf("HTTPPROF: %v", err)
			}
		}()
	}

	runtime.GOMAXPROCS(runtime.NumCPU())

	os.RemoveAll(dbPath)
	stor, err := storage.OpenFile(dbPath, false)
	if err != nil {
		log.Fatal(err)
	}
	tstor := &testingStorage{stor}
	defer tstor.Close()

	fatalf := func(err error, format string, v ...interface{}) {
		atomic.StoreUint32(&fail, 1)
		atomic.StoreUint32(&done, 1)
		log.Printf("FATAL: "+format, v...)
		if err != nil && errors.IsCorrupted(err) {
			cerr := err.(*errors.ErrCorrupted)
			if !cerr.Fd.Nil() && cerr.Fd.Type == storage.TypeTable {
				log.Print("FATAL: corruption detected, scanning...")
				if !tstor.scanTable(storage.FileDesc{Type: storage.TypeTable, Num: cerr.Fd.Num}, false) {
					log.Printf("FATAL: unable to find corrupted key/value pair in table %v", cerr.Fd)
				}
			}
		}
		runtime.Goexit()
	}

	if openFilesCacheCapacity == 0 {
		openFilesCacheCapacity = -1
	}
	o := &opt.Options{
		OpenFilesCacheCapacity: openFilesCacheCapacity,
		DisableBufferPool:      !enableBufferPool,
		DisableBlockCache:      !enableBlockCache,
		ErrorIfExist:           true,
		Compression:            opt.NoCompression,
	}
	if enableCompression {
		o.Compression = opt.DefaultCompression
	}

	db, err := leveldb.Open(tstor, o)
	if err != nil {
		log.Fatal(err)
	}
	defer db.Close()

	var (
		mu              = &sync.Mutex{}
		gGetStat        = &latencyStats{}
		gIterStat       = &latencyStats{}
		gWriteStat      = &latencyStats{}
		gTrasactionStat = &latencyStats{}
		startTime       = time.Now()

		writeReq    = make(chan *leveldb.Batch)
		writeAck    = make(chan error)
		writeAckAck = make(chan struct{})
	)

	go func() {
		for b := range writeReq {

			var err error
			if mrand.Float64() < transactionProb {
				log.Print("> Write using transaction")
				gTrasactionStat.start()
				var tr *leveldb.Transaction
				if tr, err = db.OpenTransaction(); err == nil {
					if err = tr.Write(b, nil); err == nil {
						if err = tr.Commit(); err == nil {
							gTrasactionStat.record(b.Len())
						}
					} else {
						tr.Discard()
					}
				}
			} else {
				gWriteStat.start()
				if err = db.Write(b, nil); err == nil {
					gWriteStat.record(b.Len())
				}
			}
			writeAck <- err
			<-writeAckAck
		}
	}()

	go func() {
		for {
			time.Sleep(3 * time.Second)

			log.Print("------------------------")

			log.Printf("> Elapsed=%v", time.Now().Sub(startTime))
			mu.Lock()
			log.Printf("> GetLatencyMin=%v GetLatencyMax=%v GetLatencyAvg=%v GetRatePerSec=%d",
				gGetStat.min, gGetStat.max, gGetStat.avg(), gGetStat.ratePerSec())
			log.Printf("> IterLatencyMin=%v IterLatencyMax=%v IterLatencyAvg=%v IterRatePerSec=%d",
				gIterStat.min, gIterStat.max, gIterStat.avg(), gIterStat.ratePerSec())
			log.Printf("> WriteLatencyMin=%v WriteLatencyMax=%v WriteLatencyAvg=%v WriteRatePerSec=%d",
				gWriteStat.min, gWriteStat.max, gWriteStat.avg(), gWriteStat.ratePerSec())
			log.Printf("> TransactionLatencyMin=%v TransactionLatencyMax=%v TransactionLatencyAvg=%v TransactionRatePerSec=%d",
				gTrasactionStat.min, gTrasactionStat.max, gTrasactionStat.avg(), gTrasactionStat.ratePerSec())
			mu.Unlock()

			cachedblock, _ := db.GetProperty("leveldb.cachedblock")
			openedtables, _ := db.GetProperty("leveldb.openedtables")
			alivesnaps, _ := db.GetProperty("leveldb.alivesnaps")
			aliveiters, _ := db.GetProperty("leveldb.aliveiters")
			blockpool, _ := db.GetProperty("leveldb.blockpool")
			log.Printf("> BlockCache=%s OpenedTables=%s AliveSnaps=%s AliveIter=%s BlockPool=%q",
				cachedblock, openedtables, alivesnaps, aliveiters, blockpool)

			log.Print("------------------------")
		}
	}()

	for ns, numKey := range numKeys {
		func(ns, numKey int) {
			log.Printf("[%02d] STARTING: numKey=%d", ns, numKey)

			keys := make([][]byte, numKey)
			for i := range keys {
				keys[i] = randomData(nil, byte(ns), 1, uint32(i), keyLen)
			}

			wg.Add(1)
			go func() {
				var wi uint32
				defer func() {
					log.Printf("[%02d] WRITER DONE #%d", ns, wi)
					wg.Done()
				}()

				var (
					b       = new(leveldb.Batch)
					k2, v2  []byte
					nReader int32
				)
				for atomic.LoadUint32(&done) == 0 {
					log.Printf("[%02d] WRITER #%d", ns, wi)

					b.Reset()
					for _, k1 := range keys {
						k2 = randomData(k2, byte(ns), 2, wi, keyLen)
						v2 = randomData(v2, byte(ns), 3, wi, valueLen)
						b.Put(k2, v2)
						b.Put(k1, k2)
					}
					writeReq <- b
					if err := <-writeAck; err != nil {
						writeAckAck <- struct{}{}
						fatalf(err, "[%02d] WRITER #%d db.Write: %v", ns, wi, err)
					}

					snap, err := db.GetSnapshot()
					if err != nil {
						writeAckAck <- struct{}{}
						fatalf(err, "[%02d] WRITER #%d db.GetSnapshot: %v", ns, wi, err)
					}

					writeAckAck <- struct{}{}

					wg.Add(1)
					atomic.AddInt32(&nReader, 1)
					go func(snapwi uint32, snap *leveldb.Snapshot) {
						var (
							ri       int
							iterStat = &latencyStats{}
							getStat  = &latencyStats{}
						)
						defer func() {
							mu.Lock()
							gGetStat.add(getStat)
							gIterStat.add(iterStat)
							mu.Unlock()

							atomic.AddInt32(&nReader, -1)
							log.Printf("[%02d] READER #%d.%d DONE Snap=%v Alive=%d IterLatency=%v GetLatency=%v", ns, snapwi, ri, snap, atomic.LoadInt32(&nReader), iterStat.avg(), getStat.avg())
							snap.Release()
							wg.Done()
						}()

						stopi := snapwi + 3
						for (ri < 3 || atomic.LoadUint32(&wi) < stopi) && atomic.LoadUint32(&done) == 0 {
							var n int
							iter := snap.NewIterator(dataPrefixSlice(byte(ns), 1), nil)
							iterStat.start()
							for iter.Next() {
								k1 := iter.Key()
								k2 := iter.Value()
								iterStat.record(1)

								if dataNS(k2) != byte(ns) {
									fatalf(nil, "[%02d] READER #%d.%d K%d invalid in-key NS: want=%d got=%d", ns, snapwi, ri, n, ns, dataNS(k2))
								}

								kwritei := dataI(k2)
								if kwritei != snapwi {
									fatalf(nil, "[%02d] READER #%d.%d K%d invalid in-key iter num: %d", ns, snapwi, ri, n, kwritei)
								}

								getStat.start()
								v2, err := snap.Get(k2, nil)
								if err != nil {
									fatalf(err, "[%02d] READER #%d.%d K%d snap.Get: %v\nk1: %x\n -> k2: %x", ns, snapwi, ri, n, err, k1, k2)
								}
								getStat.record(1)

								if checksum0, checksum1 := dataChecksum(v2); checksum0 != checksum1 {
									err := &errors.ErrCorrupted{Fd: storage.FileDesc{0xff, 0}, Err: fmt.Errorf("v2: %x: checksum mismatch: %v vs %v", v2, checksum0, checksum1)}
									fatalf(err, "[%02d] READER #%d.%d K%d snap.Get: %v\nk1: %x\n -> k2: %x", ns, snapwi, ri, n, err, k1, k2)
								}

								n++
								iterStat.start()
							}
							iter.Release()
							if err := iter.Error(); err != nil {
								fatalf(err, "[%02d] READER #%d.%d K%d iter.Error: %v", ns, snapwi, ri, numKey, err)
							}
							if n != numKey {
								fatalf(nil, "[%02d] READER #%d.%d missing keys: want=%d got=%d", ns, snapwi, ri, numKey, n)
							}

							ri++
						}
					}(wi, snap)

					atomic.AddUint32(&wi, 1)
				}
			}()

			delB := new(leveldb.Batch)
			wg.Add(1)
			go func() {
				var (
					i        int
					iterStat = &latencyStats{}
				)
				defer func() {
					log.Printf("[%02d] SCANNER DONE #%d", ns, i)
					wg.Done()
				}()

				time.Sleep(2 * time.Second)

				for atomic.LoadUint32(&done) == 0 {
					var n int
					delB.Reset()
					iter := db.NewIterator(dataNsSlice(byte(ns)), nil)
					iterStat.start()
					for iter.Next() && atomic.LoadUint32(&done) == 0 {
						k := iter.Key()
						v := iter.Value()
						iterStat.record(1)

						for ci, x := range [...][]byte{k, v} {
							checksum0, checksum1 := dataChecksum(x)
							if checksum0 != checksum1 {
								if ci == 0 {
									fatalf(nil, "[%02d] SCANNER %d.%d invalid key checksum: want %d, got %d\n%x -> %x", ns, i, n, checksum0, checksum1, k, v)
								} else {
									fatalf(nil, "[%02d] SCANNER %d.%d invalid value checksum: want %d, got %d\n%x -> %x", ns, i, n, checksum0, checksum1, k, v)
								}
							}
						}

						if dataPrefix(k) == 2 || mrand.Int()%999 == 0 {
							delB.Delete(k)
						}

						n++
						iterStat.start()
					}
					iter.Release()
					if err := iter.Error(); err != nil {
						fatalf(err, "[%02d] SCANNER #%d.%d iter.Error: %v", ns, i, n, err)
					}

					if n > 0 {
						log.Printf("[%02d] SCANNER #%d IterLatency=%v", ns, i, iterStat.avg())
					}

					if delB.Len() > 0 && atomic.LoadUint32(&done) == 0 {
						t := time.Now()
						writeReq <- delB
						if err := <-writeAck; err != nil {
							writeAckAck <- struct{}{}
							fatalf(err, "[%02d] SCANNER #%d db.Write: %v", ns, i, err)
						} else {
							writeAckAck <- struct{}{}
						}
						log.Printf("[%02d] SCANNER #%d Deleted=%d Time=%v", ns, i, delB.Len(), time.Now().Sub(t))
					}

					i++
				}
			}()
		}(ns, numKey)
	}

	go func() {
		sig := make(chan os.Signal)
		signal.Notify(sig, os.Interrupt, os.Kill)
		log.Printf("Got signal: %v, exiting...", <-sig)
		atomic.StoreUint32(&done, 1)
	}()

	wg.Wait()
}
Ejemplo n.º 7
0
func (ts *testingStorage) scanTable(fd storage.FileDesc, checksum bool) (corrupted bool) {
	r, err := ts.Open(fd)
	if err != nil {
		log.Fatal(err)
	}
	defer r.Close()

	size, err := r.Seek(0, os.SEEK_END)
	if err != nil {
		log.Fatal(err)
	}

	o := &opt.Options{
		DisableLargeBatchTransaction: true,
		Strict: opt.NoStrict,
	}
	if checksum {
		o.Strict = opt.StrictBlockChecksum | opt.StrictReader
	}
	tr, err := table.NewReader(r, size, fd, nil, bpool, o)
	if err != nil {
		log.Fatal(err)
	}
	defer tr.Release()

	checkData := func(i int, t string, data []byte) bool {
		if len(data) == 0 {
			panic(fmt.Sprintf("[%v] nil data: i=%d t=%s", fd, i, t))
		}

		checksum0, checksum1 := dataChecksum(data)
		if checksum0 != checksum1 {
			atomic.StoreUint32(&fail, 1)
			atomic.StoreUint32(&done, 1)
			corrupted = true

			data0, data1 := dataSplit(data)
			data0c0, data0c1 := dataChecksum(data0)
			data1c0, data1c1 := dataChecksum(data1)
			log.Printf("FATAL: [%v] Corrupted data i=%d t=%s (%#x != %#x): %x(%v) vs %x(%v)",
				fd, i, t, checksum0, checksum1, data0, data0c0 == data0c1, data1, data1c0 == data1c1)
			return true
		}
		return false
	}

	iter := tr.NewIterator(nil, nil)
	defer iter.Release()
	for i := 0; iter.Next(); i++ {
		ukey, _, kt, kerr := parseIkey(iter.Key())
		if kerr != nil {
			atomic.StoreUint32(&fail, 1)
			atomic.StoreUint32(&done, 1)
			corrupted = true

			log.Printf("FATAL: [%v] Corrupted ikey i=%d: %v", fd, i, kerr)
			return
		}
		if checkData(i, "key", ukey) {
			return
		}
		if kt == ktVal && checkData(i, "value", iter.Value()) {
			return
		}
	}
	if err := iter.Error(); err != nil {
		if errors.IsCorrupted(err) {
			atomic.StoreUint32(&fail, 1)
			atomic.StoreUint32(&done, 1)
			corrupted = true

			log.Printf("FATAL: [%v] Corruption detected: %v", fd, err)
		} else {
			log.Fatal(err)
		}
	}

	return
}
Ejemplo n.º 8
0
// NewReader creates a new initialized table reader for the file.
// The fi, cache and bpool is optional and can be nil.
//
// The returned table reader instance is goroutine-safe.
func NewReader(f io.ReaderAt, size int64, fd storage.FileDesc, cache *cache.NamespaceGetter, bpool *util.BufferPool, o *opt.Options) (*Reader, error) {
	if f == nil {
		return nil, errors.New("leveldb/table: nil file")
	}

	r := &Reader{
		fd:             fd,
		reader:         f,
		cache:          cache,
		bpool:          bpool,
		o:              o,
		cmp:            o.GetComparer(),
		verifyChecksum: o.GetStrict(opt.StrictBlockChecksum),
	}

	if size < footerLen {
		r.err = r.newErrCorrupted(0, size, "table", "too small")
		return r, nil
	}

	footerPos := size - footerLen
	var footer [footerLen]byte
	if _, err := r.reader.ReadAt(footer[:], footerPos); err != nil && err != io.EOF {
		return nil, err
	}
	if string(footer[footerLen-len(magic):footerLen]) != magic {
		r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad magic number")
		return r, nil
	}

	var n int
	// Decode the metaindex block handle.
	r.metaBH, n = decodeBlockHandle(footer[:])
	if n == 0 {
		r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad metaindex block handle")
		return r, nil
	}

	// Decode the index block handle.
	r.indexBH, n = decodeBlockHandle(footer[n:])
	if n == 0 {
		r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad index block handle")
		return r, nil
	}

	// Read metaindex block.
	metaBlock, err := r.readBlock(r.metaBH, true)
	if err != nil {
		if errors.IsCorrupted(err) {
			r.err = err
			return r, nil
		} else {
			return nil, err
		}
	}

	// Set data end.
	r.dataEnd = int64(r.metaBH.offset)

	// Read metaindex.
	metaIter := r.newBlockIter(metaBlock, nil, nil, true)
	for metaIter.Next() {
		key := string(metaIter.Key())
		if !strings.HasPrefix(key, "filter.") {
			continue
		}
		fn := key[7:]
		if f0 := o.GetFilter(); f0 != nil && f0.Name() == fn {
			r.filter = f0
		} else {
			for _, f0 := range o.GetAltFilters() {
				if f0.Name() == fn {
					r.filter = f0
					break
				}
			}
		}
		if r.filter != nil {
			filterBH, n := decodeBlockHandle(metaIter.Value())
			if n == 0 {
				continue
			}
			r.filterBH = filterBH
			// Update data end.
			r.dataEnd = int64(filterBH.offset)
			break
		}
	}
	metaIter.Release()
	metaBlock.Release()

	// Cache index and filter block locally, since we don't have global cache.
	if cache == nil {
		r.indexBlock, err = r.readBlock(r.indexBH, true)
		if err != nil {
			if errors.IsCorrupted(err) {
				r.err = err
				return r, nil
			} else {
				return nil, err
			}
		}
		if r.filter != nil {
			r.filterBlock, err = r.readFilterBlock(r.filterBH)
			if err != nil {
				if !errors.IsCorrupted(err) {
					return nil, err
				}

				// Don't use filter then.
				r.filter = nil
			}
		}
	}

	return r, nil
}
Ejemplo n.º 9
0
func (r *Reader) find(key []byte, filtered bool, ro *opt.ReadOptions, noValue bool) (rkey, value []byte, err error) {
	r.mu.RLock()
	defer r.mu.RUnlock()

	if r.err != nil {
		err = r.err
		return
	}

	indexBlock, rel, err := r.getIndexBlock(true)
	if err != nil {
		return
	}
	defer rel.Release()

	index := r.newBlockIter(indexBlock, nil, nil, true)
	defer index.Release()
	if !index.Seek(key) {
		err = index.Error()
		if err == nil {
			err = ErrNotFound
		}
		return
	}
	dataBH, n := decodeBlockHandle(index.Value())
	if n == 0 {
		r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle")
		return
	}
	if filtered && r.filter != nil {
		filterBlock, frel, ferr := r.getFilterBlock(true)
		if ferr == nil {
			if !filterBlock.contains(r.filter, dataBH.offset, key) {
				frel.Release()
				return nil, nil, ErrNotFound
			}
			frel.Release()
		} else if !errors.IsCorrupted(ferr) {
			err = ferr
			return
		}
	}
	data := r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache())
	defer data.Release()
	if !data.Seek(key) {
		err = data.Error()
		if err == nil {
			err = ErrNotFound
		}
		return
	}
	// Don't use block buffer, no need to copy the buffer.
	rkey = data.Key()
	if !noValue {
		if r.bpool == nil {
			value = data.Value()
		} else {
			// Use block buffer, and since the buffer will be recycled, the buffer
			// need to be copied.
			value = append([]byte{}, data.Value()...)
		}
	}
	return
}
Ejemplo n.º 10
0
func (db *DB) recoverJournalRO() error {
	// Get all journals and sort it by file number.
	rawFds, err := db.s.stor.List(storage.TypeJournal)
	if err != nil {
		return err
	}
	sortFds(rawFds)

	// Journals that will be recovered.
	var fds []storage.FileDesc
	for _, fd := range rawFds {
		if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
			fds = append(fds, fd)
		}
	}

	var (
		// Options.
		strict      = db.s.o.GetStrict(opt.StrictJournal)
		checksum    = db.s.o.GetStrict(opt.StrictJournalChecksum)
		writeBuffer = db.s.o.GetWriteBuffer()

		mdb = memdb.New(db.s.icmp, writeBuffer)
	)

	// Recover journals.
	if len(fds) > 0 {
		db.logf("journal@recovery RO·Mode F·%d", len(fds))

		var (
			jr    *journal.Reader
			buf   = &util.Buffer{}
			batch = &Batch{}
		)

		for _, fd := range fds {
			db.logf("journal@recovery recovering @%d", fd.Num)

			fr, err := db.s.stor.Open(fd)
			if err != nil {
				return err
			}

			// Create or reset journal reader instance.
			if jr == nil {
				jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
			} else {
				jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
			}

			// Replay journal to memdb.
			for {
				r, err := jr.Next()
				if err != nil {
					if err == io.EOF {
						break
					}

					fr.Close()
					return errors.SetFd(err, fd)
				}

				buf.Reset()
				if _, err := buf.ReadFrom(r); err != nil {
					if err == io.ErrUnexpectedEOF {
						// This is error returned due to corruption, with strict == false.
						continue
					}

					fr.Close()
					return errors.SetFd(err, fd)
				}
				if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mdb); err != nil {
					if !strict && errors.IsCorrupted(err) {
						db.s.logf("journal error: %v (skipped)", err)
						// We won't apply sequence number as it might be corrupted.
						continue
					}

					fr.Close()
					return errors.SetFd(err, fd)
				}

				// Save sequence number.
				db.seq = batch.seq + uint64(batch.Len())
			}

			fr.Close()
		}
	}

	// Set memDB.
	db.mem = &memDB{db: db, DB: mdb, ref: 1}

	return nil
}
Ejemplo n.º 11
0
func (db *DB) recoverJournal() error {
	// Get all journals and sort it by file number.
	rawFds, err := db.s.stor.List(storage.TypeJournal)
	if err != nil {
		return err
	}
	sortFds(rawFds)

	// Journals that will be recovered.
	var fds []storage.FileDesc
	for _, fd := range rawFds {
		if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
			fds = append(fds, fd)
		}
	}

	var (
		ofd storage.FileDesc // Obsolete file.
		rec = &sessionRecord{}
	)

	// Recover journals.
	if len(fds) > 0 {
		db.logf("journal@recovery F·%d", len(fds))

		// Mark file number as used.
		db.s.markFileNum(fds[len(fds)-1].Num)

		var (
			// Options.
			strict      = db.s.o.GetStrict(opt.StrictJournal)
			checksum    = db.s.o.GetStrict(opt.StrictJournalChecksum)
			writeBuffer = db.s.o.GetWriteBuffer()

			jr    *journal.Reader
			mdb   = memdb.New(db.s.icmp, writeBuffer)
			buf   = &util.Buffer{}
			batch = &Batch{}
		)

		for _, fd := range fds {
			db.logf("journal@recovery recovering @%d", fd.Num)

			fr, err := db.s.stor.Open(fd)
			if err != nil {
				return err
			}

			// Create or reset journal reader instance.
			if jr == nil {
				jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
			} else {
				jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
			}

			// Flush memdb and remove obsolete journal file.
			if !ofd.Nil() {
				if mdb.Len() > 0 {
					if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
						fr.Close()
						return err
					}
				}

				rec.setJournalNum(fd.Num)
				rec.setSeqNum(db.seq)
				if err := db.s.commit(rec); err != nil {
					fr.Close()
					return err
				}
				rec.resetAddedTables()

				db.s.stor.Remove(ofd)
				ofd = storage.FileDesc{}
			}

			// Replay journal to memdb.
			mdb.Reset()
			for {
				r, err := jr.Next()
				if err != nil {
					if err == io.EOF {
						break
					}

					fr.Close()
					return errors.SetFd(err, fd)
				}

				buf.Reset()
				if _, err := buf.ReadFrom(r); err != nil {
					if err == io.ErrUnexpectedEOF {
						// This is error returned due to corruption, with strict == false.
						continue
					}

					fr.Close()
					return errors.SetFd(err, fd)
				}
				if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mdb); err != nil {
					if !strict && errors.IsCorrupted(err) {
						db.s.logf("journal error: %v (skipped)", err)
						// We won't apply sequence number as it might be corrupted.
						continue
					}

					fr.Close()
					return errors.SetFd(err, fd)
				}

				// Save sequence number.
				db.seq = batch.seq + uint64(batch.Len())

				// Flush it if large enough.
				if mdb.Size() >= writeBuffer {
					if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
						fr.Close()
						return err
					}

					mdb.Reset()
				}
			}

			fr.Close()
			ofd = fd
		}

		// Flush the last memdb.
		if mdb.Len() > 0 {
			if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
				return err
			}
		}
	}

	// Create a new journal.
	if _, err := db.newMem(0); err != nil {
		return err
	}

	// Commit.
	rec.setJournalNum(db.journalFd.Num)
	rec.setSeqNum(db.seq)
	if err := db.s.commit(rec); err != nil {
		// Close journal on error.
		if db.journal != nil {
			db.journal.Close()
			db.journalWriter.Close()
		}
		return err
	}

	// Remove the last obsolete journal file.
	if !ofd.Nil() {
		db.s.stor.Remove(ofd)
	}

	return nil
}
Ejemplo n.º 12
0
func recoverTable(s *session, o *opt.Options) error {
	o = dupOptions(o)
	// Mask StrictReader, lets StrictRecovery doing its job.
	o.Strict &= ^opt.StrictReader

	// Get all tables and sort it by file number.
	fds, err := s.stor.List(storage.TypeTable)
	if err != nil {
		return err
	}
	sortFds(fds)

	var (
		maxSeq                                                            uint64
		recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int

		// We will drop corrupted table.
		strict = o.GetStrict(opt.StrictRecovery)
		noSync = o.GetNoSync()

		rec   = &sessionRecord{}
		bpool = util.NewBufferPool(o.GetBlockSize() + 5)
	)
	buildTable := func(iter iterator.Iterator) (tmpFd storage.FileDesc, size int64, err error) {
		tmpFd = s.newTemp()
		writer, err := s.stor.Create(tmpFd)
		if err != nil {
			return
		}
		defer func() {
			writer.Close()
			if err != nil {
				s.stor.Remove(tmpFd)
				tmpFd = storage.FileDesc{}
			}
		}()

		// Copy entries.
		tw := table.NewWriter(writer, o)
		for iter.Next() {
			key := iter.Key()
			if validInternalKey(key) {
				err = tw.Append(key, iter.Value())
				if err != nil {
					return
				}
			}
		}
		err = iter.Error()
		if err != nil {
			return
		}
		err = tw.Close()
		if err != nil {
			return
		}
		if !noSync {
			err = writer.Sync()
			if err != nil {
				return
			}
		}
		size = int64(tw.BytesLen())
		return
	}
	recoverTable := func(fd storage.FileDesc) error {
		s.logf("table@recovery recovering @%d", fd.Num)
		reader, err := s.stor.Open(fd)
		if err != nil {
			return err
		}
		var closed bool
		defer func() {
			if !closed {
				reader.Close()
			}
		}()

		// Get file size.
		size, err := reader.Seek(0, 2)
		if err != nil {
			return err
		}

		var (
			tSeq                                     uint64
			tgoodKey, tcorruptedKey, tcorruptedBlock int
			imin, imax                               []byte
		)
		tr, err := table.NewReader(reader, size, fd, nil, bpool, o)
		if err != nil {
			return err
		}
		iter := tr.NewIterator(nil, nil)
		if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok {
			itererr.SetErrorCallback(func(err error) {
				if errors.IsCorrupted(err) {
					s.logf("table@recovery block corruption @%d %q", fd.Num, err)
					tcorruptedBlock++
				}
			})
		}

		// Scan the table.
		for iter.Next() {
			key := iter.Key()
			_, seq, _, kerr := parseInternalKey(key)
			if kerr != nil {
				tcorruptedKey++
				continue
			}
			tgoodKey++
			if seq > tSeq {
				tSeq = seq
			}
			if imin == nil {
				imin = append([]byte{}, key...)
			}
			imax = append(imax[:0], key...)
		}
		if err := iter.Error(); err != nil {
			iter.Release()
			return err
		}
		iter.Release()

		goodKey += tgoodKey
		corruptedKey += tcorruptedKey
		corruptedBlock += tcorruptedBlock

		if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) {
			droppedTable++
			s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
			return nil
		}

		if tgoodKey > 0 {
			if tcorruptedKey > 0 || tcorruptedBlock > 0 {
				// Rebuild the table.
				s.logf("table@recovery rebuilding @%d", fd.Num)
				iter := tr.NewIterator(nil, nil)
				tmpFd, newSize, err := buildTable(iter)
				iter.Release()
				if err != nil {
					return err
				}
				closed = true
				reader.Close()
				if err := s.stor.Rename(tmpFd, fd); err != nil {
					return err
				}
				size = newSize
			}
			if tSeq > maxSeq {
				maxSeq = tSeq
			}
			recoveredKey += tgoodKey
			// Add table to level 0.
			rec.addTable(0, fd.Num, size, imin, imax)
			s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
		} else {
			droppedTable++
			s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", fd.Num, tcorruptedKey, tcorruptedBlock, size)
		}

		return nil
	}

	// Recover all tables.
	if len(fds) > 0 {
		s.logf("table@recovery F·%d", len(fds))

		// Mark file number as used.
		s.markFileNum(fds[len(fds)-1].Num)

		for _, fd := range fds {
			if err := recoverTable(fd); err != nil {
				return err
			}
		}

		s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(fds), recoveredKey, goodKey, corruptedKey, maxSeq)
	}

	// Set sequence number.
	rec.setSeqNum(maxSeq)

	// Create new manifest.
	if err := s.create(); err != nil {
		return err
	}

	// Commit.
	return s.commit(rec)
}