// Scans a file for checksum errors and returns all the corrupt ranges and // errors encountered. func groupChecksumVerify(fpr io.Reader) ([]*groupCorruptRange, []error) { header, checksumInterval, err := readGroupHeader(fpr) if err != nil { return []*groupCorruptRange{&groupCorruptRange{0, math.MaxUint32}}, []error{err} } buf := make([]byte, checksumInterval+4) copy(buf, header) if _, err := io.ReadFull(fpr, buf[len(header):]); err != nil { return []*groupCorruptRange{&groupCorruptRange{0, math.MaxUint32}}, []error{err} } start := uint32(0) stop := checksumInterval - 1 var corruptions []*groupCorruptRange var errs []error for { if murmur3.Sum32(buf[:checksumInterval]) != binary.BigEndian.Uint32(buf[checksumInterval:]) { corruptions = append(corruptions, &groupCorruptRange{start, stop}) } start = stop + 1 stop = stop + checksumInterval if _, err := io.ReadFull(fpr, buf); err != nil { corruptions = append(corruptions, &groupCorruptRange{start, math.MaxUint32}) errs = append(errs, err) break } } return corruptions, errs }
func (fl *groupStoreFile) writingChecksummer() { for { buf := <-fl.writerChecksumBufChan if buf == nil { break } binary.BigEndian.PutUint32(buf.buf[fl.store.checksumInterval:], murmur3.Sum32(buf.buf[:fl.store.checksumInterval])) fl.writerToDiskBufChan <- buf } fl.writerDoneChan <- struct{}{} }
func (vf *valuesFile) checksummer() { for { buf := <-vf.checksumChan if buf == nil { break } binary.BigEndian.PutUint32(buf.buf[vf.vs.checksumInterval:], murmur3.Sum32(buf.buf[:vf.vs.checksumInterval])) vf.writeChan <- buf } vf.doneChan <- struct{}{} }
func (fl *groupStoreFile) closeWriting() error { if fl.writerChecksumBufChan == nil { return nil } var reterr error close(fl.writerChecksumBufChan) for i := 0; i < cap(fl.writerChecksumBufChan); i++ { <-fl.writerDoneChan } fl.writerToDiskBufChan <- nil <-fl.writerDoneChan // Make sure any trailing data is covered by a checksum by writing an // additional block of zeros (since this is a value file and the TOC won't // reference these additional locations, they are effectively ignored). term := make([]byte, fl.store.checksumInterval) copy(term[len(term)-8:], []byte("TERM v0 ")) left := len(term) for left > 0 { n := copy(fl.writerCurrentBuf.buf[fl.writerCurrentBuf.offset:fl.store.checksumInterval], term[len(term)-left:]) left -= n fl.writerCurrentBuf.offset += uint32(n) if left > 0 { binary.BigEndian.PutUint32(fl.writerCurrentBuf.buf[fl.writerCurrentBuf.offset:], murmur3.Sum32(fl.writerCurrentBuf.buf[:fl.writerCurrentBuf.offset])) fl.writerCurrentBuf.offset += 4 } if _, err := fl.writerFP.Write(fl.writerCurrentBuf.buf[:fl.writerCurrentBuf.offset]); err != nil { if reterr == nil { reterr = err } break } fl.writerCurrentBuf.offset = 0 } if err := fl.writerFP.Close(); err != nil { if reterr == nil { reterr = err } } for _, memBlock := range fl.writerCurrentBuf.memBlocks { fl.store.freeableMemBlockChans[fl.freeableMemBlockChanIndex] <- memBlock fl.freeableMemBlockChanIndex++ if fl.freeableMemBlockChanIndex >= len(fl.store.freeableMemBlockChans) { fl.freeableMemBlockChanIndex = 0 } } fl.writerFP = nil fl.writerFreeBufChan = nil fl.writerChecksumBufChan = nil fl.writerToDiskBufChan = nil fl.writerDoneChan = nil fl.writerCurrentBuf = nil return reterr }
func (vf *valuesFile) close() { close(vf.checksumChan) for i := 0; i < cap(vf.checksumChan); i++ { <-vf.doneChan } vf.writeChan <- nil <-vf.doneChan term := make([]byte, 16) binary.BigEndian.PutUint64(term[4:], uint64(atomic.LoadUint32(&vf.atOffset))) copy(term[12:], "TERM") left := len(term) for left > 0 { n := copy(vf.buf.buf[vf.buf.offset:vf.vs.checksumInterval], term[len(term)-left:]) vf.buf.offset += uint32(n) binary.BigEndian.PutUint32(vf.buf.buf[vf.buf.offset:], murmur3.Sum32(vf.buf.buf[:vf.buf.offset])) if _, err := vf.writerFP.Write(vf.buf.buf[:vf.buf.offset+4]); err != nil { panic(err) } vf.buf.offset = 0 left -= n } if err := vf.writerFP.Close(); err != nil { panic(err) } for _, vm := range vf.buf.vms { vf.vs.freeableVMChans[vf.freeableVMChanIndex] <- vm vf.freeableVMChanIndex++ if vf.freeableVMChanIndex >= len(vf.vs.freeableVMChans) { vf.freeableVMChanIndex = 0 } } vf.writerFP = nil vf.freeChan = nil vf.checksumChan = nil vf.writeChan = nil vf.doneChan = nil vf.buf = nil }
func (vs *DefaultValueStore) recovery() { start := time.Now() fromDiskCount := 0 causedChangeCount := int64(0) type writeReq struct { keyA uint64 keyB uint64 timestampbits uint64 blockID uint32 offset uint32 length uint32 } workers := uint64(vs.workers) pendingBatchChans := make([]chan []writeReq, workers) freeBatchChans := make([]chan []writeReq, len(pendingBatchChans)) for i := 0; i < len(pendingBatchChans); i++ { pendingBatchChans[i] = make(chan []writeReq, 4) freeBatchChans[i] = make(chan []writeReq, 4) for j := 0; j < cap(freeBatchChans[i]); j++ { freeBatchChans[i] <- make([]writeReq, vs.recoveryBatchSize) } } wg := &sync.WaitGroup{} wg.Add(len(pendingBatchChans)) for i := 0; i < len(pendingBatchChans); i++ { go func(pendingBatchChan chan []writeReq, freeBatchChan chan []writeReq) { for { batch := <-pendingBatchChan if batch == nil { break } for j := 0; j < len(batch); j++ { wr := &batch[j] if wr.timestampbits&_TSB_LOCAL_REMOVAL != 0 { wr.blockID = 0 } if vs.logDebug != nil { if vs.vlm.Set(wr.keyA, wr.keyB, wr.timestampbits, wr.blockID, wr.offset, wr.length, true) < wr.timestampbits { atomic.AddInt64(&causedChangeCount, 1) } } else { vs.vlm.Set(wr.keyA, wr.keyB, wr.timestampbits, wr.blockID, wr.offset, wr.length, true) } } freeBatchChan <- batch } wg.Done() }(pendingBatchChans[i], freeBatchChans[i]) } fromDiskBuf := make([]byte, vs.checksumInterval+4) fromDiskOverflow := make([]byte, 0, 32) batches := make([][]writeReq, len(freeBatchChans)) batchesPos := make([]int, len(batches)) fp, err := os.Open(vs.pathtoc) if err != nil { panic(err) } names, err := fp.Readdirnames(-1) fp.Close() if err != nil { panic(err) } sort.Strings(names) for i := 0; i < len(names); i++ { if !strings.HasSuffix(names[i], ".valuestoc") { continue } namets := int64(0) if namets, err = strconv.ParseInt(names[i][:len(names[i])-len(".valuestoc")], 10, 64); err != nil { vs.logError.Printf("bad timestamp in name: %#v\n", names[i]) continue } if namets == 0 { vs.logError.Printf("bad timestamp in name: %#v\n", names[i]) continue } vf := newValuesFile(vs, namets, osOpenReadSeeker) fp, err := os.Open(path.Join(vs.pathtoc, names[i])) if err != nil { vs.logError.Printf("error opening %s: %s\n", names[i], err) continue } checksumFailures := 0 first := true terminated := false fromDiskOverflow = fromDiskOverflow[:0] for { n, err := io.ReadFull(fp, fromDiskBuf) if n < 4 { if err != io.EOF && err != io.ErrUnexpectedEOF { vs.logError.Printf("error reading %s: %s\n", names[i], err) } break } n -= 4 if murmur3.Sum32(fromDiskBuf[:n]) != binary.BigEndian.Uint32(fromDiskBuf[n:]) { checksumFailures++ } else { j := 0 if first { if !bytes.Equal(fromDiskBuf[:28], []byte("VALUESTORETOC v0 ")) { vs.logError.Printf("bad header: %s\n", names[i]) break } if binary.BigEndian.Uint32(fromDiskBuf[28:]) != vs.checksumInterval { vs.logError.Printf("bad header checksum interval: %s\n", names[i]) break } j += 32 first = false } if n < int(vs.checksumInterval) { if binary.BigEndian.Uint32(fromDiskBuf[n-16:]) != 0 { vs.logError.Printf("bad terminator size marker: %s\n", names[i]) break } if !bytes.Equal(fromDiskBuf[n-4:n], []byte("TERM")) { vs.logError.Printf("bad terminator: %s\n", names[i]) break } n -= 16 terminated = true } if len(fromDiskOverflow) > 0 { j += 32 - len(fromDiskOverflow) fromDiskOverflow = append(fromDiskOverflow, fromDiskBuf[j-32+len(fromDiskOverflow):j]...) keyB := binary.BigEndian.Uint64(fromDiskOverflow[8:]) k := keyB % workers if batches[k] == nil { batches[k] = <-freeBatchChans[k] batchesPos[k] = 0 } wr := &batches[k][batchesPos[k]] wr.keyA = binary.BigEndian.Uint64(fromDiskOverflow) wr.keyB = keyB wr.timestampbits = binary.BigEndian.Uint64(fromDiskOverflow[16:]) wr.blockID = vf.id wr.offset = binary.BigEndian.Uint32(fromDiskOverflow[24:]) wr.length = binary.BigEndian.Uint32(fromDiskOverflow[28:]) batchesPos[k]++ if batchesPos[k] >= vs.recoveryBatchSize { pendingBatchChans[k] <- batches[k] batches[k] = nil } fromDiskCount++ fromDiskOverflow = fromDiskOverflow[:0] } for ; j+32 <= n; j += 32 { keyB := binary.BigEndian.Uint64(fromDiskBuf[j+8:]) k := keyB % workers if batches[k] == nil { batches[k] = <-freeBatchChans[k] batchesPos[k] = 0 } wr := &batches[k][batchesPos[k]] wr.keyA = binary.BigEndian.Uint64(fromDiskBuf[j:]) wr.keyB = keyB wr.timestampbits = binary.BigEndian.Uint64(fromDiskBuf[j+16:]) wr.blockID = vf.id wr.offset = binary.BigEndian.Uint32(fromDiskBuf[j+24:]) wr.length = binary.BigEndian.Uint32(fromDiskBuf[j+28:]) batchesPos[k]++ if batchesPos[k] >= vs.recoveryBatchSize { pendingBatchChans[k] <- batches[k] batches[k] = nil } fromDiskCount++ } if j != n { fromDiskOverflow = fromDiskOverflow[:n-j] copy(fromDiskOverflow, fromDiskBuf[j:]) } } if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { vs.logError.Printf("error reading %s: %s\n", names[i], err) break } } fp.Close() if !terminated { vs.logError.Printf("early end of file: %s\n", names[i]) } if checksumFailures > 0 { vs.logWarning.Printf("%d checksum failures for %s\n", checksumFailures, names[i]) } } for i := 0; i < len(batches); i++ { if batches[i] != nil { pendingBatchChans[i] <- batches[i][:batchesPos[i]] } pendingBatchChans[i] <- nil } wg.Wait() if vs.logDebug != nil { dur := time.Now().Sub(start) valueCount, valueLength, _ := vs.GatherStats(false) vs.logInfo.Printf("%d key locations loaded in %s, %.0f/s; %d caused change; %d resulting locations referencing %d bytes.\n", fromDiskCount, dur, float64(fromDiskCount)/(float64(dur)/float64(time.Second)), causedChangeCount, valueCount, valueLength) } }
func groupReadTOCEntriesBatched(fpr io.ReadSeeker, blockID uint32, freeBatchChans []chan []groupTOCEntry, pendingBatchChans []chan []groupTOCEntry, controlChan chan struct{}) (int, []error) { // There is an assumption that the checksum interval is greater than the // _GROUP_FILE_HEADER_SIZE and that the _GROUP_FILE_ENTRY_SIZE is // greater than the _GROUP_FILE_TRAILER_SIZE. var errs []error var checksumInterval int if _, ci, err := readGroupHeaderTOC(fpr); err != nil { return 0, append(errs, err) } else { checksumInterval = int(ci) } fpr.Seek(0, 0) buf := make([]byte, checksumInterval+4+_GROUP_FILE_ENTRY_SIZE) rpos := 0 checksumErrors := 0 workers := uint64(len(freeBatchChans)) batches := make([][]groupTOCEntry, workers) batches[0] = <-freeBatchChans[0] batches[0] = batches[0][:cap(batches[0])] batchSize := len(batches[0]) batchesPos := make([]int, len(batches)) fromDiskCount := 0 skipNext := _GROUP_FILE_HEADER_SIZE more := true L1: for more { select { case <-controlChan: break L1 default: } rbuf := buf[rpos : rpos+checksumInterval+4] if n, err := io.ReadFull(fpr, rbuf); err == io.ErrUnexpectedEOF || err == io.EOF { rbuf = rbuf[:n] more = false } else if err != nil { errs = append(errs, err) break } else { cbuf := rbuf[len(rbuf)-4:] rbuf = rbuf[:len(rbuf)-4] if binary.BigEndian.Uint32(cbuf) != murmur3.Sum32(rbuf) { checksumErrors++ rbuf = buf[:rpos+len(rbuf)] skipNext = _GROUP_FILE_ENTRY_SIZE - ((skipNext + len(rbuf)) % _GROUP_FILE_ENTRY_SIZE) rpos = 0 continue } } if skipNext != 0 { rbuf = rbuf[skipNext:] skipNext = 0 } else { rbuf = buf[:rpos+len(rbuf)] } if !more { if len(rbuf) >= _VALUE_FILE_TRAILER_SIZE && bytes.Equal(rbuf[len(rbuf)-_GROUP_FILE_TRAILER_SIZE:], []byte("TERM v0 ")) { rbuf = rbuf[:len(rbuf)-_GROUP_FILE_TRAILER_SIZE] } else { errs = append(errs, errors.New("no terminator found")) } } for len(rbuf) >= _GROUP_FILE_ENTRY_SIZE { offset := binary.BigEndian.Uint32(rbuf[40:]) if offset != 0 { fromDiskCount++ keyB := binary.BigEndian.Uint64(rbuf[8:]) k := keyB % workers if batches[k] == nil { batches[k] = <-freeBatchChans[k] batches[k] = batches[k][:cap(batches[k])] batchesPos[k] = 0 } wr := &batches[k][batchesPos[k]] wr.KeyA = binary.BigEndian.Uint64(rbuf) wr.KeyB = keyB wr.ChildKeyA = binary.BigEndian.Uint64(rbuf[16:]) wr.ChildKeyB = binary.BigEndian.Uint64(rbuf[24:]) wr.TimestampBits = binary.BigEndian.Uint64(rbuf[32:]) wr.BlockID = blockID wr.Offset = offset wr.Length = binary.BigEndian.Uint32(rbuf[44:]) batchesPos[k]++ if batchesPos[k] >= batchSize { pendingBatchChans[k] <- batches[k] batches[k] = nil } } rbuf = rbuf[_GROUP_FILE_ENTRY_SIZE:] } rpos = copy(buf, rbuf) } for i := 0; i < len(batches); i++ { if batches[i] != nil { pendingBatchChans[i] <- batches[i][:batchesPos[i]] } } if checksumErrors > 0 { errs = append(errs, fmt.Errorf("there were %d checksum errors", checksumErrors)) } return fromDiskCount, errs }
func (h *HashTable) hash(key *string, offset uint32) uint32 { data := []byte(*key) return (murmur3.Sum32(data) + offset) % h.capacity }
func (vs *DefaultValueStore) compactFile(name string, candidateBlockID uint32) (compactionResult, error) { var cr compactionResult fromDiskBuf := make([]byte, vs.checksumInterval+4) fromDiskOverflow := make([]byte, 0, 32) fp, err := os.Open(name) if err != nil { vs.logError.Printf("error opening %s: %s\n", name, err) return cr, errors.New("Error opening toc") } first := true terminated := false fromDiskOverflow = fromDiskOverflow[:0] for { n, err := io.ReadFull(fp, fromDiskBuf) if n < 4 { if err != io.EOF && err != io.ErrUnexpectedEOF { vs.logError.Printf("error reading %s: %s\n", name, err) return cr, errors.New("Error attempting to read toc") } break } n -= 4 if murmur3.Sum32(fromDiskBuf[:n]) != binary.BigEndian.Uint32(fromDiskBuf[n:]) { cr.checksumFailures++ } else { j := 0 if first { if !bytes.Equal(fromDiskBuf[:28], []byte("VALUESTORETOC v0 ")) { vs.logError.Printf("bad header: %s\n", name) return cr, errors.New("Bad header") } if binary.BigEndian.Uint32(fromDiskBuf[28:]) != vs.checksumInterval { vs.logError.Printf("bad header checksum interval: %s\n", name) return cr, errors.New("Bad header checksum interval") } j += 32 first = false } if n < int(vs.checksumInterval) { if binary.BigEndian.Uint32(fromDiskBuf[n-16:]) != 0 { vs.logError.Printf("bad terminator size marker: %s\n", name) return cr, errors.New("Error on toc term size marker") } if !bytes.Equal(fromDiskBuf[n-4:n], []byte("TERM")) { vs.logError.Printf("bad terminator: %s\n", name) return cr, errors.New("Error on toc term marker") } n -= 16 terminated = true } if len(fromDiskOverflow) > 0 { j += 32 - len(fromDiskOverflow) fromDiskOverflow = append(fromDiskOverflow, fromDiskBuf[j-32+len(fromDiskOverflow):j]...) keyB := binary.BigEndian.Uint64(fromDiskOverflow[8:]) keyA := binary.BigEndian.Uint64(fromDiskOverflow) timestampbits := binary.BigEndian.Uint64(fromDiskOverflow[16:]) fromDiskOverflow = fromDiskOverflow[:0] tsm, blockid, _, _ := vs.lookup(keyA, keyB) if tsm>>_TSB_UTIL_BITS != timestampbits>>_TSB_UTIL_BITS && blockid != candidateBlockID || tsm&_TSB_DELETION != 0 { cr.count++ cr.stale++ } else { var value []byte _, value, err := vs.read(keyA, keyB, value) if err != nil { vs.logCritical.Println("Error on rewrite read", err) return cr, errors.New("Error on read for compaction rewrite.") } _, err = vs.write(keyA, keyB, timestampbits|_TSB_COMPACTION_REWRITE, value) if err != nil { vs.logCritical.Println("Error on rewrite", err) return cr, errors.New("Write error on compaction rewrite.") } cr.count++ cr.rewrote++ } } for ; j+32 <= n; j += 32 { keyB := binary.BigEndian.Uint64(fromDiskBuf[j+8:]) keyA := binary.BigEndian.Uint64(fromDiskBuf[j:]) timestampbits := binary.BigEndian.Uint64(fromDiskBuf[j+16:]) tsm, blockid, _, _ := vs.lookup(keyA, keyB) if tsm>>_TSB_UTIL_BITS != timestampbits>>_TSB_UTIL_BITS && blockid != candidateBlockID || tsm&_TSB_DELETION != 0 { cr.count++ cr.stale++ } else { var value []byte _, value, err := vs.read(keyA, keyB, value) if err != nil { vs.logCritical.Println("Error on rewrite read", err) return cr, errors.New("Error on rewrite read") } _, err = vs.write(keyA, keyB, timestampbits|_TSB_COMPACTION_REWRITE, value) if err != nil { vs.logCritical.Println("Error on rewrite", err) return cr, errors.New("Error on rewrite") } cr.count++ cr.rewrote++ } } if j != n { fromDiskOverflow = fromDiskOverflow[:n-j] copy(fromDiskOverflow, fromDiskBuf[j:]) } } if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { vs.logError.Printf("error reading %s: %s\n", name, err) return cr, errors.New("EOF while reading toc during compaction") } } fp.Close() if !terminated { vs.logError.Printf("early end of file: %s\n", name) return cr, nil } if cr.checksumFailures > 0 { vs.logWarning.Printf("%d checksum failures for %s\n", cr.checksumFailures, name) return cr, nil } return cr, nil }
func (vs *DefaultValueStore) sampleTOC(name string, candidateBlockID uint32, skipOffset, skipCount int) (int, int, error) { count := 0 stale := 0 fromDiskBuf := make([]byte, vs.checksumInterval+4) fromDiskOverflow := make([]byte, 0, 32) fp, err := os.Open(name) if err != nil { vs.logError.Printf("error opening %s: %s\n", name, err) return 0, 0, err } checksumFailures := 0 first := true terminated := false fromDiskOverflow = fromDiskOverflow[:0] skipCounter := 0 - skipOffset for { n, err := io.ReadFull(fp, fromDiskBuf) if n < 4 { if err != io.EOF && err != io.ErrUnexpectedEOF { vs.logError.Printf("error reading %s: %s\n", name, err) } break } n -= 4 if murmur3.Sum32(fromDiskBuf[:n]) != binary.BigEndian.Uint32(fromDiskBuf[n:]) { checksumFailures++ } else { j := 0 if first { if !bytes.Equal(fromDiskBuf[:28], []byte("VALUESTORETOC v0 ")) { vs.logError.Printf("bad header: %s\n", name) break } if binary.BigEndian.Uint32(fromDiskBuf[28:]) != vs.checksumInterval { vs.logError.Printf("bad header checksum interval: %s\n", name) break } j += 32 first = false } if n < int(vs.checksumInterval) { if binary.BigEndian.Uint32(fromDiskBuf[n-16:]) != 0 { vs.logError.Printf("bad terminator size marker: %s\n", name) break } if !bytes.Equal(fromDiskBuf[n-4:n], []byte("TERM")) { vs.logError.Printf("bad terminator: %s\n", name) break } n -= 16 terminated = true } if len(fromDiskOverflow) > 0 { j += 32 - len(fromDiskOverflow) fromDiskOverflow = append(fromDiskOverflow, fromDiskBuf[j-32+len(fromDiskOverflow):j]...) keyB := binary.BigEndian.Uint64(fromDiskOverflow[8:]) keyA := binary.BigEndian.Uint64(fromDiskOverflow) timestampbits := binary.BigEndian.Uint64(fromDiskOverflow[16:]) fromDiskOverflow = fromDiskOverflow[:0] count++ if skipCounter == skipCount { tsm, blockid, _, _ := vs.lookup(keyA, keyB) if tsm>>_TSB_UTIL_BITS != timestampbits>>_TSB_UTIL_BITS && blockid != candidateBlockID || tsm&_TSB_DELETION != 0 { stale++ } skipCounter = 0 } else { skipCounter++ } } for ; j+32 <= n; j += 32 { keyB := binary.BigEndian.Uint64(fromDiskBuf[j+8:]) keyA := binary.BigEndian.Uint64(fromDiskBuf[j:]) timestampbits := binary.BigEndian.Uint64(fromDiskBuf[j+16:]) tsm, blockid, _, _ := vs.lookup(keyA, keyB) count++ if skipCounter == skipCount { if tsm>>_TSB_UTIL_BITS != timestampbits>>_TSB_UTIL_BITS && blockid != candidateBlockID || tsm&_TSB_DELETION != 0 { stale++ } skipCounter = 0 } else { skipCounter++ } } if j != n { fromDiskOverflow = fromDiskOverflow[:n-j] copy(fromDiskOverflow, fromDiskBuf[j:]) } } if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { vs.logError.Printf("error reading %s: %s\n", name, err) break } } fp.Close() if !terminated { vs.logError.Printf("early end of file: %s\n", name) } if checksumFailures > 0 { vs.logWarning.Printf("%d checksum failures for %s\n", checksumFailures, name) } return count, stale, nil }
// MurMurHash算法 :https://github.com/spaolacci/murmur3 func (c *Consistent) hashStr(key string) uint32 { // return crc32.ChecksumIEEE([]byte(key)) return murmur3.Sum32([]byte(key)) }