// Generates an index from a reader // This is mostly a utility function to avoid being overly verbose in tests that need // an index to work, but don't want to construct one by hand in order to avoid the dependencies // obviously this means that those tests are likely to fail if there are issues with any of the other // modules, which is not ideal. // TODO: move to util? func BuildChecksumIndex(check *filechecksum.FileChecksumGenerator, r io.Reader) ( fcheck []byte, i *index.ChecksumIndex, lookup filechecksum.ChecksumLookup, err error, ) { b := bytes.NewBuffer(nil) fcheck, err = check.GenerateChecksums(r, b) if err != nil { return } weakSize := check.WeakRollingHash.Size() strongSize := check.GetStrongHash().Size() readChunks, err := chunks.LoadChecksumsFromReader(b, weakSize, strongSize) if err != nil { return } i = index.MakeChecksumIndex(readChunks) lookup = chunks.StrongChecksumGetter(readChunks) return }
/* TODO: When matching duplicated blocks, a channel of BlockMatchResult slices would be more efficient */ func (c *Comparer) startFindMatchingBlocks_int( results chan<- BlockMatchResult, comparison io.Reader, baseOffset int64, generator *filechecksum.FileChecksumGenerator, reference Index, ) { defer close(results) block := make([]byte, generator.BlockSize) var err error ReportErr := func(err error) { results <- BlockMatchResult{ Err: err, } } _, err = io.ReadFull(comparison, block) if err != nil { ReportErr( fmt.Errorf("Error reading first block in comparison: %v", err), ) return } generator.WeakRollingHash.SetBlock(block) singleByte := make([]byte, 1) weaksum := make([]byte, generator.WeakRollingHash.Size()) strongSum := make([]byte, 0, generator.GetStrongHash().Size()) blockMemory := circularbuffer.MakeC2Buffer(int(generator.BlockSize)) blockMemory.Write(block) strong := generator.GetStrongHash() // All the bytes i := int64(0) next := READ_NEXT_BYTE //ReadLoop: for { atomic.AddInt64(&c.Comparisons, 1) // look for a weak match generator.WeakRollingHash.GetSum(weaksum) if weakMatchList := reference.FindWeakChecksum2(weaksum); weakMatchList != nil { atomic.AddInt64(&c.WeakHashHits, 1) block = blockMemory.GetBlock() strong.Reset() strong.Write(block) strongSum = strong.Sum(strongSum) strongList := reference.FindStrongChecksum2(strongSum, weakMatchList) // clear the slice strongSum = strongSum[:0] // If there are many matches, it means that this block is // duplicated in the reference. // since we care about finding all the blocks in the reference, // we must report all of them off := i + baseOffset for _, strongMatch := range strongList { results <- BlockMatchResult{ ComparisonOffset: off, BlockIdx: strongMatch.ChunkOffset, } } if len(strongList) > 0 { atomic.AddInt64(&c.StrongHashHits, 1) if next == READ_NONE { // found the match at the end, so exit break } // No point looking for a match that overlaps this block next = READ_NEXT_BLOCK } } var n int var readBytes []byte switch next { case READ_NEXT_BYTE: n, err = comparison.Read(singleByte) readBytes = singleByte case READ_NEXT_BLOCK: n, err = io.ReadFull(comparison, block) readBytes = block[:n] next = READ_NEXT_BYTE } if uint(n) == generator.BlockSize { generator.WeakRollingHash.SetBlock(block) blockMemory.Write(block) i += int64(n) } else if n > 0 { b_len := blockMemory.Len() blockMemory.Write(readBytes) generator.WeakRollingHash.AddAndRemoveBytes( readBytes, blockMemory.Evicted(), b_len, ) i += int64(n) } if next != READ_NONE && (err == io.EOF || err == io.ErrUnexpectedEOF) { err = io.EOF next = READ_NONE } if next == READ_NONE { if blockMemory.Empty() { break } b_len := blockMemory.Len() removedByte := blockMemory.Truncate(1) generator.WeakRollingHash.RemoveBytes(removedByte, b_len) i += 1 } } if err != io.EOF { ReportErr(err) return } }