Exemplo n.º 1
0
// Generates an index from a reader
// This is mostly a utility function to avoid being overly verbose in tests that need
// an index to work, but don't want to construct one by hand in order to avoid the dependencies
// obviously this means that those tests are likely to fail if there are issues with any of the other
// modules, which is not ideal.
// TODO: move to util?
func BuildChecksumIndex(check *filechecksum.FileChecksumGenerator, r io.Reader) (
	fcheck []byte,
	i *index.ChecksumIndex,
	lookup filechecksum.ChecksumLookup,
	err error,
) {
	b := bytes.NewBuffer(nil)
	fcheck, err = check.GenerateChecksums(r, b)

	if err != nil {
		return
	}

	weakSize := check.WeakRollingHash.Size()
	strongSize := check.GetStrongHash().Size()
	readChunks, err := chunks.LoadChecksumsFromReader(b, weakSize, strongSize)

	if err != nil {
		return
	}

	i = index.MakeChecksumIndex(readChunks)
	lookup = chunks.StrongChecksumGetter(readChunks)

	return
}
Exemplo n.º 2
0
/*
TODO: When matching duplicated blocks, a channel of BlockMatchResult slices would be more efficient
*/
func (c *Comparer) startFindMatchingBlocks_int(
	results chan<- BlockMatchResult,
	comparison io.Reader,
	baseOffset int64,
	generator *filechecksum.FileChecksumGenerator,
	reference Index,
) {
	defer close(results)

	block := make([]byte, generator.BlockSize)
	var err error

	ReportErr := func(err error) {
		results <- BlockMatchResult{
			Err: err,
		}
	}

	_, err = io.ReadFull(comparison, block)

	if err != nil {
		ReportErr(
			fmt.Errorf("Error reading first block in comparison: %v", err),
		)
		return
	}

	generator.WeakRollingHash.SetBlock(block)
	singleByte := make([]byte, 1)
	weaksum := make([]byte, generator.WeakRollingHash.Size())
	strongSum := make([]byte, 0, generator.GetStrongHash().Size())

	blockMemory := circularbuffer.MakeC2Buffer(int(generator.BlockSize))
	blockMemory.Write(block)

	strong := generator.GetStrongHash()
	// All the bytes
	i := int64(0)
	next := READ_NEXT_BYTE

	//ReadLoop:
	for {

		atomic.AddInt64(&c.Comparisons, 1)

		// look for a weak match
		generator.WeakRollingHash.GetSum(weaksum)
		if weakMatchList := reference.FindWeakChecksum2(weaksum); weakMatchList != nil {
			atomic.AddInt64(&c.WeakHashHits, 1)

			block = blockMemory.GetBlock()

			strong.Reset()
			strong.Write(block)
			strongSum = strong.Sum(strongSum)
			strongList := reference.FindStrongChecksum2(strongSum, weakMatchList)

			// clear the slice
			strongSum = strongSum[:0]

			// If there are many matches, it means that this block is
			// duplicated in the reference.
			// since we care about finding all the blocks in the reference,
			// we must report all of them
			off := i + baseOffset
			for _, strongMatch := range strongList {
				results <- BlockMatchResult{
					ComparisonOffset: off,
					BlockIdx:         strongMatch.ChunkOffset,
				}
			}

			if len(strongList) > 0 {
				atomic.AddInt64(&c.StrongHashHits, 1)
				if next == READ_NONE {
					// found the match at the end, so exit
					break
				}
				// No point looking for a match that overlaps this block
				next = READ_NEXT_BLOCK
			}
		}

		var n int
		var readBytes []byte

		switch next {
		case READ_NEXT_BYTE:
			n, err = comparison.Read(singleByte)
			readBytes = singleByte
		case READ_NEXT_BLOCK:
			n, err = io.ReadFull(comparison, block)
			readBytes = block[:n]
			next = READ_NEXT_BYTE
		}

		if uint(n) == generator.BlockSize {
			generator.WeakRollingHash.SetBlock(block)
			blockMemory.Write(block)
			i += int64(n)
		} else if n > 0 {
			b_len := blockMemory.Len()
			blockMemory.Write(readBytes)
			generator.WeakRollingHash.AddAndRemoveBytes(
				readBytes,
				blockMemory.Evicted(),
				b_len,
			)
			i += int64(n)
		}

		if next != READ_NONE && (err == io.EOF || err == io.ErrUnexpectedEOF) {
			err = io.EOF
			next = READ_NONE
		}

		if next == READ_NONE {
			if blockMemory.Empty() {
				break
			}

			b_len := blockMemory.Len()
			removedByte := blockMemory.Truncate(1)
			generator.WeakRollingHash.RemoveBytes(removedByte, b_len)
			i += 1
		}
	}

	if err != io.EOF {
		ReportErr(err)
		return
	}
}