func TestGolden(t *testing.T) { for _, g := range golden { in := g.in // We test the vanilla implementation p := []byte(g.in) vanilla := adler32.New() vanilla.Write(p) if got := vanilla.Sum32(); got != g.out { t.Errorf("vanilla implentation: for %q, expected 0x%x, got 0x%x", in, g.out, got) continue } // We test the rolling implementation by prefixing the slice by a // space, writing it to our rolling hash, and then rolling once q := []byte(" ") q = append(q, p...) rolling := rollsum.New() rolling.Write(q[:len(q)-1]) rolling.Roll(q[len(q)-1]) if got := rolling.Sum32(); got != g.out { t.Errorf("rolling implentation: for %q, expected 0x%x, got 0x%x", in, g.out, got) continue } } }
func TestUninitialized(t *testing.T) { s := []byte(data) hash := rollsum.New() err := hash.Roll(s[0]) if err == nil { t.Fatal("Rolling with an uninitialized window should trigger an error") } }
func BenchmarkWeakHashAdler32(b *testing.B) { data := make([]byte, size) hf := adler32.New() for i := 0; i < b.N; i++ { hf.Write(data) } _ = hf.Sum32() b.SetBytes(size) }
func BenchmarkWeakHashAdler32Roll(b *testing.B) { data := make([]byte, size) hf := adler32.New() hf.Write(data) b.ResetTimer() for i := 0; i < b.N; i++ { for i := 0; i <= size; i++ { hf.Roll('a') } } b.SetBytes(size) }
func BenchmarkRolling128B(b *testing.B) { b.SetBytes(1024) window := make([]byte, 128) for i := range window { window[i] = byte(i) } h := rollsum.New() in := make([]byte, 0, h.Size()) b.ResetTimer() h.Write(window) for i := 0; i < b.N; i++ { h.Roll(byte(128 + i)) h.Sum(in) } }
func chop(r *bufio.Reader, level int) (rollsum uint32, id string, n int, err error) { m := boundaryMask(level) if level > 0 { entries := make(map[int]string) offset := 0 for { rollsum, id, n, err := chop(r, level-1) entries[offset] = id offset += n if (rollsum&m == m && level < levelmax) || err != nil { resb, _ := json.Marshal(entries) id = store(resb) return rollsum, id, offset, err } } } else { data := make([]byte, 128, 4*(1<<chunkbits)) hash := adler32.New() n, err := r.Read(data) if err != nil { if err == io.EOF && n > 0 { data = data[:n] hash.Write(data) return hash.Sum32(), store(data), n, err } return 0, "", 0, err } hash.Write(data) for hash.Sum32()&m != m { b, err := r.ReadByte() if err != nil { break } hash.Roll(b) data = append(data, b) } return hash.Sum32(), store(data), n, err } }
// Find finds all the blocks of the given size within io.Reader that matches // the hashes provided, and returns a hash -> slice of offsets within reader // map, that produces the same weak hash. func Find(ir io.Reader, hashesToFind []uint32, size int) (map[uint32][]int64, error) { if ir == nil { return nil, nil } r := bufio.NewReader(ir) hf := adler32.New() n, err := io.CopyN(hf, r, int64(size)) if err == io.EOF { return nil, nil } if err != nil { return nil, err } if n != int64(size) { return nil, io.ErrShortBuffer } offsets := make(map[uint32][]int64) for _, hashToFind := range hashesToFind { offsets[hashToFind] = nil } var i int64 var hash uint32 for { hash = hf.Sum32() if existing, ok := offsets[hash]; ok { offsets[hash] = append(existing, i) } i++ bt, err := r.ReadByte() if err == io.EOF { break } else if err != nil { return offsets, err } hf.Roll(bt) } return offsets, nil }
{0xd0201df6, "'Invariant assertions' is the most elegant programming technique! -Tom Szymanski"}, {0x211297c8, strings.Repeat("\xff", 5548) + "8"}, {0xbaa198c8, strings.Repeat("\xff", 5549) + "9"}, {0x553499be, strings.Repeat("\xff", 5550) + "0"}, {0xf0c19abe, strings.Repeat("\xff", 5551) + "1"}, {0x8d5c9bbe, strings.Repeat("\xff", 5552) + "2"}, {0x2af69cbe, strings.Repeat("\xff", 5553) + "3"}, {0xc9809dbe, strings.Repeat("\xff", 5554) + "4"}, {0x69189ebe, strings.Repeat("\xff", 5555) + "5"}, {0x86af0001, strings.Repeat("\x00", 1e5)}, {0x79660b4d, strings.Repeat("a", 1e5)}, {0x110588ee, strings.Repeat("ABCDEFGHIJKLMNOPQRSTUVWXYZ", 1e4)}, } // This is a no-op to prove that rollsum.Hash32 implements hash.Hash32 var _ = hash.Hash32(rollsum.New()) func TestGolden(t *testing.T) { for _, g := range golden { in := g.in // We test the vanilla implementation p := []byte(g.in) vanilla := adler32.New() vanilla.Write(p) if got := vanilla.Sum32(); got != g.out { t.Errorf("vanilla implentation: for %q, expected 0x%x, got 0x%x", in, g.out, got) continue } // We test the rolling implementation by prefixing the slice by a
func Example() { s := []byte(data) vanilla := adler32.New() rolling := rollsum.New() // arbitrary window len n := 16 // Load the window into the rolling hash rolling.Write(s[:n]) // Roll it and compare the result with full re-calculus every time for i := n; i < len(s); i++ { vanilla.Reset() vanilla.Write(s[i-n+1 : i+1]) err := rolling.Roll(s[i]) if err != nil { log.Fatal(err) } fmt.Printf("%v: checksum %x\n", string(s[i-n+1:i+1]), rolling.Sum32()) if vanilla.Sum32() != rolling.Sum32() { log.Fatalf("%v: expected %x, got %x", s[i-n+1:i+1], vanilla.Sum32(), rolling.Sum32()) } } // Output: // he quick brown f: checksum 31e905d9 // e quick brown fo: checksum 314805e0 // quick brown fox: checksum 30ea05f3 // quick brown fox : checksum 34dc05f3 // uick brown fox j: checksum 33b705ec // ick brown fox ju: checksum 325205ec // ck brown fox jum: checksum 31b105f0 // k brown fox jump: checksum 317d05fd // brown fox jumps: checksum 30d10605 // brown fox jumps : checksum 34d50605 // rown fox jumps o: checksum 34c60612 // own fox jumps ov: checksum 33bb0616 // wn fox jumps ove: checksum 32d6060c // n fox jumps over: checksum 316c0607 // fox jumps over : checksum 304405b9 // fox jumps over t: checksum 3450060d // ox jumps over th: checksum 33fe060f // x jumps over the: checksum 33120605 // jumps over the : checksum 313e05ad // jumps over the l: checksum 353605f9 // umps over the la: checksum 348505f0 // mps over the laz: checksum 332905f5 // ps over the lazy: checksum 32590601 // s over the lazy : checksum 310905b1 // over the lazy d: checksum 2f7a05a2 // over the lazy do: checksum 336a05f1 // ver the lazy dog: checksum 326205e9 }
// Blocks returns the blockwise hash of the reader. func Blocks(r io.Reader, blocksize int, sizehint int64, counter Counter) ([]protocol.BlockInfo, error) { hf := sha256.New() hashLength := hf.Size() whf := adler32.New() mhf := io.MultiWriter(hf, whf) var blocks []protocol.BlockInfo var hashes, thisHash []byte if sizehint >= 0 { // Allocate contiguous blocks for the BlockInfo structures and their // hashes once and for all, and stick to the specified size. r = io.LimitReader(r, sizehint) numBlocks := int(sizehint / int64(blocksize)) blocks = make([]protocol.BlockInfo, 0, numBlocks) hashes = make([]byte, 0, hashLength*numBlocks) } // A 32k buffer is used for copying into the hash function. buf := make([]byte, 32<<10) var offset int64 for { lr := io.LimitReader(r, int64(blocksize)) n, err := io.CopyBuffer(mhf, lr, buf) if err != nil { return nil, err } if n == 0 { break } if counter != nil { counter.Update(n) } // Carve out a hash-sized chunk of "hashes" to store the hash for this // block. hashes = hf.Sum(hashes) thisHash, hashes = hashes[:hashLength], hashes[hashLength:] b := protocol.BlockInfo{ Size: int32(n), Offset: offset, Hash: thisHash, WeakHash: whf.Sum32(), } blocks = append(blocks, b) offset += n hf.Reset() whf.Reset() } if len(blocks) == 0 { // Empty file blocks = append(blocks, protocol.BlockInfo{ Offset: 0, Size: 0, Hash: SHA256OfNothing, }) } return blocks, nil }