func SplitFun(r io.Reader, outFn func([]byte) error) error { br := bufio.NewReader(r) rs := rollsum.New() chunk := bytes.Buffer{} for { chunk.Reset() var err error for !(rs.OnSplit() && rs.Bits() >= minBits) || chunk.Len() < minChunkSize { var c byte c, err = br.ReadByte() if err != nil { break } rs.Roll(c) if e := chunk.WriteByte(c); e != nil { return e } } if err != nil && err != io.EOF { return err } if err1 := outFn(chunk.Bytes()); err1 != nil { return err1 } if err == io.EOF { return nil } } }
func (d *Dict) parse(content []byte) error { rs := rollsum.New() var match uint64 q := content buf := make([]byte, 0) hashes := make([][]byte, 0) offs := make([]int, 0) tx, err := d.db.Begin() if err != nil { return err } stmt, err := tx.Prepare(sqlUpSert) if err != nil { return err } off := 0 for len(q) > 0 { b := q[0] q = q[1:] rs.Roll(b) off++ d.totalBytesIn++ buf = append(buf, b) if rs.OnSplitWithBits(5) { h := sha1.Sum(buf) offs = append(offs, off) hashes = append(hashes, h[:]) _, err := stmt.Exec(buf, h[:], h[:]) if err != nil { return err } buf = buf[:0] } } d.totalBytesDup += uint64(match) if errStmt := stmt.Close(); errStmt != nil { return err } if errTx := tx.Commit(); errTx != nil { return err } err = d.makeDict() if err != nil { return err } return nil }
// chunksOf takes a (presumably large) file's uncompressed input, // rolling-checksum splits it into ~514 byte chunks, compresses each, // base64s each, and writes chunk files out, with each file just // defining an exported fileembed.Opener variable named C<xxxx> where // xxxx is the first 8 lowercase hex digits of the SHA-1 of the chunk // value pre-compression. The return value is a Go expression // referencing each of those chunks concatenated together. func chunksOf(in []byte) (stringExpression []byte) { var multiParts [][]byte rs := rollsum.New() const nBits = 9 // ~512 byte chunks last := 0 for i, b := range in { rs.Roll(b) if rs.OnSplitWithBits(nBits) || i == len(in)-1 { raw := in[last : i+1] // inclusive last = i + 1 s1 := sha1.New() s1.Write(raw) sha1hex := fmt.Sprintf("%x", s1.Sum(nil))[:8] writeChunkFile(sha1hex, raw) multiParts = append(multiParts, []byte(fmt.Sprintf("chunkpkg.C%s", sha1hex))) } } return bytes.Join(multiParts, []byte(",\n\t")) }
func writeFileChunks(bs blobserver.StatReceiver, file *Builder, r io.Reader) (n int64, spans []span, outerr error) { src := ¬eEOFReader{r: r} bufr := bufio.NewReaderSize(src, bufioReaderSize) spans = []span{} // the tree of spans, cut on interesting rollsum boundaries rs := rollsum.New() var last int64 var buf bytes.Buffer blobSize := 0 // of the next blob being built, should be same as buf.Len() const chunksInFlight = 32 // at ~64 KB chunks, this is ~2MB memory per file gatec := syncutil.NewGate(chunksInFlight) firsterrc := make(chan error, 1) // uploadLastSpan runs in the same goroutine as the loop below and is responsible for // starting uploading the contents of the buf. It returns false if there's been // an error and the loop below should be stopped. uploadLastSpan := func() bool { chunk := buf.String() buf.Reset() br := blob.SHA1FromString(chunk) spans[len(spans)-1].br = br select { case outerr = <-firsterrc: return false default: // No error seen so far, continue. } gatec.Start() go func() { defer gatec.Done() if _, err := uploadString(bs, br, chunk); err != nil { select { case firsterrc <- err: default: } } }() return true } for { c, err := bufr.ReadByte() if err == io.EOF { if n != last { spans = append(spans, span{from: last, to: n}) if !uploadLastSpan() { return } } break } if err != nil { return 0, nil, err } buf.WriteByte(c) n++ blobSize++ rs.Roll(c) var bits int onRollSplit := rs.OnSplit() switch { case blobSize == maxBlobSize: bits = 20 // arbitrary node weight; 1<<20 == 1MB case src.sawEOF: // Don't split. End is coming soon enough. continue case onRollSplit && n > firstChunkSize && blobSize > tooSmallThreshold: bits = rs.Bits() case n == firstChunkSize: bits = 18 // 1 << 18 == 256KB default: // Don't split. continue } blobSize = 0 // Take any spans from the end of the spans slice that // have a smaller 'bits' score and make them children // of this node. var children []span childrenFrom := len(spans) for childrenFrom > 0 && spans[childrenFrom-1].bits < bits { childrenFrom-- } if nCopy := len(spans) - childrenFrom; nCopy > 0 { children = make([]span, nCopy) copy(children, spans[childrenFrom:]) spans = spans[:childrenFrom] } spans = append(spans, span{from: last, to: n, bits: bits, children: children}) last = n if !uploadLastSpan() { return } } // Loop was already hit earlier. if outerr != nil { return 0, nil, outerr } // Wait for all uploads to finish, one way or another, and then // see if any generated errors. // Once this loop is done, we own all the tokens in gatec, so nobody // else can have one outstanding. for i := 0; i < chunksInFlight; i++ { gatec.Start() } select { case err := <-firsterrc: return 0, nil, err default: } return n, spans, nil }
func writeFileChunks(bs blobserver.StatReceiver, fileMap Map, r io.Reader) (n int64, spans []span, outerr error) { src := ¬eEOFReader{r: r} blobSize := 0 // of the next blob being built, should be same as buf.Len() bufr := bufio.NewReaderSize(src, bufioReaderSize) spans = []span{} // the tree of spans, cut on interesting rollsum boundaries rs := rollsum.New() last := n buf := new(bytes.Buffer) // TODO: keep multiple of these in-flight at a time. uploadLastSpan := func() bool { defer buf.Reset() br, err := uploadString(bs, buf.String()) if err != nil { outerr = err return false } spans[len(spans)-1].br = br return true } for { c, err := bufr.ReadByte() if err == io.EOF { if n != last { spans = append(spans, span{from: last, to: n}) if !uploadLastSpan() { return } } break } if err != nil { return 0, nil, err } buf.WriteByte(c) n++ blobSize++ rs.Roll(c) var bits int onRollSplit := rs.OnSplit() switch { case blobSize == maxBlobSize: bits = 20 // arbitrary node weight; 1<<20 == 1MB case src.sawEOF: // Don't split. End is coming soon enough. continue case onRollSplit && n > firstChunkSize && blobSize > tooSmallThreshold: bits = rs.Bits() case n == firstChunkSize: bits = 18 // 1 << 18 == 256KB default: // Don't split. continue } blobSize = 0 // Take any spans from the end of the spans slice that // have a smaller 'bits' score and make them children // of this node. var children []span childrenFrom := len(spans) for childrenFrom > 0 && spans[childrenFrom-1].bits < bits { childrenFrom-- } if nCopy := len(spans) - childrenFrom; nCopy > 0 { children = make([]span, nCopy) copy(children, spans[childrenFrom:]) spans = spans[:childrenFrom] } spans = append(spans, span{from: last, to: n, bits: bits, children: children}) last = n if !uploadLastSpan() { return } } return n, spans, nil }
func (bh *bodyHandler) parseResponse(body []byte) (changed bool, err error) { startParse := time.Now() rs := rollsum.New() rd := bytes.NewReader(body) buf := make([]byte, 0) tx, err := bh.db.Begin() if err != nil { return false, err } stmt, err := tx.Prepare(sqlUpSert) if err != nil { return false, err } known := 0 for { b, err := rd.ReadByte() if err != nil { if err == io.EOF { break } else { return false, err } } rs.Roll(b) buf = append(buf, b) if rs.OnSplitWithBits(5) { h := sha1.Sum(buf) var s int bh.db.QueryRow(`SELECT LENGTH(content) FROM chunks WHERE hash = ?`, h[:]).Scan(&s) known += s _, err = stmt.Exec(buf, h[:], h[:]) if err != nil { log.Println("HERE") return false, err } buf = buf[:0] } } if err := tx.Commit(); err != nil { return false, err } log.Printf("Best match: %d bytes on %d\n", known, len(body)) log.Printf("Parsed response in %v ms\n", time.Since(startParse).Seconds()*1000) // Heuristic for changed: if the old top chunk is not in the new // first 10, consider the current dictionary as old top10Rows, err := bh.db.Query(`SELECT hash FROM chunks ORDER BY count, hash DESC LIMIT 10`) if err != nil { return false, err } top10 := make([][]byte, 0, 10) for top10Rows.Next() { var hash []byte err := top10Rows.Scan(&hash) if err != nil { return false, err } top10 = append(top10, hash) } if err := top10Rows.Err(); err != nil { return false, err } if len(bh.topChunk) == 0 && len(top10) > 0 { bh.topChunk = top10[0] return true, nil } for _, newInTop := range top10 { if bytes.Compare(bh.topChunk, newInTop) == 0 { return false, nil } } bh.topChunk = top10[0] return true, nil }
func WriteFileMapRolling(bs blobserver.StatReceiver, fileMap map[string]interface{}, r io.Reader) (outbr *blobref.BlobRef, outerr error) { blobSize := 0 bufr := bufio.NewReader(r) spans := []span{} // the tree of spans, cut on interesting rollsum boundaries rs := rollsum.New() n := int64(0) last := n buf := new(bytes.Buffer) uploadString := func(s string) (*blobref.BlobRef, error) { br := blobref.SHA1FromString(s) hasIt, err := serverHasBlob(bs, br) if err != nil { return nil, err } if hasIt { return br, nil } _, err = bs.ReceiveBlob(br, strings.NewReader(s)) if err != nil { return nil, err } return br, nil } // TODO: keep multiple of these in-flight at a time. uploadLastSpan := func() bool { defer buf.Reset() br, err := uploadString(buf.String()) if err != nil { outerr = err return false } spans[len(spans)-1].br = br return true } for { c, err := bufr.ReadByte() if err == io.EOF { if n != last { spans = append(spans, span{from: last, to: n}) if !uploadLastSpan() { return } } break } if err != nil { return nil, err } buf.WriteByte(c) n++ blobSize++ rs.Roll(c) if !rs.OnSplit() { if blobSize < MaxBlobSize { continue } } blobSize = 0 bits := rs.Bits() // Take any spans from the end of the spans slice that // have a smaller 'bits' score and make them children // of this node. var children []span childrenFrom := len(spans) for childrenFrom > 0 && spans[childrenFrom-1].bits < bits { childrenFrom-- } if nCopy := len(spans) - childrenFrom; nCopy > 0 { children = make([]span, nCopy) copy(children, spans[childrenFrom:]) spans = spans[:childrenFrom] } spans = append(spans, span{from: last, to: n, bits: bits, children: children}) last = n if !uploadLastSpan() { return } } var addBytesParts func(dst *[]BytesPart, s []span) error uploadFile := func(isFragment bool, fileSize int64, s []span) (*blobref.BlobRef, error) { parts := []BytesPart{} err := addBytesParts(&parts, s) if err != nil { return nil, err } m := fileMap if isFragment { m = NewBytes() } err = PopulateParts(m, fileSize, parts) if err != nil { return nil, err } json, err := MapToCamliJSON(m) if err != nil { return nil, err } return uploadString(json) } addBytesParts = func(dst *[]BytesPart, spansl []span) error { for _, sp := range spansl { if len(sp.children) > 0 { childrenSize := int64(0) for _, cs := range sp.children { childrenSize += cs.size() } br, err := uploadFile(true, childrenSize, sp.children) if err != nil { return err } *dst = append(*dst, BytesPart{ BytesRef: br, Size: uint64(childrenSize), }) } if sp.from != sp.to { *dst = append(*dst, BytesPart{ BlobRef: sp.br, Size: uint64(sp.to - sp.from), }) } } return nil } // The top-level content parts return uploadFile(false, n, spans) }
func showSplits(file string) { f, err := os.Open(file) if err != nil { panic(err.Error()) } bufr := bufio.NewReader(f) spans := []span{} rs := rollsum.New() n := int64(0) last := n for { c, err := bufr.ReadByte() if err != nil { if err == io.EOF { if n != last { spans = append(spans, span{from: last, to: n}) } break } panic(err.Error()) } n++ rs.Roll(c) if rs.OnSplit() { bits := rs.Bits() sliceFrom := len(spans) for sliceFrom > 0 && spans[sliceFrom-1].bits < bits { sliceFrom-- } nCopy := len(spans) - sliceFrom var children []span if nCopy > 0 { children = make([]span, nCopy) nCopied := copy(children, spans[sliceFrom:]) if nCopied != nCopy { panic("n wrong") } spans = spans[:sliceFrom] } spans = append(spans, span{from: last, to: n, bits: bits, children: children}) log.Printf("split at %d (after %d), bits=%d", n, n-last, bits) last = n } } var dumpSpans func(s []span, indent int) dumpSpans = func(s []span, indent int) { in := strings.Repeat(" ", indent) for _, sp := range s { fmt.Printf("%sfrom=%d, to=%d (len %d) bits=%d\n", in, sp.from, sp.to, sp.to-sp.from, sp.bits) if len(sp.children) > 0 { dumpSpans(sp.children, indent+4) } } } dumpSpans(spans, 0) fmt.Printf("\n\nNOTE NOTE NOTE: the camdebug tool hasn't been updated to use the splitting policy from pkg/schema/filewriter.go.") }