Ejemplo n.º 1
0
func SplitFun(r io.Reader, outFn func([]byte) error) error {
	br := bufio.NewReader(r)
	rs := rollsum.New()
	chunk := bytes.Buffer{}
	for {
		chunk.Reset()
		var err error
		for !(rs.OnSplit() && rs.Bits() >= minBits) || chunk.Len() < minChunkSize {
			var c byte
			c, err = br.ReadByte()
			if err != nil {
				break
			}
			rs.Roll(c)
			if e := chunk.WriteByte(c); e != nil {
				return e
			}
		}
		if err != nil && err != io.EOF {
			return err
		}
		if err1 := outFn(chunk.Bytes()); err1 != nil {
			return err1
		}
		if err == io.EOF {
			return nil
		}
	}
}
Ejemplo n.º 2
0
Archivo: dict.go Proyecto: rakoo/MMAS
func (d *Dict) parse(content []byte) error {
	rs := rollsum.New()

	var match uint64
	q := content
	buf := make([]byte, 0)
	hashes := make([][]byte, 0)
	offs := make([]int, 0)

	tx, err := d.db.Begin()
	if err != nil {
		return err
	}

	stmt, err := tx.Prepare(sqlUpSert)
	if err != nil {
		return err
	}

	off := 0
	for len(q) > 0 {
		b := q[0]
		q = q[1:]

		rs.Roll(b)
		off++
		d.totalBytesIn++

		buf = append(buf, b)
		if rs.OnSplitWithBits(5) {
			h := sha1.Sum(buf)
			offs = append(offs, off)
			hashes = append(hashes, h[:])

			_, err := stmt.Exec(buf, h[:], h[:])
			if err != nil {
				return err
			}
			buf = buf[:0]
		}
	}

	d.totalBytesDup += uint64(match)

	if errStmt := stmt.Close(); errStmt != nil {
		return err
	}

	if errTx := tx.Commit(); errTx != nil {
		return err
	}

	err = d.makeDict()
	if err != nil {
		return err
	}

	return nil
}
Ejemplo n.º 3
0
// chunksOf takes a (presumably large) file's uncompressed input,
// rolling-checksum splits it into ~514 byte chunks, compresses each,
// base64s each, and writes chunk files out, with each file just
// defining an exported fileembed.Opener variable named C<xxxx> where
// xxxx is the first 8 lowercase hex digits of the SHA-1 of the chunk
// value pre-compression.  The return value is a Go expression
// referencing each of those chunks concatenated together.
func chunksOf(in []byte) (stringExpression []byte) {
	var multiParts [][]byte
	rs := rollsum.New()
	const nBits = 9 // ~512 byte chunks
	last := 0
	for i, b := range in {
		rs.Roll(b)
		if rs.OnSplitWithBits(nBits) || i == len(in)-1 {
			raw := in[last : i+1] // inclusive
			last = i + 1
			s1 := sha1.New()
			s1.Write(raw)
			sha1hex := fmt.Sprintf("%x", s1.Sum(nil))[:8]
			writeChunkFile(sha1hex, raw)
			multiParts = append(multiParts, []byte(fmt.Sprintf("chunkpkg.C%s", sha1hex)))
		}
	}
	return bytes.Join(multiParts, []byte(",\n\t"))
}
Ejemplo n.º 4
0
func writeFileChunks(bs blobserver.StatReceiver, file *Builder, r io.Reader) (n int64, spans []span, outerr error) {
	src := &noteEOFReader{r: r}
	bufr := bufio.NewReaderSize(src, bufioReaderSize)
	spans = []span{} // the tree of spans, cut on interesting rollsum boundaries
	rs := rollsum.New()
	var last int64
	var buf bytes.Buffer
	blobSize := 0 // of the next blob being built, should be same as buf.Len()

	const chunksInFlight = 32 // at ~64 KB chunks, this is ~2MB memory per file
	gatec := syncutil.NewGate(chunksInFlight)
	firsterrc := make(chan error, 1)

	// uploadLastSpan runs in the same goroutine as the loop below and is responsible for
	// starting uploading the contents of the buf.  It returns false if there's been
	// an error and the loop below should be stopped.
	uploadLastSpan := func() bool {
		chunk := buf.String()
		buf.Reset()
		br := blob.SHA1FromString(chunk)
		spans[len(spans)-1].br = br
		select {
		case outerr = <-firsterrc:
			return false
		default:
			// No error seen so far, continue.
		}
		gatec.Start()
		go func() {
			defer gatec.Done()
			if _, err := uploadString(bs, br, chunk); err != nil {
				select {
				case firsterrc <- err:
				default:
				}
			}
		}()
		return true
	}

	for {
		c, err := bufr.ReadByte()
		if err == io.EOF {
			if n != last {
				spans = append(spans, span{from: last, to: n})
				if !uploadLastSpan() {
					return
				}
			}
			break
		}
		if err != nil {
			return 0, nil, err
		}

		buf.WriteByte(c)
		n++
		blobSize++
		rs.Roll(c)

		var bits int
		onRollSplit := rs.OnSplit()
		switch {
		case blobSize == maxBlobSize:
			bits = 20 // arbitrary node weight; 1<<20 == 1MB
		case src.sawEOF:
			// Don't split. End is coming soon enough.
			continue
		case onRollSplit && n > firstChunkSize && blobSize > tooSmallThreshold:
			bits = rs.Bits()
		case n == firstChunkSize:
			bits = 18 // 1 << 18 == 256KB
		default:
			// Don't split.
			continue
		}
		blobSize = 0

		// Take any spans from the end of the spans slice that
		// have a smaller 'bits' score and make them children
		// of this node.
		var children []span
		childrenFrom := len(spans)
		for childrenFrom > 0 && spans[childrenFrom-1].bits < bits {
			childrenFrom--
		}
		if nCopy := len(spans) - childrenFrom; nCopy > 0 {
			children = make([]span, nCopy)
			copy(children, spans[childrenFrom:])
			spans = spans[:childrenFrom]
		}

		spans = append(spans, span{from: last, to: n, bits: bits, children: children})
		last = n
		if !uploadLastSpan() {
			return
		}
	}

	// Loop was already hit earlier.
	if outerr != nil {
		return 0, nil, outerr
	}

	// Wait for all uploads to finish, one way or another, and then
	// see if any generated errors.
	// Once this loop is done, we own all the tokens in gatec, so nobody
	// else can have one outstanding.
	for i := 0; i < chunksInFlight; i++ {
		gatec.Start()
	}
	select {
	case err := <-firsterrc:
		return 0, nil, err
	default:
	}

	return n, spans, nil

}
Ejemplo n.º 5
0
func writeFileChunks(bs blobserver.StatReceiver, fileMap Map, r io.Reader) (n int64, spans []span, outerr error) {
	src := &noteEOFReader{r: r}
	blobSize := 0 // of the next blob being built, should be same as buf.Len()
	bufr := bufio.NewReaderSize(src, bufioReaderSize)
	spans = []span{} // the tree of spans, cut on interesting rollsum boundaries
	rs := rollsum.New()
	last := n
	buf := new(bytes.Buffer)

	// TODO: keep multiple of these in-flight at a time.
	uploadLastSpan := func() bool {
		defer buf.Reset()
		br, err := uploadString(bs, buf.String())
		if err != nil {
			outerr = err
			return false
		}
		spans[len(spans)-1].br = br
		return true
	}

	for {
		c, err := bufr.ReadByte()
		if err == io.EOF {
			if n != last {
				spans = append(spans, span{from: last, to: n})
				if !uploadLastSpan() {
					return
				}
			}
			break
		}
		if err != nil {
			return 0, nil, err
		}

		buf.WriteByte(c)
		n++
		blobSize++
		rs.Roll(c)

		var bits int
		onRollSplit := rs.OnSplit()
		switch {
		case blobSize == maxBlobSize:
			bits = 20 // arbitrary node weight; 1<<20 == 1MB
		case src.sawEOF:
			// Don't split. End is coming soon enough.
			continue
		case onRollSplit && n > firstChunkSize && blobSize > tooSmallThreshold:
			bits = rs.Bits()
		case n == firstChunkSize:
			bits = 18 // 1 << 18 == 256KB
		default:
			// Don't split.
			continue
		}
		blobSize = 0

		// Take any spans from the end of the spans slice that
		// have a smaller 'bits' score and make them children
		// of this node.
		var children []span
		childrenFrom := len(spans)
		for childrenFrom > 0 && spans[childrenFrom-1].bits < bits {
			childrenFrom--
		}
		if nCopy := len(spans) - childrenFrom; nCopy > 0 {
			children = make([]span, nCopy)
			copy(children, spans[childrenFrom:])
			spans = spans[:childrenFrom]
		}

		spans = append(spans, span{from: last, to: n, bits: bits, children: children})
		last = n
		if !uploadLastSpan() {
			return
		}
	}

	return n, spans, nil

}
Ejemplo n.º 6
0
Archivo: parse.go Proyecto: rakoo/MMAS
func (bh *bodyHandler) parseResponse(body []byte) (changed bool, err error) {

	startParse := time.Now()

	rs := rollsum.New()
	rd := bytes.NewReader(body)
	buf := make([]byte, 0)

	tx, err := bh.db.Begin()
	if err != nil {
		return false, err
	}

	stmt, err := tx.Prepare(sqlUpSert)
	if err != nil {
		return false, err
	}

	known := 0
	for {
		b, err := rd.ReadByte()
		if err != nil {
			if err == io.EOF {
				break
			} else {
				return false, err
			}
		}
		rs.Roll(b)
		buf = append(buf, b)
		if rs.OnSplitWithBits(5) {
			h := sha1.Sum(buf)
			var s int
			bh.db.QueryRow(`SELECT LENGTH(content) FROM chunks WHERE hash = ?`, h[:]).Scan(&s)
			known += s

			_, err = stmt.Exec(buf, h[:], h[:])
			if err != nil {
				log.Println("HERE")
				return false, err
			}
			buf = buf[:0]
		}
	}

	if err := tx.Commit(); err != nil {
		return false, err
	}

	log.Printf("Best match: %d bytes on %d\n", known, len(body))

	log.Printf("Parsed response in %v ms\n", time.Since(startParse).Seconds()*1000)

	// Heuristic for changed: if the old top chunk is not in the new
	// first 10, consider the current dictionary as old
	top10Rows, err := bh.db.Query(`SELECT hash FROM chunks ORDER BY count, hash DESC LIMIT 10`)
	if err != nil {
		return false, err
	}

	top10 := make([][]byte, 0, 10)
	for top10Rows.Next() {
		var hash []byte
		err := top10Rows.Scan(&hash)
		if err != nil {
			return false, err
		}
		top10 = append(top10, hash)
	}
	if err := top10Rows.Err(); err != nil {
		return false, err
	}

	if len(bh.topChunk) == 0 && len(top10) > 0 {
		bh.topChunk = top10[0]
		return true, nil
	}

	for _, newInTop := range top10 {
		if bytes.Compare(bh.topChunk, newInTop) == 0 {
			return false, nil
		}
	}

	bh.topChunk = top10[0]
	return true, nil
}
Ejemplo n.º 7
0
func WriteFileMapRolling(bs blobserver.StatReceiver, fileMap map[string]interface{}, r io.Reader) (outbr *blobref.BlobRef, outerr error) {
	blobSize := 0
	bufr := bufio.NewReader(r)
	spans := []span{} // the tree of spans, cut on interesting rollsum boundaries
	rs := rollsum.New()
	n := int64(0)
	last := n
	buf := new(bytes.Buffer)

	uploadString := func(s string) (*blobref.BlobRef, error) {
		br := blobref.SHA1FromString(s)
		hasIt, err := serverHasBlob(bs, br)
		if err != nil {
			return nil, err
		}
		if hasIt {
			return br, nil
		}
		_, err = bs.ReceiveBlob(br, strings.NewReader(s))
		if err != nil {
			return nil, err
		}
		return br, nil
	}

	// TODO: keep multiple of these in-flight at a time.
	uploadLastSpan := func() bool {
		defer buf.Reset()
		br, err := uploadString(buf.String())
		if err != nil {
			outerr = err
			return false
		}
		spans[len(spans)-1].br = br
		return true
	}

	for {
		c, err := bufr.ReadByte()
		if err == io.EOF {
			if n != last {
				spans = append(spans, span{from: last, to: n})
				if !uploadLastSpan() {
					return
				}
			}
			break
		}
		if err != nil {
			return nil, err
		}
		buf.WriteByte(c)

		n++
		blobSize++
		rs.Roll(c)
		if !rs.OnSplit() {
			if blobSize < MaxBlobSize {
				continue
			}
		}
		blobSize = 0
		bits := rs.Bits()

		// Take any spans from the end of the spans slice that
		// have a smaller 'bits' score and make them children
		// of this node.
		var children []span
		childrenFrom := len(spans)
		for childrenFrom > 0 && spans[childrenFrom-1].bits < bits {
			childrenFrom--
		}
		if nCopy := len(spans) - childrenFrom; nCopy > 0 {
			children = make([]span, nCopy)
			copy(children, spans[childrenFrom:])
			spans = spans[:childrenFrom]
		}

		spans = append(spans, span{from: last, to: n, bits: bits, children: children})
		last = n
		if !uploadLastSpan() {
			return
		}
	}

	var addBytesParts func(dst *[]BytesPart, s []span) error

	uploadFile := func(isFragment bool, fileSize int64, s []span) (*blobref.BlobRef, error) {
		parts := []BytesPart{}
		err := addBytesParts(&parts, s)
		if err != nil {
			return nil, err
		}
		m := fileMap
		if isFragment {
			m = NewBytes()
		}
		err = PopulateParts(m, fileSize, parts)
		if err != nil {
			return nil, err
		}
		json, err := MapToCamliJSON(m)
		if err != nil {
			return nil, err
		}
		return uploadString(json)
	}

	addBytesParts = func(dst *[]BytesPart, spansl []span) error {
		for _, sp := range spansl {
			if len(sp.children) > 0 {
				childrenSize := int64(0)
				for _, cs := range sp.children {
					childrenSize += cs.size()
				}
				br, err := uploadFile(true, childrenSize, sp.children)
				if err != nil {
					return err
				}
				*dst = append(*dst, BytesPart{
					BytesRef: br,
					Size:     uint64(childrenSize),
				})
			}
			if sp.from != sp.to {
				*dst = append(*dst, BytesPart{
					BlobRef: sp.br,
					Size:    uint64(sp.to - sp.from),
				})
			}
		}
		return nil
	}

	// The top-level content parts
	return uploadFile(false, n, spans)
}
Ejemplo n.º 8
0
func showSplits(file string) {
	f, err := os.Open(file)
	if err != nil {
		panic(err.Error())
	}
	bufr := bufio.NewReader(f)

	spans := []span{}
	rs := rollsum.New()
	n := int64(0)
	last := n

	for {
		c, err := bufr.ReadByte()
		if err != nil {
			if err == io.EOF {
				if n != last {
					spans = append(spans, span{from: last, to: n})
				}
				break
			}
			panic(err.Error())
		}
		n++
		rs.Roll(c)
		if rs.OnSplit() {
			bits := rs.Bits()
			sliceFrom := len(spans)
			for sliceFrom > 0 && spans[sliceFrom-1].bits < bits {
				sliceFrom--
			}
			nCopy := len(spans) - sliceFrom
			var children []span
			if nCopy > 0 {
				children = make([]span, nCopy)
				nCopied := copy(children, spans[sliceFrom:])
				if nCopied != nCopy {
					panic("n wrong")
				}
				spans = spans[:sliceFrom]
			}
			spans = append(spans, span{from: last, to: n, bits: bits, children: children})

			log.Printf("split at %d (after %d), bits=%d", n, n-last, bits)
			last = n
		}
	}

	var dumpSpans func(s []span, indent int)
	dumpSpans = func(s []span, indent int) {
		in := strings.Repeat(" ", indent)
		for _, sp := range s {
			fmt.Printf("%sfrom=%d, to=%d (len %d) bits=%d\n", in, sp.from, sp.to, sp.to-sp.from, sp.bits)
			if len(sp.children) > 0 {
				dumpSpans(sp.children, indent+4)
			}
		}
	}
	dumpSpans(spans, 0)
	fmt.Printf("\n\nNOTE NOTE NOTE: the camdebug tool hasn't been updated to use the splitting policy from pkg/schema/filewriter.go.")
}