Example #1
0
// NewReader returns a reader that consumes records from r, using a cache of up
// to maxSize bytes for known record hashes.
func NewReader(r io.Reader, maxSize int) (*Reader, error) {
	d, err := dedup.New(maxSize)
	if err != nil {
		return nil, err
	}
	return &Reader{delimited.NewReader(r), d}, nil
}
Example #2
0
func main() {
	flag.Parse()
	if len(flag.Args()) != 0 {
		flagutil.UsageErrorf("unknown arguments: %v", flag.Args())
	}

	written := make(map[[sha512.Size384]byte]struct{})

	var skipped uint64
	rd := delimited.NewReader(os.Stdin)
	wr := delimited.NewWriter(os.Stdout)
	for {
		rec, err := rd.Next()
		if err == io.EOF {
			break
		} else if err != nil {
			log.Fatal(err)
		}

		hash := sha512.Sum384(rec)
		if _, ok := written[hash]; ok {
			skipped++
			continue
		}
		if err := wr.Put(rec); err != nil {
			log.Fatal(err)
		}
		written[hash] = struct{}{}
	}
	log.Printf("dedup_stream: skipped %d records", skipped)
}
Example #3
0
// NewReader reads a stream of Entry protobufs from r.
func NewReader(r io.Reader) EntryReader {
	return func(f func(*spb.Entry) error) error {
		rd := delimited.NewReader(r)
		for {
			var entry spb.Entry
			if err := rd.NextProto(&entry); err == io.EOF {
				return nil
			} else if err != nil {
				return fmt.Errorf("error decoding Entry: %v", err)
			}
			if err := f(&entry); err != nil {
				return err
			}
		}
	}
}
Example #4
0
func main() {
	flag.Parse()
	if flag.NArg() != 0 {
		flagutil.UsageErrorf("unknown arguments: %v", flag.Args())
	}

	rd, err := delimited.NewUniqReader(delimited.NewReader(os.Stdin), int(cacheSize.Bytes()))
	if err != nil {
		log.Fatalf("Error creating UniqReader: %v", err)
	}
	wr := delimited.NewWriter(os.Stdout)
	if err := delimited.Copy(wr, rd); err != nil {
		log.Fatal(err)
	}
	log.Printf("dedup_stream: skipped %d records", rd.Skipped())
}
Example #5
0
// ReadEntries reads a stream of Entry protobufs from r.
func ReadEntries(r io.Reader) <-chan *spb.Entry {
	ch := make(chan *spb.Entry)
	go func() {
		defer close(ch)
		rd := delimited.NewReader(r)
		for {
			var entry spb.Entry
			if err := rd.NextProto(&entry); err == io.EOF {
				break
			} else if err != nil {
				log.Fatalf("Error decoding Entry: %v", err)
			}
			ch <- &entry
		}
	}()
	return ch
}
Example #6
0
// New reads a kindex file from r, which is expected to be positioned at the
// beginning of an index file or a data source of equivalent format.
func New(r io.Reader) (*Compilation, error) {
	var rd *delimited.Reader
	if gz, err := gzip.NewReader(r); err != nil {
		return nil, err
	} else {
		rd = delimited.NewReader(gz)
	}

	// The first block is the CompilationUnit message.
	cu := new(apb.CompilationUnit)
	if rec, err := rd.Next(); err != nil {
		return nil, err
	} else if err := proto.Unmarshal(rec, cu); err != nil {
		return nil, err
	}

	// All the subsequent blocks are FileData messages.
	var files []*apb.FileData
	for {
		rec, err := rd.Next()
		if err == io.EOF {
			break
		} else if err != nil {
			return nil, err
		}

		fd := new(apb.FileData)
		if err := proto.Unmarshal(rec, fd); err != nil {
			return nil, err
		}
		files = append(files, fd)
	}

	return &Compilation{
		Proto: cu,
		Files: files,
	}, nil
}
Example #7
0
// Iterator implements part of the Interface interface.
func (m *mergeSorter) Iterator() (iter Iterator, err error) {
	if m.finalized {
		return nil, ErrAlreadyFinalized
	}
	m.finalized = true // signal that further operations should fail

	it := &mergeIterator{workDir: m.workDir, marshaler: m.opts.Marshaler}

	if len(m.shards) == 0 {
		// Fast path for a single, in-memory shard
		it.buffer, m.buffer = m.buffer, nil
		sortutil.Sort(m.opts.Lesser, it.buffer)
		return it, nil
	}

	// This is a heap storing the head of each shard.
	merger := &sortutil.ByLesser{
		Lesser: &mergeElementLesser{Lesser: m.opts.Lesser},
	}
	it.merger = merger

	defer func() {
		// Try to cleanup on errors
		if err != nil {
			if cErr := it.Close(); cErr != nil {
				log.Printf("WARNING: error closing Iterator after error: %v", cErr)
			}
		}
	}()

	if len(m.buffer) != 0 {
		// To make the merging algorithm simpler, dump the last shard to disk.
		if err := m.dumpShard(); err != nil {
			m.buffer = nil
			return nil, fmt.Errorf("error dumping final shard: %v", err)
		}
	}
	m.buffer = nil

	// Initialize the merger heap by reading the first element of each shard.
	for _, shard := range m.shards {
		f, err := os.OpenFile(shard, os.O_RDONLY, shardFileMode)
		if err != nil {
			return nil, fmt.Errorf("error opening shard %q: %v", shard, err)
		}

		r := io.Reader(f)
		if m.opts.CompressShards {
			r = snappy.NewReader(r)
		}

		rd := delimited.NewReader(bufio.NewReaderSize(r, m.opts.IOBufferSize))
		first, err := rd.Next()
		if err != nil {
			f.Close()
			return nil, fmt.Errorf("error reading beginning of shard %q: %v", shard, err)
		}
		el, err := m.opts.Marshaler.Unmarshal(first)
		if err != nil {
			f.Close()
			return nil, fmt.Errorf("error unmarshaling beginning of shard %q: %v", shard, err)
		}

		heap.Push(merger, &mergeElement{el: el, rd: rd, f: f})
	}

	return it, nil
}
Example #8
0
// Read implements part of the Interface interface.
func (m *mergeSorter) Read(f func(i interface{}) error) (err error) {
	if m.finalized {
		return ErrAlreadyFinalized
	}
	m.finalized = true // signal that further operations should fail

	// Ensure that the working directory is always cleaned up.
	defer func() {
		cleanupErr := os.RemoveAll(m.workDir)
		if err == nil {
			err = cleanupErr
		} else {
			log.Println("WARNING: error removing temporary directory:", m.workDir)
		}
	}()

	if len(m.shards) == 0 {
		// Fast path for a single, in-memory shard
		defer func() { m.buffer = nil }()
		sortutil.Sort(m.opts.Lesser, m.buffer)
		for len(m.buffer) > 0 {
			if err := f(m.buffer[0]); err != nil {
				return err
			}
			m.buffer = m.buffer[1:]
		}
		return nil
	}

	if len(m.buffer) != 0 {
		// To make the merging algorithm simpler, dump the last shard to disk.
		if err := m.dumpShard(); err != nil {
			m.buffer = nil
			return fmt.Errorf("error dumping final shard: %v", err)
		}
	}
	m.buffer = nil

	// This is a heap storing the head of each shard.
	merger := &sortutil.ByLesser{
		Lesser: &mergeElementLesser{Lesser: m.opts.Lesser},
	}

	defer func() {
		// Try to cleanup on errors
		for merger.Len() != 0 {
			x := heap.Pop(merger).(*mergeElement)
			_ = x.f.Close() // ignore errors (file is only open for reading)
		}
	}()

	// Initialize the merger heap by reading the first element of each shard.
	for _, shard := range m.shards {
		f, err := os.OpenFile(shard, os.O_RDONLY, shardFileMode)
		if err != nil {
			return fmt.Errorf("error opening shard %q: %v", shard, err)
		}

		r := io.Reader(f)
		if m.opts.CompressShards {
			r = snappy.NewReader(r)
		}

		rd := delimited.NewReader(bufio.NewReaderSize(r, m.opts.IOBufferSize))
		first, err := rd.Next()
		if err != nil {
			f.Close()
			return fmt.Errorf("error reading beginning of shard %q: %v", shard, err)
		}
		el, err := m.opts.Marshaler.Unmarshal(first)
		if err != nil {
			f.Close()
			return fmt.Errorf("error unmarshaling beginning of shard %q: %v", shard, err)
		}

		heap.Push(merger, &mergeElement{el: el, rd: rd, f: f})
	}

	// While the merger heap is non-empty:
	//   el := pop the head of the heap
	//   pass it to the user-specific function
	//   push the next element el.rd to the merger heap
	for merger.Len() != 0 {
		x := heap.Pop(merger).(*mergeElement)

		// Give the value to the user-supplied function
		if err := f(x.el); err != nil {
			return err
		}

		// Read and parse the next value on the same shard
		rec, err := x.rd.Next()
		if err != nil {
			_ = x.f.Close()           // ignore errors (file is only open for reading)
			_ = os.Remove(x.f.Name()) // ignore errors (os.RemoveAll used in defer)
			if err == io.EOF {
				continue
			} else {
				return fmt.Errorf("error reading shard: %v", err)
			}
		}
		next, err := m.opts.Marshaler.Unmarshal(rec)
		if err != nil {
			return fmt.Errorf("error unmarshaling element: %v", err)
		}

		// Reuse mergeElement, push it back onto the merger heap with the next value
		x.el = next
		heap.Push(merger, x)
	}

	return nil
}