// NewReader returns a reader that consumes records from r, using a cache of up // to maxSize bytes for known record hashes. func NewReader(r io.Reader, maxSize int) (*Reader, error) { d, err := dedup.New(maxSize) if err != nil { return nil, err } return &Reader{delimited.NewReader(r), d}, nil }
func main() { flag.Parse() if len(flag.Args()) != 0 { flagutil.UsageErrorf("unknown arguments: %v", flag.Args()) } written := make(map[[sha512.Size384]byte]struct{}) var skipped uint64 rd := delimited.NewReader(os.Stdin) wr := delimited.NewWriter(os.Stdout) for { rec, err := rd.Next() if err == io.EOF { break } else if err != nil { log.Fatal(err) } hash := sha512.Sum384(rec) if _, ok := written[hash]; ok { skipped++ continue } if err := wr.Put(rec); err != nil { log.Fatal(err) } written[hash] = struct{}{} } log.Printf("dedup_stream: skipped %d records", skipped) }
// NewReader reads a stream of Entry protobufs from r. func NewReader(r io.Reader) EntryReader { return func(f func(*spb.Entry) error) error { rd := delimited.NewReader(r) for { var entry spb.Entry if err := rd.NextProto(&entry); err == io.EOF { return nil } else if err != nil { return fmt.Errorf("error decoding Entry: %v", err) } if err := f(&entry); err != nil { return err } } } }
func main() { flag.Parse() if flag.NArg() != 0 { flagutil.UsageErrorf("unknown arguments: %v", flag.Args()) } rd, err := delimited.NewUniqReader(delimited.NewReader(os.Stdin), int(cacheSize.Bytes())) if err != nil { log.Fatalf("Error creating UniqReader: %v", err) } wr := delimited.NewWriter(os.Stdout) if err := delimited.Copy(wr, rd); err != nil { log.Fatal(err) } log.Printf("dedup_stream: skipped %d records", rd.Skipped()) }
// ReadEntries reads a stream of Entry protobufs from r. func ReadEntries(r io.Reader) <-chan *spb.Entry { ch := make(chan *spb.Entry) go func() { defer close(ch) rd := delimited.NewReader(r) for { var entry spb.Entry if err := rd.NextProto(&entry); err == io.EOF { break } else if err != nil { log.Fatalf("Error decoding Entry: %v", err) } ch <- &entry } }() return ch }
// New reads a kindex file from r, which is expected to be positioned at the // beginning of an index file or a data source of equivalent format. func New(r io.Reader) (*Compilation, error) { var rd *delimited.Reader if gz, err := gzip.NewReader(r); err != nil { return nil, err } else { rd = delimited.NewReader(gz) } // The first block is the CompilationUnit message. cu := new(apb.CompilationUnit) if rec, err := rd.Next(); err != nil { return nil, err } else if err := proto.Unmarshal(rec, cu); err != nil { return nil, err } // All the subsequent blocks are FileData messages. var files []*apb.FileData for { rec, err := rd.Next() if err == io.EOF { break } else if err != nil { return nil, err } fd := new(apb.FileData) if err := proto.Unmarshal(rec, fd); err != nil { return nil, err } files = append(files, fd) } return &Compilation{ Proto: cu, Files: files, }, nil }
// Iterator implements part of the Interface interface. func (m *mergeSorter) Iterator() (iter Iterator, err error) { if m.finalized { return nil, ErrAlreadyFinalized } m.finalized = true // signal that further operations should fail it := &mergeIterator{workDir: m.workDir, marshaler: m.opts.Marshaler} if len(m.shards) == 0 { // Fast path for a single, in-memory shard it.buffer, m.buffer = m.buffer, nil sortutil.Sort(m.opts.Lesser, it.buffer) return it, nil } // This is a heap storing the head of each shard. merger := &sortutil.ByLesser{ Lesser: &mergeElementLesser{Lesser: m.opts.Lesser}, } it.merger = merger defer func() { // Try to cleanup on errors if err != nil { if cErr := it.Close(); cErr != nil { log.Printf("WARNING: error closing Iterator after error: %v", cErr) } } }() if len(m.buffer) != 0 { // To make the merging algorithm simpler, dump the last shard to disk. if err := m.dumpShard(); err != nil { m.buffer = nil return nil, fmt.Errorf("error dumping final shard: %v", err) } } m.buffer = nil // Initialize the merger heap by reading the first element of each shard. for _, shard := range m.shards { f, err := os.OpenFile(shard, os.O_RDONLY, shardFileMode) if err != nil { return nil, fmt.Errorf("error opening shard %q: %v", shard, err) } r := io.Reader(f) if m.opts.CompressShards { r = snappy.NewReader(r) } rd := delimited.NewReader(bufio.NewReaderSize(r, m.opts.IOBufferSize)) first, err := rd.Next() if err != nil { f.Close() return nil, fmt.Errorf("error reading beginning of shard %q: %v", shard, err) } el, err := m.opts.Marshaler.Unmarshal(first) if err != nil { f.Close() return nil, fmt.Errorf("error unmarshaling beginning of shard %q: %v", shard, err) } heap.Push(merger, &mergeElement{el: el, rd: rd, f: f}) } return it, nil }
// Read implements part of the Interface interface. func (m *mergeSorter) Read(f func(i interface{}) error) (err error) { if m.finalized { return ErrAlreadyFinalized } m.finalized = true // signal that further operations should fail // Ensure that the working directory is always cleaned up. defer func() { cleanupErr := os.RemoveAll(m.workDir) if err == nil { err = cleanupErr } else { log.Println("WARNING: error removing temporary directory:", m.workDir) } }() if len(m.shards) == 0 { // Fast path for a single, in-memory shard defer func() { m.buffer = nil }() sortutil.Sort(m.opts.Lesser, m.buffer) for len(m.buffer) > 0 { if err := f(m.buffer[0]); err != nil { return err } m.buffer = m.buffer[1:] } return nil } if len(m.buffer) != 0 { // To make the merging algorithm simpler, dump the last shard to disk. if err := m.dumpShard(); err != nil { m.buffer = nil return fmt.Errorf("error dumping final shard: %v", err) } } m.buffer = nil // This is a heap storing the head of each shard. merger := &sortutil.ByLesser{ Lesser: &mergeElementLesser{Lesser: m.opts.Lesser}, } defer func() { // Try to cleanup on errors for merger.Len() != 0 { x := heap.Pop(merger).(*mergeElement) _ = x.f.Close() // ignore errors (file is only open for reading) } }() // Initialize the merger heap by reading the first element of each shard. for _, shard := range m.shards { f, err := os.OpenFile(shard, os.O_RDONLY, shardFileMode) if err != nil { return fmt.Errorf("error opening shard %q: %v", shard, err) } r := io.Reader(f) if m.opts.CompressShards { r = snappy.NewReader(r) } rd := delimited.NewReader(bufio.NewReaderSize(r, m.opts.IOBufferSize)) first, err := rd.Next() if err != nil { f.Close() return fmt.Errorf("error reading beginning of shard %q: %v", shard, err) } el, err := m.opts.Marshaler.Unmarshal(first) if err != nil { f.Close() return fmt.Errorf("error unmarshaling beginning of shard %q: %v", shard, err) } heap.Push(merger, &mergeElement{el: el, rd: rd, f: f}) } // While the merger heap is non-empty: // el := pop the head of the heap // pass it to the user-specific function // push the next element el.rd to the merger heap for merger.Len() != 0 { x := heap.Pop(merger).(*mergeElement) // Give the value to the user-supplied function if err := f(x.el); err != nil { return err } // Read and parse the next value on the same shard rec, err := x.rd.Next() if err != nil { _ = x.f.Close() // ignore errors (file is only open for reading) _ = os.Remove(x.f.Name()) // ignore errors (os.RemoveAll used in defer) if err == io.EOF { continue } else { return fmt.Errorf("error reading shard: %v", err) } } next, err := m.opts.Marshaler.Unmarshal(rec) if err != nil { return fmt.Errorf("error unmarshaling element: %v", err) } // Reuse mergeElement, push it back onto the merger heap with the next value x.el = next heap.Push(merger, x) } return nil }