func (m *mergeSorter) dumpShard() (err error) { defer func() { m.buffer = make([]interface{}, 0, m.opts.MaxInMemory) }() // Create a new shard file shardPath := filepath.Join(m.workDir, fmt.Sprintf("shard.%.6d", len(m.shards))) file, err := os.OpenFile(shardPath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, shardFileMode) if err != nil { return fmt.Errorf("error creating shard: %v", err) } defer func() { replaceErrIfNil(&err, "error closing shard: %v", file.Close()) }() w := io.Writer(file) if m.opts.CompressShards { w = snappy.NewWriter(w) } // Buffer writing to the shard buf := bufio.NewWriterSize(w, m.opts.IOBufferSize) defer func() { replaceErrIfNil(&err, "error flushing shard: %v", buf.Flush()) }() // Sort the in-memory buffer of elements sortutil.Sort(m.opts.Lesser, m.buffer) // Write each element of the in-memory to shard file, in sorted order wr := delimited.NewWriter(buf) for len(m.buffer) > 0 { rec, err := m.opts.Marshaler.Marshal(m.buffer[0]) if err != nil { return fmt.Errorf("marshaling error: %v", err) } if _, err := wr.Write(rec); err != nil { return fmt.Errorf("writing error: %v", err) } m.buffer = m.buffer[1:] } m.shards = append(m.shards, shardPath) return nil }
// Iterator implements part of the Interface interface. func (m *mergeSorter) Iterator() (iter Iterator, err error) { if m.finalized { return nil, ErrAlreadyFinalized } m.finalized = true // signal that further operations should fail it := &mergeIterator{workDir: m.workDir, marshaler: m.opts.Marshaler} if len(m.shards) == 0 { // Fast path for a single, in-memory shard it.buffer, m.buffer = m.buffer, nil sortutil.Sort(m.opts.Lesser, it.buffer) return it, nil } // This is a heap storing the head of each shard. merger := &sortutil.ByLesser{ Lesser: &mergeElementLesser{Lesser: m.opts.Lesser}, } it.merger = merger defer func() { // Try to cleanup on errors if err != nil { if cErr := it.Close(); cErr != nil { log.Printf("WARNING: error closing Iterator after error: %v", cErr) } } }() if len(m.buffer) != 0 { // To make the merging algorithm simpler, dump the last shard to disk. if err := m.dumpShard(); err != nil { m.buffer = nil return nil, fmt.Errorf("error dumping final shard: %v", err) } } m.buffer = nil // Initialize the merger heap by reading the first element of each shard. for _, shard := range m.shards { f, err := os.OpenFile(shard, os.O_RDONLY, shardFileMode) if err != nil { return nil, fmt.Errorf("error opening shard %q: %v", shard, err) } r := io.Reader(f) if m.opts.CompressShards { r = snappy.NewReader(r) } rd := delimited.NewReader(bufio.NewReaderSize(r, m.opts.IOBufferSize)) first, err := rd.Next() if err != nil { f.Close() return nil, fmt.Errorf("error reading beginning of shard %q: %v", shard, err) } el, err := m.opts.Marshaler.Unmarshal(first) if err != nil { f.Close() return nil, fmt.Errorf("error unmarshaling beginning of shard %q: %v", shard, err) } heap.Push(merger, &mergeElement{el: el, rd: rd, f: f}) } return it, nil }
// Read implements part of the Interface interface. func (m *mergeSorter) Read(f func(i interface{}) error) (err error) { if m.finalized { return ErrAlreadyFinalized } m.finalized = true // signal that further operations should fail // Ensure that the working directory is always cleaned up. defer func() { cleanupErr := os.RemoveAll(m.workDir) if err == nil { err = cleanupErr } else { log.Println("WARNING: error removing temporary directory:", m.workDir) } }() if len(m.shards) == 0 { // Fast path for a single, in-memory shard defer func() { m.buffer = nil }() sortutil.Sort(m.opts.Lesser, m.buffer) for len(m.buffer) > 0 { if err := f(m.buffer[0]); err != nil { return err } m.buffer = m.buffer[1:] } return nil } if len(m.buffer) != 0 { // To make the merging algorithm simpler, dump the last shard to disk. if err := m.dumpShard(); err != nil { m.buffer = nil return fmt.Errorf("error dumping final shard: %v", err) } } m.buffer = nil // This is a heap storing the head of each shard. merger := &sortutil.ByLesser{ Lesser: &mergeElementLesser{Lesser: m.opts.Lesser}, } defer func() { // Try to cleanup on errors for merger.Len() != 0 { x := heap.Pop(merger).(*mergeElement) _ = x.f.Close() // ignore errors (file is only open for reading) } }() // Initialize the merger heap by reading the first element of each shard. for _, shard := range m.shards { f, err := os.OpenFile(shard, os.O_RDONLY, shardFileMode) if err != nil { return fmt.Errorf("error opening shard %q: %v", shard, err) } r := io.Reader(f) if m.opts.CompressShards { r = snappy.NewReader(r) } rd := delimited.NewReader(bufio.NewReaderSize(r, m.opts.IOBufferSize)) first, err := rd.Next() if err != nil { f.Close() return fmt.Errorf("error reading beginning of shard %q: %v", shard, err) } el, err := m.opts.Marshaler.Unmarshal(first) if err != nil { f.Close() return fmt.Errorf("error unmarshaling beginning of shard %q: %v", shard, err) } heap.Push(merger, &mergeElement{el: el, rd: rd, f: f}) } // While the merger heap is non-empty: // el := pop the head of the heap // pass it to the user-specific function // push the next element el.rd to the merger heap for merger.Len() != 0 { x := heap.Pop(merger).(*mergeElement) // Give the value to the user-supplied function if err := f(x.el); err != nil { return err } // Read and parse the next value on the same shard rec, err := x.rd.Next() if err != nil { _ = x.f.Close() // ignore errors (file is only open for reading) _ = os.Remove(x.f.Name()) // ignore errors (os.RemoveAll used in defer) if err == io.EOF { continue } else { return fmt.Errorf("error reading shard: %v", err) } } next, err := m.opts.Marshaler.Unmarshal(rec) if err != nil { return fmt.Errorf("error unmarshaling element: %v", err) } // Reuse mergeElement, push it back onto the merger heap with the next value x.el = next heap.Push(merger, x) } return nil }