Exemple #1
0
func (m *mergeSorter) dumpShard() (err error) {
	defer func() {
		m.buffer = make([]interface{}, 0, m.opts.MaxInMemory)
	}()

	// Create a new shard file
	shardPath := filepath.Join(m.workDir, fmt.Sprintf("shard.%.6d", len(m.shards)))
	file, err := os.OpenFile(shardPath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, shardFileMode)
	if err != nil {
		return fmt.Errorf("error creating shard: %v", err)
	}
	defer func() {
		replaceErrIfNil(&err, "error closing shard: %v", file.Close())
	}()

	w := io.Writer(file)
	if m.opts.CompressShards {
		w = snappy.NewWriter(w)
	}

	// Buffer writing to the shard
	buf := bufio.NewWriterSize(w, m.opts.IOBufferSize)
	defer func() {
		replaceErrIfNil(&err, "error flushing shard: %v", buf.Flush())
	}()

	// Sort the in-memory buffer of elements
	sortutil.Sort(m.opts.Lesser, m.buffer)

	// Write each element of the in-memory to shard file, in sorted order
	wr := delimited.NewWriter(buf)
	for len(m.buffer) > 0 {
		rec, err := m.opts.Marshaler.Marshal(m.buffer[0])
		if err != nil {
			return fmt.Errorf("marshaling error: %v", err)
		}
		if _, err := wr.Write(rec); err != nil {
			return fmt.Errorf("writing error: %v", err)
		}
		m.buffer = m.buffer[1:]
	}

	m.shards = append(m.shards, shardPath)
	return nil
}
Exemple #2
0
// Iterator implements part of the Interface interface.
func (m *mergeSorter) Iterator() (iter Iterator, err error) {
	if m.finalized {
		return nil, ErrAlreadyFinalized
	}
	m.finalized = true // signal that further operations should fail

	it := &mergeIterator{workDir: m.workDir, marshaler: m.opts.Marshaler}

	if len(m.shards) == 0 {
		// Fast path for a single, in-memory shard
		it.buffer, m.buffer = m.buffer, nil
		sortutil.Sort(m.opts.Lesser, it.buffer)
		return it, nil
	}

	// This is a heap storing the head of each shard.
	merger := &sortutil.ByLesser{
		Lesser: &mergeElementLesser{Lesser: m.opts.Lesser},
	}
	it.merger = merger

	defer func() {
		// Try to cleanup on errors
		if err != nil {
			if cErr := it.Close(); cErr != nil {
				log.Printf("WARNING: error closing Iterator after error: %v", cErr)
			}
		}
	}()

	if len(m.buffer) != 0 {
		// To make the merging algorithm simpler, dump the last shard to disk.
		if err := m.dumpShard(); err != nil {
			m.buffer = nil
			return nil, fmt.Errorf("error dumping final shard: %v", err)
		}
	}
	m.buffer = nil

	// Initialize the merger heap by reading the first element of each shard.
	for _, shard := range m.shards {
		f, err := os.OpenFile(shard, os.O_RDONLY, shardFileMode)
		if err != nil {
			return nil, fmt.Errorf("error opening shard %q: %v", shard, err)
		}

		r := io.Reader(f)
		if m.opts.CompressShards {
			r = snappy.NewReader(r)
		}

		rd := delimited.NewReader(bufio.NewReaderSize(r, m.opts.IOBufferSize))
		first, err := rd.Next()
		if err != nil {
			f.Close()
			return nil, fmt.Errorf("error reading beginning of shard %q: %v", shard, err)
		}
		el, err := m.opts.Marshaler.Unmarshal(first)
		if err != nil {
			f.Close()
			return nil, fmt.Errorf("error unmarshaling beginning of shard %q: %v", shard, err)
		}

		heap.Push(merger, &mergeElement{el: el, rd: rd, f: f})
	}

	return it, nil
}
Exemple #3
0
// Read implements part of the Interface interface.
func (m *mergeSorter) Read(f func(i interface{}) error) (err error) {
	if m.finalized {
		return ErrAlreadyFinalized
	}
	m.finalized = true // signal that further operations should fail

	// Ensure that the working directory is always cleaned up.
	defer func() {
		cleanupErr := os.RemoveAll(m.workDir)
		if err == nil {
			err = cleanupErr
		} else {
			log.Println("WARNING: error removing temporary directory:", m.workDir)
		}
	}()

	if len(m.shards) == 0 {
		// Fast path for a single, in-memory shard
		defer func() { m.buffer = nil }()
		sortutil.Sort(m.opts.Lesser, m.buffer)
		for len(m.buffer) > 0 {
			if err := f(m.buffer[0]); err != nil {
				return err
			}
			m.buffer = m.buffer[1:]
		}
		return nil
	}

	if len(m.buffer) != 0 {
		// To make the merging algorithm simpler, dump the last shard to disk.
		if err := m.dumpShard(); err != nil {
			m.buffer = nil
			return fmt.Errorf("error dumping final shard: %v", err)
		}
	}
	m.buffer = nil

	// This is a heap storing the head of each shard.
	merger := &sortutil.ByLesser{
		Lesser: &mergeElementLesser{Lesser: m.opts.Lesser},
	}

	defer func() {
		// Try to cleanup on errors
		for merger.Len() != 0 {
			x := heap.Pop(merger).(*mergeElement)
			_ = x.f.Close() // ignore errors (file is only open for reading)
		}
	}()

	// Initialize the merger heap by reading the first element of each shard.
	for _, shard := range m.shards {
		f, err := os.OpenFile(shard, os.O_RDONLY, shardFileMode)
		if err != nil {
			return fmt.Errorf("error opening shard %q: %v", shard, err)
		}

		r := io.Reader(f)
		if m.opts.CompressShards {
			r = snappy.NewReader(r)
		}

		rd := delimited.NewReader(bufio.NewReaderSize(r, m.opts.IOBufferSize))
		first, err := rd.Next()
		if err != nil {
			f.Close()
			return fmt.Errorf("error reading beginning of shard %q: %v", shard, err)
		}
		el, err := m.opts.Marshaler.Unmarshal(first)
		if err != nil {
			f.Close()
			return fmt.Errorf("error unmarshaling beginning of shard %q: %v", shard, err)
		}

		heap.Push(merger, &mergeElement{el: el, rd: rd, f: f})
	}

	// While the merger heap is non-empty:
	//   el := pop the head of the heap
	//   pass it to the user-specific function
	//   push the next element el.rd to the merger heap
	for merger.Len() != 0 {
		x := heap.Pop(merger).(*mergeElement)

		// Give the value to the user-supplied function
		if err := f(x.el); err != nil {
			return err
		}

		// Read and parse the next value on the same shard
		rec, err := x.rd.Next()
		if err != nil {
			_ = x.f.Close()           // ignore errors (file is only open for reading)
			_ = os.Remove(x.f.Name()) // ignore errors (os.RemoveAll used in defer)
			if err == io.EOF {
				continue
			} else {
				return fmt.Errorf("error reading shard: %v", err)
			}
		}
		next, err := m.opts.Marshaler.Unmarshal(rec)
		if err != nil {
			return fmt.Errorf("error unmarshaling element: %v", err)
		}

		// Reuse mergeElement, push it back onto the merger heap with the next value
		x.el = next
		heap.Push(merger, x)
	}

	return nil
}