Beispiel #1
0
func (comdb *CompressedDB) writer() {
	var record []string
	var err error
	var cseq *CompressedSeq

	byteOffset := int64(0)
	buf := new(bytes.Buffer)

	var compressedWriter *bgzf.Writer
	if comdb.CompressedSource {

		compressedWriter = bgzf.NewWriter(comdb.File, 0) // O indicates that bgzf should use GO_MAX_PROCS for compression
	}

	csvWriter := csv.NewWriter(buf)
	csvWriter.Comma = ','
	csvWriter.UseCRLF = false

	saved := make([]CompressedSeq, 0, 1000)
	nextIndex := comdb.NumSequences()

	// If we're appending to the index, set the byteOffset to be at the end
	// of the current compressed database.
	if comdb.indexSize > 0 {
		info, err := comdb.File.Stat()
		if err != nil {
			fmt.Fprintf(os.Stderr, "%s\n", err)
			os.Exit(1)
		}
		byteOffset = info.Size()
	}

	for possible := range comdb.writerChan {
		// We have to preserve the order of compressed sequences, so we don't
		// write anything until we have the next sequence that we expect.
		if possible.Id < nextIndex {
			panic(fmt.Sprintf("BUG: Next sequence expected is '%d', but "+
				"we have an earlier sequence: %d", nextIndex, possible.Id))
		}
		saved = append(saved, possible)

		cseq, saved = nextSeqToWrite(nextIndex, saved)
		for cseq != nil {

			// Reset the buffer so it's empty. We want it to only contain
			// the next record we're writing.
			buf.Reset()

			// Allocate memory for creating the next record.
			// A record is a sequence name followed by four-tuples of links:
			// (coarse-seq-id, coarse-start, coarse-end, diff).
			record = make([]string, 0, 1+4*len(cseq.Links))
			record = append(record, cseq.Name)
			for _, link := range cseq.Links {
				record = append(record,
					fmt.Sprintf("%d", link.CoarseSeqId),
					fmt.Sprintf("%d", link.CoarseStart),
					fmt.Sprintf("%d", link.CoarseEnd),
					link.Diff)
			}

			// Write the record to our *buffer* and flush it.
			if err = csvWriter.Write(record); err != nil {
				fmt.Fprintf(os.Stderr, "%s\n", err)
				os.Exit(1)
			}
			csvWriter.Flush()

			// Pass the bytes on to the compressed file.
			if comdb.CompressedSource {
				if _, err = compressedWriter.Write(buf.Bytes()); err != nil {
					fmt.Fprintf(os.Stderr, "%s\n", err)
					os.Exit(1)
				}
				compressedWriter.Flush()
				compressedWriter.Wait()
			} else {
				if _, err = comdb.File.Write(buf.Bytes()); err != nil {
					fmt.Fprintf(os.Stderr, "%s\n", err)
					os.Exit(1)
				}
			}

			// Now write the byte offset that points to the start of this record
			err = binary.Write(comdb.Index, binary.BigEndian, byteOffset)
			if err != nil {
				fmt.Fprintf(os.Stderr, "%s\n", err)
				os.Exit(1)
			}

			// Increment the byte offset to be at the end of this record.
			byteOffset += int64(buf.Len())

			nextIndex++
			cseq, saved = nextSeqToWrite(nextIndex, saved)
		}
	}
	if comdb.CompressedSource {
		compressedWriter.Close()
	}
	comdb.Index.Close()
	comdb.File.Close()
	comdb.writerDone <- struct{}{}
}
Beispiel #2
0
func (s *S) TestIssue10(c *check.C) {
	for _, test := range issue10Tests {
		var buf bytes.Buffer

		// Write the set of words to a bgzf stream.
		w := bgzf.NewWriter(&buf, *conc)
		for _, wb := range test.words {
			w.Write([]byte(wb.word))
			if wb.flush {
				w.Flush()
			}
		}
		w.Close()

		for _, strategy := range []MergeStrategy{nil, adjacent} {
			if strategy != nil && !test.canSquash {
				continue
			}
			for _, clean := range []bool{false, true} {
				for _, truncFinal := range []bool{false, true} {
					if truncFinal && !test.canTrunc {
						continue
					}
					// Build an index into the words.
					r, err := bgzf.NewReader(bytes.NewReader(buf.Bytes()), *conc)
					c.Assert(err, check.Equals, nil)
					idx := make(map[string]bgzf.Chunk)
					for i, wb := range test.words {
						p := make([]byte, len(wb.word))
						n, err := r.Read(p)
						c.Assert(err, check.Equals, nil)
						c.Assert(string(p[:n]), check.Equals, wb.word)

						last := r.LastChunk()
						if !clean {
							// This simulates the index construction behaviour
							// that appears to be what is done by htslib. The
							// behaviour of bgzf is to elide seeks that will not
							// result in a productive read.
							if i != 0 && test.words[i-1].flush {
								last.Begin = idx[test.words[i-1].word].End
							}
						}
						idx[wb.word] = last
					}

					var chunks []bgzf.Chunk
					for _, w := range test.chunks {
						chunks = append(chunks, idx[w])
					}
					var want string
					if truncFinal {
						want = strings.Join(test.chunks[:len(test.chunks)-1], "")
						chunks[len(chunks)-2].End = chunks[len(chunks)-1].Begin
						chunks = chunks[:len(chunks)-1]
					} else {
						want = strings.Join(test.chunks, "")
					}

					if strategy != nil {
						chunks = strategy(chunks)
					}
					cr, err := NewChunkReader(r, chunks)
					c.Assert(err, check.Equals, nil)

					var got bytes.Buffer
					io.Copy(&got, cr)
					c.Check(got.String(), check.Equals, want,
						check.Commentf("clean=%t merge=%t trunc=%t chunks=%+v", clean, strategy != nil, truncFinal, chunks),
					)
				}
			}
		}
	}
}