Beispiel #1
0
func (self *Reader) metaSequence(moltype, id string) (sequence *seq.Seq, err error) {
	var line, body []byte

	for {
		if line, err = self.r.ReadBytes('\n'); err == nil {
			if len(line) > 0 && line[len(line)-1] == '\r' {
				line = line[:len(line)-1]
			}
			if len(line) == 0 {
				continue
			}
			if len(line) < 2 || !bytes.HasPrefix(line, []byte("##")) {
				return nil, bio.NewError("Corrupt metasequence", 0, line)
			}
			line = bytes.TrimSpace(line[2:])
			if string(line) == "end-"+moltype {
				break
			} else {
				line = bytes.Join(bytes.Fields(line), nil)
				body = append(body, line...)
			}
		} else {
			return nil, err
		}
	}

	sequence = seq.New(id, body, nil)
	sequence.Moltype = bio.ParseMoltype(moltype)

	return
}
Beispiel #2
0
// Map routines to iterate a function over an array, potentially splitting the array slice into
// chunks so that each chunk is processed concurrently. When using concurrent processing the
// Chunk size is either the nearest even division of the total array over the chosen concurrent
// processing goroutines or a specified maximum chunk size, whichever is smaller. Reducing
// chunk size can reduce the impact of divergence in time for processing chunks, but may add
// to overhead.
func Map(set Mapper, threads, maxChunkSize int) (results []interface{}, err error) {
	queue := make(chan Operator, 1)
	p := NewProcessor(queue, 0, threads)
	defer p.Stop()

	chunkSize := util.Min(int(math.Ceil(float64(set.Len())/float64(threads))), maxChunkSize)

	quit := make(chan struct{})

	go func() {
		for s := 0; s*chunkSize < set.Len(); s++ {
			select {
			case <-quit:
				break
			default:
				endChunk := util.Min(chunkSize*(s+1), set.Len())
				queue <- set.Slice(chunkSize*s, endChunk)
			}
		}
	}()

	for r := 0; r*chunkSize < set.Len(); r++ {
		result := <-p.out
		if result.Err != nil {
			err = bio.NewError("Map failed", 0, err)
			close(quit)
			break
		}
		results = append(results, result.Value)
	}

	return
}
Beispiel #3
0
// Rewind the reader.
func (self *Reader) Rewind() (err error) {
	if s, ok := self.f.(io.Seeker); ok {
		_, err = s.Seek(0, 0)
	} else {
		err = bio.NewError("Not a Seeker", 0, self)
	}
	return
}
Beispiel #4
0
func (self Alignment) Column(pos int, fill byte) (c []byte, err error) {
	if pos < self.Start() || pos >= self.End() {
		return nil, bio.NewError("Column out of range", 0, self.Start(), self.End(), pos)
	}
	c = make([]byte, len(self))
	for i, s := range self {
		if pos-s.Offset >= 0 || pos-s.Offset < s.Offset+s.Len() {
			c[i] = s.Seq[pos]
		} else {
			c[i] = fill
		}
	}

	return
}
Beispiel #5
0
// Return a new Processor to operate the function f over the number of threads specified taking
// input from queue and placing the result in buffer. Threads is limited by GOMAXPROCS, if threads is greater
// GOMAXPROCS or less than 1 then threads is set to GOMAXPROCS.
func NewProcessor(queue chan Operator, buffer int, threads int) (p *Processor) {
	if available := runtime.GOMAXPROCS(0); threads > available || threads < 1 {
		threads = available
	}

	p = &Processor{
		in:      queue,
		out:     make(chan Result, buffer),
		stop:    make(chan struct{}),
		working: make(chan bool, threads),
		wg:      &sync.WaitGroup{},
	}

	for i := 0; i < threads; i++ {
		p.wg.Add(1)
		go func() {
			p.working <- true
			defer func() {
				if e := recover(); e != nil {
					p.out <- Result{nil, bio.NewError("concurrent.Processor panic", 1, e)}
				}
				<-p.working
				if len(p.working) == 0 {
					close(p.out)
				}
				p.wg.Done()
			}()

			for input := range p.in {
				v, e := input.Operation()
				if p.out != nil {
					p.out <- Result{v, e}
				}
				select {
				case <-p.stop:
					return
				default:
				}
			}
		}()
	}

	return
}
Beispiel #6
0
// Write meta data to a GFF file.
func (self *Writer) WriteMetaData(d interface{}) (n int, err error) {
	switch d.(type) {
	case []byte, string:
		n, err = self.w.WriteString("##" + d.(string) + "\n")
	case *seq.Seq:
		sw := fasta.NewWriter(self.f, self.Width)
		sw.IDPrefix = fmt.Sprintf("##%s ", d.(*seq.Seq).Moltype)
		sw.SeqPrefix = "##"
		if n, err = sw.Write(d.(*seq.Seq)); err != nil {
			return
		}
		if err = sw.Flush(); err != nil {
			return
		}
		var m int
		m, err = self.w.WriteString("##end-" + d.(*seq.Seq).Moltype.String() + "\n")
		n += m
		if err != nil {
			return
		}
		err = self.w.Flush()
		return
	case *feat.Feature:
		start := d.(*feat.Feature).Start
		if self.OneBased && start >= 0 {
			start++
		}
		n, err = self.w.WriteString("##sequence-region " + string(d.(*feat.Feature).ID) + " " +
			strconv.Itoa(start) + " " +
			strconv.Itoa(d.(*feat.Feature).End) + "\n")
	default:
		n, err = 0, bio.NewError("Unknown meta data type", 0, d)
	}

	if err == nil {
		err = self.w.Flush()
	}

	return
}
Beispiel #7
0
// Hash returns the h hash sum of file ReadSeekStater and any error. The file is
// Seek'd to the origin before and after the hash to ensure that the full file is summed and the
// file is ready for other reads. The hash is not reset on return, so if individual files are to
// be hashed with the same h, it should be reset.
func Hash(h hash.Hash, file *os.File) (sum []byte, err error) {
	var fi os.FileInfo
	if fi, err = file.Stat(); err != nil || fi.IsDir() {
		return nil, bio.NewError("Is a directory", 0, file)
	}

	file.Seek(0, 0)

	for n, buffer := 0, make([]byte, bufferLen); err == nil || err == io.ErrUnexpectedEOF; {
		n, err = io.ReadAtLeast(file, buffer, bufferLen)
		h.Write(buffer[:n])
	}

	if err == io.EOF || err == io.ErrUnexpectedEOF {
		err = nil
	}

	file.Seek(0, 0)
	sum = h.Sum(nil)

	return
}
Beispiel #8
0
// Read a single feature and return it or an error.
func (self *Reader) Read() (f *feat.Feature, err error) {
	var (
		line  string
		elems []string
		se    error
		ok    bool
	)

	if line, err = self.r.ReadString('\n'); err == nil {
		self.line++
		if len(line) > 0 && line[len(line)-1] == '\r' {
			line = line[:len(line)-1]
		}
		line = strings.TrimSpace(line)
		elems = strings.SplitN(line, "\t", self.BedType+1)
		if len(elems) < self.BedType {
			return nil, bio.NewError(fmt.Sprintf("Bad bedtype on line %d", self.line), 0, line)
		}
	} else {
		return
	}

	f = &feat.Feature{Moltype: bio.DNA}

	for i := range elems {
		switch i {
		case chromField:
			f.Location = elems[i]
			if self.BedType <= nameField {
				f.ID = elems[chromField] + ":" + elems[startField] + ".." + elems[endField]
			}
		case startField:
			f.Start, se = strconv.Atoi(elems[i])
			if se != nil {
				f.Start = 0
			}
		case endField:
			f.End, se = strconv.Atoi(elems[i])
			if se != nil {
				f.End = 0
			}
		case nameField:
			f.ID = elems[i]
		case scoreField:
			if f.Score, se = strconv.ParseFloat(elems[i], 64); se != nil {
				f.Score = 0
			}
		case strandField:
			if f.Strand, ok = CharToStrand[elems[i]]; !ok {
				f.Strand = 0
			}

			// The following fields are unsupported at this stage
		case thickStartField:
		case thickEndField:
		case rgbField:
		case blockCountField:
		case blockSizesField:
		case blockStartsField:
		}
	}

	return
}
Beispiel #9
0
func (self *Reader) commentMetaline(line string) (f *feat.Feature, err error) {
	// Load these into a slice in a MetaField of the Feature
	fields := strings.Split(string(line), " ")
	switch fields[0] {
	case "gff-version":
		if self.Version, err = strconv.Atoi(fields[1]); err != nil {
			self.Version = DefaultVersion
		}
		return self.Read()
	case "source-version":
		if len(fields) > 1 {
			self.SourceVersion = strings.Join(fields[1:], " ")
			return self.Read()
		} else {
			return nil, bio.NewError("Incomplete source-version metaline", 0, fields)
		}
	case "date":
		if len(fields) > 1 {
			self.Date, err = time.Parse(self.TimeFormat, strings.Join(fields[1:], " "))
			return self.Read()
		} else {
			return nil, bio.NewError("Incomplete date metaline", 0, fields)
		}
	case "Type":
		if len(fields) > 1 {
			self.Type = bio.ParseMoltype(fields[1])
			return self.Read()
		} else {
			return nil, bio.NewError("Incomplete Type metaline", 0, fields)
		}
	case "sequence-region":
		if len(fields) > 3 {
			var start, end int
			if start, err = strconv.Atoi(fields[2]); err != nil {
				return nil, err
			} else {
				if self.OneBased {
					start = bio.OneToZero(start)
				}
			}
			if end, err = strconv.Atoi(fields[3]); err != nil {
				return nil, err
			}
			f = &feat.Feature{
				Meta: &feat.Feature{
					ID:    fields[1],
					Start: start,
					End:   end,
				},
			}
		} else {
			return nil, bio.NewError("Incomplete sequence-region metaline", 0, fields)
		}
	case "DNA", "RNA", "Protein":
		if len(fields) > 1 {
			var s *seq.Seq
			if s, err = self.metaSequence(fields[0], fields[1]); err != nil {
				return
			} else {
				f = &feat.Feature{Meta: s}
			}
		} else {
			return nil, bio.NewError("Incomplete sequence metaline", 0, fields)
		}
	default:
		f = &feat.Feature{Meta: line}
	}

	return
}
Beispiel #10
0
func (self Alignment) Stitch(f feat.FeatureSet) (a Alignment, err error) {
	for _, s := range self {
		if !s.Inplace && s.Quality != nil && s.Quality.Inplace {
			return nil, bio.NewError("Inplace operation on Quality with non-Inplace operation on parent Seq.", 0, s)
		}
	}

	t := interval.NewTree()
	var i *interval.Interval

	for _, feature := range f {
		if i, err = interval.New("", feature.Start, feature.End, 0, nil); err != nil {
			return nil, err
		} else {
			t.Insert(i)
		}
	}

	start := self.Start()
	a = make(Alignment, len(self))
	span, err := interval.New("", start, self.End(), 0, nil)
	if err != nil {
		panic("Seq.End() < Seq.Start()")
	}
	fs, _ := t.Flatten(span, 0, 0)

	var offset int
	for i, s := range self {
		if s.Inplace {
			s.Seq = s.stitch(fs)
			if s.Offset -= fs[0].Start(); offset < 0 {
				s.Offset = 0
			}
			s.Circular = false
			if s.Quality != nil {
				var q *Quality
				if s.Quality.Inplace {
					q = s.Quality
				} else {
					q = &Quality{ID: s.Quality.ID}
				}
				q.Qual = s.Quality.stitch(fs)
				if q.Offset = s.Quality.Offset - fs[0].Start(); q.Offset < 0 {
					q.Offset = 0
				}
				q.Circular = false
				s.Quality = q
			}
			a[i] = s
		} else {
			var q *Quality
			if s.Quality != nil {
				if offset = s.Quality.Offset - fs[0].Start(); offset < 0 {
					offset = 0
				}
				q = &Quality{
					ID:       s.Quality.ID,
					Qual:     s.Quality.stitch(fs),
					Offset:   offset,
					Circular: false,
				}
			}
			if offset = s.Offset - fs[0].Start(); offset < 0 {
				offset = 0
			}
			a[i] = &Seq{
				ID:       s.ID,
				Seq:      s.stitch(fs),
				Offset:   offset,
				Strand:   s.Strand,
				Circular: false,
				Moltype:  s.Moltype,
				Quality:  q,
			}
		}
	}

	return
}
Beispiel #11
0
func (self Alignment) Join(a Alignment, fill byte, where int) (b Alignment, err error) {
	if len(self) != len(a) {
		return nil, bio.NewError("Alignments do not hold the same number of sequences", 0, []Alignment{self, a})
	}

	var (
		ID    string
		ts    []byte
		shift int
	)

	b = make(Alignment, len(self))

	switch where {
	case Prepend:
		if !a.IsFlush(Right) {
			a = a.Flush(Right, fill)
		}
		if !self.IsFlush(Left) {
			a = self.Flush(Left, fill)
		}
	case Append:
		if !a.IsFlush(Left) {
			a = a.Flush(Left, fill)
		}
		if !self.IsFlush(Right) {
			a = self.Flush(Right, fill)
		}
	}

	for i, s2 := range self {
		s1 := self[i]
		switch where {
		case Prepend:
			ID = s2.ID + "+" + s1.ID
			ts = make([]byte, len(s2.Seq), len(s2.Seq)+len(s1.Seq))
			copy(ts, s2.Seq)
			ts = append(ts, s1.Seq...)
			shift = s2.Len()
		case Append:
			ID = s1.ID + "+" + s2.ID
			if s1.Inplace {
				ts = append(s1.Seq, s2.Seq...)
			} else {
				ts = make([]byte, len(s1.Seq), len(s2.Seq)+len(s1.Seq))
				copy(ts, s1.Seq)
				ts = append(ts, s2.Seq...)
			}
		}

		if s1.Inplace {
			b[i] = s1
			b[i].ID = ID
			b[i].Seq = ts
			b[i].Offset -= shift
			b[i].Quality = nil // TODO Handle Quality
		} else {
			b[i] = &Seq{
				ID:      ID,
				Seq:     ts,
				Offset:  s1.Offset - shift,
				Strand:  s1.Strand,
				Moltype: s1.Moltype,
				Quality: nil, // TODO Handle Quality
			}
		}
	}

	return
}
Beispiel #12
0
// Filter a query sequence against the stored index. If query and the target are the same sequence,
// selfAlign can be used to avoid double seaching - behavior is undefined if the the sequences are not the same.
// A morass is used to store and sort individual filter hits.
func (self *Filter) Filter(query *seq.Seq, selfAlign, complement bool, morass *morass.Morass) (err error) {
	self.selfAlign = selfAlign
	self.complement = complement
	self.morass = morass
	self.k = self.index.GetK()

	// Ukonnen's Lemma
	self.minKmersPerHit = MinWordsPerFilterHit(self.minMatch, self.k, self.maxError)

	// Maximum distance between SeqQ positions of two k-mers in a match
	// (More stringent bounds may be possible, but not a big problem
	// if two adjacent matches get merged).
	self.maxKmerDist = self.minMatch - self.k

	tubeWidth := self.tubeOffset + self.maxError

	if self.tubeOffset < self.maxError {
		return bio.NewError("TubeOffset < MaxError", 0, []int{self.tubeOffset, self.maxError})
	}

	maxActiveTubes := (self.target.Len()+tubeWidth-1)/self.tubeOffset + 1
	self.tubes = make([]TubeState, maxActiveTubes)

	// Ticker tracks cycling of circular list of active tubes.
	ticker := tubeWidth

	f := func(index *kmerindex.Index, position, kmer int) {
		from := 0
		if kmer > 0 {
			from = index.FingerAt(kmer - 1)
		}
		to := index.FingerAt(kmer)
		for i := from; i < to; i++ {
			self.commonKmer(index.PosAt(i), position)
		}

		if ticker--; ticker == 0 {
			if e := self.tubeEnd(position); e != nil {
				panic(e) // Caught by fastkmerindex.ForEachKmerOf and returned
			}
			ticker = self.tubeOffset
		}
	}

	if err = self.index.ForEachKmerOf(query, 0, query.Len(), f); err != nil {
		return
	}

	if err = self.tubeEnd(query.Len() - 1); err != nil {
		return
	}

	diagFrom := self.diagIndex(self.target.Len()-1, query.Len()-1) - tubeWidth
	diagTo := self.diagIndex(0, query.Len()-1) + tubeWidth

	tubeFrom := self.tubeIndex(diagFrom)
	if tubeFrom < 0 {
		tubeFrom = 0
	}

	tubeTo := self.tubeIndex(diagTo)

	for tubeIndex := tubeFrom; tubeIndex <= tubeTo; tubeIndex++ {
		if err = self.tubeFlush(tubeIndex); err != nil {
			return
		}
	}

	self.tubes = nil

	return self.morass.Finalise()
}
Beispiel #13
0
// Method to align two sequences using the Smith-Waterman algorithm. Returns an alignment or an error
// if the scoring matrix is not square.
func (self *Aligner) Align(reference, query *seq.Seq) (aln seq.Alignment, err error) {
	gap := len(self.Matrix) - 1
	for _, row := range self.Matrix {
		if len(row) != gap+1 {
			return nil, bio.NewError("Scoring matrix is not square.", 0, self.Matrix)
		}
	}
	r, c := reference.Len()+1, query.Len()+1
	table := make([][]int, r)
	for i := range table {
		table[i] = make([]int, c)
	}

	var scores [3]int

	for i := 1; i < r; i++ {
		for j := 1; j < c; j++ {
			if rVal, qVal := self.LookUp.ValueToCode[reference.Seq[i-1]], self.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 {
				continue
			} else {
				scores[diag] = table[i-1][j-1] + self.Matrix[rVal][qVal]
				scores[up] = table[i-1][j] + self.Matrix[rVal][gap]
				scores[left] = table[i][j-1] + self.Matrix[gap][qVal]
				table[i][j] = util.Max(scores[:]...)
			}
		}
	}

	refAln := &seq.Seq{ID: reference.ID, Seq: make([]byte, 0, reference.Len())}
	queryAln := &seq.Seq{ID: query.ID, Seq: make([]byte, 0, query.Len())}

	i, j := r-1, c-1
	for i > 0 && j > 0 {
		if rVal, qVal := self.LookUp.ValueToCode[reference.Seq[i-1]], self.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 {
			continue
		} else {
			scores[diag] = table[i-1][j-1] + self.Matrix[rVal][qVal]
			scores[up] = table[i-1][j] + self.Matrix[gap][qVal]
			scores[left] = table[i][j-1] + self.Matrix[rVal][gap]
			switch d := maxIndex(scores[:]); d {
			case diag:
				i--
				j--
				refAln.Seq = append(refAln.Seq, reference.Seq[i])
				queryAln.Seq = append(queryAln.Seq, query.Seq[j])
			case up:
				i--
				refAln.Seq = append(refAln.Seq, reference.Seq[i])
				queryAln.Seq = append(queryAln.Seq, self.GapChar)
			case left:
				j--
				queryAln.Seq = append(queryAln.Seq, query.Seq[j])
				refAln.Seq = append(refAln.Seq, self.GapChar)
			}
		}
	}

	for ; i > 0; i-- {
		refAln.Seq = append(refAln.Seq, reference.Seq[i-1])
		queryAln.Seq = append(queryAln.Seq, self.GapChar)
	}
	for ; j > 0; j-- {
		refAln.Seq = append(refAln.Seq, self.GapChar)
		queryAln.Seq = append(queryAln.Seq, query.Seq[j-1])
	}

	for i, j := 0, len(refAln.Seq)-1; i < j; i, j = i+1, j-1 {
		refAln.Seq[i], refAln.Seq[j] = refAln.Seq[j], refAln.Seq[i]
	}
	for i, j := 0, len(queryAln.Seq)-1; i < j; i, j = i+1, j-1 {
		queryAln.Seq[i], queryAln.Seq[j] = queryAln.Seq[j], queryAln.Seq[i]
	}

	aln = seq.Alignment{refAln, queryAln}

	return
}