Esempio n. 1
0
File: merge.go Progetto: frogs/biogo
// Create a new Merger using the provided kmerindex, query sequence, filter parameters and maximum inter-segment gap length.
// If selfCompare is true only the upper diagonal of the comparison matrix is examined.
func NewMerger(index *kmerindex.Index, query *seq.Seq, filterParams *Params, maxIGap int, selfCompare bool) *Merger {
	tubeWidth := filterParams.TubeOffset + filterParams.MaxError
	binWidth := tubeWidth - 1
	leftPadding := diagonalPadding + binWidth

	eoTerm := &Trapezoid{
		Left:   query.Len() + 1 + leftPadding,
		Right:  query.Len() + 1,
		Bottom: -1,
		Top:    query.Len() + 1,
		Next:   nil,
	}

	return &Merger{
		target:         index.Seq,
		filterParams:   filterParams,
		maxIGap:        maxIGap,
		query:          query,
		selfComparison: selfCompare,
		bottomPadding:  index.GetK() + 2,
		leftPadding:    leftPadding,
		binWidth:       binWidth,
		eoTerm:         eoTerm,
		trapOrder:      eoTerm,
	}
}
Esempio n. 2
0
File: fastq.go Progetto: frogs/biogo
// Write a single sequence and return the number of bytes written and any error.
func (self *Writer) Write(s *seq.Seq) (n int, err error) {
	if s.Quality == nil {
		return 0, bio.NewError("No quality associated with sequence", 0, s)
	}
	if s.Len() == s.Quality.Len() {
		self.template[1] = []byte(s.ID)
		self.template[3] = s.Seq
		if self.QID {
			self.template[4] = append(append([]byte("\n+"), []byte(s.ID)...), '\n')
		} else {
			self.template[4] = []byte("\n+\n")
		}
		self.template[5] = self.encodeQuality(s.Quality.Qual)
		var tn int
		for _, t := range self.template {
			tn, err = self.w.Write(t)
			n += tn
			if err != nil {
				return
			}
		}
	} else {
		return 0, bio.NewError("Sequence length and quality length do not match", 0, s)
	}

	return
}
Esempio n. 3
0
File: pair.go Progetto: frogs/biogo
// Convert coordinates in a packed sequence into a feat.Feature.
func featureOf(contigs *seq.Seq, from, to int, comp bool) (feature *feat.Feature, err error) {
	if comp {
		from, to = contigs.Len()-to, contigs.Len()-from
	}
	if from >= to {
		return nil, bio.NewError(fmt.Sprintf("%s: from > to", contigs.ID), 0, nil)
	}

	// DPHit coordinates sometimes over/underflow.
	// This is a lazy hack to work around it, should really figure
	// out what is going on.
	if from < 0 {
		from = 0
	}
	if to > contigs.Len() {
		to = contigs.Len()
	}

	// Take midpoint of segment -- lazy hack again, endpoints
	// sometimes under / overflow
	bin := (from + to) / (2 * binSize)
	binCount := (contigs.Len() + binSize - 1) / binSize

	if bin < 0 || bin >= binCount {
		return nil, bio.NewError(fmt.Sprintf("%s: bin %d out of range 0..%d", contigs.ID, bin, binCount-1), 0, nil)
	}

	contigIndex := contigs.Meta.(seqMap).binMap[bin]

	if contigIndex < 0 || contigIndex >= len(contigs.Meta.(seqMap).contigs) {
		return nil, bio.NewError(fmt.Sprintf("%s: contig index %d out of range 0..%d", contigs.ID, contigIndex, len(contigs.Meta.(seqMap).contigs)), 0, nil)
	}

	length := to - from

	if length < 0 {
		return nil, bio.NewError(fmt.Sprintf("%s: length < 0", contigs.ID), 0, nil)
	}

	contig := contigs.Meta.(seqMap).contigs[contigIndex]
	contigFrom := from - contig.from
	contigTo := contigFrom + length

	if contigFrom < 0 {
		contigFrom = 0
	}

	if contigTo > contig.seq.Len() {
		contigTo = contig.seq.Len()
	}

	return &feat.Feature{
		ID:    contig.seq.ID,
		Start: contigFrom,
		End:   contigTo,
	}, nil
}
Esempio n. 4
0
File: fasta.go Progetto: frogs/biogo
// Write a single sequence and return the number of bytes written and any error.
func (self *Writer) Write(s *seq.Seq) (n int, err error) {
	var ln int
	n, err = self.w.WriteString(string(self.IDPrefix) + s.ID + "\n")
	if err == nil {
		for i := 0; i*self.Width <= s.Len(); i++ {
			endLinePos := util.Min(self.Width*(i+1), s.Len())
			for _, elem := range [][]byte{self.SeqPrefix, s.Seq[self.Width*i : endLinePos], {'\n'}} {
				ln, err = self.w.Write(elem)
				if n += ln; err != nil {
					return
				}
			}
		}
	}

	return
}
Esempio n. 5
0
// Pack a sequence into the Packed sequence. Returns a string giving diagnostic information.
func (pa *Packer) Pack(sequence *seq.Seq) string {
	m := pa.Packed.Meta.(seqMap)

	c := contig{seq: sequence}

	padding := binSize - sequence.Len()%binSize
	if padding < minPadding {
		padding += binSize
	}

	pa.length += pa.lastPad
	c.from = pa.length
	pa.length += sequence.Len()
	pa.lastPad = padding

	bins := make([]int, (padding+sequence.Len())/binSize)
	for i := 0; i < len(bins); i++ {
		bins[i] = len(m.contigs)
	}

	m.binMap = append(m.binMap, bins...)
	m.contigs = append(m.contigs, c)
	pa.Packed.Meta = m

	return fmt.Sprintf("%20s\t%10d\t%7d-%-d", sequence.ID[:util.Min(20, len(sequence.ID))], sequence.Len(), len(m.binMap)-len(bins), len(m.binMap)-1)
}
Esempio n. 6
0
// Create a new Kmer Index with a word size k based on sequence
func New(k int, sequence *seq.Seq) (i *Index, err error) {
	switch {
	case k > MaxKmerLen:
		return nil, bio.NewError("k greater than MaxKmerLen", 0, k, MaxKmerLen)
	case k < MinKmerLen:
		return nil, bio.NewError("k less than MinKmerLen", 0, k, MinKmerLen)
	case k+1 > sequence.Len():
		return nil, bio.NewError("sequence shorter than k+1-mer length", 0, k+1, sequence.Len())
	}

	i = &Index{
		finger:  make([]Kmer, util.Pow4(k)+1), // Need a Tn+1 finger position so that Tn can be recognised
		k:       k,
		kMask:   Kmer(util.Pow4(k) - 1),
		Seq:     sequence,
		indexed: false,
	}

	i.buildKmerTable()

	return
}
Esempio n. 7
0
File: sw.go Progetto: frogs/biogo
// Method to align two sequences using the Smith-Waterman algorithm. Returns an alignment or an error
// if the scoring matrix is not square.
func (a *Aligner) Align(reference, query *seq.Seq) (aln seq.Alignment, err error) {
	gap := len(a.Matrix) - 1
	for _, row := range a.Matrix {
		if len(row) != gap+1 {
			return nil, bio.NewError("Scoring matrix is not square.", 0, a.Matrix)
		}
	}
	r, c := reference.Len()+1, query.Len()+1
	table := make([][]int, r)
	for i := range table {
		table[i] = make([]int, c)
	}

	max, maxI, maxJ := 0, 0, 0
	var (
		score  int
		scores [3]int
	)

	for i := 1; i < r; i++ {
		for j := 1; j < c; j++ {
			if rVal, qVal := a.LookUp.ValueToCode[reference.Seq[i-1]], a.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 {
				continue
			} else {
				scores[diag] = table[i-1][j-1] + a.Matrix[rVal][qVal]
				scores[up] = table[i-1][j] + a.Matrix[rVal][gap]
				scores[left] = table[i][j-1] + a.Matrix[gap][qVal]
				score = util.Max(scores[:]...)
				if score < 0 {
					score = 0
				}
				if score >= max { // greedy so make farthest down and right
					max, maxI, maxJ = score, i, j
				}
				table[i][j] = score
			}
		}
	}

	refAln := &seq.Seq{ID: reference.ID, Seq: make([]byte, 0, reference.Len())}
	queryAln := &seq.Seq{ID: query.ID, Seq: make([]byte, 0, query.Len())}

	for i, j := maxI, maxJ; table[i][j] != 0 && i > 0 && j > 0; {
		if rVal, qVal := a.LookUp.ValueToCode[reference.Seq[i-1]], a.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 {
			continue
		} else {
			scores[diag] = table[i-1][j-1] + a.Matrix[rVal][qVal]
			scores[up] = table[i-1][j] + a.Matrix[gap][qVal]
			scores[left] = table[i][j-1] + a.Matrix[rVal][gap]
			switch d := maxIndex(scores[:]); d {
			case diag:
				i--
				j--
				refAln.Seq = append(refAln.Seq, reference.Seq[i])
				queryAln.Seq = append(queryAln.Seq, query.Seq[j])
			case up:
				i--
				refAln.Seq = append(refAln.Seq, reference.Seq[i])
				queryAln.Seq = append(queryAln.Seq, a.GapChar)
			case left:
				j--
				refAln.Seq = append(refAln.Seq, a.GapChar)
				queryAln.Seq = append(queryAln.Seq, query.Seq[j])
			}
		}
	}

	for i, j := 0, len(refAln.Seq)-1; i < j; i, j = i+1, j-1 {
		refAln.Seq[i], refAln.Seq[j] = refAln.Seq[j], refAln.Seq[i]
	}
	for i, j := 0, len(queryAln.Seq)-1; i < j; i, j = i+1, j-1 {
		queryAln.Seq[i], queryAln.Seq[j] = queryAln.Seq[j], queryAln.Seq[i]
	}

	aln = seq.Alignment{refAln, queryAln}

	return
}
Esempio n. 8
0
// Filter a query sequence against the stored index. If query and the target are the same sequence,
// selfAlign can be used to avoid double seaching - behavior is undefined if the the sequences are not the same.
// A morass is used to store and sort individual filter hits.
func (f *Filter) Filter(query *seq.Seq, selfAlign, complement bool, morass *morass.Morass) error {
	f.selfAlign = selfAlign
	f.complement = complement
	f.morass = morass
	f.k = f.index.GetK()

	// Ukonnen's Lemma
	f.minKmersPerHit = MinWordsPerFilterHit(f.minMatch, f.k, f.maxError)

	// Maximum distance between SeqQ positions of two k-mers in a match
	// (More stringent bounds may be possible, but not a big problem
	// if two adjacent matches get merged).
	f.maxKmerDist = f.minMatch - f.k

	tubeWidth := f.tubeOffset + f.maxError

	if f.tubeOffset < f.maxError {
		return bio.NewError("TubeOffset < MaxError", 0, []int{f.tubeOffset, f.maxError})
	}

	maxActiveTubes := (f.target.Len()+tubeWidth-1)/f.tubeOffset + 1
	f.tubes = make([]tubeState, maxActiveTubes)

	// Ticker tracks cycling of circular list of active tubes.
	ticker := tubeWidth

	var err error
	err = f.index.ForEachKmerOf(query, 0, query.Len(), func(index *kmerindex.Index, position, kmer int) {
		from := 0
		if kmer > 0 {
			from = index.FingerAt(kmer - 1)
		}
		to := index.FingerAt(kmer)
		for i := from; i < to; i++ {
			f.commonKmer(index.PosAt(i), position)
		}

		if ticker--; ticker == 0 {
			if e := f.tubeEnd(position); e != nil {
				panic(e) // Caught by fastkmerindex.ForEachKmerOf and returned
			}
			ticker = f.tubeOffset
		}
	})
	if err != nil {
		return err
	}

	err = f.tubeEnd(query.Len() - 1)
	if err != nil {
		return err
	}

	diagFrom := f.diagIndex(f.target.Len()-1, query.Len()-1) - tubeWidth
	diagTo := f.diagIndex(0, query.Len()-1) + tubeWidth

	tubeFrom := f.tubeIndex(diagFrom)
	if tubeFrom < 0 {
		tubeFrom = 0
	}

	tubeTo := f.tubeIndex(diagTo)

	for tubeIndex := tubeFrom; tubeIndex <= tubeTo; tubeIndex++ {
		err = f.tubeFlush(tubeIndex)
		if err != nil {
			return err
		}
	}

	f.tubes = nil

	return f.morass.Finalise()
}