Пример #1
0
// Create a new Merger using the provided kmerindex, query sequence and filter parameters.
// If selfCompare is true only the upper diagonal of the comparison matrix is checked to save time.
func NewMerger(index *kmerindex.Index, query *seq.Seq, filterParams *Params, selfCompare bool) (m *Merger) {
	tubeWidth := filterParams.TubeOffset + filterParams.MaxError
	binWidth := tubeWidth - 1
	leftPadding := diagonalPadding + binWidth

	eoTerm := &Trapezoid{
		Left:   query.Len() + 1 + leftPadding,
		Right:  query.Len() + 1,
		Bottom: -1,
		Top:    query.Len() + 1,
		Next:   nil,
	}

	m = &Merger{
		target:         index.Seq,
		filterParams:   filterParams,
		query:          query,
		selfComparison: selfCompare,
		bottomPadding:  index.GetK() + 2,
		leftPadding:    leftPadding,
		binWidth:       binWidth,
		eoTerm:         eoTerm,
		trapOrder:      eoTerm,
	}

	return m
}
Пример #2
0
// Filter a query sequence against the stored index. If query and the target are the same sequence,
// selfAlign can be used to avoid double seaching - behavior is undefined if the the sequences are not the same.
// A morass is used to store and sort individual filter hits.
func (self *Filter) Filter(query *seq.Seq, selfAlign, complement bool, morass *morass.Morass) (err error) {
	self.selfAlign = selfAlign
	self.complement = complement
	self.morass = morass
	self.k = self.index.GetK()

	// Ukonnen's Lemma
	self.minKmersPerHit = MinWordsPerFilterHit(self.minMatch, self.k, self.maxError)

	// Maximum distance between SeqQ positions of two k-mers in a match
	// (More stringent bounds may be possible, but not a big problem
	// if two adjacent matches get merged).
	self.maxKmerDist = self.minMatch - self.k

	tubeWidth := self.tubeOffset + self.maxError

	if self.tubeOffset < self.maxError {
		return bio.NewError("TubeOffset < MaxError", 0, []int{self.tubeOffset, self.maxError})
	}

	maxActiveTubes := (self.target.Len()+tubeWidth-1)/self.tubeOffset + 1
	self.tubes = make([]TubeState, maxActiveTubes)

	// Ticker tracks cycling of circular list of active tubes.
	ticker := tubeWidth

	f := func(index *kmerindex.Index, position, kmer int) {
		from := 0
		if kmer > 0 {
			from = index.FingerAt(kmer - 1)
		}
		to := index.FingerAt(kmer)
		for i := from; i < to; i++ {
			self.commonKmer(index.PosAt(i), position)
		}

		if ticker--; ticker == 0 {
			if e := self.tubeEnd(position); e != nil {
				panic(e) // Caught by fastkmerindex.ForEachKmerOf and returned
			}
			ticker = self.tubeOffset
		}
	}

	if err = self.index.ForEachKmerOf(query, 0, query.Len(), f); err != nil {
		return
	}

	if err = self.tubeEnd(query.Len() - 1); err != nil {
		return
	}

	diagFrom := self.diagIndex(self.target.Len()-1, query.Len()-1) - tubeWidth
	diagTo := self.diagIndex(0, query.Len()-1) + tubeWidth

	tubeFrom := self.tubeIndex(diagFrom)
	if tubeFrom < 0 {
		tubeFrom = 0
	}

	tubeTo := self.tubeIndex(diagTo)

	for tubeIndex := tubeFrom; tubeIndex <= tubeTo; tubeIndex++ {
		if err = self.tubeFlush(tubeIndex); err != nil {
			return
		}
	}

	self.tubes = nil

	return self.morass.Finalise()
}
Пример #3
0
// Method to align two sequences using the Smith-Waterman algorithm. Returns an alignment or an error
// if the scoring matrix is not square.
func (self *Aligner) Align(reference, query *seq.Seq) (aln seq.Alignment, err error) {
	gap := len(self.Matrix) - 1
	for _, row := range self.Matrix {
		if len(row) != gap+1 {
			return nil, bio.NewError("Scoring matrix is not square.", 0, self.Matrix)
		}
	}
	r, c := reference.Len()+1, query.Len()+1
	table := make([][]int, r)
	for i := range table {
		table[i] = make([]int, c)
	}

	var scores [3]int

	for i := 1; i < r; i++ {
		for j := 1; j < c; j++ {
			if rVal, qVal := self.LookUp.ValueToCode[reference.Seq[i-1]], self.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 {
				continue
			} else {
				scores[diag] = table[i-1][j-1] + self.Matrix[rVal][qVal]
				scores[up] = table[i-1][j] + self.Matrix[rVal][gap]
				scores[left] = table[i][j-1] + self.Matrix[gap][qVal]
				table[i][j] = util.Max(scores[:]...)
			}
		}
	}

	refAln := &seq.Seq{ID: reference.ID, Seq: make([]byte, 0, reference.Len())}
	queryAln := &seq.Seq{ID: query.ID, Seq: make([]byte, 0, query.Len())}

	i, j := r-1, c-1
	for i > 0 && j > 0 {
		if rVal, qVal := self.LookUp.ValueToCode[reference.Seq[i-1]], self.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 {
			continue
		} else {
			scores[diag] = table[i-1][j-1] + self.Matrix[rVal][qVal]
			scores[up] = table[i-1][j] + self.Matrix[gap][qVal]
			scores[left] = table[i][j-1] + self.Matrix[rVal][gap]
			switch d := maxIndex(scores[:]); d {
			case diag:
				i--
				j--
				refAln.Seq = append(refAln.Seq, reference.Seq[i])
				queryAln.Seq = append(queryAln.Seq, query.Seq[j])
			case up:
				i--
				refAln.Seq = append(refAln.Seq, reference.Seq[i])
				queryAln.Seq = append(queryAln.Seq, self.GapChar)
			case left:
				j--
				queryAln.Seq = append(queryAln.Seq, query.Seq[j])
				refAln.Seq = append(refAln.Seq, self.GapChar)
			}
		}
	}

	for ; i > 0; i-- {
		refAln.Seq = append(refAln.Seq, reference.Seq[i-1])
		queryAln.Seq = append(queryAln.Seq, self.GapChar)
	}
	for ; j > 0; j-- {
		refAln.Seq = append(refAln.Seq, self.GapChar)
		queryAln.Seq = append(queryAln.Seq, query.Seq[j-1])
	}

	for i, j := 0, len(refAln.Seq)-1; i < j; i, j = i+1, j-1 {
		refAln.Seq[i], refAln.Seq[j] = refAln.Seq[j], refAln.Seq[i]
	}
	for i, j := 0, len(queryAln.Seq)-1; i < j; i, j = i+1, j-1 {
		queryAln.Seq[i], queryAln.Seq[j] = queryAln.Seq[j], queryAln.Seq[i]
	}

	aln = seq.Alignment{refAln, queryAln}

	return
}
Пример #4
0
func main() {
	var (
		in      *fasta.Reader
		out     *fasta.Writer
		e       error
		profile *os.File
	)

	inName := flag.String("in", "", "Filename for input. Defaults to stdin.")
	outName := flag.String("out", "", "Filename for output. Defaults to stdout.")
	size := flag.Int("size", 40, "Fragment size.")
	width := flag.Int("width", 60, "Fasta output width.")
	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to this file.")
	help := flag.Bool("help", false, "Print this usage message.")

	flag.Parse()

	if *help {
		flag.Usage()
		os.Exit(1)
	}

	if *cpuprofile != "" {
		if profile, e = os.Create(*cpuprofile); e != nil {
			fmt.Fprintf(os.Stderr, "Error: %v.", e)
			os.Exit(0)
		}
		fmt.Fprintf(os.Stderr, "Writing CPU profile data to %s\n", *cpuprofile)
		pprof.StartCPUProfile(profile)
		defer pprof.StopCPUProfile()
	}

	if *inName == "" {
		in = fasta.NewReader(os.Stdin)
	} else if in, e = fasta.NewReaderName(*inName); e != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", e)
	}
	defer in.Close()

	if *outName == "" {
		out = fasta.NewWriter(os.Stdout, *width)
	} else if out, e = fasta.NewWriterName(*outName, *width); e != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", e)
	}
	defer out.Close()

	var (
		sequence *seq.Seq
		err      error
	)

	t := &seq.Seq{}

	for {
		if sequence, err = in.Read(); err != nil {
			break
		}
		length := sequence.Len()
		t.ID = sequence.ID
		switch {
		case length >= 20 && length <= 85:
			t.Seq = sequence.Seq[5:]
			out.Write(t)
		case length > 85:
			for start := 0; start+*size <= length; start += *size {
				t.Seq = sequence.Seq[start : start+*size]
				out.Write(t)
			}
		}
	}
}