// Create a new Merger using the provided kmerindex, query sequence and filter parameters. // If selfCompare is true only the upper diagonal of the comparison matrix is checked to save time. func NewMerger(index *kmerindex.Index, query *seq.Seq, filterParams *Params, selfCompare bool) (m *Merger) { tubeWidth := filterParams.TubeOffset + filterParams.MaxError binWidth := tubeWidth - 1 leftPadding := diagonalPadding + binWidth eoTerm := &Trapezoid{ Left: query.Len() + 1 + leftPadding, Right: query.Len() + 1, Bottom: -1, Top: query.Len() + 1, Next: nil, } m = &Merger{ target: index.Seq, filterParams: filterParams, query: query, selfComparison: selfCompare, bottomPadding: index.GetK() + 2, leftPadding: leftPadding, binWidth: binWidth, eoTerm: eoTerm, trapOrder: eoTerm, } return m }
// Filter a query sequence against the stored index. If query and the target are the same sequence, // selfAlign can be used to avoid double seaching - behavior is undefined if the the sequences are not the same. // A morass is used to store and sort individual filter hits. func (self *Filter) Filter(query *seq.Seq, selfAlign, complement bool, morass *morass.Morass) (err error) { self.selfAlign = selfAlign self.complement = complement self.morass = morass self.k = self.index.GetK() // Ukonnen's Lemma self.minKmersPerHit = MinWordsPerFilterHit(self.minMatch, self.k, self.maxError) // Maximum distance between SeqQ positions of two k-mers in a match // (More stringent bounds may be possible, but not a big problem // if two adjacent matches get merged). self.maxKmerDist = self.minMatch - self.k tubeWidth := self.tubeOffset + self.maxError if self.tubeOffset < self.maxError { return bio.NewError("TubeOffset < MaxError", 0, []int{self.tubeOffset, self.maxError}) } maxActiveTubes := (self.target.Len()+tubeWidth-1)/self.tubeOffset + 1 self.tubes = make([]TubeState, maxActiveTubes) // Ticker tracks cycling of circular list of active tubes. ticker := tubeWidth f := func(index *kmerindex.Index, position, kmer int) { from := 0 if kmer > 0 { from = index.FingerAt(kmer - 1) } to := index.FingerAt(kmer) for i := from; i < to; i++ { self.commonKmer(index.PosAt(i), position) } if ticker--; ticker == 0 { if e := self.tubeEnd(position); e != nil { panic(e) // Caught by fastkmerindex.ForEachKmerOf and returned } ticker = self.tubeOffset } } if err = self.index.ForEachKmerOf(query, 0, query.Len(), f); err != nil { return } if err = self.tubeEnd(query.Len() - 1); err != nil { return } diagFrom := self.diagIndex(self.target.Len()-1, query.Len()-1) - tubeWidth diagTo := self.diagIndex(0, query.Len()-1) + tubeWidth tubeFrom := self.tubeIndex(diagFrom) if tubeFrom < 0 { tubeFrom = 0 } tubeTo := self.tubeIndex(diagTo) for tubeIndex := tubeFrom; tubeIndex <= tubeTo; tubeIndex++ { if err = self.tubeFlush(tubeIndex); err != nil { return } } self.tubes = nil return self.morass.Finalise() }
// Method to align two sequences using the Smith-Waterman algorithm. Returns an alignment or an error // if the scoring matrix is not square. func (self *Aligner) Align(reference, query *seq.Seq) (aln seq.Alignment, err error) { gap := len(self.Matrix) - 1 for _, row := range self.Matrix { if len(row) != gap+1 { return nil, bio.NewError("Scoring matrix is not square.", 0, self.Matrix) } } r, c := reference.Len()+1, query.Len()+1 table := make([][]int, r) for i := range table { table[i] = make([]int, c) } var scores [3]int for i := 1; i < r; i++ { for j := 1; j < c; j++ { if rVal, qVal := self.LookUp.ValueToCode[reference.Seq[i-1]], self.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 { continue } else { scores[diag] = table[i-1][j-1] + self.Matrix[rVal][qVal] scores[up] = table[i-1][j] + self.Matrix[rVal][gap] scores[left] = table[i][j-1] + self.Matrix[gap][qVal] table[i][j] = util.Max(scores[:]...) } } } refAln := &seq.Seq{ID: reference.ID, Seq: make([]byte, 0, reference.Len())} queryAln := &seq.Seq{ID: query.ID, Seq: make([]byte, 0, query.Len())} i, j := r-1, c-1 for i > 0 && j > 0 { if rVal, qVal := self.LookUp.ValueToCode[reference.Seq[i-1]], self.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 { continue } else { scores[diag] = table[i-1][j-1] + self.Matrix[rVal][qVal] scores[up] = table[i-1][j] + self.Matrix[gap][qVal] scores[left] = table[i][j-1] + self.Matrix[rVal][gap] switch d := maxIndex(scores[:]); d { case diag: i-- j-- refAln.Seq = append(refAln.Seq, reference.Seq[i]) queryAln.Seq = append(queryAln.Seq, query.Seq[j]) case up: i-- refAln.Seq = append(refAln.Seq, reference.Seq[i]) queryAln.Seq = append(queryAln.Seq, self.GapChar) case left: j-- queryAln.Seq = append(queryAln.Seq, query.Seq[j]) refAln.Seq = append(refAln.Seq, self.GapChar) } } } for ; i > 0; i-- { refAln.Seq = append(refAln.Seq, reference.Seq[i-1]) queryAln.Seq = append(queryAln.Seq, self.GapChar) } for ; j > 0; j-- { refAln.Seq = append(refAln.Seq, self.GapChar) queryAln.Seq = append(queryAln.Seq, query.Seq[j-1]) } for i, j := 0, len(refAln.Seq)-1; i < j; i, j = i+1, j-1 { refAln.Seq[i], refAln.Seq[j] = refAln.Seq[j], refAln.Seq[i] } for i, j := 0, len(queryAln.Seq)-1; i < j; i, j = i+1, j-1 { queryAln.Seq[i], queryAln.Seq[j] = queryAln.Seq[j], queryAln.Seq[i] } aln = seq.Alignment{refAln, queryAln} return }
func main() { var ( in *fasta.Reader out *fasta.Writer e error profile *os.File ) inName := flag.String("in", "", "Filename for input. Defaults to stdin.") outName := flag.String("out", "", "Filename for output. Defaults to stdout.") size := flag.Int("size", 40, "Fragment size.") width := flag.Int("width", 60, "Fasta output width.") cpuprofile := flag.String("cpuprofile", "", "write cpu profile to this file.") help := flag.Bool("help", false, "Print this usage message.") flag.Parse() if *help { flag.Usage() os.Exit(1) } if *cpuprofile != "" { if profile, e = os.Create(*cpuprofile); e != nil { fmt.Fprintf(os.Stderr, "Error: %v.", e) os.Exit(0) } fmt.Fprintf(os.Stderr, "Writing CPU profile data to %s\n", *cpuprofile) pprof.StartCPUProfile(profile) defer pprof.StopCPUProfile() } if *inName == "" { in = fasta.NewReader(os.Stdin) } else if in, e = fasta.NewReaderName(*inName); e != nil { fmt.Fprintf(os.Stderr, "Error: %v.", e) } defer in.Close() if *outName == "" { out = fasta.NewWriter(os.Stdout, *width) } else if out, e = fasta.NewWriterName(*outName, *width); e != nil { fmt.Fprintf(os.Stderr, "Error: %v.", e) } defer out.Close() var ( sequence *seq.Seq err error ) t := &seq.Seq{} for { if sequence, err = in.Read(); err != nil { break } length := sequence.Len() t.ID = sequence.ID switch { case length >= 20 && length <= 85: t.Seq = sequence.Seq[5:] out.Write(t) case length > 85: for start := 0; start+*size <= length; start += *size { t.Seq = sequence.Seq[start : start+*size] out.Write(t) } } } }