// Create a new Merger using the provided kmerindex, query sequence, filter parameters and maximum inter-segment gap length. // If selfCompare is true only the upper diagonal of the comparison matrix is examined. func NewMerger(index *kmerindex.Index, query *seq.Seq, filterParams *Params, maxIGap int, selfCompare bool) *Merger { tubeWidth := filterParams.TubeOffset + filterParams.MaxError binWidth := tubeWidth - 1 leftPadding := diagonalPadding + binWidth eoTerm := &Trapezoid{ Left: query.Len() + 1 + leftPadding, Right: query.Len() + 1, Bottom: -1, Top: query.Len() + 1, Next: nil, } return &Merger{ target: index.Seq, filterParams: filterParams, maxIGap: maxIGap, query: query, selfComparison: selfCompare, bottomPadding: index.GetK() + 2, leftPadding: leftPadding, binWidth: binWidth, eoTerm: eoTerm, trapOrder: eoTerm, } }
// Write a single sequence and return the number of bytes written and any error. func (self *Writer) Write(s *seq.Seq) (n int, err error) { if s.Quality == nil { return 0, bio.NewError("No quality associated with sequence", 0, s) } if s.Len() == s.Quality.Len() { self.template[1] = []byte(s.ID) self.template[3] = s.Seq if self.QID { self.template[4] = append(append([]byte("\n+"), []byte(s.ID)...), '\n') } else { self.template[4] = []byte("\n+\n") } self.template[5] = self.encodeQuality(s.Quality.Qual) var tn int for _, t := range self.template { tn, err = self.w.Write(t) n += tn if err != nil { return } } } else { return 0, bio.NewError("Sequence length and quality length do not match", 0, s) } return }
// Convert coordinates in a packed sequence into a feat.Feature. func featureOf(contigs *seq.Seq, from, to int, comp bool) (feature *feat.Feature, err error) { if comp { from, to = contigs.Len()-to, contigs.Len()-from } if from >= to { return nil, bio.NewError(fmt.Sprintf("%s: from > to", contigs.ID), 0, nil) } // DPHit coordinates sometimes over/underflow. // This is a lazy hack to work around it, should really figure // out what is going on. if from < 0 { from = 0 } if to > contigs.Len() { to = contigs.Len() } // Take midpoint of segment -- lazy hack again, endpoints // sometimes under / overflow bin := (from + to) / (2 * binSize) binCount := (contigs.Len() + binSize - 1) / binSize if bin < 0 || bin >= binCount { return nil, bio.NewError(fmt.Sprintf("%s: bin %d out of range 0..%d", contigs.ID, bin, binCount-1), 0, nil) } contigIndex := contigs.Meta.(seqMap).binMap[bin] if contigIndex < 0 || contigIndex >= len(contigs.Meta.(seqMap).contigs) { return nil, bio.NewError(fmt.Sprintf("%s: contig index %d out of range 0..%d", contigs.ID, contigIndex, len(contigs.Meta.(seqMap).contigs)), 0, nil) } length := to - from if length < 0 { return nil, bio.NewError(fmt.Sprintf("%s: length < 0", contigs.ID), 0, nil) } contig := contigs.Meta.(seqMap).contigs[contigIndex] contigFrom := from - contig.from contigTo := contigFrom + length if contigFrom < 0 { contigFrom = 0 } if contigTo > contig.seq.Len() { contigTo = contig.seq.Len() } return &feat.Feature{ ID: contig.seq.ID, Start: contigFrom, End: contigTo, }, nil }
// Write a single sequence and return the number of bytes written and any error. func (self *Writer) Write(s *seq.Seq) (n int, err error) { var ln int n, err = self.w.WriteString(string(self.IDPrefix) + s.ID + "\n") if err == nil { for i := 0; i*self.Width <= s.Len(); i++ { endLinePos := util.Min(self.Width*(i+1), s.Len()) for _, elem := range [][]byte{self.SeqPrefix, s.Seq[self.Width*i : endLinePos], {'\n'}} { ln, err = self.w.Write(elem) if n += ln; err != nil { return } } } } return }
// Pack a sequence into the Packed sequence. Returns a string giving diagnostic information. func (pa *Packer) Pack(sequence *seq.Seq) string { m := pa.Packed.Meta.(seqMap) c := contig{seq: sequence} padding := binSize - sequence.Len()%binSize if padding < minPadding { padding += binSize } pa.length += pa.lastPad c.from = pa.length pa.length += sequence.Len() pa.lastPad = padding bins := make([]int, (padding+sequence.Len())/binSize) for i := 0; i < len(bins); i++ { bins[i] = len(m.contigs) } m.binMap = append(m.binMap, bins...) m.contigs = append(m.contigs, c) pa.Packed.Meta = m return fmt.Sprintf("%20s\t%10d\t%7d-%-d", sequence.ID[:util.Min(20, len(sequence.ID))], sequence.Len(), len(m.binMap)-len(bins), len(m.binMap)-1) }
// Create a new Kmer Index with a word size k based on sequence func New(k int, sequence *seq.Seq) (i *Index, err error) { switch { case k > MaxKmerLen: return nil, bio.NewError("k greater than MaxKmerLen", 0, k, MaxKmerLen) case k < MinKmerLen: return nil, bio.NewError("k less than MinKmerLen", 0, k, MinKmerLen) case k+1 > sequence.Len(): return nil, bio.NewError("sequence shorter than k+1-mer length", 0, k+1, sequence.Len()) } i = &Index{ finger: make([]Kmer, util.Pow4(k)+1), // Need a Tn+1 finger position so that Tn can be recognised k: k, kMask: Kmer(util.Pow4(k) - 1), Seq: sequence, indexed: false, } i.buildKmerTable() return }
// Method to align two sequences using the Smith-Waterman algorithm. Returns an alignment or an error // if the scoring matrix is not square. func (a *Aligner) Align(reference, query *seq.Seq) (aln seq.Alignment, err error) { gap := len(a.Matrix) - 1 for _, row := range a.Matrix { if len(row) != gap+1 { return nil, bio.NewError("Scoring matrix is not square.", 0, a.Matrix) } } r, c := reference.Len()+1, query.Len()+1 table := make([][]int, r) for i := range table { table[i] = make([]int, c) } max, maxI, maxJ := 0, 0, 0 var ( score int scores [3]int ) for i := 1; i < r; i++ { for j := 1; j < c; j++ { if rVal, qVal := a.LookUp.ValueToCode[reference.Seq[i-1]], a.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 { continue } else { scores[diag] = table[i-1][j-1] + a.Matrix[rVal][qVal] scores[up] = table[i-1][j] + a.Matrix[rVal][gap] scores[left] = table[i][j-1] + a.Matrix[gap][qVal] score = util.Max(scores[:]...) if score < 0 { score = 0 } if score >= max { // greedy so make farthest down and right max, maxI, maxJ = score, i, j } table[i][j] = score } } } refAln := &seq.Seq{ID: reference.ID, Seq: make([]byte, 0, reference.Len())} queryAln := &seq.Seq{ID: query.ID, Seq: make([]byte, 0, query.Len())} for i, j := maxI, maxJ; table[i][j] != 0 && i > 0 && j > 0; { if rVal, qVal := a.LookUp.ValueToCode[reference.Seq[i-1]], a.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 { continue } else { scores[diag] = table[i-1][j-1] + a.Matrix[rVal][qVal] scores[up] = table[i-1][j] + a.Matrix[gap][qVal] scores[left] = table[i][j-1] + a.Matrix[rVal][gap] switch d := maxIndex(scores[:]); d { case diag: i-- j-- refAln.Seq = append(refAln.Seq, reference.Seq[i]) queryAln.Seq = append(queryAln.Seq, query.Seq[j]) case up: i-- refAln.Seq = append(refAln.Seq, reference.Seq[i]) queryAln.Seq = append(queryAln.Seq, a.GapChar) case left: j-- refAln.Seq = append(refAln.Seq, a.GapChar) queryAln.Seq = append(queryAln.Seq, query.Seq[j]) } } } for i, j := 0, len(refAln.Seq)-1; i < j; i, j = i+1, j-1 { refAln.Seq[i], refAln.Seq[j] = refAln.Seq[j], refAln.Seq[i] } for i, j := 0, len(queryAln.Seq)-1; i < j; i, j = i+1, j-1 { queryAln.Seq[i], queryAln.Seq[j] = queryAln.Seq[j], queryAln.Seq[i] } aln = seq.Alignment{refAln, queryAln} return }
// Filter a query sequence against the stored index. If query and the target are the same sequence, // selfAlign can be used to avoid double seaching - behavior is undefined if the the sequences are not the same. // A morass is used to store and sort individual filter hits. func (f *Filter) Filter(query *seq.Seq, selfAlign, complement bool, morass *morass.Morass) error { f.selfAlign = selfAlign f.complement = complement f.morass = morass f.k = f.index.GetK() // Ukonnen's Lemma f.minKmersPerHit = MinWordsPerFilterHit(f.minMatch, f.k, f.maxError) // Maximum distance between SeqQ positions of two k-mers in a match // (More stringent bounds may be possible, but not a big problem // if two adjacent matches get merged). f.maxKmerDist = f.minMatch - f.k tubeWidth := f.tubeOffset + f.maxError if f.tubeOffset < f.maxError { return bio.NewError("TubeOffset < MaxError", 0, []int{f.tubeOffset, f.maxError}) } maxActiveTubes := (f.target.Len()+tubeWidth-1)/f.tubeOffset + 1 f.tubes = make([]tubeState, maxActiveTubes) // Ticker tracks cycling of circular list of active tubes. ticker := tubeWidth var err error err = f.index.ForEachKmerOf(query, 0, query.Len(), func(index *kmerindex.Index, position, kmer int) { from := 0 if kmer > 0 { from = index.FingerAt(kmer - 1) } to := index.FingerAt(kmer) for i := from; i < to; i++ { f.commonKmer(index.PosAt(i), position) } if ticker--; ticker == 0 { if e := f.tubeEnd(position); e != nil { panic(e) // Caught by fastkmerindex.ForEachKmerOf and returned } ticker = f.tubeOffset } }) if err != nil { return err } err = f.tubeEnd(query.Len() - 1) if err != nil { return err } diagFrom := f.diagIndex(f.target.Len()-1, query.Len()-1) - tubeWidth diagTo := f.diagIndex(0, query.Len()-1) + tubeWidth tubeFrom := f.tubeIndex(diagFrom) if tubeFrom < 0 { tubeFrom = 0 } tubeTo := f.tubeIndex(diagTo) for tubeIndex := tubeFrom; tubeIndex <= tubeTo; tubeIndex++ { err = f.tubeFlush(tubeIndex) if err != nil { return err } } f.tubes = nil return f.morass.Finalise() }