func (self *Reader) metaSequence(moltype, id string) (sequence *seq.Seq, err error) { var line, body []byte for { if line, err = self.r.ReadBytes('\n'); err == nil { if len(line) > 0 && line[len(line)-1] == '\r' { line = line[:len(line)-1] } if len(line) == 0 { continue } if len(line) < 2 || !bytes.HasPrefix(line, []byte("##")) { return nil, bio.NewError("Corrupt metasequence", 0, line) } line = bytes.TrimSpace(line[2:]) if string(line) == "end-"+moltype { break } else { line = bytes.Join(bytes.Fields(line), nil) body = append(body, line...) } } else { return nil, err } } sequence = seq.New(id, body, nil) sequence.Moltype = bio.ParseMoltype(moltype) return }
// Map routines to iterate a function over an array, potentially splitting the array slice into // chunks so that each chunk is processed concurrently. When using concurrent processing the // Chunk size is either the nearest even division of the total array over the chosen concurrent // processing goroutines or a specified maximum chunk size, whichever is smaller. Reducing // chunk size can reduce the impact of divergence in time for processing chunks, but may add // to overhead. func Map(set Mapper, threads, maxChunkSize int) (results []interface{}, err error) { queue := make(chan Operator, 1) p := NewProcessor(queue, 0, threads) defer p.Stop() chunkSize := util.Min(int(math.Ceil(float64(set.Len())/float64(threads))), maxChunkSize) quit := make(chan struct{}) go func() { for s := 0; s*chunkSize < set.Len(); s++ { select { case <-quit: break default: endChunk := util.Min(chunkSize*(s+1), set.Len()) queue <- set.Slice(chunkSize*s, endChunk) } } }() for r := 0; r*chunkSize < set.Len(); r++ { result := <-p.out if result.Err != nil { err = bio.NewError("Map failed", 0, err) close(quit) break } results = append(results, result.Value) } return }
// Rewind the reader. func (self *Reader) Rewind() (err error) { if s, ok := self.f.(io.Seeker); ok { _, err = s.Seek(0, 0) } else { err = bio.NewError("Not a Seeker", 0, self) } return }
func (self Alignment) Column(pos int, fill byte) (c []byte, err error) { if pos < self.Start() || pos >= self.End() { return nil, bio.NewError("Column out of range", 0, self.Start(), self.End(), pos) } c = make([]byte, len(self)) for i, s := range self { if pos-s.Offset >= 0 || pos-s.Offset < s.Offset+s.Len() { c[i] = s.Seq[pos] } else { c[i] = fill } } return }
// Return a new Processor to operate the function f over the number of threads specified taking // input from queue and placing the result in buffer. Threads is limited by GOMAXPROCS, if threads is greater // GOMAXPROCS or less than 1 then threads is set to GOMAXPROCS. func NewProcessor(queue chan Operator, buffer int, threads int) (p *Processor) { if available := runtime.GOMAXPROCS(0); threads > available || threads < 1 { threads = available } p = &Processor{ in: queue, out: make(chan Result, buffer), stop: make(chan struct{}), working: make(chan bool, threads), wg: &sync.WaitGroup{}, } for i := 0; i < threads; i++ { p.wg.Add(1) go func() { p.working <- true defer func() { if e := recover(); e != nil { p.out <- Result{nil, bio.NewError("concurrent.Processor panic", 1, e)} } <-p.working if len(p.working) == 0 { close(p.out) } p.wg.Done() }() for input := range p.in { v, e := input.Operation() if p.out != nil { p.out <- Result{v, e} } select { case <-p.stop: return default: } } }() } return }
// Write meta data to a GFF file. func (self *Writer) WriteMetaData(d interface{}) (n int, err error) { switch d.(type) { case []byte, string: n, err = self.w.WriteString("##" + d.(string) + "\n") case *seq.Seq: sw := fasta.NewWriter(self.f, self.Width) sw.IDPrefix = fmt.Sprintf("##%s ", d.(*seq.Seq).Moltype) sw.SeqPrefix = "##" if n, err = sw.Write(d.(*seq.Seq)); err != nil { return } if err = sw.Flush(); err != nil { return } var m int m, err = self.w.WriteString("##end-" + d.(*seq.Seq).Moltype.String() + "\n") n += m if err != nil { return } err = self.w.Flush() return case *feat.Feature: start := d.(*feat.Feature).Start if self.OneBased && start >= 0 { start++ } n, err = self.w.WriteString("##sequence-region " + string(d.(*feat.Feature).ID) + " " + strconv.Itoa(start) + " " + strconv.Itoa(d.(*feat.Feature).End) + "\n") default: n, err = 0, bio.NewError("Unknown meta data type", 0, d) } if err == nil { err = self.w.Flush() } return }
// Hash returns the h hash sum of file ReadSeekStater and any error. The file is // Seek'd to the origin before and after the hash to ensure that the full file is summed and the // file is ready for other reads. The hash is not reset on return, so if individual files are to // be hashed with the same h, it should be reset. func Hash(h hash.Hash, file *os.File) (sum []byte, err error) { var fi os.FileInfo if fi, err = file.Stat(); err != nil || fi.IsDir() { return nil, bio.NewError("Is a directory", 0, file) } file.Seek(0, 0) for n, buffer := 0, make([]byte, bufferLen); err == nil || err == io.ErrUnexpectedEOF; { n, err = io.ReadAtLeast(file, buffer, bufferLen) h.Write(buffer[:n]) } if err == io.EOF || err == io.ErrUnexpectedEOF { err = nil } file.Seek(0, 0) sum = h.Sum(nil) return }
// Read a single feature and return it or an error. func (self *Reader) Read() (f *feat.Feature, err error) { var ( line string elems []string se error ok bool ) if line, err = self.r.ReadString('\n'); err == nil { self.line++ if len(line) > 0 && line[len(line)-1] == '\r' { line = line[:len(line)-1] } line = strings.TrimSpace(line) elems = strings.SplitN(line, "\t", self.BedType+1) if len(elems) < self.BedType { return nil, bio.NewError(fmt.Sprintf("Bad bedtype on line %d", self.line), 0, line) } } else { return } f = &feat.Feature{Moltype: bio.DNA} for i := range elems { switch i { case chromField: f.Location = elems[i] if self.BedType <= nameField { f.ID = elems[chromField] + ":" + elems[startField] + ".." + elems[endField] } case startField: f.Start, se = strconv.Atoi(elems[i]) if se != nil { f.Start = 0 } case endField: f.End, se = strconv.Atoi(elems[i]) if se != nil { f.End = 0 } case nameField: f.ID = elems[i] case scoreField: if f.Score, se = strconv.ParseFloat(elems[i], 64); se != nil { f.Score = 0 } case strandField: if f.Strand, ok = CharToStrand[elems[i]]; !ok { f.Strand = 0 } // The following fields are unsupported at this stage case thickStartField: case thickEndField: case rgbField: case blockCountField: case blockSizesField: case blockStartsField: } } return }
func (self *Reader) commentMetaline(line string) (f *feat.Feature, err error) { // Load these into a slice in a MetaField of the Feature fields := strings.Split(string(line), " ") switch fields[0] { case "gff-version": if self.Version, err = strconv.Atoi(fields[1]); err != nil { self.Version = DefaultVersion } return self.Read() case "source-version": if len(fields) > 1 { self.SourceVersion = strings.Join(fields[1:], " ") return self.Read() } else { return nil, bio.NewError("Incomplete source-version metaline", 0, fields) } case "date": if len(fields) > 1 { self.Date, err = time.Parse(self.TimeFormat, strings.Join(fields[1:], " ")) return self.Read() } else { return nil, bio.NewError("Incomplete date metaline", 0, fields) } case "Type": if len(fields) > 1 { self.Type = bio.ParseMoltype(fields[1]) return self.Read() } else { return nil, bio.NewError("Incomplete Type metaline", 0, fields) } case "sequence-region": if len(fields) > 3 { var start, end int if start, err = strconv.Atoi(fields[2]); err != nil { return nil, err } else { if self.OneBased { start = bio.OneToZero(start) } } if end, err = strconv.Atoi(fields[3]); err != nil { return nil, err } f = &feat.Feature{ Meta: &feat.Feature{ ID: fields[1], Start: start, End: end, }, } } else { return nil, bio.NewError("Incomplete sequence-region metaline", 0, fields) } case "DNA", "RNA", "Protein": if len(fields) > 1 { var s *seq.Seq if s, err = self.metaSequence(fields[0], fields[1]); err != nil { return } else { f = &feat.Feature{Meta: s} } } else { return nil, bio.NewError("Incomplete sequence metaline", 0, fields) } default: f = &feat.Feature{Meta: line} } return }
func (self Alignment) Stitch(f feat.FeatureSet) (a Alignment, err error) { for _, s := range self { if !s.Inplace && s.Quality != nil && s.Quality.Inplace { return nil, bio.NewError("Inplace operation on Quality with non-Inplace operation on parent Seq.", 0, s) } } t := interval.NewTree() var i *interval.Interval for _, feature := range f { if i, err = interval.New("", feature.Start, feature.End, 0, nil); err != nil { return nil, err } else { t.Insert(i) } } start := self.Start() a = make(Alignment, len(self)) span, err := interval.New("", start, self.End(), 0, nil) if err != nil { panic("Seq.End() < Seq.Start()") } fs, _ := t.Flatten(span, 0, 0) var offset int for i, s := range self { if s.Inplace { s.Seq = s.stitch(fs) if s.Offset -= fs[0].Start(); offset < 0 { s.Offset = 0 } s.Circular = false if s.Quality != nil { var q *Quality if s.Quality.Inplace { q = s.Quality } else { q = &Quality{ID: s.Quality.ID} } q.Qual = s.Quality.stitch(fs) if q.Offset = s.Quality.Offset - fs[0].Start(); q.Offset < 0 { q.Offset = 0 } q.Circular = false s.Quality = q } a[i] = s } else { var q *Quality if s.Quality != nil { if offset = s.Quality.Offset - fs[0].Start(); offset < 0 { offset = 0 } q = &Quality{ ID: s.Quality.ID, Qual: s.Quality.stitch(fs), Offset: offset, Circular: false, } } if offset = s.Offset - fs[0].Start(); offset < 0 { offset = 0 } a[i] = &Seq{ ID: s.ID, Seq: s.stitch(fs), Offset: offset, Strand: s.Strand, Circular: false, Moltype: s.Moltype, Quality: q, } } } return }
func (self Alignment) Join(a Alignment, fill byte, where int) (b Alignment, err error) { if len(self) != len(a) { return nil, bio.NewError("Alignments do not hold the same number of sequences", 0, []Alignment{self, a}) } var ( ID string ts []byte shift int ) b = make(Alignment, len(self)) switch where { case Prepend: if !a.IsFlush(Right) { a = a.Flush(Right, fill) } if !self.IsFlush(Left) { a = self.Flush(Left, fill) } case Append: if !a.IsFlush(Left) { a = a.Flush(Left, fill) } if !self.IsFlush(Right) { a = self.Flush(Right, fill) } } for i, s2 := range self { s1 := self[i] switch where { case Prepend: ID = s2.ID + "+" + s1.ID ts = make([]byte, len(s2.Seq), len(s2.Seq)+len(s1.Seq)) copy(ts, s2.Seq) ts = append(ts, s1.Seq...) shift = s2.Len() case Append: ID = s1.ID + "+" + s2.ID if s1.Inplace { ts = append(s1.Seq, s2.Seq...) } else { ts = make([]byte, len(s1.Seq), len(s2.Seq)+len(s1.Seq)) copy(ts, s1.Seq) ts = append(ts, s2.Seq...) } } if s1.Inplace { b[i] = s1 b[i].ID = ID b[i].Seq = ts b[i].Offset -= shift b[i].Quality = nil // TODO Handle Quality } else { b[i] = &Seq{ ID: ID, Seq: ts, Offset: s1.Offset - shift, Strand: s1.Strand, Moltype: s1.Moltype, Quality: nil, // TODO Handle Quality } } } return }
// Filter a query sequence against the stored index. If query and the target are the same sequence, // selfAlign can be used to avoid double seaching - behavior is undefined if the the sequences are not the same. // A morass is used to store and sort individual filter hits. func (self *Filter) Filter(query *seq.Seq, selfAlign, complement bool, morass *morass.Morass) (err error) { self.selfAlign = selfAlign self.complement = complement self.morass = morass self.k = self.index.GetK() // Ukonnen's Lemma self.minKmersPerHit = MinWordsPerFilterHit(self.minMatch, self.k, self.maxError) // Maximum distance between SeqQ positions of two k-mers in a match // (More stringent bounds may be possible, but not a big problem // if two adjacent matches get merged). self.maxKmerDist = self.minMatch - self.k tubeWidth := self.tubeOffset + self.maxError if self.tubeOffset < self.maxError { return bio.NewError("TubeOffset < MaxError", 0, []int{self.tubeOffset, self.maxError}) } maxActiveTubes := (self.target.Len()+tubeWidth-1)/self.tubeOffset + 1 self.tubes = make([]TubeState, maxActiveTubes) // Ticker tracks cycling of circular list of active tubes. ticker := tubeWidth f := func(index *kmerindex.Index, position, kmer int) { from := 0 if kmer > 0 { from = index.FingerAt(kmer - 1) } to := index.FingerAt(kmer) for i := from; i < to; i++ { self.commonKmer(index.PosAt(i), position) } if ticker--; ticker == 0 { if e := self.tubeEnd(position); e != nil { panic(e) // Caught by fastkmerindex.ForEachKmerOf and returned } ticker = self.tubeOffset } } if err = self.index.ForEachKmerOf(query, 0, query.Len(), f); err != nil { return } if err = self.tubeEnd(query.Len() - 1); err != nil { return } diagFrom := self.diagIndex(self.target.Len()-1, query.Len()-1) - tubeWidth diagTo := self.diagIndex(0, query.Len()-1) + tubeWidth tubeFrom := self.tubeIndex(diagFrom) if tubeFrom < 0 { tubeFrom = 0 } tubeTo := self.tubeIndex(diagTo) for tubeIndex := tubeFrom; tubeIndex <= tubeTo; tubeIndex++ { if err = self.tubeFlush(tubeIndex); err != nil { return } } self.tubes = nil return self.morass.Finalise() }
// Method to align two sequences using the Smith-Waterman algorithm. Returns an alignment or an error // if the scoring matrix is not square. func (self *Aligner) Align(reference, query *seq.Seq) (aln seq.Alignment, err error) { gap := len(self.Matrix) - 1 for _, row := range self.Matrix { if len(row) != gap+1 { return nil, bio.NewError("Scoring matrix is not square.", 0, self.Matrix) } } r, c := reference.Len()+1, query.Len()+1 table := make([][]int, r) for i := range table { table[i] = make([]int, c) } var scores [3]int for i := 1; i < r; i++ { for j := 1; j < c; j++ { if rVal, qVal := self.LookUp.ValueToCode[reference.Seq[i-1]], self.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 { continue } else { scores[diag] = table[i-1][j-1] + self.Matrix[rVal][qVal] scores[up] = table[i-1][j] + self.Matrix[rVal][gap] scores[left] = table[i][j-1] + self.Matrix[gap][qVal] table[i][j] = util.Max(scores[:]...) } } } refAln := &seq.Seq{ID: reference.ID, Seq: make([]byte, 0, reference.Len())} queryAln := &seq.Seq{ID: query.ID, Seq: make([]byte, 0, query.Len())} i, j := r-1, c-1 for i > 0 && j > 0 { if rVal, qVal := self.LookUp.ValueToCode[reference.Seq[i-1]], self.LookUp.ValueToCode[query.Seq[j-1]]; rVal < 0 || qVal < 0 { continue } else { scores[diag] = table[i-1][j-1] + self.Matrix[rVal][qVal] scores[up] = table[i-1][j] + self.Matrix[gap][qVal] scores[left] = table[i][j-1] + self.Matrix[rVal][gap] switch d := maxIndex(scores[:]); d { case diag: i-- j-- refAln.Seq = append(refAln.Seq, reference.Seq[i]) queryAln.Seq = append(queryAln.Seq, query.Seq[j]) case up: i-- refAln.Seq = append(refAln.Seq, reference.Seq[i]) queryAln.Seq = append(queryAln.Seq, self.GapChar) case left: j-- queryAln.Seq = append(queryAln.Seq, query.Seq[j]) refAln.Seq = append(refAln.Seq, self.GapChar) } } } for ; i > 0; i-- { refAln.Seq = append(refAln.Seq, reference.Seq[i-1]) queryAln.Seq = append(queryAln.Seq, self.GapChar) } for ; j > 0; j-- { refAln.Seq = append(refAln.Seq, self.GapChar) queryAln.Seq = append(queryAln.Seq, query.Seq[j-1]) } for i, j := 0, len(refAln.Seq)-1; i < j; i, j = i+1, j-1 { refAln.Seq[i], refAln.Seq[j] = refAln.Seq[j], refAln.Seq[i] } for i, j := 0, len(queryAln.Seq)-1; i < j; i, j = i+1, j-1 { queryAln.Seq[i], queryAln.Seq[j] = queryAln.Seq[j], queryAln.Seq[i] } aln = seq.Alignment{refAln, queryAln} return }