// Join p to the sequence at the end specified by where. func (self *Seq) Join(p *Seq, where int) (err error) { if self.circular { return bio.NewError("Cannot join circular sequence: receiver.", 0, self) } else if p.circular { return bio.NewError("Cannot join circular sequence: parameter.", 0, p) } switch where { case seq.Start: p = p.Copy().(*Seq) p.S.Align(seq.End) self.S.Align(seq.Start) self.S.LeftPad = p.S.LeftPad case seq.End: p = p.Copy().(*Seq) p.S.Align(seq.Start) self.S.Align(seq.End) self.S.RightPad = p.S.RightPad default: return bio.NewError("Undefined location.", 0, where) } tt, offset := sequtils.Join(self.S.Letters, p.S.Letters, where) self.offset = offset self.S.Letters = tt.([]alphabet.Pack) return }
// Read a single sequence and return it or an error. // TODO: Does not read multi-line fastq. func (self *Reader) Read() (s seq.Sequence, err error) { var ( buff, line, label []byte isPrefix bool seqBuff []alphabet.QLetter t seqio.SequenceAppender ) inQual := false for { if buff, isPrefix, err = self.r.ReadLine(); err == nil { if isPrefix { line = append(line, buff...) continue } else { line = buff } line = bytes.TrimSpace(line) if len(line) == 0 { continue } switch { case !inQual && line[0] == '@': t = self.readHeader(line) label, line = line, nil case !inQual && line[0] == '+': if len(label) == 0 { return nil, bio.NewError("fastq: no header line parsed before +line in fastq format", 0) } if len(line) > 1 && bytes.Compare(label[1:], line[1:]) != 0 { return nil, bio.NewError("fastq: quality header does not match sequence header", 0) } inQual = true case !inQual: line = bytes.Join(bytes.Fields(line), nil) seqBuff = make([]alphabet.QLetter, len(line)) for i := range line { seqBuff[i].L = alphabet.Letter(line[i]) } case inQual: line = bytes.Join(bytes.Fields(line), nil) if len(line) != len(seqBuff) { return nil, bio.NewError("fastq: sequence/quality length mismatch", 0) } for i := range line { seqBuff[i].Q = alphabet.DecodeToQphred(line[i], self.enc) } t.AppendQLetters(seqBuff...) return t, nil } } else { return } } panic("cannot reach") }
// Write a single sequence and return the number of bytes written and any error. func (self *Writer) Write(s *seq.Seq) (n int, err error) { if s.Quality == nil { return 0, bio.NewError("No quality associated with sequence", 0, s) } if s.Len() == s.Quality.Len() { self.template[1] = []byte(s.ID) self.template[3] = s.Seq if self.QID { self.template[4] = append(append([]byte("\n+"), []byte(s.ID)...), '\n') } else { self.template[4] = []byte("\n+\n") } self.template[5] = self.encodeQuality(s.Quality.Qual) var tn int for _, t := range self.template { tn, err = self.w.Write(t) n += tn if err != nil { return } } } else { return 0, bio.NewError("Sequence length and quality length do not match", 0, s) } return }
func (p *Promise) fulfill(value interface{}) (err error) { r, set := p.messageState() if r.Err != nil { err = bio.NewError("Tried to fulfill a failed promise", 0, r.Err) } else { if !set || p.mutable { r.Value = value err = nil } else { err = bio.NewError("Tried to fulfill an already set immutable promise", 0) } } if err != nil && p.relay { if r.Err != nil { err = bio.NewError("Promise already failed - cannot relay", 0, r.Err) } else { r.Err = err } } p.message <- r return }
func (self *Seq) Join(s *Seq, where int) (j *Seq, err error) { var ( ts []byte ID string ) if self.Circular { return nil, bio.NewError("Cannot join circular molecule.", 0, self) } if !self.Inplace && self.Quality != nil && self.Quality.Inplace { return nil, bio.NewError("Inplace operation on Quality with non-Inplace operation on parent Seq.", 0, self) } switch where { case Prepend: ID = s.ID + "+" + self.ID ts = make([]byte, len(s.Seq), len(s.Seq)+len(self.Seq)) copy(ts, s.Seq) ts = append(ts, self.Seq...) case Append: ID = self.ID + "+" + s.ID if self.Inplace { ts = append(self.Seq, s.Seq...) } else { ts = make([]byte, len(self.Seq), len(s.Seq)+len(self.Seq)) copy(ts, self.Seq) ts = append(ts, s.Seq...) } } var q *Quality if self.Quality != nil && s.Quality != nil { q, err = self.Quality.Join(s.Quality, where) if err != nil { return } } if self.Inplace { j = self j.ID = ID j.Seq = ts j.Quality = q // self.Quality will become nil if either sequence lacks Quality } else { j = &Seq{ ID: ID, Seq: ts, Strand: self.Strand, Moltype: self.Moltype, Quality: q, } } if where == Prepend { j.Offset -= s.Len() } return }
func (self *Seq) Trunc(start, end int) (s *Seq, err error) { var ts []byte if !self.Inplace && self.Quality != nil && self.Quality.Inplace { return nil, bio.NewError("Inplace operation on Quality with non-Inplace operation on parent Seq.", 0, self) } if start < self.Offset || end < self.Offset || start > len(self.Seq)+self.Offset || end > len(self.Seq)+self.Offset { return nil, bio.NewError("Start or end position out of range.", 0, self) } if start <= end { if self.Inplace { ts = self.Seq[start-self.Offset : end-self.Offset] } else { ts = append([]byte(nil), self.Seq[start-self.Offset:end-self.Offset]...) } } else if self.Circular { if self.Inplace { ts = append(self.Seq[start-self.Offset:], self.Seq[:end-self.Offset]...) // not quite inplace for this op } else { ts = make([]byte, len(self.Seq)-start-self.Offset, len(self.Seq)+end-start) copy(ts, self.Seq[start-self.Offset:]) ts = append(ts, self.Seq[:end-self.Offset]...) } } else { return nil, bio.NewError("Start position greater than end position for non-circular molecule.", 0, self) } var q *Quality if self.Quality != nil { q, err = self.Quality.Trunc(start, end) if err != nil { err = bio.NewError("Quality.Trunc() returned error", 0, err) return } } if self.Inplace { s = self s.Seq = ts s.Circular = false s.Quality = q } else { s = &Seq{ ID: self.ID, Seq: ts, Offset: start, Strand: self.Strand, Circular: false, Moltype: self.Moltype, Quality: q, } } return }
// Pack a QLetter into a QPack. a.Len() == 4. func (self QLetter) Pack(a Nucleic) (QPack, error) { if a.Len() != 4 { return 0, bio.NewError("Invalid alphabet", 0, self) } if !a.IsValid(self.L) { return QPack(byte(self.Q) << 2), bio.NewError("Invalid letter", 0, self) } return QPack(byte(self.Q)<<2 | byte(a.IndexOf(self.L)&0x3)), nil }
// Convert coordinates in a packed sequence into a feat.Feature. func featureOf(contigs *seq.Seq, from, to int, comp bool) (feature *feat.Feature, err error) { if comp { from, to = contigs.Len()-to, contigs.Len()-from } if from >= to { return nil, bio.NewError(fmt.Sprintf("%s: from > to", contigs.ID), 0, nil) } // DPHit coordinates sometimes over/underflow. // This is a lazy hack to work around it, should really figure // out what is going on. if from < 0 { from = 0 } if to > contigs.Len() { to = contigs.Len() } // Take midpoint of segment -- lazy hack again, endpoints // sometimes under / overflow bin := (from + to) / (2 * binSize) binCount := (contigs.Len() + binSize - 1) / binSize if bin < 0 || bin >= binCount { return nil, bio.NewError(fmt.Sprintf("%s: bin %d out of range 0..%d", contigs.ID, bin, binCount-1), 0, nil) } contigIndex := contigs.Meta.(seqMap).binMap[bin] if contigIndex < 0 || contigIndex >= len(contigs.Meta.(seqMap).contigs) { return nil, bio.NewError(fmt.Sprintf("%s: contig index %d out of range 0..%d", contigs.ID, contigIndex, len(contigs.Meta.(seqMap).contigs)), 0, nil) } length := to - from if length < 0 { return nil, bio.NewError(fmt.Sprintf("%s: length < 0", contigs.ID), 0, nil) } contig := contigs.Meta.(seqMap).contigs[contigIndex] contigFrom := from - contig.from contigTo := contigFrom + length if contigFrom < 0 { contigFrom = 0 } if contigTo > contig.seq.Len() { contigTo = contig.seq.Len() } return &feat.Feature{ ID: contig.seq.ID, Start: contigFrom, End: contigTo, }, nil }
// Join p to the sequence at the end specified by where. func (self *Seq) Join(p *Seq, where int) (err error) { if self.circular { return bio.NewError("Cannot join circular sequence: receiver.", 1, self) } else if p.circular { return bio.NewError("Cannot join circular sequence: parameter.", 1, p) } tt, offset := sequtils.Join(self.S, p.S, where) self.offset = offset self.S = tt.([]alphabet.Letter) return }
// Read a single sequence and return it or an error. // TODO: Does not read interleaved fastq. func (self *Reader) Read() (sequence *seq.Seq, err error) { var line, label, seqBody, qualBody []byte sequence = &seq.Seq{} inQual := false READ: for { line, err = self.r.ReadBytes('\n') if err == nil { if len(line) > 0 && line[len(line)-1] == '\r' { line = line[:len(line)-1] } line = bytes.TrimSpace(line) if len(line) == 0 { continue } switch { case !inQual && line[0] == '@': label = line[1:] case !inQual && line[0] == '+': if len(label) == 0 { return nil, bio.NewError("No ID line parsed at +line in fastq format", 0) } if len(line) > 1 && bytes.Compare(label, line[1:]) != 0 { return nil, bio.NewError("Quality ID does not match sequence ID", 0) } inQual = true case !inQual: line = bytes.Join(bytes.Fields(line), nil) seqBody = append(seqBody, line...) case inQual: line = bytes.Join(bytes.Fields(line), nil) qualBody = append(qualBody, line...) if len(qualBody) >= len(seqBody) { break READ } } } else { return } } if len(seqBody) != len(qualBody) { return nil, bio.NewError("Quality length does not match sequence length", 0) } labelString := string(label) sequence = seq.New(labelString, seqBody, seq.NewQuality(labelString, self.decodeQuality(qualBody))) return }
// Join p to the sequence at the end specified by where. func (self *QSeq) Join(p *QSeq, where int) (err error) { if self.circular { return bio.NewError("Cannot join circular sequence: receiver.", 1, self) } else if p.circular { return bio.NewError("Cannot join circular sequence: parameter.", 1, p) } var tt interface{} tt, self.offset = sequtils.Join(self.S, p.S, where) self.S = tt.([]alphabet.QPack) return }
// Convert a string of bases into a Kmer, returns an error if string length does not match word length func (self *Index) KmerOf(kmertext string) (kmer Kmer, err error) { if len(kmertext) != self.k { return 0, bio.NewError("Sequence length does not match Kmer length", 0, self.k, kmertext) } for _, v := range kmertext { x := lookUp.ValueToCode[v] if x < 0 { return 0, bio.NewError("Kmer contains illegal character", 0, kmertext) } kmer = (kmer << 2) | Kmer(x) } return }
func checkPackedAlpha(alpha alphabet.Nucleic) error { if alpha.Len() != 4 { return bio.NewError("Cannot create packed sequence with alphabet length != 4", 0, alpha) } for _, v := range alphabet.BytesToLetters([]byte(alpha.String())) { if c, ok := alpha.Complement(v); ok && alpha.IndexOf(v) != alpha.IndexOf(c)^0x3 { // TODO: Resolution to the following problem: // Normal nucleotide alphabets (ACGT/ACGU) are safe with this in either case sensitive or // insensitive. Other alphabets may not be, in this case specify case sensitive. return bio.NewError("alphabet order not consistent with bit operations for packed.", 0, alpha) } } return nil }
// Return an array of positions for the Kmer string kmertext func (self *Index) GetPositionsString(kmertext string) (positions []int, err error) { switch { case len(kmertext) != self.k: return nil, bio.NewError("Sequence length does not match Kmer length", 0, self.k, kmertext) case !self.indexed: return nil, bio.NewError("Index not built: call Build()", 0, self) } var kmer Kmer if kmer, err = self.KmerOf(kmertext); err != nil { return nil, err } return self.GetPositionsKmer(kmer) }
func NewSeq(id string, subids []string, b [][]alphabet.Letter, alpha alphabet.Peptide, cons protein.Consensifyer) (*Seq, error) { switch lids, lseq := len(subids), len(b); { case lids == 0 && len(b) == 0: case lseq != 0 && lids == len(b[0]): if lids == 0 { subids = make([]string, len(b[0])) for i := range subids { subids[i] = fmt.Sprintf("%s:%d", id, i) } } default: return nil, bio.NewError("alignment: id/seq number mismatch", 0) } return &Seq{ ID: id, SubIDs: append([]string(nil), subids...), S: append([][]alphabet.Letter(nil), b...), alphabet: alpha, Consensify: cons, Stringify: func(s seq.Polymer) string { t := s.(*Seq).Consensus(false) return t.String() }, }, nil }
func (self *Reader) metaSequence(moltype, id string) (sequence *seq.Seq, err error) { var line, body []byte for { line, err = self.r.ReadBytes('\n') if err != nil { return nil, err } if len(line) > 0 && line[len(line)-1] == '\r' { line = line[:len(line)-1] } if len(line) == 0 { continue } if len(line) < 2 || !bytes.HasPrefix(line, []byte("##")) { return nil, bio.NewError("Corrupt metasequence", 0, line) } line = bytes.TrimSpace(line[2:]) if string(line) == "end-"+moltype { break } else { line = bytes.Join(bytes.Fields(line), nil) body = append(body, line...) } } sequence = seq.New(id, body, nil) sequence.Moltype = bio.ParseMoltype(moltype) return }
func NewQSeq(id string, subids []string, ql [][]alphabet.QLetter, alpha alphabet.Nucleic, encode alphabet.Encoding, cons nucleic.Consensifyer) (*QSeq, error) { switch lids, lseq := len(subids), len(ql); { case lids == 0 && len(ql) == 0: case lseq != 0 && lids == len(ql[0]): if lids == 0 { subids = make([]string, len(ql[0])) for i := range subids { subids[i] = fmt.Sprintf("%s:%d", id, i) } } default: return nil, bio.NewError("alignment: id/seq number mismatch", 0) } return &QSeq{ ID: id, SubIDs: append([]string(nil), subids...), S: append([][]alphabet.QLetter(nil), ql...), alphabet: alpha, encoding: encode, Strand: 1, Consensify: cons, Threshold: 2, LowQFilter: func(s seq.Sequence, _ alphabet.Letter) alphabet.Letter { return s.Alphabet().Ambiguous() }, Stringify: func(s seq.Polymer) string { t := s.(*QSeq).Consensus(false) t.Threshold = s.(*QSeq).Threshold t.LowQFilter = s.(*QSeq).LowQFilter return t.String() }, }, nil }
// Join segments of the sequence, returning any error. func (self *Seq) Compose(f feat.FeatureSet) (err error) { l := 0 for _, seg := range f { if seg.End < seg.Start { return bio.NewError("Feature end < start", 0, seg) } l += util.Min(seg.End, self.End()) - util.Max(seg.Start, self.Start()) } t := &Seq{} *t = *self t.S = &Packing{Letters: make([]alphabet.Pack, 0, (l+3)/4)} var tseg seq.Sequence for _, seg := range f { tseg, err = self.Subseq(util.Max(seg.Start, self.Start()), util.Min(seg.End, self.End())) if err != nil { return } tseg := tseg.(*Seq) if seg.Strand == -1 { tseg.RevComp() } tseg.S.Align(seq.Start) t.S.Align(seq.End) t.S.Letters = append(t.S.Letters, tseg.S.Letters...) t.S.RightPad = tseg.S.RightPad } *self = *t return }
// Map routines to iterate a function over an array, potentially splitting the array slice into // chunks so that each chunk is processed concurrently. When using concurrent processing the // Chunk size is either the nearest even division of the total array over the chosen concurrent // processing goroutines or a specified maximum chunk size, whichever is smaller. Reducing // chunk size can reduce the impact of divergence in time for processing chunks, but may add // to overhead. func Map(set Mapper, threads, maxChunkSize int) (results []interface{}, err error) { queue := make(chan Operator, 1) p := NewProcessor(queue, 0, threads) defer p.Stop() chunkSize := util.Min(int(math.Ceil(float64(set.Len())/float64(threads))), maxChunkSize) quit := make(chan struct{}) go func() { for s := 0; s*chunkSize < set.Len(); s++ { select { case <-quit: break default: endChunk := util.Min(chunkSize*(s+1), set.Len()) queue <- set.Slice(chunkSize*s, endChunk) } } }() for r := 0; r*chunkSize < set.Len(); r++ { result := <-p.out if result.Err != nil { err = bio.NewError("Map failed", 0, err) close(quit) break } results = append(results, result.Value) } return }
func (self *Multi) Join(a *Multi, where int) (err error) { if self.Count() != a.Count() { return bio.NewError("Multis do not hold the same number of sequences", 0, []*Multi{self, a}) } switch where { case seq.Start: if !a.IsFlush(seq.End) { a.Flush(seq.End, self.alphabet.Gap()) } if !self.IsFlush(seq.Start) { self.Flush(seq.Start, self.alphabet.Gap()) } case seq.End: if !a.IsFlush(seq.Start) { a.Flush(seq.Start, self.alphabet.Gap()) } if !self.IsFlush(seq.End) { self.Flush(seq.End, self.alphabet.Gap()) } } for i := 0; i < self.Count(); i++ { s := self.Get(i) as := a.Get(i) err = joinOne(s, as, where) if err != nil { return } } return }
func joinOne(s, as protein.Sequence, where int) (err error) { switch s.(type) { case *protein.Seq: if t, ok := as.(*protein.Seq); !ok { err = joinFailure(s, t) } else { err = s.(*protein.Seq).Join(t, where) } case *protein.QSeq: if t, ok := as.(*protein.QSeq); !ok { err = joinFailure(s, t) } else { err = s.(*protein.QSeq).Join(t, where) } case *Multi: if t, ok := as.(*Multi); !ok { err = joinFailure(s, t) } else { err = s.(*Multi).Join(t, where) } default: joinerRegistryLock.RLock() if joinerFunc, ok := joinerRegistry[reflect.TypeOf(s)]; ok { err = joinerFunc(s, as, where) } else { err = bio.NewError(fmt.Sprintf("Sequence type %T not handled.", s), 0, s) } joinerRegistryLock.RUnlock() } return }
// Rewind the reader. func (self *Reader) Rewind() (err error) { if s, ok := self.f.(io.Seeker); ok { _, err = s.Seek(0, 0) } else { err = bio.NewError("Not a Seeker", 0, self) } return }
// Truncate provides a function that may be used by polymer types to implement Truncator. // It makes use of reflection and so may be slower than type-specific implementations. // This is the reference implementation and should be used to compare type-specific // implementation against in testing. func Truncate(pol interface{}, start, end int, circular bool) (p interface{}, err error) { pv := reflect.ValueOf(pol) if l := pv.Len(); start < 0 || end < 0 || start > l || end > l { return nil, bio.NewError("Out of range.", 0, nil) } if start <= end { p = pv.Slice(start, end).Interface() } else if circular { tv := reflect.MakeSlice(pv.Type(), pv.Len()-start, pv.Len()+end-start) reflect.Copy(tv, pv.Slice(start, pv.Len())) p = reflect.AppendSlice(tv, pv.Slice(0, end)).Interface() } else { return nil, bio.NewError("Start position greater than end position for non-circular sequence.", 0, pol) } return }
func (self *Seq) RevComp() (s *Seq, err error) { var rs []byte if self.Inplace { rs = self.Seq } else { if self.Quality != nil && self.Quality.Inplace { return nil, bio.NewError("Inplace operation on Quality with non-Inplace operation on parent Seq.", 0, self) } rs = make([]byte, len(self.Seq)) } if self.Moltype == bio.DNA || self.Moltype == bio.RNA { i, j := 0, len(self.Seq)-1 for ; i < j; i, j = i+1, j-1 { rs[i], rs[j] = complement[self.Moltype][self.Seq[j]], complement[self.Moltype][self.Seq[i]] } if i == j { rs[i] = complement[self.Moltype][self.Seq[i]] } } else { return nil, bio.NewError("Cannot reverse-complement protein.", 0, self) } var q *Quality if self.Quality != nil { q = self.Quality.Reverse() } if self.Inplace { s = self s.Quality = q } else { s = &Seq{ ID: self.ID, Seq: rs, Offset: self.Offset + len(self.Seq), Strand: -self.Strand, Circular: self.Circular, Moltype: self.Moltype, Quality: q, } } return }
// Rewind the reader. func (self *Reader) Rewind() (err error) { if s, ok := self.f.(io.Seeker); ok { self.last = nil _, err = s.Seek(0, 0) self.r = bufio.NewReader(self.f) } else { err = bio.NewError("Not a Seeker", 0, self) } return }
// Returns the sums of alignment lengths. func (h DPHits) Sum() (a, b int, err error) { for _, hit := range h { la, lb := hit.Aepos-hit.Abpos, hit.Bepos-hit.Bbpos if la < 0 || lb < 0 { return 0, 0, bio.NewError("Area < 0", 0, hit) } a, b = a+la, b+lb } return }
func (self *Multi) Add(n ...protein.Sequence) (err error) { for _, s := range n { if s.Alphabet() != self.alphabet { return bio.NewError("Inconsistent alphabets", 0, self, s) } } self.S = append(self.S, n...) return }
// Append QLetters to the sequence. func (self *Seq) AppendQLetters(a ...alphabet.QLetter) (err error) { defer func() { if r := recover(); r != nil { _, pos := self.alphabet.AllValidQLetter(a) err = bio.NewError(fmt.Sprintf("Encoding error: %s %q at position %d.", r, a[pos], pos), 1, a) } }() i := 0 for ; self.S.RightPad > 0 && i < len(a); i, self.S.RightPad = i+1, self.S.RightPad-1 { if !self.alphabet.IsValid(a[i].L) { return bio.NewError(fmt.Sprintf("Invalid letter %q at position %d.", a[i], i), 0, nil) } self.S.Letters[len(self.S.Letters)-1] |= alphabet.Pack(self.alphabet.IndexOf(a[i].L)) << (4 - byte(self.S.RightPad)) } self.S.Letters = append(self.S.Letters, PackQLetters(self.alphabet, a[i:]...).Letters...) return }
func (self *Quality) Trunc(start, end int) (q *Quality, err error) { var tq []Qsanger if start < self.Offset || end < self.Offset || start > len(self.Qual)+self.Offset || end > len(self.Qual)+self.Offset { return nil, bio.NewError("Start or end position out of range.", 0, self) } if start <= end { if self.Inplace { tq = self.Qual[start-self.Offset : end-self.Offset] } else { tq = append([]Qsanger(nil), self.Qual[start-self.Offset:end-self.Offset]...) } } else if self.Circular { if self.Inplace { tq = append(self.Qual[start-self.Offset:], self.Qual[:end-self.Offset]...) // not quite inplace for this op } else { tq = make([]Qsanger, len(self.Qual)-start-self.Offset, len(self.Qual)+end-start) copy(tq, self.Qual[start-self.Offset:]) tq = append(tq, self.Qual[:end-self.Offset]...) } } else { return nil, bio.NewError("Start position greater than end position for non-circular molecule.", 0, self) } if self.Inplace { q = self q.Qual = tq q.Circular = false } else { q = &Quality{ ID: self.ID, Qual: tq, Offset: start, Strand: self.Strand, Circular: false, } } return }
// Append each byte of each a to the appropriate sequence in the reciever. func (self *QSeq) AppendColumns(a ...[]alphabet.QLetter) (err error) { for i, s := range a { if len(s) != self.Count() { return bio.NewError(fmt.Sprintf("Column %d does not match Count(): %d != %d.", i, len(s), self.Count()), 0, a) } } self.S = append(self.S, a...) return }