func translateStockholm(b byte) (seq.Residue, bool) { switch { case b >= 'a' && b <= 'z': return seq.Residue(b), true case b >= 'A' && b <= 'Z': return seq.Residue(b), true case b == '-': return '-', true case b == '.': return '.', true } return 0, false }
// TranslateNormal is the default translator for regular (NOT aligned) FASTA // files. func TranslateNormal(b byte) (seq.Residue, bool) { switch { case b >= 'a' && b <= 'z': return seq.Residue(unicode.ToTitle(rune(b))), true case b >= 'A' && b <= 'Z': return seq.Residue(b), true case b == '*': return 0, true case b == '-': return '-', true case b == '/': // PIR junk. Chain breaks? WTF? return '-', true } return 0, false }
// FastaSeq returns a new seq.Sequence from TuftsBCB/seq. func (s *sequence) FastaSeq() seq.Sequence { rs := make([]seq.Residue, len(s.Residues)) for i := range s.Residues { rs[i] = seq.Residue(s.Residues[i]) } return seq.Sequence{s.Name, rs} }
func translateA2M(b byte) (seq.Residue, bool) { switch { case b >= 'a' && b <= 'z': return seq.Residue(b), true case b >= 'A' && b <= 'Z': return seq.Residue(b), true case b == '*': return 0, true case b == '-': return '-', true case b == '.': return '.', true case b == '/': // PIR junk. Chain breaks? WTF? return '-', true } return 0, false }
func getSeq(line []byte) []seq.Residue { fs := bytes.Fields(line[17:]) var fseq []byte if len(fs) == 1 { fseq = fs[0] } else { fseq = fs[1] } rs := make([]seq.Residue, len(fseq)) for i, r := range fseq { rs[i] = seq.Residue(r) } return rs }
func asResidues(brs []byte, trusted bool) ([]seq.Residue, error) { rs := make([]seq.Residue, 0, len(brs)) for _, b := range brs { if trusted { rs = append(rs, seq.Residue(b)) } else { bNew, ok := translateStockholm(b) if !ok { return nil, fmt.Errorf("Invalid Stockholm residue '%c'.", b) } else if bNew > 0 { rs = append(rs, bNew) } } } return rs, nil }
func readHMM(buf *bytes.Buffer) (hmm *seq.HMM, err error) { var nullFields []string hmm = new(seq.HMM) for { line, err := buf.ReadBytes('\n') if err == io.EOF && len(line) == 0 { break } if err != nil && err != io.EOF { panic(err) } line = trim(line) if len(line) == 0 { continue } switch { case hasPrefix(line, "NULL"): // We can't read the NULL emissions yet, because we don't have // an alphabet. (Which we'll get on the next line.) // We'll slurp this into a seq.EProbs value in a little bit, after // we get an alphabet. nullFields = strings.Fields(str(line[4:])) case hasPrefix(line, "HMM"): // We slurp up three lines here. The first is the alphabet // (the current line). The second is the ordering of transition // probabilities. And the third are transition probabilities for // the begin state. We ignore the second two. if _, err := demandLine(buf); err != nil { return nil, fmt.Errorf("%s (expected transition ordering)", err) } if _, err := demandLine(buf); err != nil { return nil, fmt.Errorf("%s (expected start transitions)", err) } // Get the ordering of the alphabet. hmm.Alphabet = make([]seq.Residue, 0, 20) residues := bytes.Split(trim(line[3:]), []byte{'\t'}) for _, residue := range residues { hmm.Alphabet = append(hmm.Alphabet, seq.Residue(residue[0])) } // Remember those null probabilities? Well, we have an alphabet now. ep, err := readEmissions(hmm.Alphabet, nullFields) if err != nil { return nil, fmt.Errorf("Could not read NULL emissions '%s': %s", strings.Join(nullFields, " "), err) } hmm.Null = *ep default: // finally, reading a node in the HMM // Each node in the HMM is made up of two lines. // The first line starts with the amino acid in the reference // sequence, followed by the node number, followed by N match // emission probabilities (where N = len(alphabet)), and finally // followed by the node number again. (What the *f**k* is up with // that? Seriously.) // // The second line is made up of 7 transition probabilities, // followed by 3 diversity (the 'neff' stuff) scores. // // Also, each field is separated by spaces OR tabs. Lovely, eh? line2, err := demandLine(buf) if err != nil { return nil, fmt.Errorf("%s (expected transition probs)", err) } fields1 := strings.Fields(string(line)) fields2 := strings.Fields(string(line2)) node := seq.HMMNode{ Residue: seq.Residue(fields1[0][0]), } node.NodeNum, err = strconv.Atoi(fields1[1]) if err != nil { return nil, fmt.Errorf("Could not parse node number '%s': %s", fields1[1], err) } ep, err := readEmissions(hmm.Alphabet, fields1[2:]) if err != nil { return nil, fmt.Errorf("Could not read emissions '%s': %s", strings.Join(fields1[2:], " "), err) } node.MatEmit = *ep node.InsEmit = seq.NewEProbs(hmm.Alphabet) for _, residue := range hmm.Alphabet { node.InsEmit.Set(residue, hmm.Null.Lookup(residue)) } node.Transitions, err = readTransitions(fields2) if err != nil { return nil, fmt.Errorf("Could not read transitions '%s': %s", strings.Join(fields2, " "), err) } node.NeffM, node.NeffI, node.NeffD, err = readDiversity(fields2[7:]) if err != nil { return nil, fmt.Errorf("Could not read diversity '%s': %s", strings.Join(fields2[7:], " "), err) } hmm.Nodes = append(hmm.Nodes, node) } } return hmm, nil }
// ReadSequence is exported for use in other packages that read FASTA-like // files. // // The 'translate' function is used when sequences are checked for valid // characters. // // If you're just reading FASTA files, this method SHOULD NOT be used. func (r *Reader) ReadSequence(translate Translator) (seq.Sequence, error) { s := seq.Sequence{} seenHeader := false // Before entering the main loop, we have to check to see if we've // already read this entry's header. if r.nextHeader != nil { s.Name = trimHeader(r.nextHeader) r.nextHeader = nil seenHeader = true } for { line, err := r.buf.ReadBytes('\n') if err == io.EOF { if len(line) == 0 { return s, io.EOF } } else if err != nil { return seq.Sequence{}, fmt.Errorf("Error on line %d: %s", r.line, err) } line = bytes.TrimSpace(line) // If it's empty, increment the counter and skip ahead. if len(line) == 0 { r.line++ continue } // If the line starts with PIR junk, ignore the line. if bytes.HasPrefix(line, []byte("C;")) || bytes.HasPrefix(line, []byte("structure")) || bytes.HasPrefix(line, []byte("sequence")) { r.line++ continue } // If we haven't seen the header yet, this better be it. if !seenHeader { if line[0] != '>' { return seq.Sequence{}, fmt.Errorf("Expected '>', got '%c' on line %d.", line[0], r.line) } // Trim the '>' and load this line into the header. s.Name = trimHeader(line) seenHeader = true r.line++ continue } else if line[0] == '>' { // This means we've begun reading the next entry. // So slap this line into 'nextHeader' and return the current entry. r.nextHeader = line r.line++ return s, nil } // Finally, time to start reading the sequence. // If we trust the sequences, then we can just append this line // willy nilly. Otherwise we've got to check each character. if s.Residues == nil { s.Residues = make([]seq.Residue, 0, 50) } if r.TrustSequences { for _, b := range line { s.Residues = append(s.Residues, seq.Residue(b)) } } else { for _, b := range line { bNew, ok := translate(b) if !ok { return seq.Sequence{}, fmt.Errorf("Invalid character '%c' on line %d.", b, r.line) } // If the zero byte is returned from translate, then we // don't keep this residue around. if bNew > 0 { s.Residues = append(s.Residues, bNew) } } } r.line++ } panic("unreachable") }