Exemplo n.º 1
0
func translateStockholm(b byte) (seq.Residue, bool) {
	switch {
	case b >= 'a' && b <= 'z':
		return seq.Residue(b), true
	case b >= 'A' && b <= 'Z':
		return seq.Residue(b), true
	case b == '-':
		return '-', true
	case b == '.':
		return '.', true
	}
	return 0, false
}
Exemplo n.º 2
0
// TranslateNormal is the default translator for regular (NOT aligned) FASTA
// files.
func TranslateNormal(b byte) (seq.Residue, bool) {
	switch {
	case b >= 'a' && b <= 'z':
		return seq.Residue(unicode.ToTitle(rune(b))), true
	case b >= 'A' && b <= 'Z':
		return seq.Residue(b), true
	case b == '*':
		return 0, true
	case b == '-':
		return '-', true
	case b == '/': // PIR junk. Chain breaks? WTF?
		return '-', true
	}
	return 0, false
}
Exemplo n.º 3
0
// FastaSeq returns a new seq.Sequence from TuftsBCB/seq.
func (s *sequence) FastaSeq() seq.Sequence {
	rs := make([]seq.Residue, len(s.Residues))
	for i := range s.Residues {
		rs[i] = seq.Residue(s.Residues[i])
	}
	return seq.Sequence{s.Name, rs}
}
Exemplo n.º 4
0
Arquivo: msa.go Projeto: ndaniels/io-1
func translateA2M(b byte) (seq.Residue, bool) {
	switch {
	case b >= 'a' && b <= 'z':
		return seq.Residue(b), true
	case b >= 'A' && b <= 'Z':
		return seq.Residue(b), true
	case b == '*':
		return 0, true
	case b == '-':
		return '-', true
	case b == '.':
		return '.', true
	case b == '/': // PIR junk. Chain breaks? WTF?
		return '-', true
	}
	return 0, false
}
Exemplo n.º 5
0
Arquivo: hhr.go Projeto: ndaniels/io-1
func getSeq(line []byte) []seq.Residue {
	fs := bytes.Fields(line[17:])
	var fseq []byte
	if len(fs) == 1 {
		fseq = fs[0]
	} else {
		fseq = fs[1]
	}
	rs := make([]seq.Residue, len(fseq))
	for i, r := range fseq {
		rs[i] = seq.Residue(r)
	}
	return rs
}
Exemplo n.º 6
0
func asResidues(brs []byte, trusted bool) ([]seq.Residue, error) {
	rs := make([]seq.Residue, 0, len(brs))
	for _, b := range brs {
		if trusted {
			rs = append(rs, seq.Residue(b))
		} else {
			bNew, ok := translateStockholm(b)
			if !ok {
				return nil, fmt.Errorf("Invalid Stockholm residue '%c'.", b)
			} else if bNew > 0 {
				rs = append(rs, bNew)
			}
		}
	}
	return rs, nil
}
Exemplo n.º 7
0
func readHMM(buf *bytes.Buffer) (hmm *seq.HMM, err error) {
	var nullFields []string
	hmm = new(seq.HMM)
	for {
		line, err := buf.ReadBytes('\n')
		if err == io.EOF && len(line) == 0 {
			break
		}
		if err != nil && err != io.EOF {
			panic(err)
		}
		line = trim(line)

		if len(line) == 0 {
			continue
		}
		switch {
		case hasPrefix(line, "NULL"):
			// We can't read the NULL emissions yet, because we don't have
			// an alphabet. (Which we'll get on the next line.)
			// We'll slurp this into a seq.EProbs value in a little bit, after
			// we get an alphabet.
			nullFields = strings.Fields(str(line[4:]))
		case hasPrefix(line, "HMM"):
			// We slurp up three lines here. The first is the alphabet
			// (the current line). The second is the ordering of transition
			// probabilities. And the third are transition probabilities for
			// the begin state. We ignore the second two.
			if _, err := demandLine(buf); err != nil {
				return nil, fmt.Errorf("%s (expected transition ordering)", err)
			}
			if _, err := demandLine(buf); err != nil {
				return nil, fmt.Errorf("%s (expected start transitions)", err)
			}

			// Get the ordering of the alphabet.
			hmm.Alphabet = make([]seq.Residue, 0, 20)
			residues := bytes.Split(trim(line[3:]), []byte{'\t'})
			for _, residue := range residues {
				hmm.Alphabet = append(hmm.Alphabet, seq.Residue(residue[0]))
			}

			// Remember those null probabilities? Well, we have an alphabet now.
			ep, err := readEmissions(hmm.Alphabet, nullFields)
			if err != nil {
				return nil, fmt.Errorf("Could not read NULL emissions '%s': %s",
					strings.Join(nullFields, " "), err)
			}
			hmm.Null = *ep
		default: // finally, reading a node in the HMM
			// Each node in the HMM is made up of two lines.
			// The first line starts with the amino acid in the reference
			// sequence, followed by the node number, followed by N match
			// emission probabilities (where N = len(alphabet)), and finally
			// followed by the node number again. (What the *f**k* is up with
			// that? Seriously.)
			//
			// The second line is made up of 7 transition probabilities,
			// followed by 3 diversity (the 'neff' stuff) scores.
			//
			// Also, each field is separated by spaces OR tabs. Lovely, eh?
			line2, err := demandLine(buf)
			if err != nil {
				return nil, fmt.Errorf("%s (expected transition probs)", err)
			}
			fields1 := strings.Fields(string(line))
			fields2 := strings.Fields(string(line2))
			node := seq.HMMNode{
				Residue: seq.Residue(fields1[0][0]),
			}

			node.NodeNum, err = strconv.Atoi(fields1[1])
			if err != nil {
				return nil, fmt.Errorf("Could not parse node number '%s': %s",
					fields1[1], err)
			}

			ep, err := readEmissions(hmm.Alphabet, fields1[2:])
			if err != nil {
				return nil, fmt.Errorf("Could not read emissions '%s': %s",
					strings.Join(fields1[2:], " "), err)
			}
			node.MatEmit = *ep

			node.InsEmit = seq.NewEProbs(hmm.Alphabet)
			for _, residue := range hmm.Alphabet {
				node.InsEmit.Set(residue, hmm.Null.Lookup(residue))
			}

			node.Transitions, err = readTransitions(fields2)
			if err != nil {
				return nil, fmt.Errorf("Could not read transitions '%s': %s",
					strings.Join(fields2, " "), err)
			}

			node.NeffM, node.NeffI, node.NeffD, err = readDiversity(fields2[7:])
			if err != nil {
				return nil, fmt.Errorf("Could not read diversity '%s': %s",
					strings.Join(fields2[7:], " "), err)
			}

			hmm.Nodes = append(hmm.Nodes, node)
		}
	}
	return hmm, nil
}
Exemplo n.º 8
0
// ReadSequence is exported for use in other packages that read FASTA-like
// files.
//
// The 'translate' function is used when sequences are checked for valid
// characters.
//
// If you're just reading FASTA files, this method SHOULD NOT be used.
func (r *Reader) ReadSequence(translate Translator) (seq.Sequence, error) {
	s := seq.Sequence{}
	seenHeader := false

	// Before entering the main loop, we have to check to see if we've
	// already read this entry's header.
	if r.nextHeader != nil {
		s.Name = trimHeader(r.nextHeader)
		r.nextHeader = nil
		seenHeader = true
	}
	for {
		line, err := r.buf.ReadBytes('\n')
		if err == io.EOF {
			if len(line) == 0 {
				return s, io.EOF
			}
		} else if err != nil {
			return seq.Sequence{}, fmt.Errorf("Error on line %d: %s",
				r.line, err)
		}
		line = bytes.TrimSpace(line)

		// If it's empty, increment the counter and skip ahead.
		if len(line) == 0 {
			r.line++
			continue
		}

		// If the line starts with PIR junk, ignore the line.
		if bytes.HasPrefix(line, []byte("C;")) ||
			bytes.HasPrefix(line, []byte("structure")) ||
			bytes.HasPrefix(line, []byte("sequence")) {

			r.line++
			continue
		}

		// If we haven't seen the header yet, this better be it.
		if !seenHeader {
			if line[0] != '>' {
				return seq.Sequence{},
					fmt.Errorf("Expected '>', got '%c' on line %d.",
						line[0], r.line)
			}

			// Trim the '>' and load this line into the header.
			s.Name = trimHeader(line)
			seenHeader = true

			r.line++
			continue
		} else if line[0] == '>' {
			// This means we've begun reading the next entry.
			// So slap this line into 'nextHeader' and return the current entry.
			r.nextHeader = line

			r.line++
			return s, nil
		}

		// Finally, time to start reading the sequence.
		// If we trust the sequences, then we can just append this line
		// willy nilly. Otherwise we've got to check each character.
		if s.Residues == nil {
			s.Residues = make([]seq.Residue, 0, 50)
		}
		if r.TrustSequences {
			for _, b := range line {
				s.Residues = append(s.Residues, seq.Residue(b))
			}
		} else {
			for _, b := range line {
				bNew, ok := translate(b)
				if !ok {
					return seq.Sequence{},
						fmt.Errorf("Invalid character '%c' on line %d.",
							b, r.line)
				}

				// If the zero byte is returned from translate, then we
				// don't keep this residue around.
				if bNew > 0 {
					s.Residues = append(s.Residues, bNew)
				}
			}
		}
		r.line++
	}
	panic("unreachable")
}