func read(r *fasta.Reader, fasta bool) (seq.MSA, error) { msa := seq.NewMSA() for { s, err := readSequence(r) if err == io.EOF { break } if err != nil { return seq.MSA{}, err } if fasta { msa.AddFasta(s) } else { msa.Add(s) } if len(msa.Entries) > 1 { // We can't use 's' directly, because a sequence added to an MSA // may be modified if it isn't already in A2M format. lastEntry := msa.Entries[len(msa.Entries)-1] if lastEntry.Len() != msa.Entries[0].Len() { return seq.MSA{}, fmt.Errorf("Sequence '%s' has length %d, but other "+ "sequences have length %d.", s.Name, lastEntry, msa.Entries[0].Len()) } } } return msa, nil }
func readStockholm(r io.Reader, trusted bool) (seq.MSA, error) { msa := seq.NewMSA() ef := fmt.Errorf scanner := bufio.NewScanner(r) if scanner.Scan() { first := bytes.ToLower(bytes.Trim(scanner.Bytes(), " #")) if !bytes.Equal([]byte("stockholm 1.0"), first) { fmt.Printf("%s\n", first) return seq.MSA{}, ef("First line does not contain 'STOCKHOLM 1.0'.") } } for scanner.Scan() { line := bytes.TrimSpace(scanner.Bytes()) if line[0] == '#' { continue } if line[0] == '/' && line[1] == '/' { // alignment done, says the spec break } pieces := bytes.Fields(line) residues, err := asResidues(pieces[len(pieces)-1], trusted) if err != nil { return seq.MSA{}, err } s := seq.Sequence{ Name: string(concat(pieces[0 : len(pieces)-1])), Residues: residues, } msa.Add(s) if len(msa.Entries) > 1 { // We can't use 's' directly, because a sequence added to an MSA // may be modified if it isn't already in A2M format. lastEntry := msa.Entries[len(msa.Entries)-1] if lastEntry.Len() != msa.Entries[0].Len() { return seq.MSA{}, fmt.Errorf("Sequence '%s' has length %d, but other "+ "sequences have length %d.", s.Name, lastEntry, msa.Entries[0].Len()) } } } if err := scanner.Err(); err != nil { return seq.MSA{}, err } return msa, nil }
func readSeqs(buf *bytes.Buffer) (HHMSecondary, seq.MSA, error) { // Remember, the sequence portion of an HHM file actually has two parts. // The first part is optional and contains secondary structure information. // These SS sequences can be identified by special sequence headers: // "ss_dssp", "sa_dssp", "ss_pred", "ss_conf", and "Consensus". // If a sequence doesn't contain a special header, then that signifies that // we should start reading the MSA, which comes after the SS information. doneSS := false ss := HHMSecondary{} msa := seq.NewMSA() reader := fasta.NewReader(buf) reader.TrustSequences = true seqs, err := reader.ReadAll() if err != nil { return HHMSecondary{}, seq.MSA{}, err } for _, s := range seqs { s := s if !doneSS { switch { case strings.HasPrefix(s.Name, "ss_dssp"): ss.SSdssp = &s case strings.HasPrefix(s.Name, "sa_dssp"): ss.SAdssp = &s case strings.HasPrefix(s.Name, "ss_pred"): ss.SSpred = &s case strings.HasPrefix(s.Name, "ss_conf"): ss.SSconf = &s case strings.HasPrefix(s.Name, "Consensus"): ss.Consensus = &s default: doneSS = true } } if doneSS { msa.Add(s) } } return ss, msa, nil }
func mkSeqHMM(c *command) { c.assertLeastNArg(3) structLib := util.StructureLibrary(c.flags.Arg(0)) outPath := c.flags.Arg(1) entries := c.flags.Args()[2:] util.AssertOverwritable(outPath, flagOverwrite) saveto := util.CreateFile(outPath) // Stores intermediate files produced by hhmake. tempDir, err := ioutil.TempDir("", "mk-seqlib-hmm") util.Assert(err, "Could not create temporary directory.") defer os.RemoveAll(tempDir) // Initialize a MSA for each structural fragment. var msas []seq.MSA var msaChans []chan seq.Sequence for i := 0; i < structLib.Size(); i++ { msa := seq.NewMSA() msa.SetLen(structLib.FragmentSize()) msas = append(msas, msa) msaChans = append(msaChans, make(chan seq.Sequence)) } // Now spin up a goroutine for each fragment that is responsible for // adding a sequence slice to itself. for i := 0; i < structLib.Size(); i++ { addToMSA(msaChans[i], &msas[i]) } // Create a channel that sends the PDB entries given. entryChan := make(chan string) go func() { for _, fp := range entries { entryChan <- fp } close(entryChan) }() progress := util.NewProgress(len(entries)) for i := 0; i < flagCpu; i++ { wgPDBChains.Add(1) go func() { for entryPath := range entryChan { _, chains, err := util.PDBOpen(entryPath) progress.JobDone(err) if err != nil { continue } for _, chain := range chains { structureToSequence(structLib, chain, nil, msaChans) } } wgPDBChains.Done() }() } wgPDBChains.Wait() progress.Close() // We've finishing reading all the PDB inputs. Now close the channels // and let the sequence fragments finish. for i := 0; i < structLib.Size(); i++ { close(msaChans[i]) } wgSeqFragments.Wait() util.Verbosef("Building profile HMMs from MSAs...") // Finally, add the sequence fragments to a new sequence fragment // library and save. hmms := make([]*seq.HMM, structLib.Size()) hhmake := func(i int) struct{} { fname := path.Join(tempDir, fmt.Sprintf("%d.fasta", i)) f := util.CreateFile(fname) util.Assert(msa.WriteFasta(f, msas[i])) hhm, err := hhsuite.HHMakePseudo.Run(fname) util.Assert(err) hmms[i] = hhm.HMM return struct{}{} // my unifier sucks, i guess } fun.ParMap(hhmake, fun.Range(0, structLib.Size())) lib, err := fragbag.NewSequenceHMM(structLib.Name(), hmms) util.Assert(err) util.Assert(fragbag.Save(saveto, lib)) }
func makeMSA(seqs []seq.Sequence) seq.MSA { msa := seq.NewMSA() msa.AddSlice(seqs) return msa }