func main() { if len(util.FlagCpuProf) > 0 { f := util.CreateFile(util.FlagCpuProf) pprof.StartCPUProfile(f) defer f.Close() defer pprof.StopCPUProfile() } if len(flagGobIt) > 0 { astralDir := util.Arg(0) dists := readAlignmentDists(astralDir) enc := gob.NewEncoder(util.CreateFile(flagGobIt)) util.Assert(enc.Encode(dists), "Could not GOB encode distances") return } var dists *intern.Table if util.IsDir(util.Arg(0)) { dists = readAlignmentDists(util.Arg(0)) } else { dec := gob.NewDecoder(util.OpenFile(util.Arg(0))) util.Assert(dec.Decode(&dists), "Could not GOB decode distances") } treeFile := util.Arg(1) outPath := util.Arg(2) treeReader := newick.NewReader(util.OpenFile(treeFile)) tree, err := treeReader.ReadTree() util.Assert(err, "Could not read newick tree") csvw := csv.NewWriter(util.CreateFile(outPath)) clusters := treeClusters(flagThreshold, dists, tree) util.Assert(csvw.WriteAll(clusters)) }
func main() { db := util.OpenBowDB(util.Arg(0)) out := util.CreateFile(util.Arg(1)) printf := func(format string, v ...interface{}) { fmt.Fprintf(out, format, v...) } // Set our search options. bowOpts := bowdb.SearchDefault bowOpts.Limit = -1 printf("QueryID\tResultID\tCosine\tEuclid\n") entries, err := db.ReadAll() util.Assert(err, "Could not read BOW database entries") for _, entry := range entries { results := db.Search(bowOpts, entry) for _, result := range results { printf("%s\t%s\t%0.4f\t%0.4f\n", entry.Id, result.Bowed.Id, result.Cosine, result.Euclid) } printf("\n") } util.Assert(out.Close()) util.Assert(db.Close()) }
func mkStructure(c *command) { c.assertNArg(2) brkFile := c.flags.Arg(0) saveto := c.flags.Arg(1) util.AssertOverwritable(saveto, flagOverwrite) brkContents, err := ioutil.ReadAll(util.OpenFile(c.flags.Arg(0))) util.Assert(err) pdbFragments := bytes.Split(brkContents, []byte("TER")) fragments := make([][]structure.Coords, 0) for i, pdbFrag := range pdbFragments { pdbFrag = bytes.TrimSpace(pdbFrag) if len(pdbFrag) == 0 { continue } fragments = append(fragments, coords(i, pdbFrag)) } libName := stripExt(path.Base(brkFile)) lib, err := fragbag.NewStructureAtoms(libName, fragments) util.Assert(err) fragbag.Save(util.CreateFile(saveto), lib) }
func main() { var f io.Reader var err error f = util.OpenFile(flag.Arg(0)) if strings.HasSuffix(flag.Arg(0), ".gz") { f, err = gzip.NewReader(f) util.Assert(err) } cifEntry, err := pdbx.Read(f) util.Assert(err, "Could not read PDBx/mmCIF file") fasEntries := make([]seq.Sequence, 0, 5) for _, ent := range cifEntry.Entities { for _, chain := range ent.Chains { if !isChainUsable(chain) || len(ent.Seq) == 0 { continue } fasEntry := seq.Sequence{ Name: chainHeader(chain), Residues: ent.Seq, } fasEntries = append(fasEntries, fasEntry) } } if len(fasEntries) == 0 { util.Fatalf("Could not find any chains with amino acids.") } var fasOut io.Writer if flag.NArg() == 1 { fasOut = os.Stdout } else { if len(flagSplit) > 0 { util.Fatalf("The '--split' option is incompatible with a single " + "output file.") } fasOut = util.CreateFile(util.Arg(1)) } if len(flagSplit) == 0 { util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries), "Could not write FASTA file '%s'", fasOut) } else { for _, entry := range fasEntries { fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name)) out := util.CreateFile(fp) w := fasta.NewWriter(out) util.Assert(w.Write(entry), "Could not write to '%s'", fp) util.Assert(w.Flush(), "Could not write to '%s'", fp) } } }
func main() { saveto := util.CreateFile(util.Arg(0)) defer saveto.Close() w := func(format string, v ...interface{}) { _, err := fmt.Fprintf(saveto, format, v...) util.Assert(err) } var fmats []*bufio.Reader for _, fmat := range util.Args()[1:] { fmats = append(fmats, bufio.NewReader(util.OpenFile(fmat))) } LOOP: for { var columns int scores := make([][]float64, len(fmats)) // matrix -> fields -> sas score for i, fmat := range fmats { line, err := fmat.ReadBytes('\n') if len(line) == 0 && err == io.EOF { break LOOP } else if err != io.EOF { util.Assert(err) } fields := bytes.Fields(line) columns = len(fields) scores[i] = make([]float64, columns) for j, sas := range fields { scores[i][j], err = strconv.ParseFloat(string(sas), 64) util.Assert(err) } } before := "" for j := 0; j < columns; j++ { best := scores[0][j] for i := 1; i < len(scores); i++ { if scores[i][j] < best { best = scores[i][j] } } if best == 0 { w("%s0", before) } else { w("%s%f", before, best) } before = " " } w("\n") } }
func main() { in, out := util.Arg(0), util.Arg(1) r, w := ioFromFile(in, flagInFmt).r, ioFromFile(out, flagOutFmt).w inf := util.OpenFile(in) defer inf.Close() msa, err := r(inf) util.Assert(err, "Error parsing '%s'", in) outf := util.CreateFile(out) defer outf.Close() util.Assert(w(outf, msa), "Error writing '%s'", out) }
func mkPaired(c *command) { c.assertNArg(2) in := util.Library(c.flags.Arg(0)) outPath := c.flags.Arg(1) util.AssertOverwritable(outPath, flagOverwrite) if _, ok := in.(fragbag.WeightedLibrary); ok { util.Fatalf("%s is a weighted library (not allowed)", in.Name()) } name := fmt.Sprintf("paired-%s", in.Name()) if fragbag.IsStructure(in) { var pairs [][]structure.Coords lib := in.(fragbag.StructureLibrary) nfrags := lib.Size() for i := 0; i < nfrags; i++ { for j := 0; j < nfrags; j++ { if i == j { continue } f1, f2 := lib.Atoms(i), lib.Atoms(j) pairs = append(pairs, append(f1, f2...)) } } pairLib, err := fragbag.NewStructureAtoms(name, pairs) util.Assert(err) fragbag.Save(util.CreateFile(outPath), pairLib) } else if strings.Contains(in.Tag(), "hmm") { var pairs []*seq.HMM lib := in.(fragbag.SequenceLibrary) nfrags := lib.Size() for i := 0; i < nfrags; i++ { for j := 0; j < nfrags; j++ { if i == j { continue } f1, f2 := lib.Fragment(i).(*seq.HMM), lib.Fragment(j).(*seq.HMM) pairs = append(pairs, seq.HMMCat(f1, f2)) } } pairLib, err := fragbag.NewSequenceHMM(name, pairs) util.Assert(err) fragbag.Save(util.CreateFile(outPath), pairLib) } else if strings.Contains(in.Tag(), "profile") { util.Fatalf("Sequence profiles not implemented.") } else { util.Fatalf("Unrecognized fragment library: %s", in.Tag()) } }
func main() { inFasta := util.Arg(0) outHHM := util.Arg(1) hhblits := hhsuite.HHBlitsDefault hhmake := hhsuite.HHMakePseudo hhblits.Verbose = !flagQuiet hhmake.Verbose = !flagQuiet HHM, err := hhsuite.BuildHHM( hhblits, hhmake, util.FlagSeqDB, inFasta) util.Assert(err, "Error building HHM") util.Assert(hmm.WriteHHM(util.CreateFile(outHHM), HHM), "Error writing HHM '%s'", outHHM) }
func search(c *command) { c.assertLeastNArg(2) // Some search options don't translate directly to command line parameters // specified by the flag package. if flagSearchDesc { flagSearchOpts.Order = bowdb.OrderDesc } switch flagSearchSort { case "cosine": flagSearchOpts.SortBy = bowdb.SortByCosine case "euclid": flagSearchOpts.SortBy = bowdb.SortByEuclid default: util.Fatalf("Unknown sort field '%s'.", flagSearchSort) } db := util.OpenBowDB(c.flags.Arg(0)) bowPaths := c.flags.Args()[1:] _, err := db.ReadAll() util.Assert(err, "Could not read BOW database entries") // always hide the progress bar here. bows := util.ProcessBowers(bowPaths, db.Lib, false, flagCpu, true) out, outDone := outputter() // launch goroutines to search queries in parallel wgSearch := new(sync.WaitGroup) for i := 0; i < flagCpu; i++ { wgSearch.Add(1) go func() { defer wgSearch.Done() for b := range bows { sr := db.Search(flagSearchOpts, b) out <- searchResult{b, sr} } }() } wgSearch.Wait() close(out) <-outDone util.Assert(db.Close()) }
func main() { dbPath := util.Arg(0) fragLibDir := util.Arg(1) pdbFiles := flag.Args()[2:] util.Assert(createBowDb(dbPath, fragLibDir, pdbFiles)) db := util.OpenBowDB(dbPath) _, err := db.ReadAll() util.Assert(err, "Could not read BOW database entries") bowOpts := bowdb.SearchDefault bowOpts.Limit = 200 mattOpts := matt.DefaultConfig mattOpts.Verbose = false chains := createChains(pdbFiles) mattArgs := createMattArgs(chains) tabw := tabwriter.NewWriter(os.Stdout, 0, 4, 4, ' ', 0) header := []byte( "BOW entry\t" + "BOW chain\t" + "BOW dist\t" + "Matt entry\t" + "Matt chain\t" + "Matt dist\n") for i, chain := range chains { marg := mattArgs[i] bowOrdered := getBowOrdering(db, bowOpts, bow.BowerFromChain(chain)) mattOrdered := getMattOrdering(mattOpts, marg, mattArgs) fmt.Printf("Ordering for %s (chain %c)\n", chain.Entry.IdCode, chain.Ident) compared := comparison([2]ordering{bowOrdered, mattOrdered}) tabw.Write(header) tabw.Write([]byte(compared.String())) tabw.Flush() fmt.Println("\n") } util.Assert(db.Close()) }
func mkBowDb(c *command) { c.assertLeastNArg(3) dbPath := c.flags.Arg(0) flib := util.Library(c.flags.Arg(1)) bowPaths := c.flags.Args()[2:] util.AssertOverwritable(dbPath, flagOverwrite) db, err := bowdb.Create(flib, dbPath) util.Assert(err) bows := util.ProcessBowers(bowPaths, flib, false, flagCpu, util.FlagQuiet) for b := range bows { db.Add(b) } util.Assert(db.Close()) }
func main() { a3mPath := util.Arg(0) fa3m := util.OpenFile(a3mPath) freader := fasta.NewReader(fa3m) freader.TrustSequences = true seqs, err := freader.ReadAll() util.Assert(err, "Could not read fasta format '%s'", a3mPath) util.Assert(fa3m.Close()) w := util.CreateFile(a3mPath) fwriter := fasta.NewWriter(w) fwriter.Columns = 0 for _, seq := range seqs { if len(seq.Residues) > 0 { util.Assert(fwriter.Write(seq)) } } util.Assert(fwriter.Flush()) util.Assert(w.Close()) }
func coords(num int, atomRecords []byte) []structure.Coords { r := bytes.NewReader(atomRecords) name := fmt.Sprintf("fragment %d", num) entry, err := pdb.Read(r, name) util.Assert(err, "Fragment contents could not be read in PDB format") atoms := entry.OneChain().CaAtoms() if len(atoms) == 0 { util.Fatalf("Fragment %d has no ATOM coordinates.", num) } return atoms }
func main() { flag.BoolVar(&flagAllFragments, "all-fragments", flagAllFragments, "When set, all fragments will be shown, even if the best fragment\n"+ "of each residue set is the same.") util.FlagParse( "fraglib align.{fasta,ali,a2m,a3m} out-csv", "Writes a CSV file to out-csv containing the best matching fragment\n"+ "for each pairwise contiguous set of residues between the\n"+ "first two proteins in the alignment.") util.AssertNArg(3) flib := util.SequenceLibrary(util.Arg(0)) aligned := util.MSA(util.Arg(1)) outcsv := util.CreateFile(util.Arg(2)) csvWriter := csv.NewWriter(outcsv) csvWriter.Comma = '\t' defer csvWriter.Flush() pf := func(record ...string) { util.Assert(csvWriter.Write(record), "Problem writing to '%s'", outcsv) } pf("start1", "end1", "start2", "end2", "frag1", "frag2", "rat1", "rat2") iter := newContiguous( flib.FragmentSize(), aligned.GetFasta(0), aligned.GetFasta(1)) for iter.next() { best1 := flib.BestSequenceFragment(iter.res1) best2 := flib.BestSequenceFragment(iter.res2) if !flagAllFragments && best1 == best2 { continue } if best1 == -1 || best2 == -1 { continue } p1 := flib.AlignmentProb(best1, iter.res1) p2 := flib.AlignmentProb(best2, iter.res2) if p1.Distance(p2) > 0.14 { continue } pf( fmt.Sprintf("%d", iter.s1()), fmt.Sprintf("%d", iter.e1()), fmt.Sprintf("%d", iter.s2()), fmt.Sprintf("%d", iter.e2()), fmt.Sprintf("%d", best1), fmt.Sprintf("%d", best2), fmt.Sprintf("%f", p1), fmt.Sprintf("%f", p2), ) } }
func main() { rfasta := util.OpenFasta(util.Arg(0)) dir := util.Arg(1) util.Assert(os.MkdirAll(dir, 0777)) fr := fasta.NewReader(rfasta) for { s, err := fr.Read() if err != nil { if err == io.EOF { break } util.Assert(err) } s.Name = strings.Fields(s.Name)[0] fw := util.CreateFile(path.Join(dir, s.Name+".fasta")) w := fasta.NewWriter(fw) util.Assert(w.Write(s)) util.Assert(w.Flush()) util.Assert(fw.Close()) } }
func main() { pdbs := util.OpenFile(flag.Arg(0)) defer pdbs.Close() entries, err := slct.NewReader(pdbs).ReadAll() util.Assert(err) for _, entry := range entries { if flagPaths { fmt.Println(util.PDBPath(entry.ChainID)) } else { fmt.Println(entry.ChainID) } } }
func main() { pdbf1, chain1, s1, e1 := util.Arg(0), util.Arg(1), util.Arg(2), util.Arg(3) pdbf2, chain2, s2, e2 := util.Arg(4), util.Arg(5), util.Arg(6), util.Arg(7) entry1 := util.PDBRead(pdbf1) entry2 := util.PDBRead(pdbf2) s1n, e1n := util.ParseInt(s1), util.ParseInt(e1) s2n, e2n := util.ParseInt(s2), util.ParseInt(e2) r, err := pdb.RMSD( entry1, chain1[0], s1n, e1n, entry2, chain2[0], s2n, e2n) util.Assert(err) fmt.Println(r) }
func readVectors(fpath string) map[string]bow.Bow { f := util.OpenFile(fpath) defer f.Close() bows := make(map[string]bow.Bow, 5000) for _, line := range util.ReadLines(f) { fields := strings.Fields(line) b := bow.NewBow(len(fields[1:])) for _, sfreq := range fields[1:] { freq, err := strconv.ParseFloat(sfreq, 32) util.Assert(err) b.Freqs = append(b.Freqs, float32(freq)) } bows[fields[0]] = b } return bows }
func main() { flag.BoolVar(&flagAllFragments, "all-fragments", flagAllFragments, "When set, all fragments will be shown, even if the best fragment\n"+ "of each ATOM set is the same.") util.FlagParse( "fraglib align.{fasta,ali,a2m,a3m} pdb-file out-csv", "Writes a CSV file to out-csv containing the best matching fragment\n"+ "for each pairwise contiguous set of alpha-carbon atoms of the\n"+ "first two proteins in the alignment and PDB file.") util.AssertNArg(4) flib := util.StructureLibrary(util.Arg(0)) aligned := util.MSA(util.Arg(1)) pentry := util.PDBRead(util.Arg(2)) outcsv := util.CreateFile(util.Arg(3)) csvWriter := csv.NewWriter(outcsv) csvWriter.Comma = '\t' defer csvWriter.Flush() pf := func(record ...string) { util.Assert(csvWriter.Write(record), "Problem writing to '%s'", outcsv) } pf("start1", "end1", "start2", "end2", "frag1", "frag2", "frag_rmsd") iter := newContiguous( flib.FragmentSize(), aligned.GetFasta(0), aligned.GetFasta(1), pentry.Chains[0], pentry.Chains[1]) for iter.next() { best1 := flib.BestStructureFragment(iter.atoms1) best2 := flib.BestStructureFragment(iter.atoms2) if !flagAllFragments && best1 == best2 { continue } bestRmsd := structure.RMSD(flib.Atoms(best1), flib.Atoms(best2)) pf( fmt.Sprintf("%d", iter.s1()), fmt.Sprintf("%d", iter.e1()), fmt.Sprintf("%d", iter.s2()), fmt.Sprintf("%d", iter.e2()), fmt.Sprintf("%d", best1), fmt.Sprintf("%d", best2), fmt.Sprintf("%f", bestRmsd), ) } }
func readDomains(fpath string) *inDomains { domains := &inDomains{ intern.NewInterner(), make([]string, 0, 2000), make([]intern.Atom, 0, 2000), } scanner := bufio.NewScanner(util.OpenFile(fpath)) for scanner.Scan() { d := strings.Fields(scanner.Text())[0] d = stripExt(path.Base(util.CathPath(d))) a := domains.in.Atom(d) domains.ids = append(domains.ids, d) domains.atoms = append(domains.atoms, a) } util.Assert(scanner.Err()) return domains }
func readMatrix(domains *inDomains, fpath string) *intern.Table { var ( err error fval float64 sval string ) tab := intern.NewTableInterner(domains.in) scanner := bufio.NewScanner(util.OpenFile(fpath)) for i := 0; scanner.Scan(); i++ { // It'd be much simpler to use Split here, but let's be quicker. // In particular, avoid allocating. // Also, we're dealing with the line as a string since it's quicker // than using bytes and converting each number to a string for // strconv.ParseFloat. line := scanner.Text() bstart, j := 0, -1 for bend, b := range scanner.Text() { // This actually skips the very last element in the table, but // it's OK because the value at [k, k] is always 0. switch { case b == ' ' || b == '\n' || bend+1 == len(line): sval = line[bstart:bend] bstart = bend + 1 j++ // falls down to process this value default: continue } if j > i && len(sval) > 0 { // upper triangular fval, err = strconv.ParseFloat(sval, 64) if err != nil { panic(err) } tab.Set(domains.atoms[i], domains.atoms[j], fval) } } } util.Assert(scanner.Err()) return tab }
func main() { pdbEntry := util.PDBRead(flag.Arg(0)) fasEntries := make([]seq.Sequence, 0, 5) if !flagSeparateChains { var fasEntry seq.Sequence if len(pdbEntry.Chains) == 1 { fasEntry.Name = chainHeader(pdbEntry.OneChain()) } else { fasEntry.Name = fmt.Sprintf("%s", strings.ToLower(pdbEntry.IdCode)) } seq := make([]seq.Residue, 0, 100) for _, chain := range pdbEntry.Chains { if isChainUsable(chain) { seq = append(seq, chain.Sequence...) } } fasEntry.Residues = seq if len(fasEntry.Residues) == 0 { util.Fatalf("Could not find any amino acids.") } fasEntries = append(fasEntries, fasEntry) } else { for _, chain := range pdbEntry.Chains { if !isChainUsable(chain) { continue } fasEntry := seq.Sequence{ Name: chainHeader(chain), Residues: chain.Sequence, } fasEntries = append(fasEntries, fasEntry) } } if len(fasEntries) == 0 { util.Fatalf("Could not find any chains with amino acids.") } var fasOut io.Writer if flag.NArg() == 1 { fasOut = os.Stdout } else { if len(flagSplit) > 0 { util.Fatalf("The '--split' option is incompatible with a single " + "output file.") } fasOut = util.CreateFile(util.Arg(1)) } if len(flagSplit) == 0 { util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries), "Could not write FASTA file '%s'", fasOut) } else { for _, entry := range fasEntries { fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name)) out := util.CreateFile(fp) w := fasta.NewWriter(out) util.Assert(w.Write(entry), "Could not write to '%s'", fp) util.Assert(w.Flush(), "Could not write to '%s'", fp) } } }
func readAlignmentDists(dir string) *intern.Table { dists := intern.NewTable(11000) threads := util.FlagCpu addDists := make(chan []pair) alignFile := make(chan string) done := make(chan struct{}) go func() { for fileDists := range addDists { for _, pair := range fileDists { a1, a2 := dists.Atom(pair.key[0]), dists.Atom(pair.key[1]) dists.Set(a1, a2, pair.dist) } } done <- struct{}{} }() wg := new(sync.WaitGroup) for i := 0; i < threads; i++ { wg.Add(1) go func() { for fpath := range alignFile { log.Printf("Reading %s (%s)", fpath, time.Now()) f := util.OpenFile(fpath) defer f.Close() csvr := csv.NewReader(f) csvr.Comma = '\t' csvr.TrimLeadingSpace = true csvr.FieldsPerRecord = -1 // data is poorly formatted records, err := csvr.ReadAll() util.Assert(err, "[%s]", fpath) fileDists := make([]pair, 0, 100000) for _, record := range records { if len(record) != 9 { continue } p := recordToDist(record) fileDists = append(fileDists, p) } addDists <- fileDists } wg.Done() }() } for _, fpath := range util.RecursiveFiles(dir) { if strings.HasPrefix(path.Base(fpath), ".") { continue } alignFile <- fpath } close(alignFile) wg.Wait() close(addDists) <-done return dists }
func readFloat(s string) float64 { num, err := strconv.ParseFloat(s, 64) util.Assert(err, "Expected float, but got '%s'.", s) return num }
func main() { if len(util.FlagCpuProf) > 0 { f := util.CreateFile(util.FlagCpuProf) pprof.StartCPUProfile(f) defer f.Close() defer pprof.StopCPUProfile() } // Read all CATH domains, the best-of-all matrix, and the matrix for // each aligner. domains := readDomains(util.Arg(0)) boa := readMatrix(domains, util.Arg(1)) aligners := make([]aligner, 0) flibs := make([]flib, 0) for i := 2; i < util.NArg(); i += 2 { fpath := util.Arg(i) if path.Ext(fpath) == ".bowdb" { db := util.OpenBowDB(fpath) records, err := db.ReadAll() util.Assert(err) bowed := make([]bow.Bowed, domains.in.Len()) for _, b := range records { if !domains.in.Exists(b.Id) { util.Fatalf("Found ID in bowdb that isn't in the list "+ "of CATH domains provided: %s", b.Id) } bowed[domains.in.Atom(b.Id)] = b } flibs = append(flibs, flib{db, bowed, util.Arg(i + 1)}) } else { aligners = append(aligners, aligner{ readMatrix(domains, fpath), util.Arg(i + 1), }) } } // Now remove CATH domains that don't have a corresponding structure file. // We don't do this initially since the matrix files are indexed with // respect to all CATH domains (includings ones without structure). // This is an artifact of the fact that the matrices were generated with // a very old version of CATH. domains.removeOldDomains() if a := matrixAuc(domains, boa, boa, flagThreshold); a != 1.0 { util.Fatalf("Something is wrong. The AUC of the best-of-all matrix "+ "with respect to itself is %f, but it should be 1.0.", a) } if len(aligners) > 0 { fmt.Println("Computing AUC for aligners...") writeAuc := func(aligner aligner) struct{} { w := util.CreateFile(aligner.outpath) a := matrixAuc(domains, boa, aligner.matrix, flagThreshold) fmt.Fprintf(w, "%f\n", a) return struct{}{} } fun.ParMap(writeAuc, aligners) } if len(flibs) > 0 { fmt.Println("Computing AUC for bowdbs...") writeAuc := func(flib flib) struct{} { w := util.CreateFile(flib.outpath) a := flibAuc(domains, boa, flib, flagThreshold) fmt.Fprintf(w, "%f\n", a) return struct{}{} } fun.ParMap(writeAuc, flibs) } }
func mkSeqHMM(c *command) { c.assertLeastNArg(3) structLib := util.StructureLibrary(c.flags.Arg(0)) outPath := c.flags.Arg(1) entries := c.flags.Args()[2:] util.AssertOverwritable(outPath, flagOverwrite) saveto := util.CreateFile(outPath) // Stores intermediate files produced by hhmake. tempDir, err := ioutil.TempDir("", "mk-seqlib-hmm") util.Assert(err, "Could not create temporary directory.") defer os.RemoveAll(tempDir) // Initialize a MSA for each structural fragment. var msas []seq.MSA var msaChans []chan seq.Sequence for i := 0; i < structLib.Size(); i++ { msa := seq.NewMSA() msa.SetLen(structLib.FragmentSize()) msas = append(msas, msa) msaChans = append(msaChans, make(chan seq.Sequence)) } // Now spin up a goroutine for each fragment that is responsible for // adding a sequence slice to itself. for i := 0; i < structLib.Size(); i++ { addToMSA(msaChans[i], &msas[i]) } // Create a channel that sends the PDB entries given. entryChan := make(chan string) go func() { for _, fp := range entries { entryChan <- fp } close(entryChan) }() progress := util.NewProgress(len(entries)) for i := 0; i < flagCpu; i++ { wgPDBChains.Add(1) go func() { for entryPath := range entryChan { _, chains, err := util.PDBOpen(entryPath) progress.JobDone(err) if err != nil { continue } for _, chain := range chains { structureToSequence(structLib, chain, nil, msaChans) } } wgPDBChains.Done() }() } wgPDBChains.Wait() progress.Close() // We've finishing reading all the PDB inputs. Now close the channels // and let the sequence fragments finish. for i := 0; i < structLib.Size(); i++ { close(msaChans[i]) } wgSeqFragments.Wait() util.Verbosef("Building profile HMMs from MSAs...") // Finally, add the sequence fragments to a new sequence fragment // library and save. hmms := make([]*seq.HMM, structLib.Size()) hhmake := func(i int) struct{} { fname := path.Join(tempDir, fmt.Sprintf("%d.fasta", i)) f := util.CreateFile(fname) util.Assert(msa.WriteFasta(f, msas[i])) hhm, err := hhsuite.HHMakePseudo.Run(fname) util.Assert(err) hmms[i] = hhm.HMM return struct{}{} // my unifier sucks, i guess } fun.ParMap(hhmake, fun.Range(0, structLib.Size())) lib, err := fragbag.NewSequenceHMM(structLib.Name(), hmms) util.Assert(err) util.Assert(fragbag.Save(saveto, lib)) }
func mkSeqProfile(c *command) { c.assertLeastNArg(3) structLib := util.StructureLibrary(c.flags.Arg(0)) outPath := c.flags.Arg(1) entries := c.flags.Args()[2:] util.AssertOverwritable(outPath, flagOverwrite) saveto := util.CreateFile(outPath) // Initialize a frequency and null profile for each structural fragment. var freqProfiles []*seq.FrequencyProfile var fpChans []chan seq.Sequence for i := 0; i < structLib.Size(); i++ { fp := seq.NewFrequencyProfile(structLib.FragmentSize()) freqProfiles = append(freqProfiles, fp) fpChans = append(fpChans, make(chan seq.Sequence)) } // Now spin up a goroutine for each fragment that is responsible for // adding a sequence slice to itself. nullChan, nullProfile := addToNull() for i := 0; i < structLib.Size(); i++ { addToProfile(fpChans[i], freqProfiles[i]) } // Create a channel that sends the PDB entries given. entryChan := make(chan string) go func() { for _, fp := range entries { entryChan <- fp } close(entryChan) }() progress := util.NewProgress(len(entries)) for i := 0; i < flagCpu; i++ { wgPDBChains.Add(1) go func() { for entryPath := range entryChan { _, chains, err := util.PDBOpen(entryPath) progress.JobDone(err) if err != nil { continue } for _, chain := range chains { structureToSequence(structLib, chain, nullChan, fpChans) } } wgPDBChains.Done() }() } wgPDBChains.Wait() progress.Close() // We've finishing reading all the PDB inputs. Now close the channels // and let the sequence fragments finish. close(nullChan) for i := 0; i < structLib.Size(); i++ { close(fpChans[i]) } wgSeqFragments.Wait() // Finally, add the sequence fragments to a new sequence fragment // library and save. profs := make([]*seq.Profile, structLib.Size()) for i := 0; i < structLib.Size(); i++ { profs[i] = freqProfiles[i].Profile(nullProfile) } lib, err := fragbag.NewSequenceProfile(structLib.Name(), profs) util.Assert(err) util.Assert(fragbag.Save(saveto, lib)) }
func main() { rfasta := util.OpenFasta(util.Arg(0)) count, err := fasta.QuickSequenceCount(rfasta) util.Assert(err) fmt.Println(count) }
func main() { outDir := util.Arg(0) fasInps := util.Args()[1:] util.Assert(os.MkdirAll(outDir, 0777)) fastaChan := make(chan string) wg := new(sync.WaitGroup) for i := 0; i < max(1, runtime.GOMAXPROCS(0)); i++ { go func() { wg.Add(1) for fasta := range fastaChan { util.Verbosef("Computing map for '%s'...", fasta) fmap := util.GetFmap(fasta) outF := path.Join(outDir, fmt.Sprintf("%s.fmap", fmap.Name)) util.FmapWrite(util.CreateFile(outF), fmap) } wg.Done() }() } for _, fasta := range fasInps { fastaChan <- fasta } close(fastaChan) wg.Wait() }