func viewLib(c *command) { c.assertNArg(1) lib := util.Library(c.flags.Arg(0)) fmt.Printf("Name: %s\n", lib.Name()) fmt.Printf("Tag: %s\n", strings.Join(libraryTag(lib), "/")) fmt.Printf("Size: %d\n", lib.Size()) fmt.Printf("Fragment Size: %d\n", lib.FragmentSize()) fmt.Printf("IsStructure: %v\n", fragbag.IsStructure(lib)) fmt.Printf("IsSequence: %v\n", fragbag.IsSequence(lib)) }
func mkPaired(c *command) { c.assertNArg(2) in := util.Library(c.flags.Arg(0)) outPath := c.flags.Arg(1) util.AssertOverwritable(outPath, flagOverwrite) if _, ok := in.(fragbag.WeightedLibrary); ok { util.Fatalf("%s is a weighted library (not allowed)", in.Name()) } name := fmt.Sprintf("paired-%s", in.Name()) if fragbag.IsStructure(in) { var pairs [][]structure.Coords lib := in.(fragbag.StructureLibrary) nfrags := lib.Size() for i := 0; i < nfrags; i++ { for j := 0; j < nfrags; j++ { if i == j { continue } f1, f2 := lib.Atoms(i), lib.Atoms(j) pairs = append(pairs, append(f1, f2...)) } } pairLib, err := fragbag.NewStructureAtoms(name, pairs) util.Assert(err) fragbag.Save(util.CreateFile(outPath), pairLib) } else if strings.Contains(in.Tag(), "hmm") { var pairs []*seq.HMM lib := in.(fragbag.SequenceLibrary) nfrags := lib.Size() for i := 0; i < nfrags; i++ { for j := 0; j < nfrags; j++ { if i == j { continue } f1, f2 := lib.Fragment(i).(*seq.HMM), lib.Fragment(j).(*seq.HMM) pairs = append(pairs, seq.HMMCat(f1, f2)) } } pairLib, err := fragbag.NewSequenceHMM(name, pairs) util.Assert(err) fragbag.Save(util.CreateFile(outPath), pairLib) } else if strings.Contains(in.Tag(), "profile") { util.Fatalf("Sequence profiles not implemented.") } else { util.Fatalf("Unrecognized fragment library: %s", in.Tag()) } }
// ProcessBowers is a convenient wrapper around BowerOpen that processes each // bower value in parallel and sends the resulting BOW value on the channel // returned. The number of goroutines spawned is equivalent to N. // // It is appropriate for fpaths to be the arguments given to be command line // arguments. Each directory is recursively expanded to its files, and special // syntax is parsed as well. // // If `hideProgress` is true, then a progress bar will not be emitted to // stderr. // // If `models` is true, then every model in a PDB bower file will have its // BOW computed. Otherwise, the first model from each chain will be used. func ProcessBowers( fpaths []string, lib fragbag.Library, models bool, n int, hideProgress bool, ) <-chan bow.Bowed { if n <= 0 { n = 1 } results := make(chan bow.Bowed, n*2) fpaths = AllFilesFromArgs(fpaths) go func() { var progress *Progress totalJobs := 0 if !hideProgress { totalJobs = numJobs(fpaths) progress = NewProgress(totalJobs) } // We use two levels of concurrency here. The first is at the level // of translating files into bowers. The second is at the level of // computing BOWs from bowers. // The first level is necessary because there can a large number of // bower files given, where each file only produces a few BOWs. // The second level is necessary because there is a lot of variation // between the number of bowers that a single file can produce. For // example, while most PDB files only produce a few, some FASTA files // can produce millions. files := make(chan string, n*2) bs := make(chan interface{}, n*2) // channel of bowers wgBowers := new(sync.WaitGroup) wgFiles := new(sync.WaitGroup) // goroutines for computing BOWs from bowers for i := 0; i < n; i++ { wgBowers.Add(1) go func() { defer wgBowers.Done() for b := range bs { var bw bow.Bowed if fragbag.IsStructure(lib) { lib := lib.(fragbag.StructureLibrary) bw = b.(bow.StructureBower).StructureBow(lib) } else if fragbag.IsSequence(lib) { lib := lib.(fragbag.SequenceLibrary) bw = b.(bow.SequenceBower).SequenceBow(lib) } else { Fatalf("Unknown fragment library %T", lib) } results <- bw } }() } // goroutines for translating files into bowers for i := 0; i < n; i++ { wgFiles.Add(1) go func() { defer wgFiles.Done() for fpath := range files { var err error for b := range BowerOpen(fpath, lib, models) { if b.Err != nil { err = b.Err } else { bs <- b.Bower } if IsFasta(fpath) { // each sequence counts progress.JobDone(err) } } if IsPDB(fpath) { // PDB file only counts as one job progress.JobDone(err) } } }() } for _, fpath := range fpaths { files <- fpath } close(files) wgFiles.Wait() close(bs) wgBowers.Wait() progress.Close() close(results) }() return results }
// BowerOpen reads the contents of `fpath` and attempts to interpret it as a // value (or values) that implement the `bow.Bower` interface. The list of // `bow.Bower` values returned is guaranteed to be homogenous: they will // either be all `bow.SequenceBower` values or `bow.StructureBower` values. // // The actual return value of the function is a receive-only channel of BowerErr // values. Each BowerErr value either has the `Bower` member set or has the // `err` field set to an error that prevented the file from being opened. // Errors in this case are reserved for files that appear to be capable of // producing a BOW, but were unable to be read. // // If the fpath given cannot be detected as a bower file, then a closed empty // channel will be returned. A warning is also emitted to stderr. // // `lib` is a fragment library that is used to help interpret what kind of // value must be in `r`. For example, if `lib` is a sequence fragment library, // then `BowerOpen` is guaranteed to return a `Bower` value that implements the // `bow.SequenceBower` interface. // // As of now, `BowerOpen` can read these types of files: // // File extension Format Interpretation // *.{ent.gz,pdb,ent} PDB whatever `lib` is // *.{fasta,fas,fasta.gz,fas.gz} FASTA sequence // everything else error invalid // // Note that special syntax for PDB file names is supported. Namely, chain // identifiers can be appended to the end of the file name, and only that chain // will be included in the `bow.Bower` value. Otherwise, all chains in the PDB // entry will be returned as individual `bow.Bower` values. // // The format is simple and easily demonstrated by examples: // // 1ctf.end.gz Chains A and B // 1ctf.ent.gz:A Only chain A // 1ctf.ent.gz:B Only chain B // 1ctf.ent.gz:A,B Chains A and B // // A secondary format is also accepted. The following are equivalent to their // corresponding examples above: // // 1ctf // 1ctfA // 1ctfB // 1ctf:A,B // // Finally, `fpath` may be the name of a PDB identifier and its file path will // be inferred from the value of the `PDB_PATH` environment variable. // Alternatively, `fpath` may be the name of a SCOP domain, and its // corresponding PDB file will be inferred from the value of the // `SCOP_PDB_PATH` environment variable. func BowerOpen(fpath string, lib fragbag.Library, models bool) <-chan BowerErr { if lib == nil { Fatalf("Files can only be converted to Fragbag frequency vectors " + "if a fragment library is specified.") } bowers := make(chan BowerErr, 100) switch { case IsPDB(fpath): go func() { defer close(bowers) entry, chains, err := PDBOpen(fpath) if err != nil { err = fmt.Errorf("Error reading '%s': %s", fpath, err) bowers <- BowerErr{Err: err} return } if fragbag.IsStructure(lib) { for i := range chains { if !chains[i].IsProtein() { continue } if !models { b := bow.BowerFromChain(chains[i]) bowers <- BowerErr{Bower: b} } else { for _, m := range chains[i].Models { b := bow.BowerFromModel(m) bowers <- BowerErr{Bower: b} } } } } else { for i := range chains { if !chains[i].IsProtein() { continue } s := chains[i].AsSequence() if s.Len() == 0 { s = aminoFromStructure(chains[i]) if s.Len() == 0 { Warnf("Chain '%s:%c' has no amino sequence.", entry.IdCode, chains[i].Ident) continue } } bowers <- BowerErr{Bower: bow.BowerFromSequence(s)} } } }() return bowers case IsFasta(fpath) && !fragbag.IsStructure(lib): go func() { defer close(bowers) r, fp, err := fastaOpen(fpath) if err != nil { err = fmt.Errorf("Error reading file: %s", err) bowers <- BowerErr{Err: err} return } defer fp.Close() fr := fasta.NewReader(r) for { s, err := fr.Read() if err != nil { if err == io.EOF { break } err = fmt.Errorf("Error reading file: %s", err) bowers <- BowerErr{Err: err} return } bowers <- BowerErr{Bower: bow.BowerFromSequence(s)} } }() return bowers } Warnf("I don't know how to produce a Fragbag frequency vector "+ "from the file '%s'.", fpath) close(bowers) return bowers }