func (s *S) TestReadFasta(c *check.C) { r := fasta.NewReader(strings.NewReader(fa), linear.NewSeq("", nil, alphabet.Protein)) m, _ := multi.NewMulti("", nil, seq.DefaultConsensus) a, err := NewReader(r, m).Read() c.Check(err, check.Equals, nil) c.Check(a.Rows(), check.Equals, 11) }
func (s *S) TestReadFromFunc(c *check.C) { var ( obtainNfa []string obtainSfa [][]alphabet.Letter ) sc := seqio.NewScannerFromFunc( fasta.NewReader( bytes.NewBufferString(testaln0), linear.NewSeq("", nil, alphabet.Protein), ).Read, ) for sc.Next() { t := sc.Seq().(*linear.Seq) header := t.Name() if desc := t.Description(); len(desc) > 0 { header += " " + desc } obtainNfa = append(obtainNfa, header) obtainSfa = append(obtainSfa, t.Slice().(alphabet.Letters)) } c.Check(sc.Error(), check.Equals, nil) c.Check(obtainNfa, check.DeepEquals, expectNfa) for i := range obtainSfa { c.Check(len(obtainSfa[i]), check.Equals, len(expectSfa[i])) c.Check(obtainSfa[i], check.DeepEquals, expectSfa[i]) } }
func main() { flag.Parse() if *exclude == "" { flag.Usage() os.Exit(1) } nameSet := make(map[string]struct{}) f, err := os.Open(*exclude) if err != nil { log.Fatalf("failed to open exclude file %q: %v", *exclude, err) } ls := bufio.NewScanner(f) for ls.Scan() { nameSet[ls.Text()] = struct{}{} } err = ls.Err() if err != nil { log.Fatalf("failed to read exclude file: %v", err) } sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA))) for sc.Next() { s := sc.Seq().(*linear.Seq) if _, ok := nameSet[s.ID]; ok { continue } fmt.Printf("%60a\n", s) } if err := sc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } }
func main() { flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } events := make(map[string][]*gff.Feature) fsc := featio.NewScanner(gff.NewReader(f)) for fsc.Next() { f := fsc.Feat().(*gff.Feature) fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } events[fields[0]] = append(events[fields[0]], f) } if err := fsc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } f.Close() for _, ref := range flag.Args() { f, err = os.Open(ref) if err != nil { log.Fatalf("failed to open reference %q: %v", ref, err) } ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA))) for ssc.Next() { seq := ssc.Seq().(*linear.Seq) for _, f := range events[seq.Name()] { fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } start, err := strconv.Atoi(fields[1]) if err != nil { log.Fatalf("failed to get start coordinate: %v", err) } end, err := strconv.Atoi(fields[2]) if err != nil { log.Fatalf("failed to get end coordinate: %v", err) } tmp := *seq tmp.ID += fmt.Sprintf("//%d_%d", start, end) tmp.Seq = tmp.Seq[start:end] fmt.Printf("%60a\n", &tmp) } } if err := ssc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } f.Close() } }
func getFasta(fn string) (seq.Sequence, error) { fasta_file, err := os.Open(fn) if err != nil { fmt.Println("Erro ao ler o arquivo", err) } defer fasta_file.Close() var s []alphabet.Letter t := linear.NewSeq("", s, alphabet.Protein) reader := fasta.NewReader(fasta_file, t) seq, _ := reader.Read() return seq, nil }
// writeFlankSeqs writes fasta files containing the sequence of unmapped flanks // identified in the primary hits provided. cutoff specifies the minimum sequence // length to consider. left and right specify the filenames for the left and right // flank fasta sequence files. func writeFlankSeqs(reads string, hits hitSet, cutoff int, left, right string) error { f, err := os.Open(reads) if err != nil { return err } defer f.Close() lf, err := os.Create(left) if err != nil { return err } rf, err := os.Create(right) if err != nil { return err } r := fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA)) sc := seqio.NewScanner(r) for sc.Next() { seq := sc.Seq().(*linear.Seq) h, ok := hits[seq.Name()] if !ok { continue } all := seq.Seq if h.qStart >= cutoff { seq.Seq = all[:h.qStart] _, err := fmt.Fprintf(lf, "%60a\n", seq) if err != nil { return err } } if h.qLen-h.qEnd >= cutoff { seq.Seq = all[h.qEnd:] _, err := fmt.Fprintf(rf, "%60a\n", seq) if err != nil { return err } } } err = sc.Error() if err != nil { return err } err = lf.Close() if err != nil { return err } return rf.Close() }
func readFasta(fn string) (name string, seq string, err error) { fFasta, err := os.Open(fn) defer fFasta.Close() if err != nil { return "", "", err } t := linear.NewSeq("", nil, alphabet.Protein) reader := fasta.NewReader(fFasta, t) s, err := reader.Read() if err != nil { return "", "", err } sl := s.(*linear.Seq) return sl.Name(), sl.String(), nil }
func readContigs(file string) (map[string]*linear.Seq, error) { f, err := os.Open(file) if err != nil { return nil, err } seqs := make(map[string]*linear.Seq) sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped))) for sc.Next() { s := sc.Seq().(*linear.Seq) seqs[s.ID] = s } if err != nil { return nil, err } return seqs, nil }
func readContigs(file string) (map[string]int, error) { f, err := os.Open(file) if err != nil { return nil, err } lengths := make(map[string]int) sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA))) for sc.Next() { s := sc.Seq() lengths[s.Name()] = s.Len() } if err != nil { log.Fatalf("error during fasta read: %v", err) } return lengths, nil }
func main() { flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } defer f.Close() names := make(map[string][]string) sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped))) for sc.Next() { seq := sc.Seq().(*linear.Seq) idx := strings.LastIndex(seq.ID, "/") names[seq.ID[:idx]] = append(names[seq.ID[:idx]], seq.ID[idx+1:]) } if err := sc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } f.Close() base := filepath.Base(*in) unique, err := os.Create(base + ".unique.text") if err != nil { log.Fatalf("failed to create %q: %v", base+".unique.text", err) } defer unique.Close() nonUnique, err := os.Create(base + ".non-unique.text") if err != nil { log.Fatalf("failed to create %q: %v", base+".non-unique.text", err) } defer nonUnique.Close() for name, coords := range names { switch len(coords) { case 0: case 1: fmt.Fprintln(unique, name) default: fmt.Fprintf(nonUnique, "%s\t%v\n", name, coords) } } }
func main() { flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } inFile, err := os.Open(*in) if err != nil { log.Fatalf("failed to open input:%v", err) } defer inFile.Close() *in = filepath.Base(*in) sc := seqio.NewScanner(fasta.NewReader(inFile, linear.NewSeq("", nil, alphabet.DNA))) var i, size int out, err := os.Create(fmt.Sprintf("%s-%d.fa", *in, i)) for sc.Next() { if sc.Seq().Len() < *cut { continue } if size != 0 && size+sc.Seq().Len() > *bundle { err = out.Close() if err != nil { log.Fatalf("failed to close file bundle %d: %v", i, err) } i++ size = 0 out, err = os.Create(fmt.Sprintf("%s-%d.fa", *in, i)) if err != nil { log.Fatalf("failed to open file bundle %d: %v", i, err) } } size += sc.Seq().Len() fmt.Fprintf(out, "%60a\n", sc.Seq()) } if sc.Error() != nil { log.Fatal(sc.Error()) } err = out.Close() if err != nil { log.Fatalf("failed to close file bundle %d: %v", i, err) } }
func BenchmarkSWAlign(b *testing.B) { t := &linear.Seq{} t.Alpha = alphabet.DNAgapped r := fasta.NewReader(strings.NewReader(crspFa), t) swsa, _ := r.Read() swsb, _ := r.Read() smith := SW{ {2, -1, -1, -1, -1}, {-1, 2, -1, -1, -1}, {-1, -1, 2, -1, -1}, {-1, -1, -1, 2, -1}, {-1, -1, -1, -1, 0}, } b.ResetTimer() for i := 0; i < b.N; i++ { smith.Align(swsa, swsb) } }
func BenchmarkNWAlign(b *testing.B) { t := &linear.Seq{} t.Alpha = alphabet.DNAgapped r := fasta.NewReader(strings.NewReader(crspFa), t) nwsa, _ := r.Read() nwsb, _ := r.Read() needle := NW{ {10, -3, -1, -4, -5}, {-3, 9, -5, 0, -5}, {-1, -5, 7, -3, -5}, {-4, 0, -3, 8, -5}, {-4, -4, -4, -4, 0}, } b.ResetTimer() for i := 0; i < b.N; i++ { needle.Align(nwsa, nwsb) } }
func mangle() { seen := make(map[string]bool) hash := sha1.New() sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA))) for sc.Next() { s := sc.Seq().(*linear.Seq) if s.Desc == "" { s.Desc = s.ID } else { s.Desc = fmt.Sprintf("%s %s", s.ID, s.Desc) } hash.Write([]byte(s.Desc)) s.ID = fmt.Sprintf("%040x", hash.Sum(nil)) if seen[s.ID] { log.Fatalf("duplicate sha1: %s", s.ID) } seen[s.ID] = true hash.Reset() fmt.Printf("%60a\n", s) } }
func main() { flag.Parse() if *in == "" || *typ < 0 || 2 < *typ { flag.Usage() os.Exit(1) } cfn := []func(s seq.Sequence, start, end int) (float64, error){ 0: complexity.WF, 1: complexity.Entropic, 2: complexity.Z, }[*typ] f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } defer f.Close() sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped))) for sc.Next() { seq := sc.Seq().(*linear.Seq) // err is always nil for a linear.Seq Start() and End(). c, _ := cfn(seq, seq.Start(), seq.End()) if *dist { fmt.Printf("%s\t%v\t%d\n", seq.Name(), c, seq.Len()) continue } if c >= *thresh { fmt.Printf("%60a\n", seq) } } if err := sc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } }
func BenchmarkSWAffineAlign(b *testing.B) { b.StopTimer() t := &linear.Seq{} t.Alpha = alphabet.DNAgapped r := fasta.NewReader(strings.NewReader(crspFa), t) swsa, _ := r.Read() swsb, _ := r.Read() smith := SWAffine{ Matrix: Linear{ {2, -1, -1, -1, -1}, {-1, 2, -1, -1, -1}, {-1, -1, 2, -1, -1}, {-1, -1, -1, 2, -1}, {-1, -1, -1, -1, 0}, }, GapOpen: -5, } b.StartTimer() for i := 0; i < b.N; i++ { smith.Align(swsa, swsb) } }
func consensus(in io.Reader) (*linear.QSeq, *multi.Multi, error) { m, err := muscle.Muscle{Quiet: true}.BuildCommand() if err != nil { return nil, nil, err } m.Stdin = in buf := &bytes.Buffer{} m.Stdout = buf err = m.Run() if err != nil { return nil, nil, err } var ( r = fasta.NewReader(buf, &linear.Seq{Annotation: seq.Annotation{Alpha: alphabet.DNA}}) ms = &multi.Multi{ColumnConsense: seq.DefaultQConsensus} ) sc := seqio.NewScanner(r) for sc.Next() { ms.Add(sc.Seq()) } return ms.Consensus(true), ms, sc.Error() }
func unmangle(mapfile string) { table := make(map[string]string) sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA))) for sc.Next() { s := sc.Seq().(*linear.Seq) id := strings.Fields(s.Desc)[0] if id == "" { log.Fatalf("no id for sequence %s", s.ID) } table[s.ID] = id } f, err := os.Open(mapfile) if err != nil { log.Fatalf("failed to open map file %q: %v", mapfile, err) } s := bufio.NewScanner(f) for s.Scan() { line := s.Text() fields := strings.Fields(line) if len(fields) <= *queryNameField { log.Fatalf("unexpected number of fields in line %q", line) } id := table[fields[*queryNameField]] if id == "" { log.Fatalf("no id for map query %s", fields[*queryNameField]) } fields[*queryNameField] = id for i, f := range fields { if i != 0 { fmt.Print("\t") } fmt.Print(f) } fmt.Println() } }
func BenchmarkNWAffineAlign(b *testing.B) { b.StopTimer() t := &linear.Seq{} t.Alpha = alphabet.DNAgapped r := fasta.NewReader(strings.NewReader(crspFa), t) nwsa, _ := r.Read() nwsb, _ := r.Read() needle := NWAffine{ Matrix: Linear{ {10, -3, -1, -4, -5}, {-3, 9, -5, 0, -5}, {-1, -5, 7, -3, -5}, {-4, 0, -3, 8, -5}, {-4, -4, -4, -4, 0}, }, GapOpen: -10, } b.StartTimer() for i := 0; i < b.N; i++ { needle.Align(nwsa, nwsb) } }
func main() { flag.Var(&alnmat, "align", "specify the match, mismatch and gap parameters") flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } events := make(map[string][]*gff.Feature) fsc := featio.NewScanner(gff.NewReader(f)) for fsc.Next() { f := fsc.Feat().(*gff.Feature) fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } events[fields[0]] = append(events[fields[0]], f) } if err := fsc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } f.Close() w := gff.NewWriter(os.Stdout, 60, true) w.WriteComment("Right coordinates (field 5) and strand (field 7) are hypothetical.") var out *os.File if *fastaOut != "" { out, err = os.Create(*fastaOut) if err != nil { log.Fatalf("failed to create fasta insertion output file %q: %v", *fastaOut, err) } defer out.Close() } hw := *window / 2 sw := makeTable(alphabet.DNAgapped, alnmat) for _, ref := range flag.Args() { f, err = os.Open(ref) if err != nil { log.Fatalf("failed to open reference %q: %v", ref, err) } ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped))) loop: for ssc.Next() { seq := ssc.Seq().(*linear.Seq) for _, f := range events[seq.Name()] { fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } start, err := strconv.Atoi(fields[1]) if err != nil { log.Fatalf("failed to get start coordinate: %v", err) } end, err := strconv.Atoi(fields[2]) if err != nil { log.Fatalf("failed to get end coordinate: %v", err) } if out != nil { insert := *seq if insert.Desc != "" { insert.Desc += " " } insert.Desc += fmt.Sprintf("[%d,%d)", start, end) insert.Seq = insert.Seq[start:end] fmt.Fprintf(out, "%60a\n", &insert) } var lOff, lEnd, rOff, rEnd int // If we have refined ends, use them. if dup := f.FeatAttributes.Get("Dup"); dup != "" { d, err := strconv.Atoi(dup) if err != nil { log.Fatalf("failed to get duplication length: %v", err) } lOff = max(0, start-d) lEnd = start rOff = end rEnd = min(len(seq.Seq), end+d) } else { lOff = max(0, start-hw) lEnd = min(len(seq.Seq), start+hw) rOff = max(0, end-hw) rEnd = min(len(seq.Seq), end+hw) // Ensure windows don't overlap. if lEnd > rOff { lEnd = (lEnd + rOff) / 2 rOff = lEnd } } if lEnd-lOff < *thresh || rEnd-rOff < *thresh { // Don't do fruitless work. continue loop } left := *seq left.ID = "prefix" left.Seq = left.Seq[lOff:lEnd] right := *seq right.ID = "postfix" right.Seq = right.Seq[rOff:rEnd] aln, err := sw.Align(&right, &left) if err != nil { log.Fatal(err) } fa := align.Format(&right, &left, aln, '-') for _, seg := range fa { var n int for _, l := range seg.(alphabet.Letters) { if l != '-' { n++ } } if n < *thresh { continue loop } } var sc int for _, seg := range aln { type scorer interface { Score() int } sc += seg.(scorer).Score() } f.FeatAttributes = append(f.FeatAttributes, gff.Attribute{ Tag: "TSD", Value: fmt.Sprintf(`%v %d %d %v "%v" %d`, fa[0], aln[len(aln)-1].Features()[0].End()+lOff, aln[0].Features()[1].Start()+rOff, fa[1], aln, sc), }) w.Write(f) } } if err := ssc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } f.Close() } }
func main() { flag.IntVar(&maxFam, "maxFam", 0, "maxFam indicates maximum family size considered (0 == no limit).") flag.BoolVar(&subSample, "subsample", false, "choose maxFam members of a family if the family has more than maxFam members.") flag.BoolVar(&consFasta, "fasta", false, "output consensus as fasta with quality case filtering.") flag.StringVar(&cDir, "cDir", "", "target directory for consensus output. If not empty Dir is deleted first.") flag.StringVar(&aDir, "aDir", "", "target directory for alignment output. If not empty dir is deleted first.") flag.StringVar(&sDir, "sDir", "", "target directory for sequence information output. If not empty dir is deleted first.") flag.Parse() fmt.Printf("Initialising Files: %s\n", flag.Args()[0]) checks(cDir, aDir, sDir) //Opening files f, fErr := os.Open(flag.Args()[0]) if fErr != nil { log.Printf("error: could not open %s to read %v", flag.Args()[0], fErr) } defer f.Close() // Reading in sequences var v []seq.Sequence r := fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA)) sc := seqio.NewScanner(r) for sc.Next() { v = append(v, sc.Seq()) } if sc.Error() != nil { log.Fatalf("failed to read sequences: %v", sc.Error()) } //Checking that there aren't too many sequences if maxFam != 0 && !subSample && len(v) > maxFam { log.Fatalf("too many sequences: %d", len(v)) } var aveLength int if subSample { // Shuffle first. w := make([]seq.Sequence, 0, len(v)) for _, j := range rand.Perm(len(v)) { w = append(w, v[j]) } v = w // Calculating lengths and averages for all sequences read var LenAllSeq = make([]int, len(v)) for num, i := range v { LenAllSeq[num] = i.Len() } // fmt.Printf("The variable: %v\n", LenAllSeq) totalLength := 0 for i := 0; i < len(v); i++ { totalLength = totalLength + LenAllSeq[i] } aveLength = totalLength / len(v) fmt.Printf("Average length: %v\n", aveLength) } // FIXME(brittany): Make this more scientifically robust. var ( sampled int buf bytes.Buffer ) // creating a file for the sequence information seqFile := fmt.Sprintf("%s_included_sequences.fq", flag.Args()[0]) seqOut, sErr := os.Create(filepath.Join(aDir, seqFile)) if sErr != nil { log.Fatalf("failed to create %s: %v", seqFile, sErr) } defer seqOut.Close() seqBufOut := bufio.NewWriter(f) defer seqBufOut.Flush() // printing subsampled sequences to a file var ( LenSubSeq = make([]int, maxFam) // length of each sequence ) for num, s := range v { if sampled++; subSample && sampled > maxFam { break } fmt.Fprintf(&buf, "%60a\n", s) fmt.Fprintf(seqOut, "including: %s %s\n", s.Name(), s.Description()) // Calculating lengths and averages for the subsampled sequences LenSubSeq[num] = s.Len() } totalSubLength := 0 for i := 0; i < maxFam; i++ { totalSubLength = totalSubLength + LenSubSeq[i] } fmt.Printf("total length of subbed: %v\n", totalSubLength) aveSubLength := totalSubLength / maxFam fmt.Printf("Average length of subbed: %v\n", aveSubLength) fmt.Printf("The sub seq variable for %v sequences: %v\n", maxFam, LenSubSeq) var ( c *linear.QSeq m *multi.Multi err error ) //Creating the consensus fmt.Println("Creating consensus") c, m, err = consensus(&buf) if err != nil { log.Printf("failed to generate consensus for %s: %v", flag.Args()[0], err) return } c.ID = fmt.Sprintf("%s_consensus", flag.Args()[0]) c.Desc = fmt.Sprintf("%d members total %v members sampled", len(v), sampled-1) c.Threshold = 42 c.QFilter = seq.CaseFilter conLength := c.Len() // Find a way to make the consensus the same case // Calculating cutoff of consensus length to mean sequence length fmt.Printf("Length of consensus:%v\n", conLength) cutoffSubs := float64(conLength) / float64(aveLength) cutoffTotal := float64(conLength) / float64(aveSubLength) fmt.Printf("ratio for all seqs: %f, \nratio for just sub-sampled: %f\n", cutoffTotal, cutoffSubs) // Creating a file for the consensus length information confile := fmt.Sprintf("%s_consensus-length-%v.txt", flag.Args()[0], maxFam) conOut, err := os.Create(filepath.Join(cDir, confile)) if err != nil { log.Fatalf("failed to create %s: %v", confile, err) } fmt.Fprintf(conOut, "number sampled: %v \nratio for all seqs: %f \nratio for just sub-sampled: %f\n\n %f\t%f", maxFam, cutoffTotal, cutoffSubs, cutoffTotal, cutoffSubs) // Creating a file for the consensus file := fmt.Sprintf("%s_consensus.fq%v", flag.Args()[0], maxFam) out, err := os.Create(filepath.Join(cDir, file)) if err != nil { log.Fatalf("failed to create %s: %v", file, err) } if consFasta { fmt.Fprintf(out, "%60a\n", c) } else { fmt.Fprintf(out, "%q\n", c) } // creating a file for the mutliple alignment alignFile := fmt.Sprintf("%s_multiple_alignment%v.fq", flag.Args()[0], maxFam) AlignOut, err := os.Create(filepath.Join(aDir, alignFile)) if err != nil { log.Fatalf("failed to create %s: %v", alignFile, err) } if consFasta { fmt.Fprintf(AlignOut, "%60a\n", m) } else { fmt.Fprintf(AlignOut, "%q\n", m) } out.Close() fmt.Printf("Complete\n\n") }