func readAnnotations(file string) (map[string]*interval.IntTree, error) { f, err := os.Open(file) if err != nil { return nil, err } trees := make(map[string]*interval.IntTree) sc := featio.NewScanner(gff.NewReader(f)) for id := uintptr(1); sc.Next(); id++ { f := sc.Feat().(*gff.Feature) t, ok := trees[f.SeqName] if !ok { t = &interval.IntTree{} trees[f.SeqName] = t } t.Insert(gffInterval{f, id}, true) } err = sc.Error() if err != nil { log.Fatalf("error during GFF read: %v", err) } for _, t := range trees { t.AdjustRanges() } return trees, nil }
func GFFToRelatable(fh io.Reader) (interfaces.RelatableChannel, error) { ch := make(chan interfaces.Relatable, 16) go func() { var g *gff.Reader g = gff.NewReader(fh) for { feat, err := g.Read() if err != nil { if err == io.EOF { break } else { log.Println(err) break } } // since Read returns the interface, first cast back // to gff.Feature so we have the needed Attributes. gfeat := feat.(*gff.Feature) f := Gff{Feature: gfeat, related: make([]interfaces.Relatable, 0, 7)} ch <- &f } close(ch) }() return ch, nil }
func main() { flag.Parse() w := gff.NewWriter(os.Stdout, 60, false) sc := featio.NewScanner(gff.NewReader(os.Stdin)) for sc.Next() { f := sc.Feat().(*gff.Feature) r := f.FeatAttributes.Get("Repeat") fields := strings.Fields(r) if len(fields) < 4 { log.Fatal("invalid repeat attribute") } end, err := strconv.Atoi(fields[3]) if err != nil { log.Fatalf("failed to parse end coordinate: %v", err) } remainder, err := strconv.Atoi(fields[4]) if err != nil { log.Fatalf("failed to parse remains coordinate: %v", err) } length := end + remainder if length < *thresh { continue } w.Write(f) } if err := sc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } }
func main() { flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } events := make(map[string][]*gff.Feature) fsc := featio.NewScanner(gff.NewReader(f)) for fsc.Next() { f := fsc.Feat().(*gff.Feature) fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } events[fields[0]] = append(events[fields[0]], f) } if err := fsc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } f.Close() for _, ref := range flag.Args() { f, err = os.Open(ref) if err != nil { log.Fatalf("failed to open reference %q: %v", ref, err) } ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA))) for ssc.Next() { seq := ssc.Seq().(*linear.Seq) for _, f := range events[seq.Name()] { fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } start, err := strconv.Atoi(fields[1]) if err != nil { log.Fatalf("failed to get start coordinate: %v", err) } end, err := strconv.Atoi(fields[2]) if err != nil { log.Fatalf("failed to get end coordinate: %v", err) } tmp := *seq tmp.ID += fmt.Sprintf("//%d_%d", start, end) tmp.Seq = tmp.Seq[start:end] fmt.Printf("%60a\n", &tmp) } } if err := ssc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } f.Close() } }
func filterFeats(annot string, classes, names []string, thresh float64) ([]interval.IntTree, error) { ntab := make(map[string]int, len(names)) for i, n := range names { ntab[n] = i } ts := make([]interval.IntTree, len(names)) cm := make(map[string]struct{}) for _, c := range classes { cm[c] = struct{}{} } f, err := os.Open(annot) if err != nil { return nil, err } fs := featio.NewScanner(gff.NewReader(f)) for fs.Next() { f := fs.Feat().(*gff.Feature) if f.FeatScore == nil || math.Exp(*f.FeatScore) < thresh { continue } var class string if att := f.FeatAttributes.Get(f.Feature); att != "" { // This gets the repeat attributes only. class = f.Feature + "/" + strings.Fields(att)[1] } else { class = f.Feature } if _, ok := cm[class]; !ok { last := strings.LastIndex(class, "/") if last == strings.Index(class, "/") { continue } if _, ok := cm[class[:last]]; !ok { continue } } if chr, ok := ntab[f.SeqName]; ok { ts[chr].Insert(intGff{f}, true) } } if err := fs.Error(); err != nil { return nil, err } for i := range ts { ts[i].AdjustRanges() } return ts, nil }
func annotFeats(annot string, classes, names []string) ([]interval.IntTree, error) { ntab := make(map[string]int, len(names)) for i, n := range names { ntab[n] = i } ts := make([]interval.IntTree, len(names)) cm := make(map[string]struct{}) for _, c := range classes { cm[c] = struct{}{} } f, err := os.Open(annot) if err != nil { return nil, err } fs := featio.NewScanner(gff.NewReader(f)) for id := uintptr(0); fs.Next(); id++ { f := fs.Feat().(*gff.Feature) var class string att := f.FeatAttributes.Get("repeat") if att == "" { // Ignore non-repeat features. continue } repeatFields := strings.Fields(att) class = f.Feature + "/" + repeatFields[1] if _, ok := cm[class]; !ok { last := strings.LastIndex(class, "/") if last == strings.Index(class, "/") { continue } if _, ok := cm[class[:last]]; !ok { continue } } if chr, ok := ntab[f.SeqName]; ok { ts[chr].Insert(intGff{f, id}, true) } } if err := fs.Error(); err != nil { return nil, err } for i := range ts { ts[i].AdjustRanges() } return ts, nil }
func main() { flag.Parse() if *exclude == "" { flag.Usage() os.Exit(1) } nameSet := make(map[string]struct{}) f, err := os.Open(*exclude) if err != nil { log.Fatalf("failed to open exclude file %q: %v", *exclude, err) } ls := bufio.NewScanner(f) for ls.Scan() { nameSet[ls.Text()] = struct{}{} } err = ls.Err() if err != nil { log.Fatalf("failed to read exclude file: %v", err) } w := gff.NewWriter(os.Stdout, 60, true) var excl *gff.Writer if *retain { excl = gff.NewWriter(os.Stderr, 60, true) } sc := featio.NewScanner(gff.NewReader(os.Stdin)) for sc.Next() { f := sc.Feat().(*gff.Feature) n := f.FeatAttributes.Get("Read") if _, ok := nameSet[n]; ok { if excl != nil { _, err := excl.Write(f) if err != nil { log.Fatalf("failed to write feature: %v", err) } } continue } _, err := w.Write(f) if err != nil { log.Fatalf("failed to write feature: %v", err) } } if err := sc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } }
func main() { flag.Parse() var grps []map[string]int sc := featio.NewScanner(gff.NewReader(os.Stdin)) for sc.Next() { f := sc.Feat().(*gff.Feature) r := f.FeatAttributes.Get("Repeat") g := f.FeatAttributes.Get("Group") typ := strings.Fields(r)[0] if !*doGrouping { fmt.Printf("%s\t%s\n", g, typ) } gid, err := strconv.Atoi(g) if err != nil { log.Fatalf("failed to parse group id: %v", err) } grps = add(grps, gid, typ) } if err := sc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } if !*doGrouping { return } for gid, g := range grps { if g == nil { continue } fmt.Printf("%d\t", gid) m := sortedMap(g) for i, t := range m { if i != 0 { fmt.Print(" ") } fmt.Printf("%s:%d", t.typ, t.n) } name := nameHeuristic(m) fmt.Printf("\t%s\t%s\n", name, trunc(name, 5)) } }
func (s *S) TestReadFromFunc(c *check.C) { for i, g := range []struct { gff string feat []*gff.Feature }{ { gff: `SEQ1 EMBL atg 103 105 . + 0 SEQ1 EMBL exon 103 172 . + 0 SEQ1 EMBL splice5 172 173 . + . SEQ1 netgene splice5 172 173 0.94 + . SEQ1 genie sp5-20 163 182 2.3 + . SEQ1 genie sp5-10 168 177 2.1 + . SEQ2 grail ATG 17 19 2.1 - 0 `, feat: []*gff.Feature{ {SeqName: "SEQ1", Source: "EMBL", Feature: "atg", FeatStart: 102, FeatEnd: 105, FeatScore: nil, FeatFrame: gff.Frame0, FeatStrand: seq.Plus}, {SeqName: "SEQ1", Source: "EMBL", Feature: "exon", FeatStart: 102, FeatEnd: 172, FeatScore: nil, FeatFrame: gff.Frame0, FeatStrand: seq.Plus}, {SeqName: "SEQ1", Source: "EMBL", Feature: "splice5", FeatStart: 171, FeatEnd: 173, FeatScore: nil, FeatFrame: gff.NoFrame, FeatStrand: seq.Plus}, {SeqName: "SEQ1", Source: "netgene", Feature: "splice5", FeatStart: 171, FeatEnd: 173, FeatScore: floatPtr(0.94), FeatFrame: gff.NoFrame, FeatStrand: seq.Plus}, {SeqName: "SEQ1", Source: "genie", Feature: "sp5-20", FeatStart: 162, FeatEnd: 182, FeatScore: floatPtr(2.3), FeatFrame: gff.NoFrame, FeatStrand: seq.Plus}, {SeqName: "SEQ1", Source: "genie", Feature: "sp5-10", FeatStart: 167, FeatEnd: 177, FeatScore: floatPtr(2.1), FeatFrame: gff.NoFrame, FeatStrand: seq.Plus}, {SeqName: "SEQ2", Source: "grail", Feature: "ATG", FeatStart: 16, FeatEnd: 19, FeatScore: floatPtr(2.1), FeatFrame: gff.Frame0, FeatStrand: seq.Minus}, }, }, } { sc := featio.NewScannerFromFunc( gff.NewReader( bytes.NewBufferString(g.gff), ).Read, ) var j int for sc.Next() { f := sc.Feat() c.Check(f, check.DeepEquals, g.feat[j], check.Commentf("Test: %d Line: %d", i, j+1)) j++ } c.Check(sc.Error(), check.Equals, nil) c.Check(j, check.Equals, len(g.feat)) } }
func readMappings(file string) (map[string]*gff.Feature, error) { f, err := os.Open(file) if err != nil { return nil, err } mapping := make(map[string]*gff.Feature) sc := featio.NewScanner(gff.NewReader(f)) for id := uintptr(1); sc.Next(); id++ { f := sc.Feat().(*gff.Feature) read := f.FeatAttributes.Get("Read") if read == "" { continue } // Currently reefer only expects a single hit per read. mapping[strings.Fields(read)[0]] = f } if err != nil { log.Fatalf("error during GFF read: %v", err) } return mapping, nil }
func main() { groups := make(map[string]struct { chrom string start, end int }) sc := featio.NewScanner(gff.NewReader(os.Stdin)) for sc.Next() { f := sc.Feat().(*gff.Feature) g := f.FeatAttributes.Get("Group") if g == "" { continue } grp, ok := groups[g] if !ok { groups[g] = struct { chrom string start, end int }{chrom: f.SeqName, start: f.FeatStart, end: f.FeatEnd} continue } if f.FeatStart < grp.start { grp.start = f.FeatStart } if grp.end < f.FeatEnd { grp.end = f.FeatEnd } groups[g] = grp } if err := sc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } for k, v := range groups { fmt.Printf("%s\t%d\t%d\t%s\n", v.chrom, v.start, v.end, k) } }
func main() { vm := make(map[string]*vector) fm := make(map[string]string) sc := featio.NewScanner(gff.NewReader(os.Stdin)) for sc.Next() { f := sc.Feat().(*gff.Feature) if f.Feature != "repeat" { continue } att := f.FeatAttributes.Get("repeat") if att == "" { continue } fields := strings.Fields(att) s := mustAtoi(fields[2]) e := mustAtoi(fields[3]) v, ok := vm[fields[0]] if !ok { v = &vector{} vm[fields[0]] = v fm[fields[0]] = fields[1] } for i := e; i >= s; i-- { v.inc(i) } } for typ, vec := range vm { for pos, val := range *vec { if val != 0 { fmt.Printf("%s\t%s\t%d\t%d\n", fm[typ], typ, pos, val) } } } }
func main() { nameSet := make(map[string]struct{}) sc := featio.NewScanner(gff.NewReader(os.Stdin)) for sc.Next() { f := sc.Feat().(*gff.Feature) n := f.FeatAttributes.Get("Read") if n == "" { continue } nameSet[n] = struct{}{} } if err := sc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } names := make([]string, 0, len(nameSet)) for n := range nameSet { names = append(names, n) } sort.Strings(names) for _, n := range names { fmt.Println(n) } }
func main() { if len(os.Args) < 2 { fmt.Fprintln(os.Stderr, "invalid invocation: must have at least one reads file") os.Exit(1) } extract := make(map[string][2]int) sc := featio.NewScanner(gff.NewReader(os.Stdin)) for sc.Next() { f := sc.Feat().(*gff.Feature) read := f.FeatAttributes.Get("Read") if read == "" { continue } fields := strings.Fields(read) name := fields[0] start, err := strconv.Atoi(fields[1]) if err != nil { log.Fatalf("failed to parse %q: %v", read, err) } end, err := strconv.Atoi(fields[2]) if err != nil { log.Fatalf("failed to parse %q: %v", read, err) } extract[name] = [2]int{start, end} } err := sc.Error() if err != nil { log.Fatalf("error during GFF read: %v", err) } for _, reads := range os.Args[1:] { sf, err := os.Open(reads) if err != nil { log.Fatalf("failed to open %q: %v", reads, err) } sr, err := sam.NewReader(sf) if err != nil { log.Fatalf("failed to open SAM input %q: %v", reads, err) } for { r, err := sr.Read() if err != nil { if err != io.EOF { log.Fatalf("unexpected error reading SAM: %v", err) } break } v, ok := extract[r.Name] if !ok { continue } // Currently reefer only expects a single hit per read, // so any multiples are due to duplicate read file input. // Update this behaviour if we change reefer to look at // remapping soft-clipped segments. delete(extract, r.Name) reverse := r.Flags&sam.Reverse != 0 rng := fmt.Sprintf("//%d_%d", v[0], v[1]) if reverse { rng += "(-)" len := r.Seq.Length v[0], v[1] = len-v[1], len-v[0] } v[0] = feat.OneToZero(v[0]) s := linear.NewSeq( r.Name+rng, alphabet.BytesToLetters(r.Seq.Expand())[v[0]:v[1]], alphabet.DNA, ) if reverse { s.Desc = "(sequence revcomp relative to read)" } fmt.Printf("%60a\n", s) } sf.Close() } }
func main() { flag.Parse() if *in == "" || *ref == "" || *mapfile == "" || *contigs == "" { flag.Usage() os.Exit(0) } refTrees, err := readAnnotations(*ref) if err != nil { log.Fatalf("failed to read annotation trees: %v", err) } mapping, err := readMappings(*mapfile) if err != nil { log.Fatalf("failed to read mapping file: %v", err) } contigLength, err := readContigs(*contigs) if err != nil { log.Fatalf("failed to read contig file: %v", err) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } w := gff.NewWriter(os.Stdout, 60, true) sc := featio.NewScanner(gff.NewReader(f)) for sc.Next() { f := sc.Feat().(*gff.Feature) ok, err := within(*buf, f.SeqName) if err != nil { log.Fatalf("failed to parse sequence name: %s: %v", f.SeqName, err) } if !ok { log.Printf("too close to read end: excluding %+v", f) continue } repeat := f.FeatAttributes.Get("Repeat") if repeat == "" { continue } fields := strings.Fields(repeat) name := strings.Split(f.SeqName, "//") if len(name) != 2 { log.Fatalf("unexpected sequence name in input: %q", f.SeqName) } contigSide, ok := mapping[name[0]] if !ok { log.Fatalf("unexpected sequence name in input: %q", f.SeqName) } if contigSide.FeatStart+f.FeatStart < *buf { log.Printf("too close to contig start:\n\texcluding %#v\n\tcontig %#v\n\n%d < %d", f, contigSide, contigSide.FeatStart, *buf) continue } length, ok := contigLength[contigSide.SeqName] if !ok { log.Fatalf("unexpected sequence name in contig mapping: %q", contigSide.SeqName) } if length-((contigSide.FeatEnd-contigSide.FeatStart)+f.FeatEnd) < *buf { log.Printf("too close to contig end:\n\texcluding %#v\n\tcontig %#v", f, contigSide) continue } t, ok := refTrees[contigSide.SeqName] if !ok { log.Fatalf("no tree for %v mapped by %v", contigSide.SeqName, f.SeqName) } var n int hits := t.Get(gffInterval{Feature: contigSide}) for _, h := range hits { f := h.(gffInterval) repeat := f.FeatAttributes.Get("Repeat") if repeat == "" { continue } hitClass := strings.Fields(repeat)[1] if fields[1] == hitClass { n++ } } if n != 0 { log.Printf("too many hits: excluding %+v", f) for _, h := range hits { log.Printf("\t%+v", h.(gffInterval).Feature) } continue } w.Write(f) } err = sc.Error() if err != nil { log.Fatalf("error during GFF read: %v", err) } }
func main() { flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } defer f.Close() names := make(map[string]map[string]struct{}) sc := featio.NewScanner(gff.NewReader(f)) for sc.Next() { feat := sc.Feat().(*gff.Feature) read := feat.FeatAttributes.Get("Read") if read == "" { continue } read = strings.Fields(read)[0] idx := strings.LastIndex(read, "/") e, ok := names[read[:idx]] if !ok { e = make(map[string]struct{}) names[read[:idx]] = e } e[read[idx+1:]] = struct{}{} } if err := sc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } f.Close() base := filepath.Base(*in) unique, err := os.Create(base + ".unique.text") if err != nil { log.Fatalf("failed to create %q: %v", base+".unique.text", err) } defer unique.Close() nonUnique, err := os.Create(base + ".non-unique.text") if err != nil { log.Fatalf("failed to create %q: %v", base+".non-unique.text", err) } defer nonUnique.Close() for name, coords := range names { switch len(coords) { case 0: case 1: fmt.Fprintln(unique, name) default: s := make([]string, 0, len(coords)) for c := range coords { s = append(s, c) } sort.Strings(s) fmt.Fprintf(nonUnique, "%s\t%v\n", name, s) } } }
func main() { flag.Var(&alnmat, "align", "specify the match, mismatch and gap parameters") flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } events := make(map[string][]*gff.Feature) fsc := featio.NewScanner(gff.NewReader(f)) for fsc.Next() { f := fsc.Feat().(*gff.Feature) fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } events[fields[0]] = append(events[fields[0]], f) } if err := fsc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } f.Close() w := gff.NewWriter(os.Stdout, 60, true) w.WriteComment("Right coordinates (field 5) and strand (field 7) are hypothetical.") var out *os.File if *fastaOut != "" { out, err = os.Create(*fastaOut) if err != nil { log.Fatalf("failed to create fasta insertion output file %q: %v", *fastaOut, err) } defer out.Close() } hw := *window / 2 sw := makeTable(alphabet.DNAgapped, alnmat) for _, ref := range flag.Args() { f, err = os.Open(ref) if err != nil { log.Fatalf("failed to open reference %q: %v", ref, err) } ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped))) loop: for ssc.Next() { seq := ssc.Seq().(*linear.Seq) for _, f := range events[seq.Name()] { fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } start, err := strconv.Atoi(fields[1]) if err != nil { log.Fatalf("failed to get start coordinate: %v", err) } end, err := strconv.Atoi(fields[2]) if err != nil { log.Fatalf("failed to get end coordinate: %v", err) } if out != nil { insert := *seq if insert.Desc != "" { insert.Desc += " " } insert.Desc += fmt.Sprintf("[%d,%d)", start, end) insert.Seq = insert.Seq[start:end] fmt.Fprintf(out, "%60a\n", &insert) } var lOff, lEnd, rOff, rEnd int // If we have refined ends, use them. if dup := f.FeatAttributes.Get("Dup"); dup != "" { d, err := strconv.Atoi(dup) if err != nil { log.Fatalf("failed to get duplication length: %v", err) } lOff = max(0, start-d) lEnd = start rOff = end rEnd = min(len(seq.Seq), end+d) } else { lOff = max(0, start-hw) lEnd = min(len(seq.Seq), start+hw) rOff = max(0, end-hw) rEnd = min(len(seq.Seq), end+hw) // Ensure windows don't overlap. if lEnd > rOff { lEnd = (lEnd + rOff) / 2 rOff = lEnd } } if lEnd-lOff < *thresh || rEnd-rOff < *thresh { // Don't do fruitless work. continue loop } left := *seq left.ID = "prefix" left.Seq = left.Seq[lOff:lEnd] right := *seq right.ID = "postfix" right.Seq = right.Seq[rOff:rEnd] aln, err := sw.Align(&right, &left) if err != nil { log.Fatal(err) } fa := align.Format(&right, &left, aln, '-') for _, seg := range fa { var n int for _, l := range seg.(alphabet.Letters) { if l != '-' { n++ } } if n < *thresh { continue loop } } var sc int for _, seg := range aln { type scorer interface { Score() int } sc += seg.(scorer).Score() } f.FeatAttributes = append(f.FeatAttributes, gff.Attribute{ Tag: "TSD", Value: fmt.Sprintf(`%v %d %d %v "%v" %d`, fa[0], aln[len(aln)-1].Features()[0].End()+lOff, aln[0].Features()[1].Start()+rOff, fa[1], aln, sc), }) w.Write(f) } } if err := ssc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } f.Close() } }