// WriteMetaData writes a meta data line to a GFF file. The type of metadata line // depends on the type of d: strings and byte slices are written verbatim, an int is // interpreted as a version number and can only be written before any other data, // feat.Moltype and gff.Sequence types are written as sequence type lines, gff.Features // and gff.Regions are written as sequence regions, sequences are written _n GFF // format and time.Time values are written as date line. All other type return an // ErrNotHandled. func (w *Writer) WriteMetaData(d interface{}) (n int, err error) { defer func() { w.header = true }() switch d := d.(type) { case string: return fmt.Fprintf(w.w, "##%s\n", d) case []byte: return fmt.Fprintf(w.w, "##%s\n", d) case int: if w.header { return 0, ErrCannotHeader } return fmt.Fprintf(w.w, "##gff-version %d\n", d) case feat.Moltype: return fmt.Fprintf(w.w, "##Type %s\n", d) case Sequence: return fmt.Fprintf(w.w, "##Type %s %s\n", d.Type, d.SeqName) case *Feature: return fmt.Fprintf(w.w, "##sequence-region %s %d %d\n", d.SeqName, feat.ZeroToOne(d.FeatStart), d.FeatEnd) case feat.Feature: return w.Write(d) case time.Time: return fmt.Fprintf(w.w, "##date %s\n", d.Format(w.TimeFormat)) } return 0, ErrNotHandled }
// Write writes a single feature and return the number of bytes written and any error. func (w *Writer) Write(pair *Pair) (n int, err error) { t := w.t t.SeqName = pair.B.Location().Name() t.FeatStart = pair.B.Start() t.FeatEnd = pair.B.End() t.FeatScore = floatPtr(float64(pair.Score)) t.FeatStrand = pair.Strand t.FeatFrame = gff.NoFrame t.FeatAttributes = append(t.FeatAttributes[:0], gff.Attribute{ Tag: "Target", Value: fmt.Sprintf("%s %d %d", pair.A.Location().Name(), feat.ZeroToOne(pair.A.Start()), pair.A.End()), }, gff.Attribute{ Tag: "maxe", Value: fmt.Sprintf("%.2g", pair.Error), }, ) return w.w.Write(t) }
// Write writes a single feature and return the number of bytes written and any error. // gff.Features are written as a canonical GFF line, seq.Sequences are written as inline // sequence in GFF format (note that only sequences of feat.Moltype DNA, RNA and Protein // are supported). gff.Sequences are not handled as they have a zero length. All other // feat.Feature are written as sequence region metadata lines. func (w *Writer) Write(f feat.Feature) (n int, err error) { if f.Start() >= f.End() { return 0, ErrBadFeature } w.header = true switch f := f.(type) { case *Feature: defer func() { if err != nil { return } _, err = w.w.Write([]byte{'\n'}) if err != nil { return } n++ }() n, err = fmt.Fprintf(w.w, "%s\t%s\t%s\t%d\t%d\t", f.SeqName, f.Source, f.Feature, feat.ZeroToOne(f.FeatStart), f.FeatEnd, ) if err != nil { return n, err } var _n int if f.FeatScore != nil && !math.IsNaN(*f.FeatScore) { if w.Precision < 0 { _n, err = fmt.Fprintf(w.w, "%v", *f.FeatScore) } else { _n, err = fmt.Fprintf(w.w, "%.*f", w.Precision, *f.FeatScore) } if err != nil { return n, err } n += _n } else { _, err = w.w.Write([]byte{'.'}) if err != nil { return n, err } n++ } _n, err = fmt.Fprintf(w.w, "\t%s\t%s", f.FeatStrand, f.FeatFrame, ) n += _n if err != nil { return n, err } if f.FeatAttributes != nil { _n, err = fmt.Fprintf(w.w, "\t%v", f.FeatAttributes) if err != nil { return n, err } n += _n } else if f.Comments != "" { _, err = w.w.Write([]byte{'\t'}) if err != nil { return } n++ } if f.Comments != "" { _n, err = fmt.Fprintf(w.w, "\t%s", f.Comments) n += _n } return n, err case seq.Sequence: sw := fasta.NewWriter(w.w, w.Width) moltype := f.Alphabet().Moltype() if moltype < feat.DNA || moltype > feat.Protein { return 0, ErrNotHandled } sw.IDPrefix = [...][]byte{ feat.DNA: []byte("##DNA "), feat.RNA: []byte("##RNA "), feat.Protein: []byte("##Protein "), }[moltype] sw.SeqPrefix = []byte("##") n, err = sw.Write(f) if err != nil { return n, err } var _n int _n, err = w.w.Write([...][]byte{ feat.DNA: []byte("##end-DNA\n"), feat.RNA: []byte("##end-RNA\n"), feat.Protein: []byte("##end-Protein\n"), }[moltype]) return n + _n, err case Sequence: return 0, ErrNotHandled case *Region: return fmt.Fprintf(w.w, "##sequence-region %s %d %d\n", f.SeqName, feat.ZeroToOne(f.RegionStart), f.RegionEnd) default: return fmt.Fprintf(w.w, "##sequence-region %s %d %d\n", f.Name(), feat.ZeroToOne(f.Start()), f.End()) } }
// deletions analyses *sam.Records from mapping reads to the given reference // using the suffix array file if provided. If run is false, blasr is not // run and the existing blasr output is used to provide the *sam.Records. // procs specifies the number of blasr threads to use. func deletions(reads, ref, suff, ext string, procs int, run bool, window, min int, br *refiner, w *gff.Writer) error { base := filepath.Base(reads) b := blasr.BLASR{ Cmd: *blasrPath, Reads: reads, Genome: ref, SuffixArray: suff, BestN: 1, SAM: true, Clipping: "soft", SAMQV: true, CIGARSeqMatch: true, Aligned: base + ".blasr." + ext, Unaligned: base + ".blasr.unmapped.fasta", Procs: procs, } if run { cmd, err := b.BuildCommand() if err != nil { return err } cmd.Stdout = errStream cmd.Stderr = errStream err = cmd.Run() if err != nil { return err } } f, err := os.Open(b.Aligned) if err != nil { return err } defer f.Close() cost := [...]float64{ sam.CigarInsertion: -2, sam.CigarDeletion: -2, sam.CigarEqual: 1, sam.CigarMismatch: -1, // Included for explicitness sam.CigarSoftClipped: 0, // Included to ensure no bounds panic. // All CIGAR operations not listed above // are given a zero cost. sam.CigarBack: 0, } _, err = w.WriteComment(fmt.Sprintf("smoothing window=%d", window)) if err != nil { return nil } _, err = w.WriteComment(fmt.Sprintf("minimum feature length=%d", min)) if err != nil { return nil } gf := &gff.Feature{ Source: "reefer", Feature: "discordance", FeatFrame: gff.NoFrame, FeatAttributes: gff.Attributes{{Tag: "Read"}, {Tag: "Dup"}}, } var sr interface { Read() (*sam.Record, error) } switch ext { case "sam": sr, err = sam.NewReader(f) if err != nil { return err } case "bam": var br *bam.Reader br, err = bam.NewReader(f, 0) if err != nil { return err } defer br.Close() sr = br default: panic("reefer: invalid extension") } for { r, err := sr.Read() if err != nil { if err != io.EOF { return err } break } var ( scores []costPos ref = r.Start() query int ) for _, co := range r.Cigar { for i := 0; i < co.Len(); i++ { scores = append(scores, costPos{ ref: ref, query: query, cost: cost[co.Type()], }) consume := co.Type().Consumes() ref += consume.Reference query += consume.Query } } if len(scores) <= window { continue } smoothed := make([]costPos, len(scores)-window) for i := range scores[:len(scores)-window] { smoothed[i] = mean(scores[i : i+window]) } var d deletion for i, v := range smoothed[1:] { switch { case d.record == nil && v.cost < 0 && smoothed[i].cost >= 0: d = deletion{record: r, rstart: v.ref + 1, qstart: v.query + 1} case d.record != nil && v.cost >= 0 && smoothed[i].cost < 0: d.rend = v.ref d.qend = v.query if d.rend-d.rstart >= min || d.qend-d.qstart >= min { gf.SeqName = d.record.Ref.Name() gf.FeatStrand = strandFor(d.record) if gf.FeatStrand == seq.Minus { len := d.record.Seq.Length d.qstart, d.qend = len-d.qend, len-d.qstart } // Adjust ends based on paired SW alignments. var refined bool d, refined, err = br.adjust(d) if err != nil && *verbose { log.Printf("failed alignment %s: %v", d.record.Name, err) } gf.FeatStart = d.rstart gf.FeatEnd = d.rend if gf.FeatStart == gf.FeatEnd { // This is disgusting garbage resulting from // GFF not allowing zero length features. gf.FeatEnd++ } if refined { gf.FeatAttributes = gf.FeatAttributes[:2] gf.FeatAttributes[1].Value = strconv.Itoa(d.dup) } else { gf.FeatAttributes = gf.FeatAttributes[:1] } gf.FeatAttributes[0].Value = fmt.Sprintf("%s %d %d", d.record.Name, feat.ZeroToOne(d.qstart), d.qend) _, err = w.Write(gf) if err != nil { return err } } d.record = nil } } } return nil }