// hitSetFrom returns a hitSet from mapping reads to the given reference // using the suffix array file if provided. If run is false, blasr is not // run and the existing blasr output is used to reconstruct the hitSet. // procs specifies the number of blasr threads to use. func hitSetFrom(reads, ref, suff string, procs int, run bool) (hitSet, error) { base := filepath.Base(reads) b := blasr.BLASR{ Cmd: *blasrPath, Reads: reads, Genome: ref, SuffixArray: suff, BestN: 1, Format: 4, Aligned: base + ".blasr", Unaligned: base + ".blasr.unmapped", Procs: procs, } if run { cmd, err := b.BuildCommand() if err != nil { return nil, err } cmd.Stdout = errStream cmd.Stderr = errStream err = cmd.Run() if err != nil { return nil, err } } f, err := os.Open(b.Aligned) if err != nil { return nil, err } defer f.Close() hits := make(hitSet) sc := bufio.NewScanner(f) for sc.Scan() { b, err := newBlasrHit(sc.Text()) if err != nil { return nil, err } hits[b.qName] = b } return hits, sc.Err() }
// deletions analyses *sam.Records from mapping reads to the given reference // using the suffix array file if provided. If run is false, blasr is not // run and the existing blasr output is used to provide the *sam.Records. // procs specifies the number of blasr threads to use. func deletions(reads, ref, suff, ext string, procs int, run bool, window, min int, br *refiner, w *gff.Writer) error { base := filepath.Base(reads) b := blasr.BLASR{ Cmd: *blasrPath, Reads: reads, Genome: ref, SuffixArray: suff, BestN: 1, SAM: true, Clipping: "soft", SAMQV: true, CIGARSeqMatch: true, Aligned: base + ".blasr." + ext, Unaligned: base + ".blasr.unmapped.fasta", Procs: procs, } if run { cmd, err := b.BuildCommand() if err != nil { return err } cmd.Stdout = errStream cmd.Stderr = errStream err = cmd.Run() if err != nil { return err } } f, err := os.Open(b.Aligned) if err != nil { return err } defer f.Close() cost := [...]float64{ sam.CigarInsertion: -2, sam.CigarDeletion: -2, sam.CigarEqual: 1, sam.CigarMismatch: -1, // Included for explicitness sam.CigarSoftClipped: 0, // Included to ensure no bounds panic. // All CIGAR operations not listed above // are given a zero cost. sam.CigarBack: 0, } _, err = w.WriteComment(fmt.Sprintf("smoothing window=%d", window)) if err != nil { return nil } _, err = w.WriteComment(fmt.Sprintf("minimum feature length=%d", min)) if err != nil { return nil } gf := &gff.Feature{ Source: "reefer", Feature: "discordance", FeatFrame: gff.NoFrame, FeatAttributes: gff.Attributes{{Tag: "Read"}, {Tag: "Dup"}}, } var sr interface { Read() (*sam.Record, error) } switch ext { case "sam": sr, err = sam.NewReader(f) if err != nil { return err } case "bam": var br *bam.Reader br, err = bam.NewReader(f, 0) if err != nil { return err } defer br.Close() sr = br default: panic("reefer: invalid extension") } for { r, err := sr.Read() if err != nil { if err != io.EOF { return err } break } var ( scores []costPos ref = r.Start() query int ) for _, co := range r.Cigar { for i := 0; i < co.Len(); i++ { scores = append(scores, costPos{ ref: ref, query: query, cost: cost[co.Type()], }) consume := co.Type().Consumes() ref += consume.Reference query += consume.Query } } if len(scores) <= window { continue } smoothed := make([]costPos, len(scores)-window) for i := range scores[:len(scores)-window] { smoothed[i] = mean(scores[i : i+window]) } var d deletion for i, v := range smoothed[1:] { switch { case d.record == nil && v.cost < 0 && smoothed[i].cost >= 0: d = deletion{record: r, rstart: v.ref + 1, qstart: v.query + 1} case d.record != nil && v.cost >= 0 && smoothed[i].cost < 0: d.rend = v.ref d.qend = v.query if d.rend-d.rstart >= min || d.qend-d.qstart >= min { gf.SeqName = d.record.Ref.Name() gf.FeatStrand = strandFor(d.record) if gf.FeatStrand == seq.Minus { len := d.record.Seq.Length d.qstart, d.qend = len-d.qend, len-d.qstart } // Adjust ends based on paired SW alignments. var refined bool d, refined, err = br.adjust(d) if err != nil && *verbose { log.Printf("failed alignment %s: %v", d.record.Name, err) } gf.FeatStart = d.rstart gf.FeatEnd = d.rend if gf.FeatStart == gf.FeatEnd { // This is disgusting garbage resulting from // GFF not allowing zero length features. gf.FeatEnd++ } if refined { gf.FeatAttributes = gf.FeatAttributes[:2] gf.FeatAttributes[1].Value = strconv.Itoa(d.dup) } else { gf.FeatAttributes = gf.FeatAttributes[:1] } gf.FeatAttributes[0].Value = fmt.Sprintf("%s %d %d", d.record.Name, feat.ZeroToOne(d.qstart), d.qend) _, err = w.Write(gf) if err != nil { return err } } d.record = nil } } } return nil }