// Write writes r to the BAM stream. func (bw *Writer) Write(r *sam.Record) error { if len(r.Name) == 0 || len(r.Name) > 254 { return errors.New("bam: name absent or too long") } if r.Qual != nil && len(r.Qual) != r.Seq.Length { return errors.New("bam: sequence/quality length mismatch") } tags := buildAux(r.AuxFields) recLen := bamFixedRemainder + len(r.Name) + 1 + // Null terminated. len(r.Cigar)<<2 + // CigarOps are 4 bytes. len(r.Seq.Seq) + len(r.Qual) + len(tags) bw.buf.Reset() wb := errWriter{w: &bw.buf} bin := binaryWriter{w: &wb} // Write record header data. bin.writeInt32(int32(recLen)) bin.writeInt32(int32(r.Ref.ID())) bin.writeInt32(int32(r.Pos)) bin.writeUint8(byte(len(r.Name) + 1)) bin.writeUint8(r.MapQ) bin.writeUint16(uint16(r.Bin())) //r.bin bin.writeUint16(uint16(len(r.Cigar))) bin.writeUint16(uint16(r.Flags)) bin.writeInt32(int32(r.Seq.Length)) bin.writeInt32(int32(r.MateRef.ID())) bin.writeInt32(int32(r.MatePos)) bin.writeInt32(int32(r.TempLen)) // Write variable length data. wb.Write(append([]byte(r.Name), 0)) writeCigarOps(&bin, r.Cigar) wb.Write(doublets(r.Seq.Seq).Bytes()) if r.Qual != nil { wb.Write(r.Qual) } else { for i := 0; i < r.Seq.Length; i++ { wb.WriteByte(0xff) } } wb.Write(tags) if wb.err != nil { return wb.err } _, err := bw.bg.Write(bw.buf.Bytes()) return err }
func (d *DiversityFilter) Diff(r *sam.Record, genome []byte) (diff, length int) { start := r.Start() end := r.End() if start < 0 || end > len(genome) { if *debug { text, err := r.MarshalSAM(sam.FlagDecimal) raiseError(err) log.Printf("acc: %s, genome length %d, read starts at %d and ends at %d: %s\n", r.Ref.Name(), len(genome), start, end, text) } length = 0 return } refSeq := genome[start:end] diff = 0 read := map2Ref(r) length = len(read) for i := 0; i < length; i++ { if read[i] != refSeq[i] { diff++ } } return }
func (d *DiversityFilter) filter(buf []*sam.Record, acc string, genome []byte) (out []*sam.Record, acc1 string, genome1 []byte) { fn := func(txn *lmdb.Txn) error { dbi, err := txn.OpenDBI("read", 0) if err != nil { return err } for _, r := range buf { key := []byte(r.Name) val, err := txn.Get(dbi, key) if err != nil { if lmdb.IsNotFound(err) { val, err = r.MarshalText() if err != nil { return err } err = txn.Put(dbi, key, val, 0) if err != nil { return err } } else { return err } } else { var mate *sam.Record = &sam.Record{} err := mate.UnmarshalText(val) raiseError(err) if r.Ref.Name() == mate.Ref.Name() { if acc != r.Ref.Name() { genome, err = d.findGenome(r, d.featureDB, "fna") raiseError(err) acc = r.Ref.Name() if *debug { log.Println(acc) } } diff1, len1 := d.Diff(r, genome) diff2, len2 := d.Diff(mate, genome) if len1 > 0 && len2 > 0 && float64(diff1+diff2)/float64(len1+len2) <= d.Cutoff { out = append(out, r) out = append(out, mate) } else { if *debug { log.Printf("%d, %d, %d, %d\n", diff1, diff2, len1, len2) } } } txn.Del(dbi, key, val) } } return nil } retry: err := d.db.Update(fn) if lmdb.IsMapFull(err) { d.sizeDB *= 2 err = d.db.SetMapSize(d.sizeDB) raiseError(err) goto retry } raiseError(err) genome1 = genome acc1 = acc return }
// Read returns the next sam.Record in the BAM stream. func (br *Reader) Read() (*sam.Record, error) { if br.c != nil && vOffset(br.r.LastChunk().End) >= vOffset(br.c.End) { return nil, io.EOF } r := errReader{r: br.r} bin := binaryReader{r: &r} // Read record header data. blockSize := int(bin.readInt32()) r.n = 0 // The blocksize field is not included in the blocksize. // br.r.Chunk() is only valid after the call the Read(), so this // must come after the first read in the record. tx := br.r.Begin() defer func() { br.lastChunk = tx.End() }() var rec sam.Record refID := bin.readInt32() rec.Pos = int(bin.readInt32()) nLen := bin.readUint8() rec.MapQ = bin.readUint8() _ = bin.readUint16() nCigar := bin.readUint16() rec.Flags = sam.Flags(bin.readUint16()) lSeq := bin.readInt32() nextRefID := bin.readInt32() rec.MatePos = int(bin.readInt32()) rec.TempLen = int(bin.readInt32()) if r.err != nil { return nil, r.err } // Read variable length data. name := make([]byte, nLen) if nf, _ := r.Read(name); nf != int(nLen) { return nil, errors.New("bam: truncated record name") } rec.Name = string(name[:len(name)-1]) // The BAM spec indicates name is null terminated. rec.Cigar = readCigarOps(&bin, nCigar) if r.err != nil { return nil, r.err } seq := make(doublets, (lSeq+1)>>1) if nf, _ := r.Read(seq.Bytes()); nf != int((lSeq+1)>>1) { return nil, errors.New("bam: truncated sequence") } rec.Seq = sam.Seq{Length: int(lSeq), Seq: seq} rec.Qual = make([]byte, lSeq) if nf, _ := r.Read(rec.Qual); nf != int(lSeq) { return nil, errors.New("bam: truncated quality") } auxTags := make([]byte, blockSize-r.n) r.Read(auxTags) if r.n != blockSize { return nil, errors.New("bam: truncated auxilliary data") } rec.AuxFields = parseAux(auxTags) if r.err != nil { return nil, r.err } refs := int32(len(br.h.Refs())) if refID != -1 { if refID < -1 || refID >= refs { return nil, errors.New("bam: reference id out of range") } rec.Ref = br.h.Refs()[refID] } if nextRefID != -1 { if nextRefID < -1 || nextRefID >= refs { return nil, errors.New("bam: mate reference id out of range") } rec.MateRef = br.h.Refs()[nextRefID] } return &rec, nil }
// Add records the SAM record as having being located at the given chunk. func (i *Index) Add(r *sam.Record, c bgzf.Chunk) error { return i.idx.Add(r, uint32(r.Bin()), c, isPlaced(r), isMapped(r)) }