// readFlipped() reads the compressed bitstream that indicates whether a read // was flipped or not. If the file does not exist, returns nil. func readFlipped(flippedFN string) []bool { // open the file; return empty if nothing there flippedIn, err := os.Open(flippedFN) if err == nil { log.Printf("Reading flipped bits from %s", flippedFN) defer flippedIn.Close() flippedZ, err := gzip.NewReader(flippedIn) DIE_ON_ERR(err, "Couldn't create unzipper for flipped file") defer flippedZ.Close() flippedBits := bitio.NewReader(bufio.NewReader(flippedZ)) defer flippedBits.Close() flipped := make([]bool, 0, 1000000) for { b, err := flippedBits.ReadBit() if err != nil { break } if b > 0 { flipped = append(flipped, true) } else { flipped = append(flipped, false) } } log.Printf("Read %d bits indicating whether reads were flipped.", len(flipped)) return flipped } else { log.Printf("No flipped bit file (%s) found; ignoring.", flippedFN) return nil } }
// decodeKmersFromFile() opens the given gzipped bittree file and extracts the // stored kmers. func decodeKmersFromFile(filename string, k int) []string { log.Printf("Decoding kmer buckets from %v", filename) // open the file and wrap a bit reader around it bittree, err := os.Open(filename) DIE_ON_ERR(err, "Couldn't open bitree file %s", filename) defer bittree.Close() bittreeZ, err := gzip.NewReader(bittree) DIE_ON_ERR(err, "Couldn't create gzipper") defer bittreeZ.Close() in := bitio.NewReader(bufio.NewReader(bittreeZ)) defer in.Close() // start a routine to produce the bits bits := make(chan byte, 1000000) go readBits(in, bits) // make a channel to get the output out := make(chan string, 1000000) // decode and pass the input to the decoded output go decodeBitTree(bits, k, out) kmers := make([]string, 0) for s := range out { kmers = append(kmers, s) } log.Printf("done; found %v kmers", len(kmers)) return kmers }
// main() encodes or decodes a set of reads based on the first command line // argument (which is either encode or decode). func main() { fmt.Println("kpath Copyright (C) 2014 Carl Kingsford & Rob Patro\n") fmt.Println("This program comes with ABSOLUTELY NO WARRANTY; This is free software, and") fmt.Println("you are welcome to redistribute it under certain conditions; see") fmt.Println("accompanying LICENSE.txt file.\n") log.Println("Starting kpath version 0.6.1 (6-19-14)") startTime := time.Now() log.Printf("Maximum threads = %v", maxThreads) runtime.GOMAXPROCS(maxThreads) // parse the command line const ( ENCODE int = 1 DECODE int = 2 ) if len(os.Args) < 2 { encodeFlags.PrintDefaults() os.Exit(1) } var mode int if os.Args[1][0] == 'e' { mode = ENCODE log.SetPrefix("kpath (encode): ") } else { mode = DECODE log.SetPrefix("kpath (decode): ") } encodeFlags.Parse(os.Args[2:]) if globalK <= 0 || globalK > 16 { log.Fatalf("K must be specified as a small positive integer with -k") } log.Printf("Using kmer size = %d", globalK) setShiftKmerMask() if refFile == "" { log.Fatalf("Must specify gzipped fasta as reference with -ref") } if readFile == "" { log.Println("Must specify input file with -reads") log.Fatalln("If decoding, just give basename of encoded files.") } if outFile == "" { log.Println("Must specify output location with -out") log.Println("If encoding, omit extension.") } if cpuProfile != "" { log.Printf("Writing CPU profile to %s", cpuProfile) cpuF, err := os.Create(cpuProfile) DIE_ON_ERR(err, "Couldn't create CPU profile file %s", cpuProfile) pprof.StartCPUProfile(cpuF) defer pprof.StopCPUProfile() } // count the kmers in the reference var hash KmerHash waitForReference := make(chan struct{}) go func() { refStart := time.Now() hash = countKmersInReference(globalK, refFile) log.Printf("There are %v unique %v-mers in the reference\n", len(hash), globalK) log.Printf("Time: Took %v seconds to read reference.", time.Now().Sub(refStart).Seconds()) close(waitForReference) return }() writeGlobalOptions() if mode == ENCODE { /* encode -k -ref -reads=FOO.seq -out=OUT will encode into OUT.{enc,bittree,counts} */ log.Printf("Reading from %s", readFile) log.Printf("Writing to %s, %s, %s", outFile+".enc", outFile+".bittree", outFile+".counts") // create the output file outF, err := os.Create(outFile + ".enc") DIE_ON_ERR(err, "Couldn't create output file %s", outFile) defer outF.Close() //outBuf := bufio.NewWriterSize(outF, 200000000) //defer outBuf.Flush() writer := bitio.NewWriter(outF) defer writer.Close() // create encoder encoder := arithc.NewEncoder(writer) defer encoder.Finish() // encode reads <-waitForReference tempReadFile, buckets, counts := preprocessWithBuckets(readFile, outFile, hash) n := encodeReadsFromTempFile(tempReadFile, buckets, counts, hash, encoder) log.Printf("Reads Flipped: %v", flipped) log.Printf("Encoded %v reads (may be < # of input reads due to duplicates).", n) } else { /* decode -k -ref -reads=FOO -out=OUT.seq will look for FOO.enc, FOO.bittree, FOO.counts and decode into OUT.seq */ tailsFN := readFile + ".enc" headsFN := readFile + ".bittree" countsFN := readFile + ".counts" log.Printf("Reading from %s, %s, and %s", tailsFN, headsFN, countsFN) // read the bucket names var kmers []string waitForBuckets := make(chan struct{}) go func() { kmers = decodeKmersFromFile(headsFN, globalK) sort.Strings(kmers) close(waitForBuckets) runtime.Goexit() return }() // read the bucket counts var counts []int var readlen int waitForCounts := make(chan struct{}) go func() { counts, readlen = readBucketCounts(countsFN) close(waitForCounts) runtime.Goexit() return }() // read the flipped bits --- flipped by be 0-length if no file could be // found; this indicates that either nothing was flipped or we don't // care about orientation var flipped []bool waitForFlipped := make(chan struct{}) go func() { flipped = readFlipped(readFile + ".flipped") close(waitForFlipped) runtime.Goexit() return }() // read the NLocations, which might be 0-length if no file could be // found; this indicates that the Ns were recorded some other way. var NLocations [][]byte waitForNLocations := make(chan struct{}) go func() { NLocations = readNLocations(readFile + ".ns") close(waitForNLocations) runtime.Goexit() return }() // open encoded read file encIn, err := os.Open(tailsFN) DIE_ON_ERR(err, "Can't open encoded read file %s", tailsFN) defer encIn.Close() readerBuf := bufio.NewReader(encIn) // create a bit reader wrapper around it reader := bitio.NewReader(readerBuf) defer reader.Close() // create a decoder around it decoder, err := arithc.NewDecoder(reader) DIE_ON_ERR(err, "Couldn't create decoder!") // create the output file log.Printf("Writing to %s", outFile) outF, err := os.Create(outFile) DIE_ON_ERR(err, "Couldn't create output file %s", outFile) defer outF.Close() <-waitForReference <-waitForBuckets <-waitForCounts <-waitForFlipped <-waitForNLocations log.Printf("Read length = %d", readlen) decodeReads(kmers, counts, flipped, NLocations, hash, readlen, outF, decoder) } log.Printf("Default interval used %v times and context used %v times", defaultIntervalSum, contextExists) endTime := time.Now() log.Printf("kpath took %v to run.", endTime.Sub(startTime).Seconds()) /* UNCOMMENT TO DEBUG GARBAGE COLLECTION WITH GO 1.2 var stats debug.GCStats stats.PauseQuantiles = make([]time.Duration, 5) debug.ReadGCStats(&stats) log.Printf("Last GC=%v\nNum GC=%v\nPause for GC=%v\nPauseHistory=%v", stats.LastGC, stats.NumGC, stats.PauseTotal.Seconds(), stats.Pause) */ }