// encodeWithBuckets() reads the reads, creates the buckets, saves the buckets // and their counts, and then encodes each read. func preprocessWithBuckets( readFile string, outBaseName string, hash KmerHash, ) (*os.File, []string, []int) { // read the reads and flip as needed reads := readAndFlipReads(readFile, hash, flipReadsOption) readLength := len(reads[0].Seq) log.Printf("Estimated 2-bit encoding size: %d", uint64(math.Ceil(float64(2*len(reads)*readLength)/8.0))) // if the user wants the qualities written out waitForFlipped := make(chan struct{}) if writeFlippedOption { outFlipped, err := os.Create(outBaseName + ".flipped") DIE_ON_ERR(err, "Couldn't create flipped file: %s", outBaseName+".flipped") defer outFlipped.Close() outFlippedZ, err := gzip.NewWriterLevel(outFlipped, gzip.BestCompression) DIE_ON_ERR(err, "Couldn't create gzipper for flipped file.") defer outFlippedZ.Close() flippedBits := bitio.NewWriter(outFlippedZ) defer flippedBits.Close() go func() { writeFlipped(flippedBits, reads) close(waitForFlipped) runtime.Goexit() return }() } else { close(waitForFlipped) } // if the user wants to write out the N positions, write them out waitForNs := make(chan struct{}) if writeNsOption { outNs, err := os.Create(outBaseName + ".ns") DIE_ON_ERR(err, "Couldn't create N location file: %s", outBaseName+".ns") defer outNs.Close() outNsZ, err := gzip.NewWriterLevel(outNs, gzip.BestCompression) DIE_ON_ERR(err, "Couldn't create gzipper for N location file.") defer outNsZ.Close() go func() { writeNLocations(outNsZ, reads) close(waitForNs) runtime.Goexit() return }() } else { close(waitForNs) } // create the buckets and counts buckets, counts := listBuckets(reads) // write the bittree for the bucket out to a file outBT, err := os.Create(outBaseName + ".bittree") DIE_ON_ERR(err, "Couldn't create bucket file: %s", outBaseName+".bittree") defer outBT.Close() // compress the file with gzip as we are writing it outBZ, err := gzip.NewWriterLevel(outBT, gzip.BestCompression) DIE_ON_ERR(err, "Couldn't create gzipper for bucket file") defer outBZ.Close() // create a writer that lets us write bits writer := bitio.NewWriter(outBZ) defer writer.Close() /*** The main work to encode the bucket names ***/ waitForBuckets := make(chan struct{}) go func() { encodeKmersToFile(buckets, writer) close(waitForBuckets) runtime.Goexit() return }() // write out the counts countF, err := os.Create(outBaseName + ".counts") DIE_ON_ERR(err, "Couldn't create counts file: %s", outBaseName+".counts") defer countF.Close() // compress it as we are writing it countZ, err := gzip.NewWriterLevel(countF, gzip.BestCompression) DIE_ON_ERR(err, "Couldn't create gzipper for count file") defer countZ.Close() /*** The main work to encode the bucket counts ***/ waitForCounts := make(chan struct{}) go func() { writeCounts(countZ, readLength, counts) close(waitForCounts) runtime.Goexit() return }() // create a temp file containing the processed reads processedFile, err := ioutil.TempFile("", "kpath-encode-") DIE_ON_ERR(err, "Couldn't create temporary file in %s", os.TempDir()) md5Hash := md5.New() waitForTemp := make(chan struct{}) go func() { for i := range reads { md5Hash.Write(reads[i].Seq) processedFile.Write(reads[i].Seq) processedFile.Write([]byte{'\n'}) } processedFile.Seek(0, 0) close(waitForTemp) }() log.Printf("MD5 hash of reads = %x", md5Hash.Sum(nil)) // Wait for each of the coders to finish <-waitForBuckets <-waitForCounts <-waitForNs <-waitForFlipped <-waitForTemp log.Printf("Done processing; reads are of length %d ...", readLength) return processedFile, buckets, counts }
// main() encodes or decodes a set of reads based on the first command line // argument (which is either encode or decode). func main() { fmt.Println("kpath Copyright (C) 2014 Carl Kingsford & Rob Patro\n") fmt.Println("This program comes with ABSOLUTELY NO WARRANTY; This is free software, and") fmt.Println("you are welcome to redistribute it under certain conditions; see") fmt.Println("accompanying LICENSE.txt file.\n") log.Println("Starting kpath version 0.6.1 (6-19-14)") startTime := time.Now() log.Printf("Maximum threads = %v", maxThreads) runtime.GOMAXPROCS(maxThreads) // parse the command line const ( ENCODE int = 1 DECODE int = 2 ) if len(os.Args) < 2 { encodeFlags.PrintDefaults() os.Exit(1) } var mode int if os.Args[1][0] == 'e' { mode = ENCODE log.SetPrefix("kpath (encode): ") } else { mode = DECODE log.SetPrefix("kpath (decode): ") } encodeFlags.Parse(os.Args[2:]) if globalK <= 0 || globalK > 16 { log.Fatalf("K must be specified as a small positive integer with -k") } log.Printf("Using kmer size = %d", globalK) setShiftKmerMask() if refFile == "" { log.Fatalf("Must specify gzipped fasta as reference with -ref") } if readFile == "" { log.Println("Must specify input file with -reads") log.Fatalln("If decoding, just give basename of encoded files.") } if outFile == "" { log.Println("Must specify output location with -out") log.Println("If encoding, omit extension.") } if cpuProfile != "" { log.Printf("Writing CPU profile to %s", cpuProfile) cpuF, err := os.Create(cpuProfile) DIE_ON_ERR(err, "Couldn't create CPU profile file %s", cpuProfile) pprof.StartCPUProfile(cpuF) defer pprof.StopCPUProfile() } // count the kmers in the reference var hash KmerHash waitForReference := make(chan struct{}) go func() { refStart := time.Now() hash = countKmersInReference(globalK, refFile) log.Printf("There are %v unique %v-mers in the reference\n", len(hash), globalK) log.Printf("Time: Took %v seconds to read reference.", time.Now().Sub(refStart).Seconds()) close(waitForReference) return }() writeGlobalOptions() if mode == ENCODE { /* encode -k -ref -reads=FOO.seq -out=OUT will encode into OUT.{enc,bittree,counts} */ log.Printf("Reading from %s", readFile) log.Printf("Writing to %s, %s, %s", outFile+".enc", outFile+".bittree", outFile+".counts") // create the output file outF, err := os.Create(outFile + ".enc") DIE_ON_ERR(err, "Couldn't create output file %s", outFile) defer outF.Close() //outBuf := bufio.NewWriterSize(outF, 200000000) //defer outBuf.Flush() writer := bitio.NewWriter(outF) defer writer.Close() // create encoder encoder := arithc.NewEncoder(writer) defer encoder.Finish() // encode reads <-waitForReference tempReadFile, buckets, counts := preprocessWithBuckets(readFile, outFile, hash) n := encodeReadsFromTempFile(tempReadFile, buckets, counts, hash, encoder) log.Printf("Reads Flipped: %v", flipped) log.Printf("Encoded %v reads (may be < # of input reads due to duplicates).", n) } else { /* decode -k -ref -reads=FOO -out=OUT.seq will look for FOO.enc, FOO.bittree, FOO.counts and decode into OUT.seq */ tailsFN := readFile + ".enc" headsFN := readFile + ".bittree" countsFN := readFile + ".counts" log.Printf("Reading from %s, %s, and %s", tailsFN, headsFN, countsFN) // read the bucket names var kmers []string waitForBuckets := make(chan struct{}) go func() { kmers = decodeKmersFromFile(headsFN, globalK) sort.Strings(kmers) close(waitForBuckets) runtime.Goexit() return }() // read the bucket counts var counts []int var readlen int waitForCounts := make(chan struct{}) go func() { counts, readlen = readBucketCounts(countsFN) close(waitForCounts) runtime.Goexit() return }() // read the flipped bits --- flipped by be 0-length if no file could be // found; this indicates that either nothing was flipped or we don't // care about orientation var flipped []bool waitForFlipped := make(chan struct{}) go func() { flipped = readFlipped(readFile + ".flipped") close(waitForFlipped) runtime.Goexit() return }() // read the NLocations, which might be 0-length if no file could be // found; this indicates that the Ns were recorded some other way. var NLocations [][]byte waitForNLocations := make(chan struct{}) go func() { NLocations = readNLocations(readFile + ".ns") close(waitForNLocations) runtime.Goexit() return }() // open encoded read file encIn, err := os.Open(tailsFN) DIE_ON_ERR(err, "Can't open encoded read file %s", tailsFN) defer encIn.Close() readerBuf := bufio.NewReader(encIn) // create a bit reader wrapper around it reader := bitio.NewReader(readerBuf) defer reader.Close() // create a decoder around it decoder, err := arithc.NewDecoder(reader) DIE_ON_ERR(err, "Couldn't create decoder!") // create the output file log.Printf("Writing to %s", outFile) outF, err := os.Create(outFile) DIE_ON_ERR(err, "Couldn't create output file %s", outFile) defer outF.Close() <-waitForReference <-waitForBuckets <-waitForCounts <-waitForFlipped <-waitForNLocations log.Printf("Read length = %d", readlen) decodeReads(kmers, counts, flipped, NLocations, hash, readlen, outF, decoder) } log.Printf("Default interval used %v times and context used %v times", defaultIntervalSum, contextExists) endTime := time.Now() log.Printf("kpath took %v to run.", endTime.Sub(startTime).Seconds()) /* UNCOMMENT TO DEBUG GARBAGE COLLECTION WITH GO 1.2 var stats debug.GCStats stats.PauseQuantiles = make([]time.Duration, 5) debug.ReadGCStats(&stats) log.Printf("Last GC=%v\nNum GC=%v\nPause for GC=%v\nPauseHistory=%v", stats.LastGC, stats.NumGC, stats.PauseTotal.Seconds(), stats.Pause) */ }