func main() { var ( in *fasta.Reader out, profile *os.File e error ) inName := flag.String("in", "", "Filename for input to be factorised. Defaults to stdin.") outName := flag.String("out", "", "Filename for output. Defaults to stdout.") k := flag.Int("k", 8, "kmer size to use.") cat := flag.Int("cat", 5, "number of categories.") iter := flag.Int("i", 1000, "iterations.") limit := flag.Int("time", 10, "time limit for NMF.") lo := flag.Int("lo", 1, "minimum number of kmer frequency to use in NMF.") hi := flag.Float64("hi", 0.5, "maximum proportion of kmer representation to use in NMF.") sf := flag.Float64("sf", 0.01, "factor for sparcity of estimating matrices for NMF.") tol := flag.Float64("tol", 0.001, "tolerance for NMF.") threads := flag.Int("threads", 2, "number of threads to use.") seed := flag.Int64("seed", -1, "seed for random number generator (-1 uses system clock).") cpuprofile := flag.String("cpuprofile", "", "write cpu profile to this file.") help := flag.Bool("help", false, "print this usage message.") flag.Parse() if *help { flag.Usage() os.Exit(1) } runtime.GOMAXPROCS(*threads) sparse.MaxProcs = *threads fmt.Fprintf(os.Stderr, "Using %d threads.\n", runtime.GOMAXPROCS(0)) if *cpuprofile != "" { if profile, e = os.Create(*cpuprofile); e != nil { fmt.Fprintf(os.Stderr, "Error: %v.", e) os.Exit(0) } fmt.Fprintf(os.Stderr, "Writing CPU profile data to %s\n", *cpuprofile) pprof.StartCPUProfile(profile) defer pprof.StopCPUProfile() } if *inName == "" { fmt.Fprintln(os.Stderr, "Reading sequences from stdin.") in = fasta.NewReader(os.Stdin) } else if in, e = fasta.NewReaderName(*inName); e != nil { fmt.Fprintf(os.Stderr, "Error: %v.", e) os.Exit(0) } else { fmt.Fprintf(os.Stderr, "Reading sequence from `%s'.\n", *inName) } defer in.Close() if *outName == "" { fmt.Fprintln(os.Stderr, "Writing output to stdout.") out = os.Stdout } else if out, e = os.Create(*outName); e != nil { fmt.Fprintf(os.Stderr, "Error: %v.", e) } else { fmt.Fprintf(os.Stderr, "Writing output to `%s'.\n", *outName) } defer out.Close() totalkmers := make(map[kmerindex.Kmer]float64) kmerlists := make([]map[kmerindex.Kmer]float64, 0) seqTable := make([]string, 0) for { if sequence, err := in.Read(); err != nil { break } else { var freqs map[kmerindex.Kmer]float64 if kindex, e := kmerindex.New(*k, sequence); e != nil { fmt.Fprintf(os.Stderr, "Error: %v.\n", e) fmt.Fprintln(os.Stderr) os.Exit(0) } else { freqs, _ = kindex.NormalisedKmerFrequencies() kmerlists = append(kmerlists, freqs) for kmer, freq := range freqs { totalkmers[kmer] += freq } } seqTable = append(seqTable, string(sequence.ID)) } } kmerArray := make([][]float64, 0) kmerTable := make([]kmerindex.Kmer, 0) for kmer, _ := range totalkmers { var count int for _, kmerlist := range kmerlists { if kmerlist[kmer] > 0 { count++ } } if count < *lo || float64(count)/float64(len(kmerlists)) > *hi { continue } row := make([]float64, len(kmerlists)) for i, kmerlist := range kmerlists { row[i] = float64(kmerlist[kmer]) } kmerArray = append(kmerArray, row) kmerTable = append(kmerTable, kmer) } var kmerMatrix *sparse.Sparse func() { defer func() { if r := recover(); r != nil { fmt.Fprintf(os.Stderr, "Error: %v.", r) os.Exit(0) } }() kmerMatrix = sparse.Matrix(kmerArray) }() f := func(i, j int, v float64) float64 { if kmerMatrix.At(i, j) != 0 { return 1 } return 0 } nonZero := kmerMatrix.Apply(f).Sum() r, c := kmerMatrix.Dims() density := nonZero / float64(r*c) if *seed == -1 { *seed = time.Now().UnixNano() } fmt.Fprintf(os.Stderr, "Using %v as random seed.\n", *seed) rand.Seed(*seed) rows, cols := kmerMatrix.Dims() Wo := sparse.Random(rows, *cat, density**sf) Ho := sparse.Random(*cat, cols, density**sf) fmt.Fprintf(os.Stderr, "Dimensions of Kmer matrix = (%v, %v)\nDensity = %.3f %%\n%v\n", r, c, (density)*100, kmerMatrix) W, H, ok := nmf.Factors(kmerMatrix, Wo, Ho, *tol, *iter, time.Duration(*limit)*1e9) fmt.Fprintf(os.Stderr, "norm(H) = %v norm(W) = %v\n\nFinished = %v\n\n", H.Norm(matrix.Fro), W.Norm(matrix.Fro), ok) printFeature(out, kmerMatrix, W, H, seqTable, kmerTable, *k) }
func main() { var ( in *bufio.Reader out *bufio.Writer profile *os.File e error ) inName := flag.String("in", "", "Filename for input to be factorised. Defaults to stdin.") outName := flag.String("out", "", "Filename for output. Defaults to stdout.") transpose := flag.Bool("t", false, "Transpose columns and rows.") sep := flag.String("sep", "\t", "Column delimiter.") cat := flag.Int("cat", 5, "number of categories.") iter := flag.Int("i", 1000, "iterations.") rep := flag.Int("rep", 1, "Resample replicates.") limit := flag.Int("time", 10, "time limit for NMF.") sf := flag.Float64("sf", 0.01, "factor for sparcity of estimating matrices for NMF.") tol := flag.Float64("tol", 0.001, "tolerance for NMF.") threads := flag.Int("threads", 2, "number of threads to use.") seed := flag.Int64("seed", -1, "seed for random number generator (-1 uses system clock).") cpuprofile := flag.String("cpuprofile", "", "write cpu profile to this file.") help := flag.Bool("help", false, "print this usage message.") flag.Parse() if *help { flag.Usage() os.Exit(1) } runtime.GOMAXPROCS(*threads) sparse.MaxProcs = *threads fmt.Fprintf(os.Stderr, "Using %d threads.\n", runtime.GOMAXPROCS(0)) if *cpuprofile != "" { if profile, e = os.Create(*cpuprofile); e != nil { fmt.Fprintf(os.Stderr, "Error: %v.", e) os.Exit(0) } fmt.Fprintf(os.Stderr, "Writing CPU profile data to %s\n", *cpuprofile) pprof.StartCPUProfile(profile) defer pprof.StopCPUProfile() } if *inName == "" { fmt.Fprintln(os.Stderr, "Reading table from stdin.") in = bufio.NewReader(os.Stdin) } else if f, err := os.Open(*inName); err != nil { fmt.Fprintf(os.Stderr, "Error: %v.", err) os.Exit(0) } else { defer f.Close() in = bufio.NewReader(f) fmt.Fprintf(os.Stderr, "Reading table from `%s'.\n", *inName) } if *outName == "" { fmt.Fprintln(os.Stderr, "Writing output to stdout.") out = bufio.NewWriter(os.Stdout) } else if f, err := os.Create(*outName); err != nil { fmt.Fprintf(os.Stderr, "Error: %v.", err) os.Exit(0) } else { defer f.Close() out = bufio.NewWriter(f) fmt.Fprintf(os.Stderr, "Writing output to `%s'.\n", *outName) } defer out.Flush() var colNames, rowNames []string array := make([][]float64, 0) if line, err := in.ReadString('\n'); err != nil { fmt.Fprintln(os.Stderr, "No table to read\n") os.Exit(0) } else { line = strings.TrimSpace(line) colNames = strings.Split(line, "\t") colNames = colNames[1:] } for count := 1; ; count++ { if line, err := in.ReadString('\n'); err != nil { break } else { line = strings.TrimSpace(line) if row := strings.Split(line, *sep); len(row) != len(colNames)+1 { fmt.Fprintf(os.Stderr, "Table row mismatch at line %d.\n", count) os.Exit(0) } else { rowData := make([]float64, len(row)-1) for i, val := range row[1:] { if rowData[i], e = strconv.ParseFloat(val, 64); e != nil { fmt.Fprintf(os.Stderr, "Float conversion error %v at line %d element %d.\n", e, count, i) os.Exit(0) } } rowNames = append(rowNames, row[0]) array = append(array, rowData) } } } var dataMatrix *sparse.Sparse func() { defer func() { if r := recover(); r != nil { fmt.Fprintf(os.Stderr, "Error: %v.", r) os.Exit(0) } }() dataMatrix = sparse.Matrix(array) }() f := func(i, j int, v float64) float64 { if dataMatrix.At(i, j) != 0 { return 1 } return 0 } nonZero := dataMatrix.Apply(f).Sum() if *transpose { colNames, rowNames = rowNames, colNames dataMatrix = dataMatrix.T() } r, c := dataMatrix.Dims() density := nonZero / float64(r*c) if *seed == -1 { *seed = time.Now().UnixNano() } fmt.Fprintf(os.Stderr, "Using %v as random seed.\n", *seed) rand.Seed(*seed) rows, cols := dataMatrix.Dims() fmt.Fprintf(os.Stderr, "Dimensions of matrix = (%v, %v)\nDensity = %.3f %%\n%v\n", r, c, (density)*100, dataMatrix) for run := 0; run < *rep; run++ { if *rep > 1 { fmt.Fprintf(os.Stderr, "Replicate #%d\n", run+1) } Wo := sparse.Random(rows, *cat, density**sf) Ho := sparse.Random(*cat, cols, density**sf) W, H, ok := nmf.Factors(dataMatrix, Wo, Ho, *tol, *iter, time.Duration(*limit)*1e9) fmt.Fprintf(os.Stderr, "norm(H) = %v norm(W) = %v\n\nFinished = %v\n\n", H.Norm(matrix.Fro), W.Norm(matrix.Fro), ok) printFeature(out, run, dataMatrix, W, H, rowNames, colNames) } }