func doFillRsids(c *cli.Context) { arg_bucket := c.String("bucket") arg_setup := c.Bool("setup") arg_overwrite := c.Bool("overwrite") arg_strict := c.Bool("strict") if len(c.Args()) <= 2 { fmt.Fprintln(os.Stderr) fmt.Fprintln(os.Stderr, "[FATAL] too few arguments") fmt.Fprintln(os.Stderr) cli.ShowCommandHelp(c, "fill-rsids") os.Exit(1) } else if !arg_overwrite && arg_strict { fmt.Fprintln(os.Stderr) fmt.Fprintln(os.Stderr, "[FATAL] -strict option is only effective with -overwrite option") fmt.Fprintln(os.Stderr) cli.ShowCommandHelp(c, "fill-rsids") os.Exit(1) } else if arg_bucket == "" { fmt.Fprintln(os.Stderr) fmt.Fprintln(os.Stderr, "[FATAL] -bucket is required") fmt.Fprintln(os.Stderr) cli.ShowCommandHelp(c, "fill-rsids") os.Exit(1) } databaseName := "bolt.db" bucketName := []byte(path.Base(arg_bucket)) // Store chrpos <=> rsid mappings into bolt.db db, err := bolt.Open(databaseName, 0600, nil) if err != nil { panic(err) } defer db.Close() if arg_setup { f, err := os.Open(arg_bucket) if err != nil { panic(err) } defer f.Close() gz, err := gzip.NewReader(f) if err != nil { panic(err) } defer gz.Close() // TODO: workaround for non-uniq chrpos. skip high rs numbers? err = db.Batch(func(tx *bolt.Tx) error { bucket, err := tx.CreateBucketIfNotExists(bucketName) if err != nil { return err } map_reader := bufio.NewReaderSize(gz, 128*1024) map_line, err := lib.Readln(map_reader) for err == nil { // chrom/pos to rs id mapping resource file ([TAB] delimited) // // | rs id | chrom | pos | // |--------|--------|--------| // | xxxxx | xx | xxxxx | // records := strings.Split(map_line, "\t") rsId := strings.Replace(records[0], "rs", "", 1) rsChr := records[1] rsPos, _ := strconv.ParseInt(records[2], 10, 64) if rsChr != "" && rsChr != "NotOn" && rsChr != "Multi" && rsChr != "Un" && rsChr != "PAR" { // | chrom id | 0-filled pos | // |------------|----------------| // | xx | xxxxxxxxx | // | (2 digits) | (9 digits) | chrpos := lib.ChrPos(rsChr, rsPos) key := lib.Itob(chrpos) val := []byte(rsId) // TODO: put/get rsId as byte(int) err = bucket.Put(key, val) } map_line, err = lib.Readln(map_reader) } if err != nil && err != io.EOF { return err } return nil }) if err != nil { panic(err) } // os.Exit(0) } // Parse VCF header lines reader := bufio.NewReaderSize(os.Stdin, 64*1024) line, err := lib.Readln(reader) for err == nil { if strings.HasPrefix(line, "##") { fmt.Println(line) } else if strings.HasPrefix(line, "#CHROM") { fmt.Println(line) break } else { err = errors.New("Invalid VCF header") break } line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } // Parse VCF body lines pattern := regexp.MustCompile(`rs(\d+)`) line, err = lib.Readln(reader) for err == nil { records := strings.Split(line, "\t") chrom := records[0] pos, _ := strconv.ParseInt(records[1], 10, 64) snpId := records[2] rsIdFound := pattern.FindStringSubmatch(snpId) // Skip or fill rs id. Switch by '-overwrite' option. // // | input | overwrite = t | overwrite = f | // |-----------|---------------|---------------| // | "rsxxxx" | fill | skip | // | "." | fill | fill | if rsIdFound != nil && !arg_overwrite { // Skip fmt.Println(line) } else if (rsIdFound != nil && arg_overwrite) || rsIdFound == nil { // Fill result := []string{} result = append(result, records[0:2]...) err = db.View(func(tx *bolt.Tx) error { bucket := tx.Bucket(bucketName) if bucket == nil { return fmt.Errorf("Bucket %q not found!", bucketName) } val := bucket.Get(lib.Itob(lib.ChrPos(chrom, pos))) if val != nil { // Fill rs id if locus is found. result = append(result, "rs"+string(val)) } else { if arg_strict { // Fill '.' if locus in not found ('-strict' option). result = append(result, ".") } else { // Keep original record (including '.') if locus is not found. result = append(result, snpId) } } return nil }) if err != nil { panic(err) } result = append(result, records[3:]...) fmt.Println(strings.Join(result, "\t")) } line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } }
func doToTab(c *cli.Context) { is_without_header := c.Bool("without-header") is_without_chr_pos := c.Bool("without-chr-pos") is_rs_id_as_int := c.Bool("rs-id-as-int") is_genotype_as_pg_array := c.Bool("genotype-as-pg-array") is_chrx_genotype_as_homo := c.Bool("chrx-genotype-as-h**o") reader := bufio.NewReaderSize(os.Stdin, 128*1024) line, err := lib.Readln(reader) for err == nil { if strings.HasPrefix(line, "##") { // pass } else if strings.HasPrefix(line, "#CHROM") { if !is_without_header { fields := strings.Split(line, "\t") if !is_without_chr_pos { fmt.Print("#CHROM\tPOS\tID\t") } else { fmt.Print("ID\t") } fmt.Println(strings.Join(fields[9:], "\t")) } break } else { err = errors.New("Invalid VCF header") break } line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } pattern := regexp.MustCompile(`rs(\d+)`) line, err = lib.Readln(reader) for err == nil { records := strings.Split(line, "\t") chrom := records[0] pos := records[1] id := records[2] if is_rs_id_as_int { id_found := pattern.FindStringSubmatch(records[2]) if id_found != nil { id = id_found[1] } } ref := records[3] alt := strings.Split(records[4], ",") format := strings.Split(records[8], ":") gts := records[9:] genotypes := []string{} for i := range gts { gt := strings.Split(gts[i], ":") for j := range gt { var genotype string if format[j] == "GT" { _gt := gt2genotype(ref, alt, gt[j]) if is_chrx_genotype_as_homo && chrom == "X" { if len(_gt) == 1 { _gt = append(_gt, _gt...) } } if is_genotype_as_pg_array { genotype = "{" + strings.Join(_gt, ",") + "}" } else { genotype = strings.Join(_gt, "/") } genotypes = append(genotypes, genotype) } } } result := []string{} if !is_without_chr_pos { result = []string{chrom, pos, id} } else { result = []string{id} } result = append(result, genotypes...) fmt.Println(strings.Join(result, "\t")) line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } }
func doFix(c *cli.Context) { arg_remove_chr_string := c.Bool("remove-chr-string") arg_remove_qual := c.Bool("remove-qual") arg_remove_filter := c.Bool("remove-filter") arg_remove_info := c.Bool("remove-info") arg_keep_gt_only := c.Bool("keep-only-gt") // Parse header lines reader := bufio.NewReaderSize(os.Stdin, 128*1024) contig_pattern := regexp.MustCompile(`##contig=<(.+)>`) info_pattern := regexp.MustCompile(`##INFO=<(.+)>`) format_pattern := regexp.MustCompile(`##FORMAT=<(.+)>`) filter_pattern := regexp.MustCompile(`##FILTER=<(.+)>`) line, err := lib.Readln(reader) for err == nil { if strings.HasPrefix(line, "##") { contig_founds := contig_pattern.FindStringSubmatch(line) info_founds := info_pattern.FindStringSubmatch(line) format_founds := format_pattern.FindStringSubmatch(line) filter_founds := filter_pattern.FindStringSubmatch(line) if arg_remove_chr_string && contig_founds != nil { // Remove 'chr' from contig meta-infos in header result := []string{} for _, x := range strings.Split(contig_founds[1], ",") { if strings.HasPrefix(x, "ID") { result = append(result, strings.Replace(x, "chr", "", 1)) } else { result = append(result, x) } } fmt.Println("##contig=<" + strings.Join(result, ",") + ">") } else if arg_remove_info && info_founds != nil { // Skip INFO meta-info } else if arg_remove_filter && filter_founds != nil { // Skip FILTER meta-info } else if arg_keep_gt_only && format_founds != nil { // Skip FORMAT meta-info tags except GT for _, x := range strings.Split(format_founds[1], ",") { if x == "ID=GT" { fmt.Println(line) continue } } } else { fmt.Println(line) } } else if strings.HasPrefix(line, "#CHROM") { fmt.Println(line) break } else { err = errors.New("Invalid VCF header") break } line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } // Parse body lines // > 1.4.1 Fixed fields // > There are 8 fixed fields per record. All data lines are tab-delimited. // > In all cases, missing values are specified with a dot ('.'). line, err = lib.Readln(reader) for err == nil { records := strings.Split(line, "\t") var chrom string if arg_remove_chr_string { chrom = strings.Replace(records[0], "chr", "", 1) } else { chrom = records[0] } // > 6. QUAL - quality: Phred-scaled quality score for the assertion made in ALT. // > ... If unknown, the missing value should be specified. (Numeric) var qual string if arg_remove_qual { qual = "." } else { qual = records[5] } // > 7. FILTER - filter status: PASS if this position has passed all filters, i.e. a call is made at this position. // > ... If filters have not been applied, then this field should be set to the missing value. // > (String, no white-space or semi-colons permitted) var filter string if arg_remove_filter { filter = "." } else { filter = records[6] } var info string if arg_remove_info { info = "." } else { info = records[7] } var format string genotypes := []string{} if arg_keep_gt_only { // > 1.4.2 Genotype fields // > ... The first sub-field must always be the genotype (GT) if it is present. format = "GT" for _, genotype := range records[9:] { genotypes = append(genotypes, strings.Split(genotype, ":")[0]) } } else { format = records[8] genotypes = records[9:] } result := []string{} result = append(result, chrom) result = append(result, records[1:5]...) result = append(result, qual) result = append(result, filter) result = append(result, info) result = append(result, format) result = append(result, genotypes...) fmt.Println(strings.Join(result, "\t")) line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } }
func doFilter(c *cli.Context) { arg_keep_ids := c.String("keep-ids") arg_keep_pos := c.String("keep-pos") arg_keep_only_pass := c.Bool("keep-only-pass") pattern := regexp.MustCompile(`rs(\d+)`) keep_ids := make(map[int]bool) keep_pos := make(map[int64]bool) // Get SNP IDs to be kept if exists if arg_keep_ids != "" { var ids_fp *os.File var err error ids_fp, err = os.Open(arg_keep_ids) if err != nil { panic(err) } defer ids_fp.Close() ids_reader := bufio.NewReaderSize(ids_fp, 128*1024) ids_line, err := lib.Readln(ids_reader) for err == nil { id_found := pattern.FindStringSubmatch(ids_line) if id_found != nil { keep_id, _ := strconv.Atoi(id_found[1]) keep_ids[keep_id] = true } ids_line, err = lib.Readln(ids_reader) } if err != nil && err != io.EOF { panic(err) } } // Get loci to be kept if exists if arg_keep_pos != "" { var pos_fp *os.File var err error pos_fp, err = os.Open(arg_keep_pos) if err != nil { panic(err) } defer pos_fp.Close() pos_reader := bufio.NewReaderSize(pos_fp, 128*1024) pos_line, err := lib.Readln(pos_reader) for err == nil { records := strings.Split(pos_line, "\t") chrom := records[0] pos, _ := strconv.ParseInt(records[1], 10, 64) chrpos := lib.ChrPos(chrom, pos) keep_pos[chrpos] = true pos_line, err = lib.Readln(pos_reader) } if err != nil && err != io.EOF { panic(err) } } // Parse header lines reader := bufio.NewReaderSize(os.Stdin, 128*1024) line, err := lib.Readln(reader) for err == nil { if strings.HasPrefix(line, "##") { fmt.Println(line) } else if strings.HasPrefix(line, "#CHROM") { fmt.Println(line) break } else { err = errors.New("Invalid VCF header") break } line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } // Parse body lines line, err = lib.Readln(reader) for err == nil { records := strings.Split(line, "\t") var is_pass bool // Filter by id if arg_keep_ids != "" { id_found := pattern.FindStringSubmatch(records[2]) if id_found != nil { id, _ := strconv.Atoi(id_found[1]) if keep_ids[id] { is_pass = true } } } // Filter by loci if arg_keep_pos != "" { chrom := records[0] pos, _ := strconv.ParseInt(records[1], 10, 64) chrpos := lib.ChrPos(chrom, pos) if keep_pos[chrpos] { is_pass = true } } // Filter by FILTER = PASS if arg_keep_only_pass { if records[6] == "PASS" { is_pass = true } else { is_pass = false } } if is_pass { fmt.Println(line) } line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } }
func doSubset(c *cli.Context) { arg_keep_id := c.String("keep-id") arg_keep_ids := c.String("keep-ids") arg_keep_index := c.String("keep-index") reader := bufio.NewReaderSize(os.Stdin, 64*1024) // Parse header lines var sample_ids []string line, err := lib.Readln(reader) for err == nil { if strings.HasPrefix(line, "##") { fmt.Println(line) } else if strings.HasPrefix(line, "#CHROM") { fields := strings.Split(line, "\t") fmt.Print(strings.Join(fields[0:9], "\t")) fmt.Print("\t") sample_ids = fields[9:] break } else { err = errors.New("Invalid VCF header") break } line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } // Get indices of sample IDs to be kept keep_ids := []string{} keep_idxs := []int{} if arg_keep_id != "" || arg_keep_ids != "" { if arg_keep_id != "" { // A sample ID to be kept. E.g., NA00001 keep_ids = append(keep_ids, arg_keep_id) } else { // Path to a file of sample IDs to be kept. Each line contains one sample ID. fp, err := os.Open(arg_keep_ids) if err != nil { panic(err) } defer fp.Close() ids_reader := bufio.NewReaderSize(fp, 128*1024) ids_line, err := lib.Readln(ids_reader) for err == nil { keep_ids = append(keep_ids, ids_line) ids_line, err = lib.Readln(ids_reader) } if err != nil && err != io.EOF { panic(err) } } for i := range keep_ids { for j := range sample_ids { if keep_ids[i] == sample_ids[j] { keep_idxs = append(keep_idxs, j) break } } } if len(keep_idxs) == 0 { fmt.Println() log.Fatal("No sample IDs matched.") } } else if arg_keep_index != "" { // An index of sample ID field to be kept. E.g., to keep 1st sample, set: 0 _keep_idx, _ := strconv.Atoi(arg_keep_index) if _keep_idx > len(sample_ids) { fmt.Println() log.Fatal("No sample IDs matched.") } keep_idxs = append(keep_idxs, _keep_idx) } fmt.Println(strings.Join(subset(sample_ids, keep_idxs), "\t")) line, err = lib.Readln(reader) for err == nil { records := strings.Split(line, "\t") result := []string{} result = append(result, records[0:9]...) result = append(result, subset(records[9:], keep_idxs)...) fmt.Println(strings.Join(result, "\t")) line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } }
func doFreq(c *cli.Context) { reader := bufio.NewReaderSize(os.Stdin, 128*1024) line, err := lib.Readln(reader) for err == nil { if strings.HasPrefix(line, "##") { // pass } else if strings.HasPrefix(line, "#CHROM") { fmt.Println("#CHROM\tPOS\tID\tAllele\tFreq") break } else { err = errors.New("Invalid VCF header") break } line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } pattern := regexp.MustCompile(`[|/]`) line, err = lib.Readln(reader) for err == nil { records := strings.Split(line, "\t") chrom := records[0] pos := records[1] id := records[2] ref := records[3] alt := strings.Split(records[4], ",") format := strings.Split(records[8], ":") gts := records[9:] alleles := []string{} alleles = append(alleles, ref) alleles = append(alleles, alt...) var count []int for i := 0; i < len(alleles); i++ { count = append(count, 0) } for i := range gts { gt := strings.Split(gts[i], ":") for j := range gt { if format[j] == "GT" { gt_idxs := pattern.Split(gt[j], -1) for i := range gt_idxs { gt_idx, _ := strconv.Atoi(gt_idxs[i]) count[gt_idx] += 1 } } } } total := float64(sum(count)) // TODO: decimal? freqs := []string{} for i := range count { freqs = append(freqs, fmt.Sprintf("%.4f", float64(count[i])/total)) // TODO: } result := []string{chrom, pos, id, strings.Join(alleles, ","), strings.Join(freqs, ",")} fmt.Println(strings.Join(result, "\t")) line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } }
func doUpdate(c *cli.Context) { arg_rs_merge_arch := c.String("rs-merge-arch") if arg_rs_merge_arch == "" { cli.ShowCommandHelp(c, "update") os.Exit(1) } f, err := os.Open(arg_rs_merge_arch) if err != nil { panic(err) } defer f.Close() gz, err := gzip.NewReader(f) if err != nil { panic(err) } defer gz.Close() // [dbSNP Column Description for table: RsMergeArc](http://www.ncbi.nlm.nih.gov/projects/SNP/snp_db_table_description.cgi?t=RsMergeArch) // // - Table name and description // // | Table Description | // |-------------------------------------------------------------------------------------------------------------------------------| // | "refSNP(rs) cluster is based on unique genome position. On new genome assembly, previously different contig may | // | align. So different rs clusters map to the same location. In this case, we merge the rs. This table tracks this merging." | // // - Table column and description // // | Column | Description | Type | Byte | Order | // |-------------------+----------------------------------------------------------------------------+---------------+------+-------| // | rsHigh | Since rs# is assigned sequentially. Low number means the rs occurs | int | 4 | 1 | // | | early. So we always merge high rs number into low rs number. | | | | // | rsLow | | int | 4 | 2 | // | build_id | dbSNP build id when this rsHigh was merged into rsLow. | smallint | 2 | 3 | // | orien | The orientation between rsHigh and rsLow. | tinyint | 1 | 4 | // | create_time | | smalldatetime | 4 | 5 | // | last_updated_time | | smalldatetime | 4 | 6 | // | rsCurrent | rsCurrent is the current rs for rsHigh. If rs9 is merged into rs5 which is | int | 4 | 7 | // | | later merged into rs2, then rsCurrent is 2 for rsHigh=9. | | | | // | orien2Current | | tinyint | 1 | 8 | // // This table/column description is last updated at: Mar 18 2015 02:51:00:000PM. // Get merge mappings of rs IDs rsHigh2current := make(map[int]int) map_reader := bufio.NewReaderSize(gz, 128*1024) map_line, err := lib.Readln(map_reader) for err == nil { records := strings.Split(map_line, "\t") rsHigh, _ := strconv.Atoi(records[0]) rsCurrent, _ := strconv.Atoi(records[6]) rsHigh2current[rsHigh] = rsCurrent map_line, err = lib.Readln(map_reader) } if err != nil && err != io.EOF { panic(err) } // Parse header lines reader := bufio.NewReaderSize(os.Stdin, 128*1024) line, err := lib.Readln(reader) for err == nil { if strings.HasPrefix(line, "##") { fmt.Println(line) } else if strings.HasPrefix(line, "#CHROM") { fmt.Println(line) break } else { err = errors.New("Invalid VCF header") break } line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } pattern := regexp.MustCompile(`rs(\d+)`) line, err = lib.Readln(reader) for err == nil { records := strings.Split(line, "\t") // Update rs ID var id_updated_str string id_found := pattern.FindStringSubmatch(records[2]) if id_found != nil { id, _ := strconv.Atoi(id_found[1]) id_updated := rsHigh2current[id] if id_updated != 0 { id_updated_str = "rs" + strconv.Itoa(id_updated) // Map to current ID } else { id_updated_str = records[2] // ID is not listed in merge history } } else { id_updated_str = records[2] // ID is not rs ID } result := []string{} result = append(result, records[0:2]...) result = append(result, id_updated_str) result = append(result, records[3:]...) fmt.Println(strings.Join(result, "\t")) line, err = lib.Readln(reader) } if err != nil && err != io.EOF { panic(err) } }