Esempio n. 1
0
func doFillRsids(c *cli.Context) {
	arg_bucket := c.String("bucket")
	arg_setup := c.Bool("setup")
	arg_overwrite := c.Bool("overwrite")
	arg_strict := c.Bool("strict")

	if len(c.Args()) <= 2 {
		fmt.Fprintln(os.Stderr)
		fmt.Fprintln(os.Stderr, "[FATAL] too few arguments")
		fmt.Fprintln(os.Stderr)
		cli.ShowCommandHelp(c, "fill-rsids")
		os.Exit(1)
	} else if !arg_overwrite && arg_strict {
		fmt.Fprintln(os.Stderr)
		fmt.Fprintln(os.Stderr, "[FATAL] -strict option is only effective with -overwrite option")
		fmt.Fprintln(os.Stderr)
		cli.ShowCommandHelp(c, "fill-rsids")
		os.Exit(1)
	} else if arg_bucket == "" {
		fmt.Fprintln(os.Stderr)
		fmt.Fprintln(os.Stderr, "[FATAL] -bucket is required")
		fmt.Fprintln(os.Stderr)
		cli.ShowCommandHelp(c, "fill-rsids")
		os.Exit(1)
	}

	databaseName := "bolt.db"
	bucketName := []byte(path.Base(arg_bucket))

	// Store chrpos <=> rsid mappings into bolt.db
	db, err := bolt.Open(databaseName, 0600, nil)
	if err != nil {
		panic(err)
	}
	defer db.Close()

	if arg_setup {
		f, err := os.Open(arg_bucket)
		if err != nil {
			panic(err)
		}
		defer f.Close()

		gz, err := gzip.NewReader(f)
		if err != nil {
			panic(err)
		}
		defer gz.Close()

		// TODO: workaround for non-uniq chrpos. skip high rs numbers?
		err = db.Batch(func(tx *bolt.Tx) error {
			bucket, err := tx.CreateBucketIfNotExists(bucketName)
			if err != nil {
				return err
			}

			map_reader := bufio.NewReaderSize(gz, 128*1024)
			map_line, err := lib.Readln(map_reader)
			for err == nil {
				// chrom/pos to rs id mapping resource file ([TAB] delimited)
				//
				// | rs id  | chrom  | pos    |
				// |--------|--------|--------|
				// |  xxxxx |     xx |  xxxxx |
				//
				records := strings.Split(map_line, "\t")
				rsId := strings.Replace(records[0], "rs", "", 1)
				rsChr := records[1]
				rsPos, _ := strconv.ParseInt(records[2], 10, 64)

				if rsChr != "" && rsChr != "NotOn" && rsChr != "Multi" && rsChr != "Un" && rsChr != "PAR" {
					// | chrom id   |  0-filled pos  |
					// |------------|----------------|
					// |        xx  |     xxxxxxxxx  |
					// | (2 digits) |     (9 digits) |
					chrpos := lib.ChrPos(rsChr, rsPos)
					key := lib.Itob(chrpos)
					val := []byte(rsId) // TODO: put/get rsId as byte(int)
					err = bucket.Put(key, val)
				}

				map_line, err = lib.Readln(map_reader)
			}
			if err != nil && err != io.EOF {
				return err
			}

			return nil
		})
		if err != nil {
			panic(err)
		}

		//
		os.Exit(0)
	}

	// Parse VCF header lines
	reader := bufio.NewReaderSize(os.Stdin, 64*1024)

	line, err := lib.Readln(reader)
	for err == nil {
		if strings.HasPrefix(line, "##") {
			fmt.Println(line)
		} else if strings.HasPrefix(line, "#CHROM") {
			fmt.Println(line)
			break
		} else {
			err = errors.New("Invalid VCF header")
			break
		}

		line, err = lib.Readln(reader)
	}
	if err != nil && err != io.EOF {
		panic(err)
	}

	// Parse VCF body lines
	pattern := regexp.MustCompile(`rs(\d+)`)

	line, err = lib.Readln(reader)
	for err == nil {
		records := strings.Split(line, "\t")

		chrom := records[0]
		pos, _ := strconv.ParseInt(records[1], 10, 64)
		snpId := records[2]

		rsIdFound := pattern.FindStringSubmatch(snpId)
		// Skip or fill rs id. Switch by '-overwrite' option.
		//
		// | input     | overwrite = t | overwrite = f |
		// |-----------|---------------|---------------|
		// | "rsxxxx"  | fill          | skip          |
		// | "."       | fill          | fill          |
		if rsIdFound != nil && !arg_overwrite {
			// Skip
			fmt.Println(line)
		} else if (rsIdFound != nil && arg_overwrite) || rsIdFound == nil {
			// Fill
			result := []string{}
			result = append(result, records[0:2]...)

			err = db.View(func(tx *bolt.Tx) error {
				bucket := tx.Bucket(bucketName)
				if bucket == nil {
					return fmt.Errorf("Bucket %q not found!", bucketName)
				}

				val := bucket.Get(lib.Itob(lib.ChrPos(chrom, pos)))

				if val != nil {
					// Fill rs id if locus is found.
					result = append(result, "rs"+string(val))
				} else {

					if arg_strict {
						// Fill '.' if locus in not found ('-strict' option).
						result = append(result, ".")
					} else {
						// Keep original record (including '.') if locus is not found.
						result = append(result, snpId)
					}
				}

				return nil
			})
			if err != nil {
				panic(err)
			}

			result = append(result, records[3:]...)
			fmt.Println(strings.Join(result, "\t"))
		}

		line, err = lib.Readln(reader)
	}
	if err != nil && err != io.EOF {
		panic(err)
	}
}
Esempio n. 2
0
func doFilter(c *cli.Context) {
	arg_keep_ids := c.String("keep-ids")
	arg_keep_pos := c.String("keep-pos")
	arg_keep_only_pass := c.Bool("keep-only-pass")

	pattern := regexp.MustCompile(`rs(\d+)`)

	keep_ids := make(map[int]bool)
	keep_pos := make(map[int64]bool)

	// Get SNP IDs to be kept if exists
	if arg_keep_ids != "" {
		var ids_fp *os.File
		var err error

		ids_fp, err = os.Open(arg_keep_ids)
		if err != nil {
			panic(err)
		}
		defer ids_fp.Close()

		ids_reader := bufio.NewReaderSize(ids_fp, 128*1024)
		ids_line, err := lib.Readln(ids_reader)
		for err == nil {
			id_found := pattern.FindStringSubmatch(ids_line)
			if id_found != nil {
				keep_id, _ := strconv.Atoi(id_found[1])
				keep_ids[keep_id] = true
			}

			ids_line, err = lib.Readln(ids_reader)
		}
		if err != nil && err != io.EOF {
			panic(err)
		}
	}

	// Get loci to be kept if exists
	if arg_keep_pos != "" {
		var pos_fp *os.File
		var err error

		pos_fp, err = os.Open(arg_keep_pos)
		if err != nil {
			panic(err)
		}
		defer pos_fp.Close()

		pos_reader := bufio.NewReaderSize(pos_fp, 128*1024)
		pos_line, err := lib.Readln(pos_reader)
		for err == nil {
			records := strings.Split(pos_line, "\t")
			chrom := records[0]
			pos, _ := strconv.ParseInt(records[1], 10, 64)
			chrpos := lib.ChrPos(chrom, pos)
			keep_pos[chrpos] = true

			pos_line, err = lib.Readln(pos_reader)
		}
		if err != nil && err != io.EOF {
			panic(err)
		}
	}

	// Parse header lines
	reader := bufio.NewReaderSize(os.Stdin, 128*1024)

	line, err := lib.Readln(reader)
	for err == nil {
		if strings.HasPrefix(line, "##") {
			fmt.Println(line)
		} else if strings.HasPrefix(line, "#CHROM") {
			fmt.Println(line)
			break
		} else {
			err = errors.New("Invalid VCF header")
			break
		}

		line, err = lib.Readln(reader)
	}
	if err != nil && err != io.EOF {
		panic(err)
	}

	// Parse body lines
	line, err = lib.Readln(reader)
	for err == nil {
		records := strings.Split(line, "\t")
		var is_pass bool

		// Filter by id
		if arg_keep_ids != "" {
			id_found := pattern.FindStringSubmatch(records[2])
			if id_found != nil {
				id, _ := strconv.Atoi(id_found[1])

				if keep_ids[id] {
					is_pass = true
				}
			}
		}

		// Filter by loci
		if arg_keep_pos != "" {
			chrom := records[0]
			pos, _ := strconv.ParseInt(records[1], 10, 64)
			chrpos := lib.ChrPos(chrom, pos)

			if keep_pos[chrpos] {
				is_pass = true
			}
		}

		// Filter by FILTER = PASS
		if arg_keep_only_pass {
			if records[6] == "PASS" {
				is_pass = true
			} else {
				is_pass = false
			}
		}

		if is_pass {
			fmt.Println(line)
		}

		line, err = lib.Readln(reader)
	}
	if err != nil && err != io.EOF {
		panic(err)

	}
}