/
vcf-update.go
134 lines (116 loc) · 4.57 KB
/
vcf-update.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
package main
import (
"bufio"
"compress/gzip"
"errors"
"fmt"
"github.com/codegangsta/cli"
"github.com/knmkr/go-vcf-tools/lib"
"io"
"os"
"regexp"
"strconv"
"strings"
)
func doUpdate(c *cli.Context) {
arg_rs_merge_arch := c.String("rs-merge-arch")
if arg_rs_merge_arch == "" {
cli.ShowCommandHelp(c, "update")
os.Exit(1)
}
f, err := os.Open(arg_rs_merge_arch)
if err != nil {
panic(err)
}
defer f.Close()
gz, err := gzip.NewReader(f)
if err != nil {
panic(err)
}
defer gz.Close()
// [dbSNP Column Description for table: RsMergeArc](http://www.ncbi.nlm.nih.gov/projects/SNP/snp_db_table_description.cgi?t=RsMergeArch)
//
// - Table name and description
//
// | Table Description |
// |-------------------------------------------------------------------------------------------------------------------------------|
// | "refSNP(rs) cluster is based on unique genome position. On new genome assembly, previously different contig may |
// | align. So different rs clusters map to the same location. In this case, we merge the rs. This table tracks this merging." |
//
// - Table column and description
//
// | Column | Description | Type | Byte | Order |
// |-------------------+----------------------------------------------------------------------------+---------------+------+-------|
// | rsHigh | Since rs# is assigned sequentially. Low number means the rs occurs | int | 4 | 1 |
// | | early. So we always merge high rs number into low rs number. | | | |
// | rsLow | | int | 4 | 2 |
// | build_id | dbSNP build id when this rsHigh was merged into rsLow. | smallint | 2 | 3 |
// | orien | The orientation between rsHigh and rsLow. | tinyint | 1 | 4 |
// | create_time | | smalldatetime | 4 | 5 |
// | last_updated_time | | smalldatetime | 4 | 6 |
// | rsCurrent | rsCurrent is the current rs for rsHigh. If rs9 is merged into rs5 which is | int | 4 | 7 |
// | | later merged into rs2, then rsCurrent is 2 for rsHigh=9. | | | |
// | orien2Current | | tinyint | 1 | 8 |
//
// This table/column description is last updated at: Mar 18 2015 02:51:00:000PM.
// Get merge mappings of rs IDs
rsHigh2current := make(map[int]int)
map_reader := bufio.NewReaderSize(gz, 128*1024)
map_line, err := lib.Readln(map_reader)
for err == nil {
records := strings.Split(map_line, "\t")
rsHigh, _ := strconv.Atoi(records[0])
rsCurrent, _ := strconv.Atoi(records[6])
rsHigh2current[rsHigh] = rsCurrent
map_line, err = lib.Readln(map_reader)
}
if err != nil && err != io.EOF {
panic(err)
}
// Parse header lines
reader := bufio.NewReaderSize(os.Stdin, 128*1024)
line, err := lib.Readln(reader)
for err == nil {
if strings.HasPrefix(line, "##") {
fmt.Println(line)
} else if strings.HasPrefix(line, "#CHROM") {
fmt.Println(line)
break
} else {
err = errors.New("Invalid VCF header")
break
}
line, err = lib.Readln(reader)
}
if err != nil && err != io.EOF {
panic(err)
}
pattern := regexp.MustCompile(`rs(\d+)`)
line, err = lib.Readln(reader)
for err == nil {
records := strings.Split(line, "\t")
// Update rs ID
var id_updated_str string
id_found := pattern.FindStringSubmatch(records[2])
if id_found != nil {
id, _ := strconv.Atoi(id_found[1])
id_updated := rsHigh2current[id]
if id_updated != 0 {
id_updated_str = "rs" + strconv.Itoa(id_updated) // Map to current ID
} else {
id_updated_str = records[2] // ID is not listed in merge history
}
} else {
id_updated_str = records[2] // ID is not rs ID
}
result := []string{}
result = append(result, records[0:2]...)
result = append(result, id_updated_str)
result = append(result, records[3:]...)
fmt.Println(strings.Join(result, "\t"))
line, err = lib.Readln(reader)
}
if err != nil && err != io.EOF {
panic(err)
}
}