/
complexity.go
141 lines (130 loc) · 3.44 KB
/
complexity.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/*
Author: Vinhthuy Phan, Shanshan Gao
Copyright 2014
Measures of complexity: I, Ik, D, Dk, Rk
*/
package genomecomplexity
import (
// "fmt"
"os"
"bufio"
"bytes"
"math"
"io/ioutil"
)
type Index struct{
data []byte
sa []int
lcp []int
Length int
}
func (idx *Index) Build(filename string) {
idx.data = ReadSequence(filename)
ws := &WorkSpace{}
idx.sa = make([]int, len(idx.data))
ws.ComputeSuffixArray(idx.data, idx.sa)
// idx.sa = qsufsort(idx.data)
idx.lcp = make([]int, len(idx.data)-1) // lcp[i] stores length of lcp of sa[i] and sa[i+1]
for i := 1; i < len(idx.data); i++ {
idx.lcp[i-1] = idx.lcp_len(i)
}
idx.Length = len(idx.sa)
}
// length of longest common prefix of data[SA[m]:] and data[SA[m-1]:]
func (idx *Index) lcp_len(m int) int{
L, i, j := len(idx.data), idx.sa[m], idx.sa[m-1]
for i<L && j<L && idx.data[i]==idx.data[j] {
i++
j++
}
return j - idx.sa[m-1]
}
// D = rate of distinct substrings
func (idx Index) D() float64{
c := uint64(len(idx.data) - idx.sa[0])
for i := 1; i < len(idx.data); i++ {
c += uint64(len(idx.data) - idx.sa[i] - idx.lcp[i-1])
}
return 2.0 * (float64(c)/float64(len(idx.data)))/ float64(len(idx.data) + 1)
}
// Dk = rate of distinct k-mers
func (idx Index) Dk(k int) float64{
var c uint64 = 0
if idx.sa[0] <= len(idx.data)-k {
c++
// fmt.Println(string(idx.data[idx.sa[0] : idx.sa[0]+k]))
}
for i := 1; i < len(idx.data); i++ {
if idx.lcp[i-1] < k && idx.sa[i] <= len(idx.data)-k {
c++
// fmt.Println(string(idx.data[idx.sa[i] : idx.sa[i]+k]))
}
}
// return float64(c)
return float64(c)/float64(len(idx.data) - k + 1)
}
func (idx Index) Block(m int, k int) int{
for i := m; i < len(idx.data)-1; i++ {
if idx.lcp[i] < k {
return (i - 1)
}
}
return len(idx.data) - 2
}
// Rk = k-repeat density
func (idx Index) Rk(k int) float64{
var c uint64 = 0
i := 0
for i < len(idx.data)-1 {
// fmt.Println(i, idx.lcp[i], idx.Block(i,k), c)
if idx.lcp[i] >= k {
c += uint64(idx.Block(i, k) - i + 2)
i = idx.Block(i, k) + 1
} else {
i++
}
}
return float64(c)/float64(len(idx.data) - k + 1)
}
// I complexity (Becher & Heiber, 2012)
func (idx *Index) I() float64 {
var sum float64 = 0
for _, v := range idx.lcp {
sum += (math.Log(float64(v+2)) - math.Log(float64(v+1))) / math.Log(4.0)
}
return sum
}
func (idx Index) Ik(k int) float64{
var sum float64 = 0
for i := 1; i < len(idx.data); i++ {
if idx.lcp[i-1] < k && len(idx.data)-idx.sa[i] >= k {
sum += (math.Log(float64(idx.lcp[i-1]+2)) - math.Log(float64(idx.lcp[i-1]+1))) / math.Log(4.0)
}
}
return sum
}
func ReadSequence(file string) []byte{
f, err := os.Open(file)
if err != nil {
panic(err)
}
defer f.Close()
byte_array := make([]byte, 0)
Ns := []byte("N")
None := []byte("")
if file[len(file)-6:] == ".fasta" {
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Bytes()
if len(line)>0 && line[0] != '>' {
byte_array = append(byte_array, bytes.Replace(bytes.Trim(line,"\n\r "), Ns, None, -1)...)
}
}
} else {
byte_array, err = ioutil.ReadFile(file)
if err != nil {
panic(err)
}
}
return byte_array
}