/
unbwst.go
70 lines (64 loc) · 1.78 KB
/
unbwst.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
package bwst
import (
"bytes"
"math/big" // bitset
"sort" // TODO: don't use sort to unbwst; interfaces are suboptimal
)
// Compute the inverse of the Burrows-Wheeler-Scott transform. This is done
// out-of-place.
func UnBWST(b []byte) []byte {
sorted := make([]byte, len(b))
copy(sorted, b)
sort.Sort(bytesorter(sorted))
used := new(big.Int)
used.SetBit(used, len(b), 1) // reserve capacity
links := make([]int, len(b))
// TODO: use O(lg(N)) search in sorted instead of O(N) search in b
for i, c := range sorted {
// find the first unused index in b of c
for j, c2 := range b {
if c == c2 && used.Bit(j) == 0 {
links[i] = j
used.SetBit(used, j, 1)
break
}
}
}
// We need to know once again whether each byte is used, so instead of
// resetting the bitset or using more memory, we can just ask whether it's
// unused.
unused := used
words := multibytesorter{}
for i := range sorted {
if unused.Bit(i) == 1 {
word := []byte{}
x := i
for unused.Bit(x) == 1 {
word = append(word, sorted[x])
unused.SetBit(unused, x, 0)
x = links[x]
}
words = append(words, nil)
copy(words[1:], words)
words[0] = word
}
}
if !sort.IsSorted(words) {
sort.Sort(words)
}
x := len(b)
s := make([]byte, len(b))
for _, word := range words {
x -= len(word)
copy(s[x:], word)
}
return s
}
type bytesorter []byte
func (b bytesorter) Len() int { return len(b) }
func (b bytesorter) Less(i, j int) bool { return b[i] < b[j] }
func (b bytesorter) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
type multibytesorter [][]byte
func (b multibytesorter) Len() int { return len(b) }
func (b multibytesorter) Less(i, j int) bool { return bytes.Compare(b[i], b[j]) < 0 }
func (b multibytesorter) Swap(i, j int) { b[i], b[j] = b[j], b[i] }