/
reader.go
167 lines (147 loc) · 3.47 KB
/
reader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
package multigz
import (
"bufio"
"errors"
"io"
"io/ioutil"
"github.com/klauspost/compress/gzip"
)
var (
errWrongOffset = errors.New("the offset does not appear to match the gzip layout")
)
// Offset represents a specific point in the decompressed stream where we want
// to seek at. The normal way to obtain an Offset is to call Reader.Offset() of
// Writer.Offset() at the specific point in the stream we are interested into;
// later, it is possible to call Reder.Seek() passing the Offset to efficiently
// get back to that point.
type Offset struct {
Block int64
Off int64
}
type countReader struct {
R *bufio.Reader
Cnt *int64
}
func (cw *countReader) Read(data []byte) (n int, err error) {
n, err = cw.R.Read(data)
(*cw.Cnt) += int64(n)
return
}
func (cw *countReader) ReadByte() (ch byte, err error) {
(*cw.Cnt) += 1
return cw.R.ReadByte()
}
// A multigz.Reader is 100% equivalent to a gzip.Reader, but allows to seek
// within the compressed file to specific positions.
//
// The idea is to use a multi-pass approach; in the first pass, you can go
// through the file and record the positions of interest by calling Offset().
// Then, you can seek to a specific offset by calling Seek().
type Reader struct {
gz *gzip.Reader
ur io.Reader
r io.ReadSeeker
cnt int64
noff int64
block int64
delim bool
}
func NewReader(r io.ReadSeeker) (*Reader, error) {
or := new(Reader)
or.r = r
gz, err := gzip.NewReader(or.createUnderlyingReader())
if err != nil {
return nil, err
}
gz.Multistream(false)
or.gz = gz
return or, nil
}
func (or *Reader) createUnderlyingReader() io.Reader {
or.ur = &countReader{
R: bufio.NewReader(or.r),
Cnt: &or.cnt,
}
return or.ur
}
func (or *Reader) Read(data []byte) (int, error) {
if or.gz == nil {
return 0, io.EOF
}
nread := 0
for len(data) > 0 {
n, err := or.gz.Read(data)
if err == io.EOF {
or.noff = 0
or.block = or.cnt
or.gz.Close()
if or.gz.Reset(or.ur) == io.EOF {
or.gz = nil
return nread, nil
}
or.delim = true
or.gz.Multistream(false)
continue
}
if err != nil {
return nread + n, err
}
or.noff += int64(n)
nread += n
data = data[n:]
}
return nread, nil
}
func (or *Reader) Close() error {
if or.gz == nil {
return nil
}
r := or.gz
or.gz = nil
return r.Close()
}
func (or *Reader) Offset() Offset {
return Offset{Block: or.block, Off: or.noff}
}
func (or *Reader) Seek(o Offset) error {
cur := or.Offset()
if cur.Block == o.Block && cur.Off < o.Off {
_, err := io.CopyN(ioutil.Discard, or, o.Off-cur.Off)
if err != nil {
return err
}
return nil
}
or.r.Seek(o.Block, 0)
or.cnt = o.Block
if or.gz == nil {
gz, err := gzip.NewReader(or.createUnderlyingReader())
if err != nil {
return err
}
or.gz = gz
} else {
or.gz.Close()
if or.gz.Reset(or.createUnderlyingReader()) == io.EOF {
or.gz = nil
return errWrongOffset
}
}
or.gz.Multistream(false)
or.block = o.Block
or.noff = 0
_, err := io.CopyN(ioutil.Discard, or, o.Off)
if err != nil {
return err
}
return nil
}
// Return true if we found at least a multi-gzip separtor while reading this
// file.
// This function does not take into account the fact that short files can
// be effectively treated as multigz even if technically they aren't. Unless
// you know that you've read enough bytes out of this file, you should use
// the global function IsProbablyMultiGzip() which is a more general solution.
func (or *Reader) IsProbablyMultiGzip() bool {
return or.delim
}