/
worker.go
112 lines (96 loc) · 2.52 KB
/
worker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
package crawlbot
import (
"bytes"
"github.com/phayes/errors"
"io/ioutil"
"net/http"
)
type worker struct {
state bool // true means busy / unavailable. false means idling and is ready for new work
url string // Current URL being processed
results chan result // Channel on which to send results
crawler *Crawler // It's parent crawler
client *http.Client // The client to be used for HTTP connection
}
type result struct {
err error
url string
newurls []string
owner *worker
}
// Process a given URL, when finish pass back a new list of URLs to process
func (w *worker) setup(targetURL string) {
w.state = true
w.url = targetURL
}
func (w *worker) teardown() {
w.state = false
w.url = ""
}
func (w *worker) process() {
go func() {
// Do the HTTP GET and create the response object
var resp Response
httpresp, err := w.client.Get(w.url)
if httpresp != nil {
resp = Response{Response: httpresp}
} else {
resp = Response{}
}
resp.URL = w.url
resp.Crawler = w.crawler
if err != nil {
resp.Err = errors.Wrap(err, ErrReqFailed)
w.crawler.Handler(&resp)
w.sendResults(nil, resp.Err)
return
}
// Check headers using HeaderCheck
if err = w.crawler.CheckHeader(w.crawler, w.url, resp.StatusCode, resp.Header); err != nil {
resp.Err = errors.Wrap(err, ErrHeaderRejected)
w.crawler.Handler(&resp)
resp.Body.Close()
w.sendResults(nil, resp.Err)
return
}
// Read the body
resp.bytes, err = ioutil.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
resp.Err = errors.Wrap(err, ErrBodyRead)
w.crawler.Handler(&resp)
w.sendResults(nil, resp.Err)
return
}
// Replace the body with a readCloser that reads from bytes
resp.Body = &readCloser{bytes.NewReader(resp.bytes)}
// Process the handler
w.crawler.Handler(&resp)
resp.Body = &readCloser{bytes.NewReader(resp.bytes)}
// Find links and finish
newurls := make([]string, 0)
for _, url := range w.crawler.LinkFinder(&resp) {
if err := w.crawler.CheckURL(w.crawler, url); err == nil {
newurls = append(newurls, url)
}
}
// We're done, return the results
w.sendResults(newurls, nil)
}()
}
func (w *worker) sendResults(newurls []string, err error) {
result := result{
err: err,
url: w.url,
newurls: newurls,
owner: w,
}
w.results <- result
}
// ReadCloser is a dummy type that makes bytes.Reader compatible with ReadCloser so we can use it to replace Body
type readCloser struct {
*bytes.Reader
}
func (r *readCloser) Close() error {
return nil
}