forked from PuerkitoBio/gocrawl
/
worker.go
224 lines (195 loc) · 6.26 KB
/
worker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
package gocrawl
import (
"exp/html"
"github.com/PuerkitoBio/goquery"
"github.com/temoto/robotstxt.go"
"net/http"
"net/url"
"strings"
"sync"
"time"
)
// The worker is dedicated to fetching and visiting a given host, respecting
// this host's robots.txt crawling policies.
type worker struct {
host string
visitor func(*http.Response, *goquery.Document) ([]*url.URL, bool)
push chan<- *workerResponse
pop popChannel
stop chan bool
userAgent string
robotUserAgent string
logFunc func(LogFlags, string, ...interface{})
index int
wg *sync.WaitGroup
crawlDelay time.Duration
idleTTL time.Duration
robotsGroup *robotstxt.Group
fetcher Fetcher
}
// Start crawling the host.
func (this *worker) run() {
defer func() {
this.logFunc(LogInfo, "worker done.\n")
this.wg.Done()
}()
// Initialize the Fetcher to default if nil
if this.fetcher == nil {
this.fetcher = new(defaultFetcher)
}
// Enter loop to process URLs until stop signal is received
for {
var idleChan <-chan time.Time
this.logFunc(LogInfo, "waiting for pop...\n")
// Initialize the idle timeout channel, if required
if this.idleTTL > 0 {
idleChan = time.After(this.idleTTL)
}
select {
case <-this.stop:
this.logFunc(LogInfo, "stop signal received.\n")
return
case <-idleChan:
this.logFunc(LogInfo, "idle timeout received.\n")
this.notifyURLProcessed(nil, false, nil, true)
return
case batch := <-this.pop:
// Got a batch of urls to crawl, loop and check at each iteration if a stop
// is received.
for _, u := range batch {
this.logFunc(LogInfo, "popped: %s\n", u.String())
if this.isAllowedPerRobotsPolicies(u) {
this.requestUrl(u)
} else {
// Must still notify Crawler that this URL was processed, although not visited
this.notifyURLProcessed(u, false, nil, false)
}
select {
case <-this.stop:
this.logFunc(LogInfo, "stop signal received.\n")
return
default:
// Nothing, just continue...
}
}
}
}
}
// Checks if the given URL can be fetched based on robots.txt policies.
func (this *worker) isAllowedPerRobotsPolicies(u *url.URL) bool {
if this.robotsGroup != nil {
// Is this URL allowed per robots.txt policy?
ok := this.robotsGroup.Test(u.Path)
if !ok {
this.logFunc(LogIgnored, "ignored on robots.txt policy: %s\n", u.String())
}
return ok
}
return true
}
// Process the specified URL.
func (this *worker) requestUrl(u *url.URL) {
var agent string
var isRobot bool
if isRobot = isRobotsTxtUrl(u); isRobot {
agent = this.robotUserAgent
} else {
agent = this.userAgent
}
// Fetch the document, using the robot user agent if this is a robot,
// so that the host admin can see what robots are doing requests.
if res, e := this.fetcher.Fetch(u, agent); e != nil {
this.logFunc(LogError, "ERROR fetching %s: %s\n", u.String(), e.Error())
this.notifyURLProcessed(u, false, nil, false)
} else {
var harvested []*url.URL
var visited bool
// Close the body on function end
defer res.Body.Close()
// Crawl delay starts now
wait := time.After(this.crawlDelay)
// Special case if this is the robots.txt
if isRobot {
if data, e := robotstxt.FromResponse(res); e != nil {
// this.robotsGroup will be nil, which will allow access by default.
// Reasonable, since by default no robots.txt means full access, so invalid
// robots.txt is similar behavior.
this.logFunc(LogError, "ERROR parsing robots.txt for host %s: %s\n", u.Host, e.Error())
} else {
if this.robotsGroup = data.FindGroup(this.robotUserAgent); this.robotsGroup != nil {
// Use robots.txt crawl-delay, if specified
if this.robotsGroup.CrawlDelay > 0 {
this.crawlDelay = this.robotsGroup.CrawlDelay
this.logFunc(LogInfo, "override crawl-delay: %v\n", this.crawlDelay)
}
}
}
} else {
// Normal path
if res.StatusCode >= 200 && res.StatusCode < 300 {
// Success, visit the URL
harvested = this.visitUrl(res)
visited = true
} else {
// Error based on status code received
this.logFunc(LogError, "ERROR status code for %s: %s\n", u.String(), res.Status)
}
}
this.notifyURLProcessed(u, visited, harvested, false)
// Wait for crawl delay
<-wait
}
}
// Send a response to the crawler.
// TODO : Rename.
func (this *worker) notifyURLProcessed(u *url.URL, visited bool, harvested []*url.URL, idleDeath bool) {
// Do NOT notify for robots.txt URLs, this is an under-the-cover request,
// not an actual URL enqueued for crawling.
if !isRobotsTxtUrl(u) {
// Push harvested urls back to crawler, even if empty (uses the channel communication
// to decrement reference count of pending URLs)
this.push <- &workerResponse{this.host, u, visited, harvested, idleDeath}
}
}
// Process the response for a URL.
func (this *worker) visitUrl(res *http.Response) []*url.URL {
var doc *goquery.Document
var harvested []*url.URL
var doLinks bool
// Load a goquery document and call the visitor function
if node, e := html.Parse(res.Body); e != nil {
this.logFunc(LogError, "ERROR parsing %s: %s\n", res.Request.URL.String(), e.Error())
} else {
doc = goquery.NewDocumentFromNode(node)
doc.Url = res.Request.URL
}
// Visit the document (with nil goquery doc if failed to load)
if this.visitor != nil {
if harvested, doLinks = this.visitor(res, doc); doLinks && doc != nil {
// Links were not processed by the visitor, so process links
harvested = this.processLinks(doc)
}
} else {
this.logFunc(LogInfo, "missing visitor function: %s\n", res.Request.URL.String())
}
return harvested
}
// Scrape the document's content to gather all links
func (this *worker) processLinks(doc *goquery.Document) (result []*url.URL) {
urls := doc.Root.Find("a[href]").Map(func(_ int, s *goquery.Selection) string {
val, _ := s.Attr("href")
return val
})
for _, s := range urls {
// If href starts with "#", then it points to this same exact URL, ignore (will fail to parse anyway)
if len(s) > 0 && !strings.HasPrefix(s, "#") {
if parsed, e := url.Parse(s); e == nil {
parsed = doc.Url.ResolveReference(parsed)
result = append(result, parsed)
} else {
this.logFunc(LogIgnored, "ignore on unparsable policy %s: %s\n", s, e.Error())
}
}
}
return
}