/
creep.go
508 lines (432 loc) · 18 KB
/
creep.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
// coding: utf-8
// gotest1:Walk.go by Ricky Seltzer rickyseltzer@gmail.com. Version 1.0 on 2013-06-04
// Todo: There are too many locks and unlocks in this. Mostly to keep track of statistics, but also
// to keep the map of beenThereDoneThat, which is the only really good reason.
/*
Package creep implements a web crawler. It reads web pages and follows links to the rest of
the web, recursively, ad infinitum, within the limits provided. We use the term creep to avoid name clashes
with other software called 'walk' and 'crawl'. I'm thinking of changing it to 'stroll'.
*/
package creep
import (
"errors"
"fmt"
"io/ioutil"
"log"
"net/http"
"regexp"
//"strings"
"sync"
"time"
)
/* METHOD:
* Url requests sent along a request channel.
* Responses with any errors sent back along a response channel.
* A pool of worker goroutines is started to recv the requests, process them, and send responses back along
* the response channel.
* The processing is getting the url, searching the body for links, and sending requests for
* each link along the request channel. We avoid fetching duplicates by keeping a hash (map) of
* each url fetched.
*/
/* Todo: <form action="http://gsa.icann.org/search" method="get">
*/
/*
* HOW DO WE KNOW WHEN WE ARE DONE?
* When the number of responses recvd matches the number of requests made?
* This would imply that the subsidiary pages (linked-to pages) need to make requests
* before the response from the parent page is sent back to the requestor. Out of order.
* A problem? Possibly for the later data processing.
* The more general solution is to use a synch.WaitGroup. It is a counter meant for this
* purpose. Even so, we do not guarantee in-order results, since the linked-to page could have
* been seen earlier, anyway.
*/
/* TODO: Better throttling — done.
* Todo: Use command-line arguments, at least for the test: Not supported by the testing package, though.
* Todo: Put all this in a separate function: shouldWeContinue() bool
*/
var urlFindRe *regexp.Regexp // Find linked-to urls in body of page
var fileRe *regexp.Regexp
var urlRejectRe *regexp.Regexp // Further examined links to ensure reasonable.
// Prevent duplicate fetching, by keeping urls in a map. And counters.
// Mentally keep track of the shared data by putting it in a struct. The struct provides
// no safety or synchronization by itself, it's just a reminder.
type synchStuff struct {
sync.RWMutex // "Embedding" type without variable name makes using it more convenient.
beenThereDoneThat map[string]bool
urlsFetched int
dupsStopped int
queueCnt int
rejectCnt int
}
var synched synchStuff
var maxUrlsFetched int = 544 // Over-ridden by loading in the test data file.
var maxGoRoutines int = 5 // Over-ridden ...
var goingCount int = 0
var startTime time.Time
var just1Domain bool = false // Don't go beyond the initial domain.
// Only makes sense when just one url in the starting array.
const reqChanCapacity = 60000
// This enables the program to wait until all the data is processed, and then exit.
var waitGroup sync.WaitGroup
// Includes the answer to the Get of the url, the url itself, error, elapsed time.
type ResponseFromWeb struct {
Url string // Original url
HttpResponse *http.Response // Response from http.Get()
Err error // Error from Get()
ElapsedTime time.Duration // Time duration of Get()
}
var respChan chan *ResponseFromWeb // result of Get() sent back along this channel to 'caller'.
// For a channel of url requests. At one time I thought each request would be more than a string.
type RequestUrl struct {
Url string
}
var reqChan chan *RequestUrl // Channel of requests from main program to this package.
var fileClient *http.Client // For handling file:// urls. Good for testing, mainly.
var routineStatus []rune // A status letter for the running-state of each goroutine. For diagnosis
// and termination.
const ExitCommandUrl string = "ExitExitExitExit" // Fake Url that tells goroutine to exit.
func init() {
// Bug: What if there is a newline in the "wrong" place?
urlFindRe = regexp.MustCompile(`href="((https?|file)://[^">#\?]+)`) // Note the back-quotes
fileRe = regexp.MustCompile(`(file)://`)
urlRejectRe = regexp.MustCompile(`\.(css|ico|js|py|pdf|png|mp3|mp4|jpg|jpeg|swf|exe|dll|so|lib)\/?$`)
t := &http.Transport{}
t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
fileClient = &http.Client{Transport: t}
}
// Main External entry point for package creep. Call only once at a time, but you can give
// it an array of urls to process.
func CreepWebSites(urls []string, maxPermittedUrls int, maxGoRo int, justOneDomain bool) <-chan *ResponseFromWeb {
if nil == respChan {
respChan = make(chan *ResponseFromWeb, 100) // buffered
}
go reallyCreepWebSites(urls, maxPermittedUrls, maxGoRo, justOneDomain)
return respChan
}
func reallyCreepWebSites(urls []string, maxPermittedUrls int, maxGoRo int, justOneDomain bool) {
maxUrlsFetched = maxPermittedUrls
just1Domain = justOneDomain
// map of urls to avoid duplicate fetch.
synched.beenThereDoneThat = make(map[string]bool, 20+maxUrlsFetched)
synched.urlsFetched = 0
synched.dupsStopped = 0 // Don't need synchronization yet. Not until goroutines.
maxGoRoutines = maxGoRo
fmt.Printf(" Creeping: %2d maxUrlsFetched, %2d maxGoRoutines, %5d reqChanCapacity, %s justOneDomain.\n",
maxUrlsFetched, maxGoRoutines, reqChanCapacity, boolTF(just1Domain))
// Make some channels:
reqChan = make(chan *RequestUrl, reqChanCapacity) // Big buffer. Enough?
// TODO: WHAT TO DO WHEN CHANNEL IS FULL: STASH ON THE DISK?? Probably just block. But
// blocking means that the goroutine isn't running to finish jobs that would cause it to
// unblock. Fatal embrace?
routineStatus = make([]rune, 1+maxGoRoutines) // Allocate the string for diagnostic array.
for i := 0; i < len(routineStatus); i++ {
routineStatus[i] = '-' // Uninitialized, no goroutines here yet.
}
if 1 > len(urls) {
log.Fatal("No urls to process")
}
startTime = time.Now()
for w := 0; w < maxGoRoutines; w++ { // Start the worker goroutines.
go getUrl(reqChan, respChan, 1+w)
}
setCurrentDomain(urls[0])
if just1Domain { // I think this is the only reasonable case that can actually terminate.
if 1 < len(urls) {
log.Fatal("For just-one-domain, must have just one starting url")
}
enQueue(urls[0], reqChan, 0)
} else {
for _, thisurl := range urls { // For each url in the initial test list.
enQueue(thisurl, reqChan, 0)
}
}
fmt.Printf(" Waiting:\n")
routineStatus[0] = 'x' // main entry pt returning caller: goroutines are started.
waitUntilDone()
// Now check channel of domains we've ignored until now.
ldc := len(domainChan)
for i := 0; i < ldc; i++ {
dom, ok := <-domainChan
if !ok {
break
}
fmt.Printf("Another domain: '%s'\n", dom)
}
waitUntilGoroutinesDead(reqChan)
sendAllDone(9999999)
close(reqChan)
time.Sleep(time.Second)
close(respChan)
return
}
// Simply wait until all Urls are done. I suppose we could do this with a special channel.
// This is actually easier. MAYBE NOT.
func waitUntilDone() {
time.Sleep(5 * time.Second)
waitGroup.Wait() // Wait until all Urls are processed.
fmt.Printf("Waited: done req:resp %3d:%3d. %5d urls fetched, %5d in map, %5d dupes stopped, %5d rejected, at %v\n",
len(reqChan), len(respChan), synched.urlsFetched, mapLength(), synched.dupsStopped, synched.rejectCnt,
time.Since(startTime))
//log.Println("go Status: ", string(routineStatus))
}
func waitUntilGoroutinesDead(reqChan chan *RequestUrl) {
// Then we wait until all goroutines are done.
// kill every worker goroutine
for i := 1; i < len(routineStatus); i++ {
// That is, once for each goroutine. --
// put an exit-goroutine status command on the request channel.
reqChan <- &RequestUrl{Url: ExitCommandUrl}
}
time.Sleep(900 * time.Millisecond)
var doneCnt int = 0
var blockedCnt int = 0
var doneLimit int = len(routineStatus) - 1 // Because of the extra element [0]
for (doneCnt + blockedCnt) < doneLimit {
time.Sleep(100 * time.Millisecond)
fmt.Printf("Wait for goroutines: ")
showStatus()
doneCnt = 0
blockedCnt = 0
for i := 1; i < len(routineStatus); i++ {
switch routineStatus[i] {
case 'X', 'K', 'L', 'N', 'T': // Exiting
doneCnt++
case 'q', 'C':
blockedCnt++ // This is observed when the channels are full.
default:
}
}
}
fmt.Printf("Waited: dead req:resp %3d:%3d. %5d urls fetched, %5d in map, %5d dupes stopped, %5d rejected, at %v\n",
len(reqChan), len(respChan), synched.urlsFetched, mapLength(), synched.dupsStopped, synched.rejectCnt,
time.Since(startTime))
//log.Println("go Status: ", string(routineStatus))
}
// Display a bool as a single letter, T or F.
func boolTF(george bool) string {
if george {
return "T"
} else {
return "F"
}
}
/* getUrl is the worker goroutine for getting an url and processing it:
* The number of such work routines is fixed at program startup time.
*/
func getUrl(reqChan chan *RequestUrl, respChan chan *ResponseFromWeb, routineNumber int) {
routineStatus[routineNumber] = '0' // virgin. No activity yet.
outerLoop:
for {
for _, st := range []rune{'K', 'L', 'N', 'T', 'X'} {
if st == routineStatus[routineNumber] {
break outerLoop
}
}
routineStatus[routineNumber] = 'W' // Blocked waiting on request channel.
theReq, ok := <-reqChan
if !ok {
break // request channel is closed. Who closes this? We do. So won't happen here?
}
thisUrl := theReq.Url
if ExitCommandUrl == thisUrl { // "ExitExitExitExit"
//sendAllDone(routineNumber)
break
}
synched.Lock()
killSelf := synched.urlsFetched > maxUrlsFetched
synched.Unlock()
if killSelf {
routineStatus[routineNumber] = 'K' // First killSelf — too many urls.
fmt.Printf("\t ->>1 Too many urls fetched %4d after %v\n", synched.urlsFetched, time.Since(startTime))
//sendAllDone(routineNumber) // End-of-stream back to caller.
return
}
routineStatus[routineNumber] = 'G' // Going
goingCount++
reallyGetUrl(thisUrl, reqChan, respChan, routineNumber) // Do the bulk of the work.
}
routineStatus[routineNumber] = 'X' // This goroutine is eXiting. req chan closed, or exit cmd.
return
}
/* reallyGetUrl() is a not-strictly-necessary subroutine of getUrl that does most of the work.
* 1. See if we have got this url already.
* 2. See if we have exceeded number of url fetches permitted.
* 3. Use the right protocol to get the file:// or the http:// url.
* 4. Send the response back on the response channel.
* 5. Search the response body for links that we ought to follow.
*/
func reallyGetUrl(thisUrl string, reqChan chan *RequestUrl, respChan chan *ResponseFromWeb, routineNumber int) {
if 12 > len(thisUrl) {
fmt.Println("URL too short ", len(thisUrl), thisUrl)
return
}
// in lieu of canonicalizing, remove trailing slash from thisUrl:
if thisUrl[len(thisUrl)-1] == '/' {
thisUrl = thisUrl[0 : len(thisUrl)-1]
//fmt.Printf("=Deslashed= '%s'\n", thisUrl )
}
synched.Lock() // Write-lock shared data.
if synched.beenThereDoneThat[thisUrl] {
// been there, done that. Check the url, possibly for the 2nd time, if was Q'ed from a link.
synched.dupsStopped++
synched.Unlock()
routineStatus[routineNumber] = 'D' // url rejected cause it's Duplicate. Been there done that.
return
}
synched.beenThereDoneThat[thisUrl] = true // Requires a write lock.
syUrlFetched := synched.urlsFetched
if syUrlFetched > maxUrlsFetched {
synched.Unlock()
routineStatus[routineNumber] = 'T' // Another case of too many urls.
fmt.Printf("\t ->>2 Too many urls fetched %4d after %v\n", syUrlFetched, time.Since(startTime))
//sendAllDone(routineNumber) // End-of-stream back to caller.
return
}
synched.urlsFetched++ // Requires a write lock. It counts attempts that succeed or fail.
synched.Unlock() // DON'T want to defer this, want to release it asap.
var client *http.Client
if nil != fileRe.FindStringIndex(thisUrl) {
client = fileClient
//fmt.Printf("file: protocol for %s\n", thisUrl)
} else {
client = http.DefaultClient
//fmt.Printf("http: protocol for %s\n", thisUrl)
}
waitGroup.Add(1)
/* Keep track of number of urls we Get(). So we know when we are all done.
* This might not yield correct behavior if we terminate early? Such as
* when exceeding number of urls allowed or any other resource constraint.
*/
routineStatus[routineNumber] = 'F' // Begun Fetching from the web.
fmt.Printf("=Fetching= '%s'\n", thisUrl)
start := time.Now()
getResponse, getErr := client.Get(thisUrl) // This can take a long time. Hundreds of milliseconds.
getElapsed := time.Since(start)
// Check again to see if we are over the limit in the total number of urls fetched:
synched.Lock()
syUrlFetch := synched.urlsFetched // It's been a long time since we last checked this
synched.Unlock()
killit := syUrlFetch > maxUrlsFetched
if killit {
routineStatus[routineNumber] = 'L' // Second attempt to killSelf because too many urls.
//fmt.Printf("=ENDING: after %5d: Too many urls fetched\n", syUrlFetch)
if (nil != getResponse) && (nil != getResponse.Body) {
getResponse.Body.Close()
}
return
//sendAllDone(routineNumber) // End-of-stream back to caller.
}
if (nil == getResponse) && (nil == getErr) {
e := errors.New("-->> NIL RESP AND NIL ERR RETURN! for '" + thisUrl + "' <<--")
log.Fatalf("Error %v\n", e) // Practically never happens.
return
}
if nil == getErr { // Send back the success, this page before the linked-to pages.
routineStatus[routineNumber] = 'R' // Sending response back to caller.
sendResponse(thisUrl, getResponse, getErr, getElapsed)
routineStatus[routineNumber] = 'S' // Sent response back to caller.
fmt.Printf("\n%35s: %s, stat %3d, len %3d, Elapsed %s\n", thisUrl, getResponse.Status,
getResponse.StatusCode, getResponse.ContentLength, getElapsed.String())
searchBodyForLinks(getResponse, reqChan, routineNumber) // Look for the later, linked-to pages.
} else { // Send back the error
routineStatus[routineNumber] = 'E' // Send error response back to caller.
sendResponse(thisUrl, getResponse, getErr, getElapsed)
}
waitGroup.Done() // This url is all done. Do this AFTER queueing up found links.
// Otherwise the number of outstanding links could become zero, which
// would make the waitGroup think we are all done with work.
// Bug: There is no way to guarantee against this waitGroup error, is there?
if killit {
routineStatus[routineNumber] = 'N' // DONE because of too many urls.
//sendAllDone(routineNumber) // End-of-stream back to caller.
}
if (nil != getResponse) && (nil != getResponse.Body) {
getResponse.Body.Close()
}
}
// Send a special terminating message back along the response channel; After all urls are done.
func sendAllDone(routineNumber int) {
fmt.Printf("SEND ALL DONE by %2d at delta %v\n", routineNumber, time.Since(startTime))
fmt.Println("go Status: ", string(routineStatus))
showStatus()
respChan <- &ResponseFromWeb{"DONE", nil, nil, time.Duration(0)} // back to caller.
}
//show status of each goroutine
func showStatus() {
format := fmt.Sprintf("go Status: %%%ds. len(domainChan) %%3d\n", len(routineStatus))
fmt.Printf(format, string(routineStatus), len(domainChan))
}
//show status of each goroutine
func showStatusOnLog() {
format := fmt.Sprintf("go Status: %%%ds. len(domainChan) %%3d\n", len(routineStatus))
log.Printf(format, string(routineStatus), len(domainChan))
}
// Send the response to the 'caller' along the response channel.
func sendResponse(thisUrl string, getResponse *http.Response, getErr error, elapse time.Duration) {
respChan <- &ResponseFromWeb{thisUrl, getResponse, getErr, elapse}
//Bug: But what about rejected Urls??? How do we .Done() them? We never .Add() them
}
// Use regex to search Body of the web page for links to other web pages, and follow them in turn.
func searchBodyForLinks(httpResp *http.Response, reqChan chan *RequestUrl, rn int) {
body, getErr := ioutil.ReadAll(httpResp.Body)
if getErr != nil {
fmt.Println(getErr) // Really ought to recover from this.
return
}
routineStatus[rn] = 'A' // Searching body for Anchor links to enQueue
links := urlFindRe.FindAllStringSubmatch(string(body), -1)
//fmt.Printf("%3d links found\n", len(links))
for _, alink := range links {
enQueue(alink[1], reqChan, rn)
// alink[1] is just the url. the [0] includes the 'href=' part of the regular expression.
}
}
// Try to enQueue a url, see and count whether it is accepted or rejected according to our rules.
// If okay, send it along the request channel.
func enQueue(thisUrl string, reqChan chan *RequestUrl, rn int) {
rejectThisUrl := nil != urlRejectRe.FindStringIndex(thisUrl) // Don't follow *.css etc
// see samedomain.go
thisDomain := getDomain(thisUrl)
newDomain := thisDomain != currentDomain
if rejectThisUrl {
//fmt.Printf("REJECT1: '%s'\n", thisUrl) // Not a web page.
} else {
if just1Domain {
rejectThisUrl = newDomain
if rejectThisUrl {
//fmt.Printf("REJECT2: '%s'\n", thisUrl) // link goes offsite
}
} else if newDomain {
enQueueNewDomain(thisDomain, rn) // see samedomain.go
return
}
}
synched.Lock() // Write lock shared data. There's just too much of this.
if rejectThisUrl {
synched.rejectCnt++
synched.Unlock() // Write unlock ... This lock logicq is getting hairy.
return
} else {
synched.queueCnt++
}
// Heuristic check this for the first time. Don't queue iff already done it.
if synched.beenThereDoneThat[thisUrl] {
// been there, done that.
synched.dupsStopped++
synched.Unlock()
routineStatus[rn] = 'd' // url rejected cause it's Duplicate. Been there done that.
return
}
synched.Unlock() // Write unlock ...
//fmt.Printf("enQueue '%s'\n", thisUrl)
routineStatus[rn] = 'C' // enQueue blocked waiting on request channel.
reqChan <- &RequestUrl{Url: thisUrl} // Here's the beef; Put it on the channel.
routineStatus[rn] = 'e' // enQueue is returning.
}
// get length of the list-of-urls map, with a synchronized read.
func mapLength() int {
synched.Lock()
ml := len(synched.beenThereDoneThat)
synched.Unlock()
return ml
}