forked from maciekmm/sitemap-generator
/
worker.go
117 lines (107 loc) · 3.02 KB
/
worker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
package sitemapgen
import (
"log"
"net/http"
"net/url"
"strings"
"sync"
"github.com/eapache/channels"
"github.com/maciekmm/sitemap-generator/limit"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
type Worker struct {
workQueue *channels.InfiniteChannel
validator chan<- *url.URL
waitGroup *sync.WaitGroup
generator chan<- string
httpClients chan *limit.Client
}
func NewWorker(workQueue *channels.InfiniteChannel, validator chan<- *url.URL, waitGroup *sync.WaitGroup, generator chan<- string, httpClients chan *limit.Client) *Worker {
return &Worker{
workQueue: workQueue,
validator: validator,
waitGroup: waitGroup,
generator: generator,
httpClients: httpClients,
}
}
func (w *Worker) Start() {
for {
select {
case job, ok := <-w.workQueue.Out():
if !ok {
return
}
stringURL := job.(*url.URL).String()
req, err := http.NewRequest("GET", stringURL, nil)
if err != nil {
log.Println("Worker: Could not parse: ", stringURL, " error: ", err.Error())
w.waitGroup.Done()
continue
}
client := <-w.httpClients
w.httpClients <- client
//DEBUG: log.Println("In pool:", strconv.Itoa(w.workQueue.Len()), " - ", stringURL)
resp, err := client.Do(req)
if err != nil {
log.Println("Worker: Could not connect to: ", stringURL, " error: ", err.Error())
if strings.Contains(err.Error(), "http: error connecting to proxy") || strings.Contains(err.Error(), "while waiting for connection") {
w.workQueue.In() <- job
} else {
w.waitGroup.Done()
}
continue
}
if resp.StatusCode != http.StatusOK {
resp.Body.Close()
log.Println("Worker: Invalid status code for: ", stringURL, " code: ", resp.StatusCode)
//TODO: return to pool on certain errors
w.waitGroup.Done()
continue
}
if !strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") {
resp.Body.Close()
log.Println("Worker: Invalid content-type for: ", stringURL, " content-type: ", resp.Header.Get("Content-Type"))
w.waitGroup.Done()
continue
}
//Push to file generation
w.waitGroup.Add(1)
w.generator <- stringURL
//DEBUG: log.Println("Parsing ", stringURL)
doc := html.NewTokenizer(resp.Body)
for tokenType := doc.Next(); tokenType != html.ErrorToken; {
token := doc.Token()
switch tokenType {
case html.StartTagToken:
if token.DataAtom != atom.A {
tokenType = doc.Next()
continue
}
for _, attr := range token.Attr {
if attr.Key == "href" {
parsedURL, err := toAbsURL(job.(*url.URL), attr.Val)
if err != nil {
log.Println("Worker: Could not get an absolute path for: ", attr.Val, " error: ", err.Error())
continue
}
w.waitGroup.Add(1)
w.validator <- parsedURL
}
}
}
}
resp.Body.Close()
w.waitGroup.Done()
}
}
}
func toAbsURL(baseurl *url.URL, weburl string) (*url.URL, error) {
relurl, err := url.Parse(weburl)
if err != nil {
return nil, err
}
absurl := baseurl.ResolveReference(relurl)
return absurl, nil
}