/
web-crawler.go
70 lines (62 loc) · 1.17 KB
/
web-crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
package main
import (
"fmt"
"golang.org/x/net/html"
"net/http"
)
var url_prefix string = "http://stackoverflow.com/"
func isSummary(t html.Token) (ok bool){
for _, attr := range t.Attr {
if (attr.Key == "class") && (attr.Val == "summary"){
return true
}
}
return false
}
func getQ(tknzer html.Tokenizer, ch chan string) {
tknzer.Next()
tknzer.Next()
tknzer.Next()
tknzer.Next()
ch <- string(tknzer.Text())
}
func printQ(ch chan string) {
for{
value := <- ch
if value == "END!" {
break
} else {
fmt.Println(value)
}
}
}
// crawl the page
func Crawl (url string, ch chan string) {
resp, _ := http.Get(url_prefix + url)
tokenizer := html.NewTokenizer(resp.Body)
defer resp.Body.Close()
for {
token := tokenizer.Next()
switch {
case token == html.ErrorToken:
// End of page
ch<- "END!"
return
case token == html.StartTagToken:
start_tt := tokenizer.Token()
if start_tt.Data == "div" {
if isSummary(start_tt) {
getQ(*tokenizer,ch)
}
} else{
continue
}
}
}
}
func main() {
printCh := make(chan string)
defer close(printCh)
go Crawl("questions/tagged/go",printCh)
printQ(printCh)
}