This repository has been archived by the owner on Apr 23, 2020. It is now read-only.
/
v2ex.go
109 lines (94 loc) · 1.97 KB
/
v2ex.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
package main
import (
"encoding/gob"
"fmt"
"regexp"
"strconv"
"sync"
"github.com/reusee/nw"
)
func init() {
gob.Register(new(V2exEntry))
}
type V2exEntry struct {
Id int
Title string
}
type V2exCollector struct {
*ErrorHost
}
func NewV2exCollector() (*V2exCollector, error) {
v := &V2exCollector{
ErrorHost: NewErrorHost("V2ex"),
}
return v, nil
}
func (v *V2exCollector) Collect() (ret []Entry, err error) {
nodes := []string{
"share",
"create",
}
maxPage := 5
var uris []string
for _, node := range nodes {
for page := 1; page <= maxPage; page++ {
uris = append(uris, fmt.Sprintf("http://v2ex.com/go/%s?p=%d", node, page))
}
}
sem := make(chan bool, 2)
wg := new(sync.WaitGroup)
wg.Add(len(uris))
lock := new(sync.Mutex)
errors := make([]error, 0, len(uris))
for i, uri := range uris {
sem <- true
go func(i int, uri string) {
defer wg.Done()
entries, err := v.CollectPage(uri)
lock.Lock()
ret = append(ret, entries...)
errors = append(errors, err)
lock.Unlock()
<-sem
}(i, uri)
}
wg.Wait()
for _, e := range errors {
if e != nil {
return nil, e
}
}
fmt.Printf("collect %d entries from V2ex\n", len(ret))
return
}
var v2exPidPattern = regexp.MustCompile(`/t/([0-9]+)`)
func (v *V2exCollector) CollectPage(uri string) (ret []Entry, err error) {
resp, err := Get(uri)
if err != nil {
return nil, v.Err("get %s error: %v", uri, err)
}
defer resp.Body.Close()
root, err := nw.Parse(resp.Body)
if err != nil {
return nil, v.Err("parse html %s: %v", uri, err)
}
var walkError error
root.Walk(nw.Css("div.cell span.item_title a", func(n *nw.Node) {
id, err := strconv.Atoi(v2exPidPattern.FindStringSubmatch(n.Attr["href"])[1])
if err != nil {
walkError = v.Err("no post id: %s", uri)
return
}
ret = append(ret, &V2exEntry{
Id: id,
Title: n.Text,
})
}))
if walkError != nil {
err = walkError
}
return
}
func (v *V2exEntry) GetKey() string {
return fmt.Sprintf("v2ex %d", v.Id)
}