/
crawl-board.go
122 lines (97 loc) · 3.1 KB
/
crawl-board.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package main
import (
"os"
"io"
"log"
"regexp"
"strconv"
"net/http"
"github.com/PuerkitoBio/goquery"
)
const PTT_URL = "https://www.ptt.cc"
type BoardIndexPage struct {
page_number int
url string
}
type ArticlePage struct {
id string
url string
}
// ptt_get ...
func ptt_get(url string) *http.Response {
req, _ := http.NewRequest("GET", url, nil)
req.Header.Set("Cookie", "over18=1")
res, err := http.DefaultClient.Do(req)
if err != nil { log.Fatal(err) }
return res;
}
func harvest_board_indices(board_url string, board_name string) []BoardIndexPage {
var ret []BoardIndexPage
doc, err := goquery.NewDocumentFromResponse( ptt_get(board_url) )
if err != nil { log.Fatal(err) }
re := regexp.MustCompile("/bbs/"+board_name+"/index([0-9]+)\\.html")
doc.Find("a[href^='/bbs/" + board_name + "/index']").Each(func (_ int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists { return }
matched := re.FindStringSubmatch(href)
if len(matched) == 0 { return }
pn, err := strconv.Atoi(matched[1])
if err != nil { log.Fatal(err) }
ret = append( ret, BoardIndexPage{ pn, href } )
})
if (ret[0].page_number > ret[1].page_number) {
ret[0],ret[1] = ret[1],ret[0]
}
for i := ret[0].page_number + 1; i < ret[1].page_number; i++ {
ret = append(ret, BoardIndexPage{i, "/bbs/" + board_name + "/index" + strconv.Itoa(i) + ".html" } )
}
return ret;
}
func harvest_articles(url string, board_name string) []ArticlePage {
var ret []ArticlePage
doc, err := goquery.NewDocumentFromResponse( ptt_get(url) )
if err != nil { log.Fatal(err) }
re := regexp.MustCompile("/(M\\.[0-9]+\\.A\\.[A-Z0-9]{3})\\.html")
doc.Find("a[href*='/bbs/" + board_name + "/']").Each(func (_ int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists { return }
matched := re.FindStringSubmatch(href)
if len(matched) == 0 { return }
ret = append(ret, ArticlePage{ matched[1], href })
})
return ret
}
func download_articles(articles []ArticlePage, output_board_dir string) {
for _, article := range articles {
output_file := output_board_dir + "/" + article.id + ".html"
if _, err := os.Stat(output_file); os.IsNotExist(err) {
output, err := os.Create(output_file)
if err != nil { log.Fatal("Error while creating", output_file, "-", err) }
defer output.Close()
res := ptt_get(PTT_URL + article.url)
defer res.Body.Close()
_, err = io.Copy(output, res.Body)
if err != nil {
log.Fatal("Error while downloading", article.url, "-", err)
}
log.Println(output_file)
} else {
log.Println("SKIP:", output_file, "exists.");
}
}
}
func main() {
if len(os.Args) != 3 {
log.Fatal("Usage: crawl-board BOARD_NAME OUTPUT_DIR")
}
board_name := os.Args[1]
output_dir := os.Args[2]
board_url := PTT_URL + "/bbs/" + board_name + "/index.html";
output_board_dir := output_dir + "/" + board_name;
os.MkdirAll(output_board_dir, os.ModeDir | os.ModePerm)
board_indices := harvest_board_indices( board_url, board_name )
for _,board := range board_indices {
articles := harvest_articles( PTT_URL + board.url, board_name )
download_articles( articles, output_board_dir )
}
}