forked from hu17889/go_spider
/
github_repo_page_processor.go
63 lines (57 loc) · 2.08 KB
/
github_repo_page_processor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
//
package main
/*
Packages must be imported:
"core/common/page"
"core/spider"
Pckages may be imported:
"core/pipeline": scawler result persistent;
"github.com/PuerkitoBio/goquery": html dom parser.
*/
import (
"github.com/PuerkitoBio/goquery"
"github.com/hu17889/go_spider/core/common/page"
"github.com/hu17889/go_spider/core/pipeline"
"github.com/hu17889/go_spider/core/spider"
"strings"
)
type MyPageProcesser struct {
}
func NewMyPageProcesser() *MyPageProcesser {
return &MyPageProcesser{}
}
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
query := p.GetHtmlParser()
var urls []string
query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
urls = append(urls, "http://github.com/"+href)
})
// these urls will be saved and crawed by other coroutines.
p.AddTargetRequests(urls, "html")
name := query.Find(".entry-title .author").Text()
name = strings.Trim(name, " \t\n")
repository := query.Find(".entry-title .js-current-repository").Text()
repository = strings.Trim(repository, " \t\n")
//readme, _ := query.Find("#readme").Html()
if name == "" {
p.SetSkip(true)
}
// the entity we want to save by Pipeline
p.AddField("author", name)
p.AddField("project", repository)
//p.AddField("readme", readme)
}
func main() {
// spider input:
// PageProcesser ;
// config path(default: WD/etc/main.conf);
// task name used in Pipeline for record;
spider.NewSpider(NewMyPageProcesser(), "", "TaskName").
AddUrl("https://github.com/hu17889?tab=repositories", "html"). // start url, html is the responce type ("html" or "json")
AddPipeline(pipeline.NewPipelineConsole()). // print result on screen
SetThreadnum(3). // crawl request by three Coroutines
Run()
}