func MakeDoubanSpider() *spiders.Spider { spider := &spiders.Spider{} spider.Name = "douban_img_spider" spider.StartUrls = []string{"http://movie.douban.com/"} spider.ParseMap = make(map[string]func(response *http.Response) ([]*http.Request, error)) spider.ParseMap[spiders.BASE_PARSE_NAME] = func(response *http.Response) ([]*http.Request, error) { if response.Request.Depth > 10 { return nil, nil } doc, err := goquery.NewDocumentFromReader(strings.NewReader(response.Body)) if err != nil { return nil, err } nodes := doc.Find("#page .n").Nodes if len(nodes) == 0 { return nil, err } nextNode := nodes[len(nodes)-1] attrList := nextNode.Attr var nextPageLink string for _, attr := range attrList { if attr.Key == "href" { nextPageLink = attr.Val break } } nextPage := "http://www.baidu.com" + nextPageLink request, err := http.NewRequest("GET", nextPage, spider.Name, spiders.BASE_PARSE_NAME, nil, 0) requestList := make([]*http.Request, 0) requestList = append(requestList, request) return requestList, nil } return spider }
func MakeMuiltiplySpiders() *spiders.Spider { spider := spiders.Spider{} spider.Name = "muiltiply_spider" spider.StartUrls = []string{"http://www.baidu.com/s?wd=1"} spider.ParseMap = make(map[string]func(response *http.Response) ([]*http.Request, error)) spider.ParseMap[spiders.BASE_PARSE_NAME] = func(response *http.Response) ([]*http.Request, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(response.Body)) if err != nil { return nil, err } requestList := make([]*http.Request, 0, 10) doc.Find("#page a").Each(func(index int, hrefNode *goquery.Selection) { href, isExist := hrefNode.Attr("href") if !isExist { return } nextPage := "http://www.baidu.com" + href request, err := http.NewRequest("GET", nextPage, spider.Name, spiders.BASE_PARSE_NAME, nil, 0) if err != nil { log.Println(err) } requestList = append(requestList, request) }) return requestList, nil } return &spider }
func Base(response *http.Response) ([]*http.Request, error) { requestList := make([]*http.Request, 0) doc, err := goquery.NewDocumentFromReader(strings.NewReader(response.Body)) if err != nil { return requestList, err } xsrf, exist := doc.Find(".zu-side-login-box input[name=_xsrf]").Attr("value") if !exist { return requestList, nil } userList := [][]string{{"*****@*****.**", ""}, {"*****@*****.**", ""}} for index, user := range userList { value := make(url.Values) value.Set("_xsrf", xsrf) value.Set("email", user[0]) value.Set("password", user[1]) value.Set("rememberme", "y") request, requestErr := http.NewRequest("POST", "http://www.zhihu.com/login", zhihuSpider.Name, "Index", strings.NewReader(value.Encode()), index+2) if requestErr != nil { log.Println(err) continue } requestList = append(requestList, request) } return requestList, nil }
func MakeStockSpider() *spiders.Spider { spider := spiders.Spider{} spider.Name = "stock_spider" spider.BeforeMethod = func() { db.DefaultMysqlConnectMap.InitConnection(spider.Name, "root:@/wxstock?charset=utf8") } spider.AfterMethod = func() { db.DefaultMysqlConnectMap.CloseContection(spider.Name) } urlPrefix := "http://money.finance.sina.com.cn/d/api/openapi_proxy.php/?__s=[[%22hq%22,%22hs_a%22,%22%22,0," urlSuffix := ",40]]&callback=FDC_DC.theTableData" pageSize := 40 spider.StartUrls = []string{urlPrefix + "0" + urlSuffix} spider.ParseMap = make(map[string]func(response *http.Response) ([]*http.Request, error)) spider.ParseMap[spiders.BASE_PARSE_NAME] = func(response *http.Response) ([]*http.Request, error) { body := response.Body startIndex := strings.Index(body, "FDC_DC.theTableData") jsonString := body[startIndex+len("FDC_DC.theTableData(") : len(body)-2] var jsonResult []Stock err := json.Unmarshal([]byte(jsonString), &jsonResult) if err != nil { log.Println(err) } for _, item := range jsonResult[0].Items { code := item[1] name := item[2] rows, err := db.DefaultMysqlConnectMap.Query(spider.Name, "select id from db_stock where code= ? ", code) if err != nil { log.Println(err) continue } if !rows.Next() { rows.Close() db.DefaultMysqlConnectMap.Exec(spider.Name, "insert into db_stock(gmt_create,code,name)values(now(), ?, ? )", code, name) } else { rows.Close() } } if strings.Contains(response.GoResponse.Request.URL.RawQuery, ",0,0,40]]") { totalPage := int(math.Ceil(float64(jsonResult[0].Count) / float64(pageSize))) requestList := make([]*http.Request, totalPage, totalPage) pageNo := 1 for pageNo <= totalPage { request, err := http.NewRequest("GET", urlPrefix+strconv.Itoa(pageNo)+urlSuffix, spider.Name, spiders.BASE_PARSE_NAME, nil, 0) if err != nil { log.Println(err) } requestList[pageNo-1] = request pageNo += 1 } return requestList, nil } return nil, nil } return &spider }
func (this *Spider) MakeStartRequests() []*http.Request { if this.InitStartUrls != nil { this.InitStartUrls() } startRequestSlice := make([]*http.Request, len(this.StartUrls)) for index, url := range this.StartUrls { request, err := http.NewRequest("GET", url, this.Name, BASE_PARSE_NAME, nil, 0) if err != nil { log.Println(err) continue } startRequestSlice[index] = request } return startRequestSlice }
func Index(response *http.Response) ([]*http.Request, error) { requestList := make([]*http.Request, 1) request, _ := http.NewRequest("GET", "http://www.zhihu.com", zhihuSpider.Name, "GetId", nil, response.Request.CookieJar) requestList[0] = request return requestList, nil }