func (this *ExampleExtender) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) { // fmt.Println("visit url: ", ctx.URL(), "state: ", ctx.State) fmt.Printf("\"%v\",\n", ctx.URL()) fmt.Printf("\t%T\t%+v\n", doc, doc) // urls := processLinks(doc) links := make(map[*url.URL]interface{}) // i, _ := ctx.State.(int) // nextDepth := i - 1 // if nextDepth <= 0 { // return nil, false // } // for _, u := range urls { // links[u] = nextDepth // } return links, false }
func (this *ExampleExtender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool { // fmt.Println("filter url: ", ctx.URL(), "state: ", ctx.State, "isVisited: ", isVisited, "ctx.IsRobotsURL(): ", ctx.IsRobotsURL()) if ctx.SourceURL() == nil { ctx.State = DEPTH return !isVisited } if ctx.State != nil { i, ok := ctx.State.(int) if ok && i > 0 { return !isVisited } } else { fmt.Println("ctx.state nil, ctx.sourceURL: ", ctx.SourceURL()) } return false }
func (self *CustomExtender) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) { fmt.Println(ctx.NormalizedURL().String()) db := GetConn() mIns, err := db.Prepare("INSERT INTO mz(photo_href, photo_thumb_src, photo_large_src, photo_public_src, people_href) VALUES( ?, ?, ?, ?, ? )") // ? = 占位符 if err != nil { panic(err.Error()) } defer mIns.Close() // main结束是关闭 //fmt.Println(doc.Find(".photo_wrap").Text()) doc.Find(".photo_wrap").Each(func(i int, s *goquery.Selection) { // For each item found, get the band and title // fmt.Println(s.Find("a").First().Attr("title")) // fmt.Println(s.Find("a").First().Attr("href")) // fmt.Println(s.Find("img").First().Attr("src")) var photo_href, photo_thumb_src, photo_large_src, photo_public_src, people_href string photo_href = first(s.Find("a").First().Attr("href")).(string) photo_thumb_src = first(s.Find("img").First().Attr("src")).(string) people_href = first(s.Find("a").First().Attr("title")).(string) _, err = mIns.Exec(photo_href, photo_thumb_src, photo_large_src, photo_public_src, people_href) // 执行插入 if err != nil { panic(err.Error()) } }) // if rxGrep.MatchString(ctx.NormalizedURL().String()) { // // print problem title // fmt.Println(doc.Find("h1").Text()) // } // defer db.Close() return nil, true }
func (e *Ext) Filter(ctx *gocrawl.URLContext, isVisited bool) bool { if isVisited { return false } if ctx.URL().Host == "github.com" || ctx.URL().Host == "golang.org" || ctx.URL().Host == "0value.com" { return true } return false }
// Override Filter for our need. func (x *ExampleExtender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool { return !isVisited && rxOk.MatchString(ctx.NormalizedURL().String()) }
func (self *CustomExtender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool { // fmt.Println(ctx.NormalizedURL().String()) return !isVisited && rxOk.MatchString(ctx.NormalizedURL().String()) }
func (e *Ext) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) { fmt.Printf("Visit: %s\n", ctx.URL()) return nil, true }