Golang Page Exemples

Langage de programmation: Golang

Espace de nommage/Pack: github.com/aosen/robot

Class/Type: Page

Exemples au hotexamples.com: 19

Golang Page - 19 exemples trouvés. Ce sont les exemples réels les mieux notés de github.com/aosen/robot.Page extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

GetHtmlParser(10)

IsSucc(9)

GetRequest(9)

AddTargetRequest(6)

Errormsg(6)

AddField(5)

SetStatus(4)

SetBodyStr(3)

AddTargetRequests(2)

SetSkip(2)

GetBodyStr(1)

AddTargetRequestWithParams(1)

GetUrlTag(1)

AddTargetRequestWithHeaderFile(1)

SetCookies(1)

SetHeader(1)

GetJson(1)

Méthodes fréquemment utilisées

GetHtmlParser (10)

IsSucc (9)

GetRequest (9)

AddTargetRequest (6)

Errormsg (6)

AddField (5)

SetStatus (4)

SetBodyStr (3)

AddTargetRequests (2)

SetSkip (2)

Méthodes fréquemment utilisées

GetBodyStr (1)

AddTargetRequestWithParams (1)

GetUrlTag (1)

AddTargetRequestWithHeaderFile (1)

SetCookies (1)

SetHeader (1)

GetJson (1)

Associées

NewLexOptPattern

Choose

HashNameAndPKI

All

NewInformer

NewEncrypter

LoadExprFromString

HasGoString

TrimNamed

SendRecv

Related in langs

WaperAlipay (PHP)

BDC_CaptchaBase (PHP)

ModelDescriptionGenerator (C#)

ImpulseFormDataStreamProvider (C#)

rb_ary_new2 (C++)

tr_new0 (C++)

TransportRequest (Java)

AbstractFullBox (Java)

getMatrix (Python)

Ui_MainWindow (Python)

Exemple #1

0

Afficher le fichier

Fichier : httpdownloader.go Projet : aosen/robot

// choose http GET/method to download func connectByHttp(p *robot.Page, req *robot.Request) (*http.Response, error) { client := &http.Client{} httpreq, err := http.NewRequest(req.GetMethod(), req.GetUrl(), strings.NewReader(req.GetPostdata())) if header := req.GetHeader(); header != nil { httpreq.Header = req.GetHeader() } if cookies := req.GetCookies(); cookies != nil { for i := range cookies { httpreq.AddCookie(cookies[i]) } } var resp *http.Response if resp, err = client.Do(httpreq); err != nil { if e, ok := err.(*url.Error); ok && e.Err != nil && e.Err.Error() == "normal" { // normal } else { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) //fmt.Printf("client do error %v \r\n", err) return nil, err } } return resp, nil }

Exemple #2

0

Afficher le fichier

Fichier : httpdownloader.go Projet : aosen/robot

func (self *HttpDownloader) downloadText(p *robot.Page, req *robot.Request) *robot.Page { p, destbody := self.downloadFile(p, req) if !p.IsSucc() { return p } p.SetBodyStr(destbody).SetStatus(false, "") return p }

Exemple #3

0

Afficher le fichier

Fichier : httpdownloader.go Projet : aosen/robot

func (self *HttpDownloader) downloadHtml(p *robot.Page, req *robot.Request) *robot.Page { var err error p, destbody := self.downloadFile(p, req) //fmt.Printf("Destbody %v \r\n", destbody) if !p.IsSucc() { //fmt.Print("Page error \r\n") return p } bodyReader := bytes.NewReader([]byte(destbody)) var doc *goquery.Document if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) return p } var body string if body, err = doc.Html(); err != nil { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) return p } p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "") return p }

Exemple #4

0

Afficher le fichier

Fichier : process.go Projet : aosen/robot

//小说内容解析 func (self *Www79xsComProcessor) contentParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]interface{}) //开始解析页面 query := p.GetHtmlParser() html, _ := query.Find(".contentbox").Html() meta["content"] = strings.Replace(strings.Replace(html, "<br/><br/>", "\n", -1), "<br/>", "\n", -1) p.AddField("code", "0") for k, v := range meta { p.AddField(k, v.(string)) } }

Exemple #5

0

Afficher le fichier

Fichier : httpdownloader.go Projet : aosen/robot

func (self *HttpDownloader) downloadJson(p *robot.Page, req *robot.Request) *robot.Page { var err error p, destbody := self.downloadFile(p, req) if !p.IsSucc() { return p } var body []byte body = []byte(destbody) mtype := req.GetResponceType() if mtype == "jsonp" { tmpstr := goutils.JsonpToJson(destbody) body = []byte(tmpstr) } var r *simplejson.Json if r, err = simplejson.NewJson(body); err != nil { mlog.LogInst().LogError(string(body) + "\t" + err.Error()) p.SetStatus(true, err.Error()) return p } // json result p.SetBodyStr(string(body)).SetJson(r).SetStatus(false, "") return p }

Exemple #6

0

Afficher le fichier

Fichier : process.go Projet : aosen/robot

func (self *Www79xsComProcessor) Process(p *robot.Page) { //判断页面是否抓取成功 if !p.IsSucc() { log.Println(p.Errormsg()) return } meta := p.GetRequest().GetMeta() handler, ok := meta.(map[string]interface{})["handler"] //如果meta中没有handler处理方法，则说明是入口页面，否则直接执行对应callback if ok { switch handler { case "mainParse": self.mainParse(p) case "urlListParse": self.urlListParse(p) case "classParse": self.classParse(p) case "introParse": self.introParse(p) case "chaperParse": self.chaperParse(p) case "contentParse": self.contentParse(p) default: return } } }

Exemple #7

0

Afficher le fichier

Fichier : process.go Projet : kjfcpua/robot

//获取分类页面的url list，并解析 func (self *Www79xsComProcessor) urlListParse(p *robot.Page) { meta := p.GetRequest().GetMeta() //开始解析页面 query := p.GetHtmlParser() //获取尾页addr lastaddr, ok := query.Find("tbody a").Last().Attr("href") if ok { //解析addr kv := goutils.GetKVInRelaPath(lastaddr) //url拼接 maxpage, _ := strconv.Atoi(kv["page"]) for i := 1; i <= maxpage; i++ { page := strconv.Itoa(i) p.AddTargetRequest(utils.InitRequest( "http://www.79xs.com/Book/ShowBookList.aspx?tclassid="+kv["tclassid"]+"&page="+page, meta.(map[string]string), self.classParse)) } } else { p.AddTargetRequest(utils.InitRequest(p.GetRequest().GetUrl(), meta.(map[string]string), self.classParse)) } }

Exemple #8

0

Afficher le fichier

Fichier : sohujson.go Projet : aosen/spiders

func (this MyPageProcesser) Process(p *robot.Page) { query := p.GetHtmlParser() if p.GetUrlTag() == "index" { query.Find(`div[class="main area"] div[class="lc"] ul li a`).Each(func(i int, s *goquery.Selection) { url, isExsit := s.Attr("href") if isExsit { reg := regexp.MustCompile(`^do not know what is this`) var fmtStr string if rxYule.MatchString(url) { reg = rxYule fmtStr = wkSohuYule } if rxPic.MatchString(url) { reg = rxPic fmtStr = wkSohuPic } regxpArrag := reg.FindStringSubmatch(url) if len(regxpArrag) == 2 { addRequest(p, "changyan", fmt.Sprintf(fmtStr, regxpArrag[1]), "", s.Text()) } } }) } if p.GetUrlTag() == "changyan" { jsonMap := ChangyanJson{} err := json.NewDecoder(strings.NewReader(p.GetBodyStr())).Decode(&jsonMap) if err == nil { content, ok := p.GetRequest().GetMeta().(string) if ok { fmt.Println("Title:", content, " CommentCount:", jsonMap.ListData.OuterCmtSum, " ParticipationCount:", jsonMap.ListData.ParticipationSum) } } } }

Exemple #9

0

Afficher le fichier

Fichier : process.go Projet : kjfcpua/robot

//小说章节解析 func (self *Www79xsComProcessor) chaperParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]string) //开始解析页面 query := p.GetHtmlParser() query.Find(".insert_list li").Each(func(i int, s *goquery.Selection) { tmp := utils.MapCopy(meta) tmp["chapter"] = strconv.Itoa(i) tmp["subtitle"] = s.Find("strong a").Text() addr, _ := s.Find("strong a").Attr("href") tmp["contenturl"] = p.GetRequest().GetBaseUrl() + addr //检测contenturl, 如果数据库中存在，则跳过本次抓取，如果不存在则将url加入调度队列 //这个需求有时间再做 if len(tmp["subtitle"]) != 0 { p.AddTargetRequest(utils.InitRequest(tmp["contenturl"], tmp, self.contentParse)) } }) }

Exemple #10

0

Afficher le fichier

Fichier : process.go Projet : kjfcpua/robot

//主页解析 func (self *Www79xsComProcessor) mainParse(p *robot.Page) { //开始解析页面 query := p.GetHtmlParser() query.Find(".subnav ul li a").Each(func(i int, s *goquery.Selection) { addr, _ := s.Attr("href") if addr == utils.GirlUrl { p.AddTargetRequest(utils.InitRequest(utils.BaseUrl+addr, map[string]string{"first": utils.GIRL}, self.urlListParse)) } else { p.AddTargetRequest(utils.InitRequest(utils.BaseUrl+addr, map[string]string{"first": utils.BOY}, self.urlListParse)) } }) }

Exemple #11

0

Afficher le fichier

Fichier : process.go Projet : kjfcpua/robot

//解析小说详情页 func (self *Www79xsComProcessor) introParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]string) //开始解析页面 query := p.GetHtmlParser() intro := query.Find("#info h3 p").Eq(1).Text() img, _ := query.Find(".img img").Attr("src") // 小说章节列表地址 chaptersource, _ := query.Find(".b1 a").Attr("href") tmp := utils.MapCopy(meta) tmp["introduction"] = intro tmp["img"] = utils.BaseUrl + img tmp["chaptersource"] = utils.BaseUrl + chaptersource p.AddTargetRequest(utils.InitRequest(utils.BaseUrl+chaptersource, tmp, self.chaperParse)) }

Exemple #12

0

Afficher le fichier

Fichier : process.go Projet : kjfcpua/robot

func (self *Www79xsComProcessor) Process(p *robot.Page) { //判断页面是否抓取成功 if !p.IsSucc() { mlog.LogInst().LogError(p.Errormsg()) return } //如果callback为空，则说明是入口页面，否则直接执行对应callback callback := p.GetRequest().GetCallBack() if callback == nil { self.mainParse(p) } else { callback(p) } }

Exemple #13

0

Afficher le fichier

Fichier : process.go Projet : kjfcpua/robot

//分类列表解析 func (self *Www79xsComProcessor) classParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]string) //开始解析页面 query := p.GetHtmlParser() query.Find("div .yl_nr_lt2 ul").Each(func(i int, s *goquery.Selection) { //获取二级分类, 小说标题，作者 second := s.Find(".ynl2 a").Text() title := s.Find(".ynl3 a").Eq(1).Text() author := s.Find(".ynl6 a").Text() novelsource := utils.BaseUrl + func() string { addr, _ := s.Find(".ynl3 a").Eq(1).Attr("href") return addr }() tmp := make(map[string]string) tmp["first"] = meta["first"] tmp["second"] = second tmp["title"] = title tmp["author"] = author tmp["novelsource"] = novelsource p.AddTargetRequest(utils.InitRequest(novelsource, tmp, self.introParse)) }) }

Exemple #14

0

Afficher le fichier

Fichier : httpdownloader.go Projet : aosen/robot

// Download file and change the charset of page charset. func (self *HttpDownloader) downloadFile(p *robot.Page, req *robot.Request) (*robot.Page, string) { var err error var urlstr string if urlstr = req.GetUrl(); len(urlstr) == 0 { mlog.LogInst().LogError("url is empty") p.SetStatus(true, "url is empty") return p, "" } var resp *http.Response if proxystr := req.GetProxyHost(); len(proxystr) != 0 { //using http proxy //fmt.Print("HttpProxy Enter ",proxystr,"\n") resp, err = connectByHttpProxy(p, req) } else { //normal http download //fmt.Print("Http Normal Enter \n",proxystr,"\n") resp, err = connectByHttp(p, req) } if err != nil { return p, "" } p.SetHeader(resp.Header) p.SetCookies(resp.Cookies()) // get converter to utf-8 var bodyStr string if resp.Header.Get("Content-Encoding") == "gzip" { bodyStr = self.changeCharsetEncodingAutoGzipSupport(resp.Header.Get("Content-Type"), resp.Body) } else { bodyStr = self.changeCharsetEncodingAuto(resp.Header.Get("Content-Type"), resp.Body) } //fmt.Printf("utf-8 body %v \r\n", bodyStr) defer resp.Body.Close() return p, bodyStr }

Exemple #15

0

Afficher le fichier

Fichier : github.go Projet : aosen/spiders

// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *robot.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }

Exemple #16

0

Afficher le fichier

Fichier : sinajson.go Projet : aosen/spiders

// Parse html dom here and record the parse result that we want to crawl. // Package simplejson (https://github.com/bitly/go-simplejson) is used to parse data of json. func (this *MyPageProcesser) Process(p *robot.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetJson() status, err := query.GetPath("result", "status", "code").Int() if status != 0 || err != nil { log.Panicf("page is crawled error : errorinfo=%s : status=%d : startNewsId=%d", err.Error(), status, this.startNewsId) } num, err := query.GetPath("result", "pageStr", "pageSize").Int() if num == 0 || err != nil { // Add url of next crawl startIdstr := strconv.Itoa(this.startNewsId) p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+startIdstr+"&pagesize=10&dire=f", "json") return } var idint, nextid int var nextidstr string query = query.Get("result").Get("data") for i := 0; i < num; i++ { id, err := query.GetIndex(i).Get("id").String() if id == "" || err != nil { continue } idint, err = strconv.Atoi(id) if err != nil { continue } if idint <= this.startNewsId { break } if i == 0 { nextid = idint nextidstr = id } content, err := query.GetIndex(i).Get("content").String() if content == "" || err != nil { continue } time, err := query.GetIndex(i).Get("created_at").String() if err != nil { continue } p.AddField(id+"_id", id) p.AddField(id+"_content", content) p.AddField(id+"_time", time) } // Add url of next crawl this.startNewsId = nextid p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+nextidstr+"&pagesize=10&dire=f", "json") //println(p.GetTargetRequests()) }

Exemple #17

0

Afficher le fichier

Fichier : weixin.go Projet : aosen/spiders

// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *robot.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() query.Find(`div[class="wx-rb bg-blue wx-rb_v1 _item"]`).Each(func(i int, s *goquery.Selection) { name := s.Find("div.txt-box > h3").Text() href, _ := s.Attr("href") fmt.Printf("WeName:%v link:http://http://weixin.sogou.com%v \r\n", name, href) // the entity we want to save by Pipeline p.AddField("name", name) p.AddField("href", href) }) next_page_href, _ := query.Find("#sogou_next").Attr("href") if next_page_href == "" { p.SetSkip(true) } else { p.AddTargetRequestWithHeaderFile("http://weixin.sogou.com/weixin"+next_page_href, "html", "weixin.sogou.com.json") } }

Exemple #18

0

Afficher le fichier

Fichier : mgo.go Projet : aosen/robot

func (self *MyProcessor) Process(p *robot.Page) { if !p.IsSucc() { mlog.LogInst().LogError(p.Errormsg()) return } u, err := url.Parse(p.GetRequest().GetUrl()) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !strings.HasSuffix(u.Host, "jiexieyin.org") { return } var urls []string query := p.GetHtmlParser() query.Find("a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") reJavascript := regexp.MustCompile("^javascript\\:") reLocal := regexp.MustCompile("^\\#") reMailto := regexp.MustCompile("^mailto\\:") if reJavascript.MatchString(href) || reLocal.MatchString(href) || reMailto.MatchString(href) { return } //处理相对路径 var absHref string urlHref, err := url.Parse(href) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !urlHref.IsAbs() { urlPrefix := p.GetRequest().GetUrl() absHref = urlPrefix + href urls = append(urls, absHref) } else { urls = append(urls, href) } }) p.AddTargetRequests(initrequests(urls)) p.AddField("test1", p.GetRequest().GetUrl()) p.AddField("test2", p.GetRequest().GetUrl()) }

Exemple #19

0

Afficher le fichier

Fichier : sohujson.go Projet : aosen/spiders

func addRequest(p *robot.Page, tag, url, cookie, content string) { req := robot.NewRequest(url, "json", tag, "GET", "", nil, nil, nil, content) p.AddTargetRequestWithParams(req) }