Exemplos de Page em Golang

Linguagem de programação: Golang

Espaço para nome / nome do pacote: github.com/aosen/robot

Classe / Tipo: Page

Exemplos em hotexamples.com: 19

Page em Golang - 19 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de github.com/aosen/robot.Page em Golang extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

GetHtmlParser(10)

IsSucc(9)

GetRequest(9)

AddTargetRequest(6)

Errormsg(6)

AddField(5)

SetStatus(4)

SetBodyStr(3)

AddTargetRequests(2)

SetSkip(2)

GetBodyStr(1)

AddTargetRequestWithParams(1)

GetUrlTag(1)

AddTargetRequestWithHeaderFile(1)

SetCookies(1)

SetHeader(1)

GetJson(1)

Métodos Frequentes

GetHtmlParser (10)

IsSucc (9)

GetRequest (9)

AddTargetRequest (6)

Errormsg (6)

AddField (5)

SetStatus (4)

SetBodyStr (3)

AddTargetRequests (2)

SetSkip (2)

Métodos Frequentes

GetBodyStr (1)

AddTargetRequestWithParams (1)

GetUrlTag (1)

AddTargetRequestWithHeaderFile (1)

SetCookies (1)

SetHeader (1)

GetJson (1)

Relacionados

NewLexOptPattern

Choose

HashNameAndPKI

All

NewInformer

NewEncrypter

LoadExprFromString

HasGoString

TrimNamed

SendRecv

Related in langs

WaperAlipay (PHP)

BDC_CaptchaBase (PHP)

ModelDescriptionGenerator (C#)

ImpulseFormDataStreamProvider (C#)

rb_ary_new2 (C++)

tr_new0 (C++)

TransportRequest (Java)

AbstractFullBox (Java)

getMatrix (Python)

Ui_MainWindow (Python)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: httpdownloader.go Projeto: aosen/robot

// choose http GET/method to download func connectByHttp(p *robot.Page, req *robot.Request) (*http.Response, error) { client := &http.Client{} httpreq, err := http.NewRequest(req.GetMethod(), req.GetUrl(), strings.NewReader(req.GetPostdata())) if header := req.GetHeader(); header != nil { httpreq.Header = req.GetHeader() } if cookies := req.GetCookies(); cookies != nil { for i := range cookies { httpreq.AddCookie(cookies[i]) } } var resp *http.Response if resp, err = client.Do(httpreq); err != nil { if e, ok := err.(*url.Error); ok && e.Err != nil && e.Err.Error() == "normal" { // normal } else { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) //fmt.Printf("client do error %v \r\n", err) return nil, err } } return resp, nil }

Exemplo n.º 2

0

Exibir arquivo

Arquivo: httpdownloader.go Projeto: aosen/robot

func (self *HttpDownloader) downloadText(p *robot.Page, req *robot.Request) *robot.Page { p, destbody := self.downloadFile(p, req) if !p.IsSucc() { return p } p.SetBodyStr(destbody).SetStatus(false, "") return p }

Exemplo n.º 3

0

Exibir arquivo

Arquivo: httpdownloader.go Projeto: aosen/robot

func (self *HttpDownloader) downloadHtml(p *robot.Page, req *robot.Request) *robot.Page { var err error p, destbody := self.downloadFile(p, req) //fmt.Printf("Destbody %v \r\n", destbody) if !p.IsSucc() { //fmt.Print("Page error \r\n") return p } bodyReader := bytes.NewReader([]byte(destbody)) var doc *goquery.Document if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) return p } var body string if body, err = doc.Html(); err != nil { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) return p } p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "") return p }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: process.go Projeto: aosen/robot

//小说内容解析 func (self *Www79xsComProcessor) contentParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]interface{}) //开始解析页面 query := p.GetHtmlParser() html, _ := query.Find(".contentbox").Html() meta["content"] = strings.Replace(strings.Replace(html, "<br/><br/>", "\n", -1), "<br/>", "\n", -1) p.AddField("code", "0") for k, v := range meta { p.AddField(k, v.(string)) } }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: httpdownloader.go Projeto: aosen/robot

func (self *HttpDownloader) downloadJson(p *robot.Page, req *robot.Request) *robot.Page { var err error p, destbody := self.downloadFile(p, req) if !p.IsSucc() { return p } var body []byte body = []byte(destbody) mtype := req.GetResponceType() if mtype == "jsonp" { tmpstr := goutils.JsonpToJson(destbody) body = []byte(tmpstr) } var r *simplejson.Json if r, err = simplejson.NewJson(body); err != nil { mlog.LogInst().LogError(string(body) + "\t" + err.Error()) p.SetStatus(true, err.Error()) return p } // json result p.SetBodyStr(string(body)).SetJson(r).SetStatus(false, "") return p }

Exemplo n.º 6

0

Exibir arquivo

Arquivo: process.go Projeto: aosen/robot

func (self *Www79xsComProcessor) Process(p *robot.Page) { //判断页面是否抓取成功 if !p.IsSucc() { log.Println(p.Errormsg()) return } meta := p.GetRequest().GetMeta() handler, ok := meta.(map[string]interface{})["handler"] //如果meta中没有handler处理方法，则说明是入口页面，否则直接执行对应callback if ok { switch handler { case "mainParse": self.mainParse(p) case "urlListParse": self.urlListParse(p) case "classParse": self.classParse(p) case "introParse": self.introParse(p) case "chaperParse": self.chaperParse(p) case "contentParse": self.contentParse(p) default: return } } }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: process.go Projeto: kjfcpua/robot

//获取分类页面的url list，并解析 func (self *Www79xsComProcessor) urlListParse(p *robot.Page) { meta := p.GetRequest().GetMeta() //开始解析页面 query := p.GetHtmlParser() //获取尾页addr lastaddr, ok := query.Find("tbody a").Last().Attr("href") if ok { //解析addr kv := goutils.GetKVInRelaPath(lastaddr) //url拼接 maxpage, _ := strconv.Atoi(kv["page"]) for i := 1; i <= maxpage; i++ { page := strconv.Itoa(i) p.AddTargetRequest(utils.InitRequest( "http://www.79xs.com/Book/ShowBookList.aspx?tclassid="+kv["tclassid"]+"&page="+page, meta.(map[string]string), self.classParse)) } } else { p.AddTargetRequest(utils.InitRequest(p.GetRequest().GetUrl(), meta.(map[string]string), self.classParse)) } }

Exemplo n.º 8

0

Exibir arquivo

Arquivo: sohujson.go Projeto: aosen/spiders

func (this MyPageProcesser) Process(p *robot.Page) { query := p.GetHtmlParser() if p.GetUrlTag() == "index" { query.Find(`div[class="main area"] div[class="lc"] ul li a`).Each(func(i int, s *goquery.Selection) { url, isExsit := s.Attr("href") if isExsit { reg := regexp.MustCompile(`^do not know what is this`) var fmtStr string if rxYule.MatchString(url) { reg = rxYule fmtStr = wkSohuYule } if rxPic.MatchString(url) { reg = rxPic fmtStr = wkSohuPic } regxpArrag := reg.FindStringSubmatch(url) if len(regxpArrag) == 2 { addRequest(p, "changyan", fmt.Sprintf(fmtStr, regxpArrag[1]), "", s.Text()) } } }) } if p.GetUrlTag() == "changyan" { jsonMap := ChangyanJson{} err := json.NewDecoder(strings.NewReader(p.GetBodyStr())).Decode(&jsonMap) if err == nil { content, ok := p.GetRequest().GetMeta().(string) if ok { fmt.Println("Title:", content, " CommentCount:", jsonMap.ListData.OuterCmtSum, " ParticipationCount:", jsonMap.ListData.ParticipationSum) } } } }

Exemplo n.º 9

0

Exibir arquivo

Arquivo: process.go Projeto: kjfcpua/robot

//小说章节解析 func (self *Www79xsComProcessor) chaperParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]string) //开始解析页面 query := p.GetHtmlParser() query.Find(".insert_list li").Each(func(i int, s *goquery.Selection) { tmp := utils.MapCopy(meta) tmp["chapter"] = strconv.Itoa(i) tmp["subtitle"] = s.Find("strong a").Text() addr, _ := s.Find("strong a").Attr("href") tmp["contenturl"] = p.GetRequest().GetBaseUrl() + addr //检测contenturl, 如果数据库中存在，则跳过本次抓取，如果不存在则将url加入调度队列 //这个需求有时间再做 if len(tmp["subtitle"]) != 0 { p.AddTargetRequest(utils.InitRequest(tmp["contenturl"], tmp, self.contentParse)) } }) }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: process.go Projeto: kjfcpua/robot

//主页解析 func (self *Www79xsComProcessor) mainParse(p *robot.Page) { //开始解析页面 query := p.GetHtmlParser() query.Find(".subnav ul li a").Each(func(i int, s *goquery.Selection) { addr, _ := s.Attr("href") if addr == utils.GirlUrl { p.AddTargetRequest(utils.InitRequest(utils.BaseUrl+addr, map[string]string{"first": utils.GIRL}, self.urlListParse)) } else { p.AddTargetRequest(utils.InitRequest(utils.BaseUrl+addr, map[string]string{"first": utils.BOY}, self.urlListParse)) } }) }

Exemplo n.º 11

0

Exibir arquivo

Arquivo: process.go Projeto: kjfcpua/robot

//解析小说详情页 func (self *Www79xsComProcessor) introParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]string) //开始解析页面 query := p.GetHtmlParser() intro := query.Find("#info h3 p").Eq(1).Text() img, _ := query.Find(".img img").Attr("src") // 小说章节列表地址 chaptersource, _ := query.Find(".b1 a").Attr("href") tmp := utils.MapCopy(meta) tmp["introduction"] = intro tmp["img"] = utils.BaseUrl + img tmp["chaptersource"] = utils.BaseUrl + chaptersource p.AddTargetRequest(utils.InitRequest(utils.BaseUrl+chaptersource, tmp, self.chaperParse)) }

Exemplo n.º 12

0

Exibir arquivo

Arquivo: process.go Projeto: kjfcpua/robot

func (self *Www79xsComProcessor) Process(p *robot.Page) { //判断页面是否抓取成功 if !p.IsSucc() { mlog.LogInst().LogError(p.Errormsg()) return } //如果callback为空，则说明是入口页面，否则直接执行对应callback callback := p.GetRequest().GetCallBack() if callback == nil { self.mainParse(p) } else { callback(p) } }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: process.go Projeto: kjfcpua/robot

//分类列表解析 func (self *Www79xsComProcessor) classParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]string) //开始解析页面 query := p.GetHtmlParser() query.Find("div .yl_nr_lt2 ul").Each(func(i int, s *goquery.Selection) { //获取二级分类, 小说标题，作者 second := s.Find(".ynl2 a").Text() title := s.Find(".ynl3 a").Eq(1).Text() author := s.Find(".ynl6 a").Text() novelsource := utils.BaseUrl + func() string { addr, _ := s.Find(".ynl3 a").Eq(1).Attr("href") return addr }() tmp := make(map[string]string) tmp["first"] = meta["first"] tmp["second"] = second tmp["title"] = title tmp["author"] = author tmp["novelsource"] = novelsource p.AddTargetRequest(utils.InitRequest(novelsource, tmp, self.introParse)) }) }

Exemplo n.º 14

0

Exibir arquivo

Arquivo: httpdownloader.go Projeto: aosen/robot

// Download file and change the charset of page charset. func (self *HttpDownloader) downloadFile(p *robot.Page, req *robot.Request) (*robot.Page, string) { var err error var urlstr string if urlstr = req.GetUrl(); len(urlstr) == 0 { mlog.LogInst().LogError("url is empty") p.SetStatus(true, "url is empty") return p, "" } var resp *http.Response if proxystr := req.GetProxyHost(); len(proxystr) != 0 { //using http proxy //fmt.Print("HttpProxy Enter ",proxystr,"\n") resp, err = connectByHttpProxy(p, req) } else { //normal http download //fmt.Print("Http Normal Enter \n",proxystr,"\n") resp, err = connectByHttp(p, req) } if err != nil { return p, "" } p.SetHeader(resp.Header) p.SetCookies(resp.Cookies()) // get converter to utf-8 var bodyStr string if resp.Header.Get("Content-Encoding") == "gzip" { bodyStr = self.changeCharsetEncodingAutoGzipSupport(resp.Header.Get("Content-Type"), resp.Body) } else { bodyStr = self.changeCharsetEncodingAuto(resp.Header.Get("Content-Type"), resp.Body) } //fmt.Printf("utf-8 body %v \r\n", bodyStr) defer resp.Body.Close() return p, bodyStr }

Exemplo n.º 15

0

Exibir arquivo

Arquivo: github.go Projeto: aosen/spiders

// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *robot.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }

Exemplo n.º 16

0

Exibir arquivo

Arquivo: sinajson.go Projeto: aosen/spiders

// Parse html dom here and record the parse result that we want to crawl. // Package simplejson (https://github.com/bitly/go-simplejson) is used to parse data of json. func (this *MyPageProcesser) Process(p *robot.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetJson() status, err := query.GetPath("result", "status", "code").Int() if status != 0 || err != nil { log.Panicf("page is crawled error : errorinfo=%s : status=%d : startNewsId=%d", err.Error(), status, this.startNewsId) } num, err := query.GetPath("result", "pageStr", "pageSize").Int() if num == 0 || err != nil { // Add url of next crawl startIdstr := strconv.Itoa(this.startNewsId) p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+startIdstr+"&pagesize=10&dire=f", "json") return } var idint, nextid int var nextidstr string query = query.Get("result").Get("data") for i := 0; i < num; i++ { id, err := query.GetIndex(i).Get("id").String() if id == "" || err != nil { continue } idint, err = strconv.Atoi(id) if err != nil { continue } if idint <= this.startNewsId { break } if i == 0 { nextid = idint nextidstr = id } content, err := query.GetIndex(i).Get("content").String() if content == "" || err != nil { continue } time, err := query.GetIndex(i).Get("created_at").String() if err != nil { continue } p.AddField(id+"_id", id) p.AddField(id+"_content", content) p.AddField(id+"_time", time) } // Add url of next crawl this.startNewsId = nextid p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+nextidstr+"&pagesize=10&dire=f", "json") //println(p.GetTargetRequests()) }

Exemplo n.º 17

0

Exibir arquivo

Arquivo: weixin.go Projeto: aosen/spiders

// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *robot.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() query.Find(`div[class="wx-rb bg-blue wx-rb_v1 _item"]`).Each(func(i int, s *goquery.Selection) { name := s.Find("div.txt-box > h3").Text() href, _ := s.Attr("href") fmt.Printf("WeName:%v link:http://http://weixin.sogou.com%v \r\n", name, href) // the entity we want to save by Pipeline p.AddField("name", name) p.AddField("href", href) }) next_page_href, _ := query.Find("#sogou_next").Attr("href") if next_page_href == "" { p.SetSkip(true) } else { p.AddTargetRequestWithHeaderFile("http://weixin.sogou.com/weixin"+next_page_href, "html", "weixin.sogou.com.json") } }

Exemplo n.º 18

0

Exibir arquivo

Arquivo: mgo.go Projeto: aosen/robot

func (self *MyProcessor) Process(p *robot.Page) { if !p.IsSucc() { mlog.LogInst().LogError(p.Errormsg()) return } u, err := url.Parse(p.GetRequest().GetUrl()) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !strings.HasSuffix(u.Host, "jiexieyin.org") { return } var urls []string query := p.GetHtmlParser() query.Find("a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") reJavascript := regexp.MustCompile("^javascript\\:") reLocal := regexp.MustCompile("^\\#") reMailto := regexp.MustCompile("^mailto\\:") if reJavascript.MatchString(href) || reLocal.MatchString(href) || reMailto.MatchString(href) { return } //处理相对路径 var absHref string urlHref, err := url.Parse(href) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !urlHref.IsAbs() { urlPrefix := p.GetRequest().GetUrl() absHref = urlPrefix + href urls = append(urls, absHref) } else { urls = append(urls, href) } }) p.AddTargetRequests(initrequests(urls)) p.AddField("test1", p.GetRequest().GetUrl()) p.AddField("test2", p.GetRequest().GetUrl()) }

Exemplo n.º 19

0

Exibir arquivo

Arquivo: sohujson.go Projeto: aosen/spiders

func addRequest(p *robot.Page, tag, url, cookie, content string) { req := robot.NewRequest(url, "json", tag, "GET", "", nil, nil, nil, content) p.AddTargetRequestWithParams(req) }