Golang Page.IsSucc Examples

Programming Language: Golang

Namespace/Package Name: github.com/aosen/robot

Class/Type: Page

Method/Function: IsSucc

Examples at hotexamples.com: 9

Golang Page.IsSucc - 9 examples found. These are the top rated real world Golang examples of github.com/aosen/robot.Page.IsSucc extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GetHtmlParser(10)

IsSucc(9)

GetRequest(9)

AddTargetRequest(6)

Errormsg(6)

AddField(5)

SetStatus(4)

SetBodyStr(3)

AddTargetRequests(2)

SetSkip(2)

GetBodyStr(1)

AddTargetRequestWithParams(1)

GetUrlTag(1)

AddTargetRequestWithHeaderFile(1)

SetCookies(1)

SetHeader(1)

GetJson(1)

Example #1

Show file

File: github.go Project: aosen/spiders

// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *robot.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetHtmlParser()
	var urls []string
	query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		urls = append(urls, "http://github.com/"+href)
	})
	// these urls will be saved and crawed by other coroutines.
	p.AddTargetRequests(urls, "html")

	name := query.Find(".entry-title .author").Text()
	name = strings.Trim(name, " \t\n")
	repository := query.Find(".entry-title .js-current-repository").Text()
	repository = strings.Trim(repository, " \t\n")
	//readme, _ := query.Find("#readme").Html()
	if name == "" {
		p.SetSkip(true)
	}
	// the entity we want to save by Pipeline
	p.AddField("author", name)
	p.AddField("project", repository)
	//p.AddField("readme", readme)
}

Example #2

Show file

File: httpdownloader.go Project: aosen/robot

func (self *HttpDownloader) downloadJson(p *robot.Page, req *robot.Request) *robot.Page {
	var err error
	p, destbody := self.downloadFile(p, req)
	if !p.IsSucc() {
		return p
	}

	var body []byte
	body = []byte(destbody)
	mtype := req.GetResponceType()
	if mtype == "jsonp" {
		tmpstr := goutils.JsonpToJson(destbody)
		body = []byte(tmpstr)
	}

	var r *simplejson.Json
	if r, err = simplejson.NewJson(body); err != nil {
		mlog.LogInst().LogError(string(body) + "\t" + err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	// json result
	p.SetBodyStr(string(body)).SetJson(r).SetStatus(false, "")

	return p
}

Example #3

Show file

File: httpdownloader.go Project: aosen/robot

func (self *HttpDownloader) downloadHtml(p *robot.Page, req *robot.Request) *robot.Page {
	var err error
	p, destbody := self.downloadFile(p, req)
	//fmt.Printf("Destbody %v \r\n", destbody)
	if !p.IsSucc() {
		//fmt.Print("Page error \r\n")
		return p
	}
	bodyReader := bytes.NewReader([]byte(destbody))

	var doc *goquery.Document
	if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil {
		mlog.LogInst().LogError(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	var body string
	if body, err = doc.Html(); err != nil {
		mlog.LogInst().LogError(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "")

	return p
}

Example #4

Show file

File: weixin.go Project: aosen/spiders

// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *robot.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetHtmlParser()

	query.Find(`div[class="wx-rb bg-blue wx-rb_v1 _item"]`).Each(func(i int, s *goquery.Selection) {
		name := s.Find("div.txt-box > h3").Text()
		href, _ := s.Attr("href")

		fmt.Printf("WeName:%v link:http://http://weixin.sogou.com%v \r\n", name, href)
		// the entity we want to save by Pipeline
		p.AddField("name", name)
		p.AddField("href", href)
	})

	next_page_href, _ := query.Find("#sogou_next").Attr("href")
	if next_page_href == "" {
		p.SetSkip(true)
	} else {
		p.AddTargetRequestWithHeaderFile("http://weixin.sogou.com/weixin"+next_page_href, "html", "weixin.sogou.com.json")
	}

}

Example #5

Show file

File: process.go Project: aosen/robot

func (self *Www79xsComProcessor) Process(p *robot.Page) {
	//判断页面是否抓取成功
	if !p.IsSucc() {
		log.Println(p.Errormsg())
		return
	}

	meta := p.GetRequest().GetMeta()
	handler, ok := meta.(map[string]interface{})["handler"]
	//如果meta中没有handler处理方法，则说明是入口页面，否则直接执行对应callback
	if ok {
		switch handler {
		case "mainParse":
			self.mainParse(p)
		case "urlListParse":
			self.urlListParse(p)
		case "classParse":
			self.classParse(p)
		case "introParse":
			self.introParse(p)
		case "chaperParse":
			self.chaperParse(p)
		case "contentParse":
			self.contentParse(p)
		default:
			return
		}
	}
}

Example #6

Show file

File: httpdownloader.go Project: aosen/robot

func (self *HttpDownloader) downloadText(p *robot.Page, req *robot.Request) *robot.Page {
	p, destbody := self.downloadFile(p, req)
	if !p.IsSucc() {
		return p
	}

	p.SetBodyStr(destbody).SetStatus(false, "")
	return p
}

Example #7

Show file

File: sinajson.go Project: aosen/spiders

// Parse html dom here and record the parse result that we want to crawl.
// Package simplejson (https://github.com/bitly/go-simplejson) is used to parse data of json.
func (this *MyPageProcesser) Process(p *robot.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetJson()
	status, err := query.GetPath("result", "status", "code").Int()
	if status != 0 || err != nil {
		log.Panicf("page is crawled error : errorinfo=%s : status=%d : startNewsId=%d", err.Error(), status, this.startNewsId)
	}
	num, err := query.GetPath("result", "pageStr", "pageSize").Int()
	if num == 0 || err != nil {
		// Add url of next crawl
		startIdstr := strconv.Itoa(this.startNewsId)
		p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+startIdstr+"&pagesize=10&dire=f", "json")
		return
	}

	var idint, nextid int
	var nextidstr string
	query = query.Get("result").Get("data")
	for i := 0; i < num; i++ {
		id, err := query.GetIndex(i).Get("id").String()
		if id == "" || err != nil {
			continue
		}
		idint, err = strconv.Atoi(id)
		if err != nil {
			continue
		}
		if idint <= this.startNewsId {
			break
		}
		if i == 0 {
			nextid = idint
			nextidstr = id
		}
		content, err := query.GetIndex(i).Get("content").String()
		if content == "" || err != nil {
			continue
		}
		time, err := query.GetIndex(i).Get("created_at").String()
		if err != nil {
			continue
		}

		p.AddField(id+"_id", id)
		p.AddField(id+"_content", content)
		p.AddField(id+"_time", time)
	}
	// Add url of next crawl
	this.startNewsId = nextid
	p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+nextidstr+"&pagesize=10&dire=f", "json")
	//println(p.GetTargetRequests())

}

Example #8

Show file

File: process.go Project: kjfcpua/robot

func (self *Www79xsComProcessor) Process(p *robot.Page) {
	//判断页面是否抓取成功
	if !p.IsSucc() {
		mlog.LogInst().LogError(p.Errormsg())
		return
	}

	//如果callback为空，则说明是入口页面，否则直接执行对应callback
	callback := p.GetRequest().GetCallBack()
	if callback == nil {
		self.mainParse(p)
	} else {
		callback(p)
	}
}

Example #9

Show file

File: mgo.go Project: aosen/robot

func (self *MyProcessor) Process(p *robot.Page) {
	if !p.IsSucc() {
		mlog.LogInst().LogError(p.Errormsg())
		return
	}

	u, err := url.Parse(p.GetRequest().GetUrl())
	if err != nil {
		mlog.LogInst().LogError(err.Error())
		return
	}
	if !strings.HasSuffix(u.Host, "jiexieyin.org") {
		return
	}

	var urls []string
	query := p.GetHtmlParser()

	query.Find("a").Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		reJavascript := regexp.MustCompile("^javascript\\:")
		reLocal := regexp.MustCompile("^\\#")
		reMailto := regexp.MustCompile("^mailto\\:")
		if reJavascript.MatchString(href) || reLocal.MatchString(href) || reMailto.MatchString(href) {
			return
		}

		//处理相对路径
		var absHref string
		urlHref, err := url.Parse(href)
		if err != nil {
			mlog.LogInst().LogError(err.Error())
			return
		}
		if !urlHref.IsAbs() {
			urlPrefix := p.GetRequest().GetUrl()
			absHref = urlPrefix + href
			urls = append(urls, absHref)
		} else {
			urls = append(urls, href)
		}

	})

	p.AddTargetRequests(initrequests(urls))
	p.AddField("test1", p.GetRequest().GetUrl())
	p.AddField("test2", p.GetRequest().GetUrl())
}