Golang Page.AddField Exemples

Langage de programmation: Golang

Espace de nommage/Pack: github.com/hu17889/go_spider/core/common/page

Class/Type: Page

Méthode/Fonction: AddField

Exemples au hotexamples.com: 10

Golang Page.AddField - 10 exemples trouvés. Ce sont les exemples réels les mieux notés de github.com/hu17889/go_spider/core/common/page.Page.AddField extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

IsSucc(15)

GetHtmlParser(13)

Errormsg(11)

AddField(10)

AddTargetRequests(6)

SetSkip(5)

SetStatus(5)

GetBodyStr(4)

GetRequest(4)

GetUrlTag(3)

SetBodyStr(3)

GetJson(2)

AddTargetRequest(2)

AddTargetRequestWithParams(2)

SetCookies(2)

SetHeader(2)

AddTargetRequestWithHeaderFile(2)

GetCookies(1)

GetJsonMap(1)

GetSkip(1)

GetTargetRequests(1)

GetPageItems(1)

Méthodes fréquemment utilisées

IsSucc (15)

GetHtmlParser (13)

Errormsg (11)

AddField (10)

AddTargetRequests (6)

SetSkip (5)

SetStatus (5)

GetBodyStr (4)

GetRequest (4)

GetUrlTag (3)

Méthodes fréquemment utilisées

SetBodyStr (3)

GetJson (2)

AddTargetRequest (2)

AddTargetRequestWithParams (2)

SetCookies (2)

SetHeader (2)

AddTargetRequestWithHeaderFile (2)

GetCookies (1)

GetJsonMap (1)

GetSkip (1)

GetTargetRequests (1)

GetPageItems (1)

Méthodes fréquemment utilisées

GetTargetRequests (1)

GetPageItems (1)

Associées

LogRecord

gearman_client_add_server

SDL_RWFromMem

WriteMessage

Clock

NewList

Init

NewCounter

Member

FactorInt

Related in langs

thb_post_format_image_markup (PHP)

html_encodeTagged (PHP)

UriHeaderHandler (C#)

Rectangle (C#)

UNIV_UNLIKELY (C++)

kmap (C++)

ViewFileSystemBaseTest (Java)

Toolkit (Java)

convert_into_ids (Python)

warning (Python)

Exemple #1

0

Afficher le fichier

Fichier : main.go Projet : xujb/go_spider

// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }

Exemple #2

0

Afficher le fichier

Fichier : main.go Projet : tuyuwei/test

// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { query := p.GetHtmlParser() var urls []string query.Find("#threadlisttableid tbody").Each(func(i int, s *goquery.Selection) { if s.HasClass("emptb") { return } href, _ := s.Find("tbody tr .icn a").Attr("href") urls = append(urls, href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") title := query.Find("#thread_subject").Text() title = strings.Trim(title, "\t\n\r") author := query.Find("#postlist div .authi").Eq(0).Text() author = strings.Trim(author, "\t\r\n") if title == "" || author == "" { p.SetSkip(true) } p.AddField("title", title) p.AddField("author", author) }

Exemple #3

0

Afficher le fichier

Fichier : processer.go Projet : liulnn/plant-spider

func (this *PlantProcesser) getName(query *goquery.Document, p *page.Page) { name := query.Find(".lemmaWgt-lemmaTitle-title").Find("h1").Text() name = strings.Trim(name, " \t\n") p.AddField("name", name) }

Exemple #4

0

Afficher le fichier

Fichier : main.go Projet : xujb/go_spider

// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() query.Find(`div[class="wx-rb bg-blue wx-rb_v1 _item"]`).Each(func(i int, s *goquery.Selection) { name := s.Find("div.txt-box > h3").Text() href, _ := s.Attr("href") fmt.Printf("WeName:%v link:http://http://weixin.sogou.com%v \r\n", name, href) // the entity we want to save by Pipeline p.AddField("name", name) p.AddField("href", href) }) next_page_href, _ := query.Find("#sogou_next").Attr("href") if next_page_href == "" { p.SetSkip(true) } else { p.AddTargetRequestWithHeaderFile("http://weixin.sogou.com/weixin"+next_page_href, "html", "weixin.sogou.com.json") } }

Exemple #5

0

Afficher le fichier

Fichier : main.go Projet : w3hacker/go_spider

// Parse html dom here and record the parse result that we want to crawl. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { query := p.GetHtmlParser() name := query.Find(".lemmaTitleH1").Text() name = strings.Trim(name, " \t\n") summary := query.Find(".card-summary-content .para").Text() summary = strings.Trim(summary, " \t\n") // the entity we want to save by Pipeline p.AddField("name", name) p.AddField("summary", summary) }

Exemple #6

0

Afficher le fichier

Fichier : main.go Projet : CrocdileChan/go_spider

// Parse html dom here and record the parse result that we want to crawl. // Package simplejson (https://github.com/bitly/go-simplejson) is used to parse data of json. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetJson() status, err := query.GetPath("result", "status", "code").Int() if status != 0 || err != nil { log.Panicf("page is crawled error : errorinfo=%s : status=%d : startNewsId=%d", err.Error(), status, this.startNewsId) } num, err := query.GetPath("result", "pageStr", "pageSize").Int() if num == 0 || err != nil { // Add url of next crawl startIdstr := strconv.Itoa(this.startNewsId) p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+startIdstr+"&pagesize=10&dire=f", "json") return } var idint, nextid int var nextidstr string query = query.Get("result").Get("data") for i := 0; i < num; i++ { id, err := query.GetIndex(i).Get("id").String() if id == "" || err != nil { continue } idint, err = strconv.Atoi(id) if err != nil { continue } if idint <= this.startNewsId { break } if i == 0 { nextid = idint nextidstr = id } content, err := query.GetIndex(i).Get("content").String() if content == "" || err != nil { continue } time, err := query.GetIndex(i).Get("created_at").String() if err != nil { continue } p.AddField(id+"_id", id) p.AddField(id+"_content", content) p.AddField(id+"_time", time) } // Add url of next crawl this.startNewsId = nextid p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+nextidstr+"&pagesize=10&dire=f", "json") //println(p.GetTargetRequests()) }

Exemple #7

0

Afficher le fichier

Fichier : website_crawler.go Projet : wadee/go_proj

// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } var fetch_content string query := p.GetHtmlParser() content := p.GetBodyStr() reg := regexp.MustCompile(`class="([0-9a-zA-Z_-]*content[0-9a-zA-Z_-]*)"`) reg_res := reg.FindAllStringSubmatch(content, -1) class_content := make([]string, 0) for _, class := range reg_res { submatch := class[1] class_content = append(class_content, submatch) } removeDuplicate(&class_content) for _, class := range class_content { query.Find("." + class).Each(func(i int, s *goquery.Selection) { text := strings.Trim(s.Text(), " \t\n") text = strings.Replace(text, " ", "", -1) text = strings.Replace(text, "\n", "", -1) text = strings.Replace(text, "\t", "", -1) if text != "" { fetch_content = fetch_content + text } }) } if fetch_content != "" { p.AddField("content", fetch_content) } }

Exemple #8

0

Afficher le fichier

Fichier : main.go Projet : CrocdileChan/go_spider

// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if p.GetUrlTag() == "site_login" { //fmt.Printf("%v\n", p.GetCookies()) this.cookies = p.GetCookies() // AddTargetRequestWithParams Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies // 8. Http redirect function if len(this.cookies) != 0 { p.AddField("info", "get cookies success") req := request.NewRequest("http://backadmin.hucong.net/site/index", "html", "site_index", "GET", "", nil, this.cookies, nil, nil) p.AddTargetRequestWithParams(req) } else { p.AddField("info", "get cookies failed") } } else { //fmt.Printf("%v\n", p.GetBodyStr()) query := p.GetHtmlParser() pageTitle := query.Find(".page-content .page-title").Text() if len(pageTitle) != 0 { p.AddField("page_title", pageTitle) p.AddField("info", "login success") } else { p.AddField("info", "login failed") } } return if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }

Exemple #9

0

Afficher le fichier

Fichier : processer.go Projet : liulnn/plant-spider

func (this *PlantProcesser) getSummary(query *goquery.Document, p *page.Page) { summary := query.Find(".lemma-summary .para").Text() summary = strings.Trim(summary, " \t\n") p.AddField("summary", summary) }

Exemple #10

0

Afficher le fichier

Fichier : processer.go Projet : liulnn/plant-spider

func (this *PlantProcesser) getCatalog(query *goquery.Document, p *page.Page) { catalog := query.Find(".lemma-catalog").Find("span.text").Text() catalog = strings.Trim(catalog, " \t\n") p.AddField("catalog", catalog) }