// Parse 获取url对应的资源并根据规则进行解析 func (this *RedditLogic) Parse(redditUrl string) error { redditUrl = strings.TrimSpace(redditUrl) if redditUrl == "" { redditUrl = this.domain + this.golang } else if !strings.HasPrefix(redditUrl, "https") { redditUrl = "https://" + redditUrl } var ( doc *goquery.Document err error ) // if doc, err = goquery.NewDocument(redditUrl); err != nil { if doc, err = this.newDocumentFromResp(redditUrl); err != nil { logger.Errorln("goquery reddit newdocument error:", err) return err } // 最后面的先入库处理 resourcesSelection := doc.Find("#siteTable .link") for i := resourcesSelection.Length() - 1; i >= 0; i-- { err = this.dealRedditOneResource(goquery.NewDocumentFromNode(resourcesSelection.Get(i)).Selection) if err != nil { logger.Errorln(err) } } return err }
func GenerateDocument(rawData []byte) *goquery.Document { utf8String := toUtf8(rawData) utf8byteArray := []byte(utf8String) node, err := html.Parse(bytes.NewReader(utf8byteArray)) helper.HandleFatalError("document generation failed:", err) return goquery.NewDocumentFromNode(node) }
// Process the response for a URL. func (this *worker) visitUrl(res *http.Response) []*url.URL { var doc *goquery.Document var harvested []*url.URL var doLinks bool // Load a goquery document and call the visitor function if node, e := html.Parse(res.Body); e != nil { this.logFunc(LogError, "ERROR parsing %s: %s\n", res.Request.URL.String(), e.Error()) } else { doc = goquery.NewDocumentFromNode(node) doc.Url = res.Request.URL } // Visit the document (with nil goquery doc if failed to load) if this.visitor != nil { if harvested, doLinks = this.visitor(res, doc); doLinks && doc != nil { // Links were not processed by the visitor, so process links harvested = this.processLinks(doc) } } else { this.logFunc(LogInfo, "missing visitor function: %s\n", res.Request.URL.String()) } return harvested }
func (md *MangaDownloader) HttpGetHtmlDoc(u *url.URL) (*goquery.Document, error) { node, err := md.HttpGetHtml(u) if err != nil { return nil, err } return goquery.NewDocumentFromNode(node), err }
func parseChapter() { defer chapterWg.Done() for chapter := range chaptersIn { filename := chapter.path b, err := ioutil.ReadFile(filename) var m map[string]*json.RawMessage err = json.Unmarshal(b, &m) if err != nil { panic(err) } var s string json.Unmarshal(*m["content"], &s) node, err := html.Parse(strings.NewReader(s)) if err != nil { panic(err) } doc := gq.NewDocumentFromNode(node) verses := doc.Find(".verse") verses.Each(func(i int, s *gq.Selection) { chaptersOut <- Verse{book: chapter.book, chapter: chapter.chapter, html: s} }) } }
//style="display:none" func clean_display_none(node *html.Node) *html.Node { doc := gq.NewDocumentFromNode(node) doc.Find("div").Each(func(i int, s *gq.Selection) { if style_display_none(s.AttrOr("style", "")) { s.Remove() } }) return node }
func parseHtml(h string) (*goquery.Document, error) { node, err := html.Parse(strings.NewReader(h)) if err != nil { log.Println("parsing failed - %s %s", err.Error()) return nil, err } doc := goquery.NewDocumentFromNode(node) return doc, nil }
func clean_iframe_none(node *html.Node) *html.Node { gq.NewDocumentFromNode(node).Find("iframe").Each(func(i int, s *gq.Selection) { if style := s.AttrOr("style", ""); style_display_none(style) || style_dim_is_small(style) { s.Remove() return } }) return node }
func flattenTable(table *html.Node) (selections []*goquery.Selection) { doc := goquery.NewDocumentFromNode(table) selections = make([]*goquery.Selection, 0) doc.Find("tr").Each(func(i int, row *goquery.Selection) { row.RemoveFiltered("tr") if row.Text() != "" { selections = append(selections, row) } }) return }
func loadDoc(page string) *goquery.Document { if f, e := os.Open(fmt.Sprintf("testdata/%s", page)); e != nil { panic(e.Error()) } else { defer f.Close() if node, e := html.Parse(f); e != nil { panic(e.Error()) } else { return goquery.NewDocumentFromNode(node) } } return nil }
func Print(dao *models.Dao, page models.PageHtml, c *elastigo.Conn) { doc, err := html.Parse(strings.NewReader(page.Html)) if err != nil { log.Fatal(err) } document := goquery.NewDocumentFromNode(doc) if document == nil { } StartParser(dao, document, c) fmt.Println("url:%s", page.Url) }
func generateDoc(htmlStr string, articleUrl string) *goquery.Document { node, _ := html.Parse(strings.NewReader(htmlStr)) doc := goquery.NewDocumentFromNode(node) aUrl, err := url.Parse(articleUrl) if err == nil { doc.Url = aUrl } else { log.Println(err) } return doc }
func LoadDoc(page string) *goquery.Document { base, _ := os.Getwd() if file, e := os.Open(base + "/" + page); e != nil { panic(e.Error()) } else { defer file.Close() if node, e := html.Parse(file); e != nil { panic(e.Error()) } else { return goquery.NewDocumentFromNode(node) } } return nil }
//parse the http resp from Townclient func (g *Ghostparser) ParseReleases() error { log.Info("%s parsing %v", TAG, g.Url) resp, err := g.Gc.Get(g.Url) if err != nil { log.Error("%s %s", TAG, err.Error()) return err } defer resp.Body.Close() respbody, err := html.Parse(resp.Body) doc := goquery.NewDocumentFromNode(respbody) var rel Release doc.Find("table").Each(func(a int, sa *goquery.Selection) { if a == 10 { //get the right table sa.Find("tr").Each(func(b int, sb *goquery.Selection) { sb.Find("td").Each(func(c int, sc *goquery.Selection) { if c == 2 { rel = Release{} g.getUrlAndTagAndName(&rel, sc) if rel.Name != "" { rel.Time = time.Now().Unix() rel.Checksum = g.encodeName(rel.Url) rel.checkQual() if rel.Name != "" { rel.Hits = 0 rel.Rating = 0 g.downloadImage(rel.Url, rel.Checksum) g.addRelease(rel) } } } }) }) } if g.Count == 0 { //get page count if a == 51 { sa.Find("a").Each(func(d int, sd *goquery.Selection) { if d == 3 { g.Count, err = strconv.Atoi(sd.Text()) } }) } } }) return nil }
func RunCodeTests(tests ConversionTests, fn func(*goquery.Document, string)) error { for _, test := range tests { node, _ := html.Parse(strings.NewReader(test.In)) doc := goquery.NewDocumentFromNode(node) fn(doc, "") got := doc.Text() if err := MustBeEqual(got, test.Want); err != nil { return err } } return nil }
func loadPage(t *testing.T, page string) *goquery.Document { var f *os.File var e error if f, e = os.Open(fmt.Sprintf("./test_data/%s", page)); e != nil { t.Fatal(e) } defer f.Close() var node *html.Node if node, e = html.Parse(f); e != nil { t.Fatal(e) } return goquery.NewDocumentFromNode(node) }
func NewDocument(url string) (d *goquery.Document, e error) { client := newHttpClient() res, e := client.Get(url) if e != nil { return } defer res.Body.Close() // Parse the HTML into nodes root, e := html.Parse(res.Body) if e != nil { return } // Create and fill the document d = goquery.NewDocumentFromNode(root) return }
func getResult(rollno int, ch1 chan Student, ch2 chan bool) { if resp, err := http.PostForm("http://jee.iitd.ac.in/resultstatus.php", url.Values{"regno": {strconv.Itoa(rollno)}, "submit": {"Submit"}}); err == nil { defer resp.Body.Close() body, _ := ioutil.ReadAll(resp.Body) if node, e := html.Parse(strings.NewReader(string(body))); e != nil { fmt.Println(e) ch1 <- Student{} ch2 <- false } else { a, b := GetStudent(goquery.NewDocumentFromNode(node), rollno) ch1 <- a ch2 <- b } } ch1 <- Student{} ch2 <- false }
func (a *Article) addInlineArticleImageHTML(title string) { if a.Img == nil { return } if a.TopNode == nil { a.TopNode = goquery.NewDocumentFromNode(&html.Node{ Type: html.ElementNode, DataAtom: atom.Span, Data: "span", }).Selection } a.TopNode.PrependHtml(fmt.Sprintf(imgHeader, html.EscapeString(a.URL), html.EscapeString(title), html.EscapeString(a.Img.Src))) }
func (a *Article) getCCache(n *html.Node) *contentCache { cc, ok := a.cCache[n] if !ok { s := goquery.NewDocumentFromNode(n).Selection cc = &contentCache{ text: strings.TrimSpace(s.Text()), s: s, } ws := splitText(cc.text) cc.wordCount = uint(len(ws)) cc.stopwords = stopwordCountWs(a.Meta.Lang, ws) cc.highLinkDensity = highLinkDensity(cc) a.cCache[n] = cc } return cc }
func Test_getSource_forBBC_findHref(t *testing.T) { tests := testutils.ConversionTests{ { // BBC In: `<html><head></head><body> <p>Test</p><p><div class="inline-media inline-image"> <a data-replace-url="" data-anchor-title="(Credit: iStock)" data-caption="Flexible hours have made working from home possible for many – but how many people actually make the most of it? (Credit: iStock)" data-caption-title="" data-replace-image="true" data-is-portrait="false" class="replace-image" title="(Credit: iStock)" href="http://ichef.bbci.co.uk/wwfeatures/wm/live/624_351/images/live/p0/44/x2/p044x25c.jpg"> View image of (Credit: iStock) </a></div></p><p>Test</p> </body></html>`, Want: `http://ichef.bbci.co.uk/wwfeatures/wm/live/624_351/images/live/p0/44/x2/p044x25c.jpg`, }, } for _, test := range tests { node, _ := html.Parse(strings.NewReader(test.In)) doc := goquery.NewDocumentFromNode(node) imgSelections, _ := guessImgTag(doc) for _, imgSel := range imgSelections { imgSel.Each(func(_ int, s *goquery.Selection) { link := guessSourceURL(s, doc.Url) if link == "" { log.Println("No image urls found.") } got := link if !reflect.DeepEqual(test.Want, got) { _, file, line, _ := runtime.Caller(0) fmt.Printf("%s:%d:\n\ncall base (%#v)\n\texp: %#v\n\n\tgot: %#v\n\n", filepath.Base(file), line, test.In, test.Want, got) t.FailNow() } }) } } }
func (c *Card) inlineSrc(n *html.Node) error { doc := goquery.NewDocumentFromNode(n) doc.Find("img").Each(func(i int, s *goquery.Selection) { src, ok := s.Attr("src") if !ok { log.Print("Found an image with no source!!??") return } log.Debugf("Found image with src of '%s'", src) att, err := c.GetAttachment(src) if err != nil { log.Printf("Error inlining file '%s': %s", src, err) return } s.SetAttr("src", fmt.Sprintf("data:%s;base64,%s", att.ContentType, base64.StdEncoding.EncodeToString(att.Content))) // iframe.Set("src", "data:text/html;charset=utf-8;base64,"+base64.StdEncoding.EncodeToString([]byte(body))) }) return nil }
func Test_Search(t *testing.T) { node, _ := html.Parse(strings.NewReader(source)) doc := goquery.NewDocumentFromNode(node) // Query Attr IsAttr NeedsFirst NeedsParents IsText IsHtml NeedsArray Removals item_title := Item{".title", "", false, true, false, true, false, false, []RemoveItem{}} item_authors := Item{".ti-byline cite", "", false, true, false, true, false, false, []RemoveItem{}} item_htmlBody := Item{".ti-body", "", false, true, false, false, true, false, []RemoveItem{}} item_date := Item{".fltimestamp", "", false, true, false, true, false, false, []RemoveItem{}} scraper := Scraper{ Journal: "The Intercept", Items: map[string]Item{ "title": item_title, "authors": item_authors, "body": item_htmlBody, "date": item_date, }, } wants := []string{ "Hacking Team Emails Expose Proposed Death Squad Deal, Secret U.K. Sales Push and Much More", "By Ryan Gallagher", `<p lang="en-US">Late Sunday, hackers dumped online a massive trove of emails and other documents obtained from the systems of Italian surveillance firm Hacking Team. The company’s controversial <a href="https://firstlook.org/theintercept/2014/10/30/hacking-team/">technology</a> is sold to governments around the world, enabling them to infect smartphones and computers with malware to covertly record conversations and steal data.</p>`, "08 Jul 2015", } got := []string{ Search(doc, scraper.Items["title"]), Search(doc, scraper.Items["authors"]), Search(doc, scraper.Items["body"]), Search(doc, scraper.Items["date"]), } for i, want := range wants { if got[i] != want { t.Errorf("\ngot : |%s|\nwant: |%s|", got[i], want) } } }
// 获取url对应的资源并根据规则进行解析 func ParseReddit(redditUrl string) error { redditUrl = strings.TrimSpace(redditUrl) if redditUrl == "" { redditUrl = Reddit + RedditGolang } else if !strings.HasPrefix(redditUrl, "http") { redditUrl = "http://" + redditUrl } var ( doc *goquery.Document err error ) if doc, err = goquery.NewDocument(redditUrl); err != nil { logger.Errorln("goquery reddit newdocument error:", err) return err } /* doc.Find("#siteTable .link").Each(func(i int, contentSelection *goquery.Selection) { err = dealRedditOneResource(contentSelection) if err != nil { logger.Errorln(err) } }) */ // 最后面的先入库处理 resourcesSelection := doc.Find("#siteTable .link") for i := resourcesSelection.Length() - 1; i >= 0; i-- { err = dealRedditOneResource(goquery.NewDocumentFromNode(resourcesSelection.Get(i)).Selection) if err != nil { logger.Errorln(err) } } return err }
func (u useKnownArticles) run(a *Article) error { for _, m := range knownArticles { for _, n := range a.Doc.FindMatcher(m).Nodes { cc := a.getCCache(n) // Sometimes even "known" articles are wrong if cc.stopwords > 5 && !cc.highLinkDensity { // Remove from document so that memory can be freed if n.Parent != nil { n.Parent.RemoveChild(n) } a.Doc = goquery.NewDocumentFromNode(n) a.TopNode = a.Doc.Selection return nil } } } return nil }
func Test_guessSourceURL_regularImageClass_getCorrectURL(t *testing.T) { tests := testutils.ConversionTests{ { In: `<html><head></head><body> <p>In this figure.</p> <div id="attachment_592" style="width: 690px" class="wp-caption aligncenter"> <a href="http://coding-geek.com/wp-content/uploads/2015/05/sine_wave-min.png"> <img class="wp-image-592" src="http://coding-geek.com/wp-content/uploads/2015/05/sine_wave-min.png.pagespeed.ce.4qmecFS6zo.png" alt="pure sinewave at 20 Hz" width="680" height="431" pagespeed_url_hash="1382721686" data-pagespeed-onload="pagespeed.CriticalImages.checkImageForCriticality(this);" onload="var elem=this;if (this==window) elem=document.body;elem.setAttribute('data-pagespeed-loaded', 1)"/> </a> <p class="wp-caption-text">pure sinewave at 20 Hz</p> </div> <p>In this figure.</p> </body></html>`, Want: "http://coding-geek.com/wp-content/uploads/2015/05/sine_wave-min.png.pagespeed.ce.4qmecFS6zo.png", }, } for _, test := range tests { node, _ := html.Parse(strings.NewReader(test.In)) doc := goquery.NewDocumentFromNode(node) imgSelections, _ := guessImgTag(doc) // initialize with random string so we can also test for empty strings, // when no caption was found got := "mklkmkloijiklj" for _, imgSel := range imgSelections { imgSel.Each(func(_ int, s *goquery.Selection) { got = guessSourceURL(s, doc.Url) }) } if err := testutils.MustBeEqual(got, test.Want); err != nil { t.Errorf("strings don't match") } } }
func (g *Ghostparser) getImageUrl(url string) (url2 string) { resp, err := g.Gc.Get(url) if err != nil { log.Error("%s %s", TAG, err.Error()) return url2 } defer resp.Body.Close() respbody, err := html.Parse(resp.Body) doc := goquery.NewDocumentFromNode(respbody) doc.Find(".resizeImage").Each(func(a int, sa *goquery.Selection) { if a == 0 { if attr, exist := sa.Attr("src"); exist { url2 = attr } } }) return url2 }
func Test_guessSourceURL(t *testing.T) { tests := []struct { in string want string }{ { in: `<html><head></head><body> <p><figure class="e-image"> <span style="'position:relative;'"> <img alt="operationslog" class="vox-lazy-load m-chorus-asset__in-entry-body" data-chorus-asset-id="671860" data-full-size="https://cdn2.vox-cdn.com/thumbor/kA3aLNmpDy8TZ2IOMGh1ysvmi-4=/cdn0.vox-cdn.com/uploads/chorus_asset/file/671860/Hello_lovely_sloggers_watermarked.0.jpg" data-original="https://cdn3.vox-cdn.com/uploads/chorus_asset/file/671860/Hello_lovely_sloggers_watermarked.0.jpg" src=""> <noscript> <img alt="operationslog" src="https://cdn2.vox-cdn.com/thumbor/kA3aLNmpDy8TZ2IOMGh1ysvmi-4=/cdn0.vox-cdn.com/uploads/chorus_asset/file/671860/Hello_lovely_sloggers_watermarked.0.jpg"> </noscript> </span> </figure> </p> </body></html>`, want: "https://cdn2.vox-cdn.com/thumbor/kA3aLNmpDy8TZ2IOMGh1ysvmi-4=/cdn0.vox-cdn.com/uploads/chorus_asset/file/671860/Hello_lovely_sloggers_watermarked.0.jpg", }, } for _, test := range tests { node, _ := html.Parse(strings.NewReader(test.in)) doc := goquery.NewDocumentFromNode(node) s := doc.Find("img") got := guessSourceURL(s, nil) if !reflect.DeepEqual(test.want, got) { _, file, line, _ := runtime.Caller(0) fmt.Printf("%s:%d:\n\ncall guessSourceURL(%#v)\n\texp: %#v\n\n\tgot: %#v\n\n", filepath.Base(file), line, s, test.want, got) t.FailNow() } } testutils.Cleanup() }
func getData(easy *curl.CURL, url string, ep string) (link string) { a := "" fmt.Println(url) //calback fooTest := func(buf []byte, userdata interface{}) bool { a = a + string(buf) //fmt.Println(a) nod, err := html.Parse(strings.NewReader(a)) check(err) doc := goquery.NewDocumentFromNode(nod) doc.Find("td").Each(func(i int, s *goquery.Selection) { s.Eq(0).Each(func(k int, bb *goquery.Selection) { bb.Find("img").Each(func(l int, cc *goquery.Selection) { link = cc.AttrOr("src", "12345") }) }) }) return true } // page forward to welcome easy.Setopt(curl.OPT_URL, url) easy.Setopt(curl.OPT_HTTPGET, true) easy.Setopt(curl.OPT_WRITEFUNCTION, fooTest) if err := easy.Perform(); err != nil { println("ERROR: ", err.Error()) } //fmt.Println("success "+link+" ==\n") return link }
// Process the response for a URL. func (this *worker) visitUrl(res *http.Response) []*url.URL { var doc *goquery.Document var harvested []*url.URL var doLinks bool // Load a goquery document and call the visitor function if bd, e := ioutil.ReadAll(res.Body); e != nil { this.extender.Error(newCrawlError(e, CekReadBody, res.Request.URL)) this.logFunc(LogError, "ERROR reading body %s: %s", res.Request.URL.String(), e.Error()) } else { if node, e := html.Parse(bytes.NewBuffer(bd)); e != nil { this.extender.Error(newCrawlError(e, CekParseBody, res.Request.URL)) this.logFunc(LogError, "ERROR parsing %s: %s", res.Request.URL.String(), e.Error()) } else { doc = goquery.NewDocumentFromNode(node) doc.Url = res.Request.URL } // Re-assign the body so it can be consumed by the visitor function res.Body = ioutil.NopCloser(bytes.NewBuffer(bd)) } // Visit the document (with nil goquery doc if failed to load) if harvested, doLinks = this.extender.Visit(res, doc); doLinks { // Links were not processed by the visitor, so process links if doc != nil { harvested = this.processLinks(doc) } else { this.extender.Error(newCrawlErrorMessage("No goquery document to process links.", CekProcessLinks, res.Request.URL)) this.logFunc(LogError, "ERROR processing links %s", res.Request.URL.String()) } } // Notify that this URL has been visited this.extender.Visited(res.Request.URL, harvested) return harvested }