Пример #1
0
// Parse 获取url对应的资源并根据规则进行解析
func (this *RedditLogic) Parse(redditUrl string) error {
	redditUrl = strings.TrimSpace(redditUrl)
	if redditUrl == "" {
		redditUrl = this.domain + this.golang
	} else if !strings.HasPrefix(redditUrl, "https") {
		redditUrl = "https://" + redditUrl
	}

	var (
		doc *goquery.Document
		err error
	)

	// if doc, err = goquery.NewDocument(redditUrl); err != nil {
	if doc, err = this.newDocumentFromResp(redditUrl); err != nil {
		logger.Errorln("goquery reddit newdocument error:", err)
		return err
	}

	// 最后面的先入库处理
	resourcesSelection := doc.Find("#siteTable .link")

	for i := resourcesSelection.Length() - 1; i >= 0; i-- {
		err = this.dealRedditOneResource(goquery.NewDocumentFromNode(resourcesSelection.Get(i)).Selection)

		if err != nil {
			logger.Errorln(err)
		}
	}

	return err
}
Пример #2
0
func GenerateDocument(rawData []byte) *goquery.Document {
	utf8String := toUtf8(rawData)
	utf8byteArray := []byte(utf8String)
	node, err := html.Parse(bytes.NewReader(utf8byteArray))
	helper.HandleFatalError("document generation failed:", err)
	return goquery.NewDocumentFromNode(node)
}
Пример #3
0
// Process the response for a URL.
func (this *worker) visitUrl(res *http.Response) []*url.URL {
	var doc *goquery.Document
	var harvested []*url.URL
	var doLinks bool

	// Load a goquery document and call the visitor function
	if node, e := html.Parse(res.Body); e != nil {
		this.logFunc(LogError, "ERROR parsing %s: %s\n", res.Request.URL.String(), e.Error())
	} else {
		doc = goquery.NewDocumentFromNode(node)
		doc.Url = res.Request.URL
	}

	// Visit the document (with nil goquery doc if failed to load)
	if this.visitor != nil {
		if harvested, doLinks = this.visitor(res, doc); doLinks && doc != nil {
			// Links were not processed by the visitor, so process links
			harvested = this.processLinks(doc)
		}
	} else {
		this.logFunc(LogInfo, "missing visitor function: %s\n", res.Request.URL.String())
	}

	return harvested
}
Пример #4
0
func (md *MangaDownloader) HttpGetHtmlDoc(u *url.URL) (*goquery.Document, error) {
	node, err := md.HttpGetHtml(u)
	if err != nil {
		return nil, err
	}
	return goquery.NewDocumentFromNode(node), err
}
Пример #5
0
func parseChapter() {
	defer chapterWg.Done()

	for chapter := range chaptersIn {
		filename := chapter.path
		b, err := ioutil.ReadFile(filename)

		var m map[string]*json.RawMessage
		err = json.Unmarshal(b, &m)
		if err != nil {
			panic(err)
		}

		var s string
		json.Unmarshal(*m["content"], &s)

		node, err := html.Parse(strings.NewReader(s))
		if err != nil {
			panic(err)
		}

		doc := gq.NewDocumentFromNode(node)
		verses := doc.Find(".verse")
		verses.Each(func(i int, s *gq.Selection) {
			chaptersOut <- Verse{book: chapter.book, chapter: chapter.chapter, html: s}
		})
	}
}
Пример #6
0
//style="display:none"
func clean_display_none(node *html.Node) *html.Node {
	doc := gq.NewDocumentFromNode(node)
	doc.Find("div").Each(func(i int, s *gq.Selection) {
		if style_display_none(s.AttrOr("style", "")) {
			s.Remove()
		}
	})
	return node
}
Пример #7
0
Файл: grab.go Проект: zackb/code
func parseHtml(h string) (*goquery.Document, error) {
	node, err := html.Parse(strings.NewReader(h))
	if err != nil {
		log.Println("parsing failed - %s %s", err.Error())
		return nil, err
	}
	doc := goquery.NewDocumentFromNode(node)
	return doc, nil
}
Пример #8
0
func clean_iframe_none(node *html.Node) *html.Node {
	gq.NewDocumentFromNode(node).Find("iframe").Each(func(i int, s *gq.Selection) {
		if style := s.AttrOr("style", ""); style_display_none(style) || style_dim_is_small(style) {
			s.Remove()
			return
		}
	})

	return node
}
Пример #9
0
func flattenTable(table *html.Node) (selections []*goquery.Selection) {
	doc := goquery.NewDocumentFromNode(table)
	selections = make([]*goquery.Selection, 0)
	doc.Find("tr").Each(func(i int, row *goquery.Selection) {
		row.RemoveFiltered("tr")
		if row.Text() != "" {
			selections = append(selections, row)
		}
	})
	return
}
Пример #10
0
func loadDoc(page string) *goquery.Document {
	if f, e := os.Open(fmt.Sprintf("testdata/%s", page)); e != nil {
		panic(e.Error())
	} else {
		defer f.Close()
		if node, e := html.Parse(f); e != nil {
			panic(e.Error())
		} else {
			return goquery.NewDocumentFromNode(node)
		}
	}
	return nil
}
Пример #11
0
func Print(dao *models.Dao, page models.PageHtml, c *elastigo.Conn) {

	doc, err := html.Parse(strings.NewReader(page.Html))
	if err != nil {
		log.Fatal(err)
	}
	document := goquery.NewDocumentFromNode(doc)
	if document == nil {
	}

	StartParser(dao, document, c)

	fmt.Println("url:%s", page.Url)
}
Пример #12
0
func generateDoc(htmlStr string, articleUrl string) *goquery.Document {
	node, _ := html.Parse(strings.NewReader(htmlStr))
	doc := goquery.NewDocumentFromNode(node)

	aUrl, err := url.Parse(articleUrl)
	if err == nil {
		doc.Url = aUrl
	} else {

		log.Println(err)
	}

	return doc
}
Пример #13
0
func LoadDoc(page string) *goquery.Document {
	base, _ := os.Getwd()
	if file, e := os.Open(base + "/" + page); e != nil {
		panic(e.Error())
	} else {
		defer file.Close()
		if node, e := html.Parse(file); e != nil {
			panic(e.Error())
		} else {
			return goquery.NewDocumentFromNode(node)
		}
	}
	return nil
}
Пример #14
0
//parse the http resp from Townclient
func (g *Ghostparser) ParseReleases() error {
	log.Info("%s parsing %v", TAG, g.Url)

	resp, err := g.Gc.Get(g.Url)
	if err != nil {
		log.Error("%s %s", TAG, err.Error())
		return err
	}
	defer resp.Body.Close()

	respbody, err := html.Parse(resp.Body)
	doc := goquery.NewDocumentFromNode(respbody)

	var rel Release
	doc.Find("table").Each(func(a int, sa *goquery.Selection) {
		if a == 10 { //get the right table
			sa.Find("tr").Each(func(b int, sb *goquery.Selection) {
				sb.Find("td").Each(func(c int, sc *goquery.Selection) {
					if c == 2 {
						rel = Release{}
						g.getUrlAndTagAndName(&rel, sc)

						if rel.Name != "" {
							rel.Time = time.Now().Unix()
							rel.Checksum = g.encodeName(rel.Url)
							rel.checkQual()
							if rel.Name != "" {
								rel.Hits = 0
								rel.Rating = 0
								g.downloadImage(rel.Url, rel.Checksum)
								g.addRelease(rel)
							}
						}
					}
				})
			})
		}
		if g.Count == 0 { //get page count
			if a == 51 {
				sa.Find("a").Each(func(d int, sd *goquery.Selection) {
					if d == 3 {
						g.Count, err = strconv.Atoi(sd.Text())
					}
				})
			}
		}
	})

	return nil
}
Пример #15
0
func RunCodeTests(tests ConversionTests, fn func(*goquery.Document, string)) error {
	for _, test := range tests {

		node, _ := html.Parse(strings.NewReader(test.In))
		doc := goquery.NewDocumentFromNode(node)

		fn(doc, "")
		got := doc.Text()

		if err := MustBeEqual(got, test.Want); err != nil {
			return err
		}
	}
	return nil
}
Пример #16
0
func loadPage(t *testing.T, page string) *goquery.Document {
	var f *os.File
	var e error

	if f, e = os.Open(fmt.Sprintf("./test_data/%s", page)); e != nil {
		t.Fatal(e)
	}
	defer f.Close()

	var node *html.Node
	if node, e = html.Parse(f); e != nil {
		t.Fatal(e)
	}
	return goquery.NewDocumentFromNode(node)
}
Пример #17
0
func NewDocument(url string) (d *goquery.Document, e error) {
	client := newHttpClient()
	res, e := client.Get(url)
	if e != nil {
		return
	}
	defer res.Body.Close()

	// Parse the HTML into nodes
	root, e := html.Parse(res.Body)
	if e != nil {
		return
	}

	// Create and fill the document
	d = goquery.NewDocumentFromNode(root)
	return
}
Пример #18
0
func getResult(rollno int, ch1 chan Student, ch2 chan bool) {
	if resp, err := http.PostForm("http://jee.iitd.ac.in/resultstatus.php",
		url.Values{"regno": {strconv.Itoa(rollno)}, "submit": {"Submit"}}); err == nil {
		defer resp.Body.Close()
		body, _ := ioutil.ReadAll(resp.Body)
		if node, e := html.Parse(strings.NewReader(string(body))); e != nil {
			fmt.Println(e)
			ch1 <- Student{}
			ch2 <- false
		} else {
			a, b := GetStudent(goquery.NewDocumentFromNode(node), rollno)
			ch1 <- a
			ch2 <- b
		}
	}
	ch1 <- Student{}
	ch2 <- false
}
Пример #19
0
func (a *Article) addInlineArticleImageHTML(title string) {
	if a.Img == nil {
		return
	}

	if a.TopNode == nil {
		a.TopNode = goquery.NewDocumentFromNode(&html.Node{
			Type:     html.ElementNode,
			DataAtom: atom.Span,
			Data:     "span",
		}).Selection
	}

	a.TopNode.PrependHtml(fmt.Sprintf(imgHeader,
		html.EscapeString(a.URL),
		html.EscapeString(title),
		html.EscapeString(a.Img.Src)))
}
Пример #20
0
func (a *Article) getCCache(n *html.Node) *contentCache {
	cc, ok := a.cCache[n]
	if !ok {
		s := goquery.NewDocumentFromNode(n).Selection
		cc = &contentCache{
			text: strings.TrimSpace(s.Text()),
			s:    s,
		}

		ws := splitText(cc.text)
		cc.wordCount = uint(len(ws))
		cc.stopwords = stopwordCountWs(a.Meta.Lang, ws)
		cc.highLinkDensity = highLinkDensity(cc)
		a.cCache[n] = cc
	}

	return cc
}
Пример #21
0
func Test_getSource_forBBC_findHref(t *testing.T) {
	tests := testutils.ConversionTests{
		{ // BBC
			In: `<html><head></head><body>
<p>Test</p><p><div class="inline-media inline-image">
<a data-replace-url=""
data-anchor-title="(Credit: iStock)"
data-caption="Flexible hours have made working from home possible for many – but how many people actually make the most of it? (Credit: iStock)"
data-caption-title=""
data-replace-image="true"
data-is-portrait="false"
class="replace-image"
title="(Credit: iStock)"
href="http://ichef.bbci.co.uk/wwfeatures/wm/live/624_351/images/live/p0/44/x2/p044x25c.jpg">
View image of (Credit: iStock)
</a></div></p><p>Test</p>
</body></html>`,
			Want: `http://ichef.bbci.co.uk/wwfeatures/wm/live/624_351/images/live/p0/44/x2/p044x25c.jpg`,
		},
	}

	for _, test := range tests {

		node, _ := html.Parse(strings.NewReader(test.In))
		doc := goquery.NewDocumentFromNode(node)
		imgSelections, _ := guessImgTag(doc)

		for _, imgSel := range imgSelections {
			imgSel.Each(func(_ int, s *goquery.Selection) {
				link := guessSourceURL(s, doc.Url)
				if link == "" {
					log.Println("No image urls found.")
				}
				got := link
				if !reflect.DeepEqual(test.Want, got) {
					_, file, line, _ := runtime.Caller(0)
					fmt.Printf("%s:%d:\n\ncall base (%#v)\n\texp: %#v\n\n\tgot: %#v\n\n",
						filepath.Base(file), line, test.In, test.Want, got)
					t.FailNow()
				}
			})
		}
	}
}
Пример #22
0
func (c *Card) inlineSrc(n *html.Node) error {
	doc := goquery.NewDocumentFromNode(n)
	doc.Find("img").Each(func(i int, s *goquery.Selection) {
		src, ok := s.Attr("src")
		if !ok {
			log.Print("Found an image with no source!!??")
			return
		}
		log.Debugf("Found image with src of '%s'", src)
		att, err := c.GetAttachment(src)
		if err != nil {
			log.Printf("Error inlining file '%s': %s", src, err)
			return
		}
		s.SetAttr("src", fmt.Sprintf("data:%s;base64,%s", att.ContentType, base64.StdEncoding.EncodeToString(att.Content)))
		// iframe.Set("src", "data:text/html;charset=utf-8;base64,"+base64.StdEncoding.EncodeToString([]byte(body)))
	})
	return nil
}
Пример #23
0
func Test_Search(t *testing.T) {

	node, _ := html.Parse(strings.NewReader(source))
	doc := goquery.NewDocumentFromNode(node)

	//                  Query   Attr IsAttr NeedsFirst NeedsParents IsText IsHtml NeedsArray Removals
	item_title := Item{".title", "", false, true, false, true, false, false, []RemoveItem{}}
	item_authors := Item{".ti-byline cite", "", false, true, false, true, false, false, []RemoveItem{}}
	item_htmlBody := Item{".ti-body", "", false, true, false, false, true, false, []RemoveItem{}}
	item_date := Item{".fltimestamp", "", false, true, false, true, false, false, []RemoveItem{}}

	scraper := Scraper{
		Journal: "The Intercept",
		Items: map[string]Item{
			"title":   item_title,
			"authors": item_authors,
			"body":    item_htmlBody,
			"date":    item_date,
		},
	}

	wants := []string{
		"Hacking Team Emails Expose Proposed Death Squad Deal, Secret U.K. Sales Push and Much More",
		"By Ryan Gallagher",
		`<p lang="en-US">Late Sunday, hackers dumped online a massive trove of emails and other documents obtained from the systems of Italian surveillance firm Hacking Team. The company’s controversial <a href="https://firstlook.org/theintercept/2014/10/30/hacking-team/">technology</a> is sold to governments around the world, enabling them to infect smartphones and computers with malware to covertly record conversations and steal data.</p>`,
		"08 Jul 2015",
	}

	got := []string{
		Search(doc, scraper.Items["title"]),
		Search(doc, scraper.Items["authors"]),
		Search(doc, scraper.Items["body"]),
		Search(doc, scraper.Items["date"]),
	}

	for i, want := range wants {
		if got[i] != want {
			t.Errorf("\ngot : |%s|\nwant: |%s|", got[i], want)
		}
	}

}
Пример #24
0
// 获取url对应的资源并根据规则进行解析
func ParseReddit(redditUrl string) error {
	redditUrl = strings.TrimSpace(redditUrl)
	if redditUrl == "" {
		redditUrl = Reddit + RedditGolang
	} else if !strings.HasPrefix(redditUrl, "http") {
		redditUrl = "http://" + redditUrl
	}

	var (
		doc *goquery.Document
		err error
	)

	if doc, err = goquery.NewDocument(redditUrl); err != nil {
		logger.Errorln("goquery reddit newdocument error:", err)
		return err
	}

	/*
		doc.Find("#siteTable .link").Each(func(i int, contentSelection *goquery.Selection) {

			err = dealRedditOneResource(contentSelection)

			if err != nil {
				logger.Errorln(err)
			}
		})
	*/

	// 最后面的先入库处理
	resourcesSelection := doc.Find("#siteTable .link")

	for i := resourcesSelection.Length() - 1; i >= 0; i-- {
		err = dealRedditOneResource(goquery.NewDocumentFromNode(resourcesSelection.Get(i)).Selection)

		if err != nil {
			logger.Errorln(err)
		}
	}

	return err
}
Пример #25
0
func (u useKnownArticles) run(a *Article) error {
	for _, m := range knownArticles {
		for _, n := range a.Doc.FindMatcher(m).Nodes {
			cc := a.getCCache(n)

			// Sometimes even "known" articles are wrong
			if cc.stopwords > 5 && !cc.highLinkDensity {
				// Remove from document so that memory can be freed
				if n.Parent != nil {
					n.Parent.RemoveChild(n)
				}

				a.Doc = goquery.NewDocumentFromNode(n)
				a.TopNode = a.Doc.Selection
				return nil
			}
		}
	}

	return nil
}
Пример #26
0
func Test_guessSourceURL_regularImageClass_getCorrectURL(t *testing.T) {
	tests := testutils.ConversionTests{
		{
			In: `<html><head></head><body>
<p>In this figure.</p>
<div id="attachment_592" style="width: 690px" class="wp-caption aligncenter">
	<a href="http://coding-geek.com/wp-content/uploads/2015/05/sine_wave-min.png">
		<img class="wp-image-592" src="http://coding-geek.com/wp-content/uploads/2015/05/sine_wave-min.png.pagespeed.ce.4qmecFS6zo.png" alt="pure sinewave at 20 Hz" width="680" height="431" pagespeed_url_hash="1382721686" data-pagespeed-onload="pagespeed.CriticalImages.checkImageForCriticality(this);" onload="var elem=this;if (this==window) elem=document.body;elem.setAttribute('data-pagespeed-loaded', 1)"/>
	</a>
	<p class="wp-caption-text">pure sinewave at 20 Hz</p>
</div>
<p>In this figure.</p>
</body></html>`,

			Want: "http://coding-geek.com/wp-content/uploads/2015/05/sine_wave-min.png.pagespeed.ce.4qmecFS6zo.png",
		},
	}

	for _, test := range tests {

		node, _ := html.Parse(strings.NewReader(test.In))
		doc := goquery.NewDocumentFromNode(node)

		imgSelections, _ := guessImgTag(doc)

		// initialize with random string so we can also test for empty strings,
		// when  no caption was found
		got := "mklkmkloijiklj"

		for _, imgSel := range imgSelections {
			imgSel.Each(func(_ int, s *goquery.Selection) {
				got = guessSourceURL(s, doc.Url)
			})
		}
		if err := testutils.MustBeEqual(got, test.Want); err != nil {
			t.Errorf("strings don't match")
		}
	}

}
Пример #27
0
func (g *Ghostparser) getImageUrl(url string) (url2 string) {

	resp, err := g.Gc.Get(url)
	if err != nil {
		log.Error("%s %s", TAG, err.Error())
		return url2
	}
	defer resp.Body.Close()

	respbody, err := html.Parse(resp.Body)
	doc := goquery.NewDocumentFromNode(respbody)

	doc.Find(".resizeImage").Each(func(a int, sa *goquery.Selection) {
		if a == 0 {
			if attr, exist := sa.Attr("src"); exist {
				url2 = attr
			}
		}
	})

	return url2
}
Пример #28
0
func Test_guessSourceURL(t *testing.T) {
	tests := []struct {
		in   string
		want string
	}{
		{
			in: `<html><head></head><body>
<p><figure class="e-image">
    <span style="'position:relative;'">
      <img alt="operationslog" class="vox-lazy-load m-chorus-asset__in-entry-body" data-chorus-asset-id="671860" data-full-size="https://cdn2.vox-cdn.com/thumbor/kA3aLNmpDy8TZ2IOMGh1ysvmi-4=/cdn0.vox-cdn.com/uploads/chorus_asset/file/671860/Hello_lovely_sloggers_watermarked.0.jpg" data-original="https://cdn3.vox-cdn.com/uploads/chorus_asset/file/671860/Hello_lovely_sloggers_watermarked.0.jpg" src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==">
      <noscript>
        <img alt="operationslog" src="https://cdn2.vox-cdn.com/thumbor/kA3aLNmpDy8TZ2IOMGh1ysvmi-4=/cdn0.vox-cdn.com/uploads/chorus_asset/file/671860/Hello_lovely_sloggers_watermarked.0.jpg">
</noscript>    </span>


</figure>
</p>
</body></html>`,
			want: "https://cdn2.vox-cdn.com/thumbor/kA3aLNmpDy8TZ2IOMGh1ysvmi-4=/cdn0.vox-cdn.com/uploads/chorus_asset/file/671860/Hello_lovely_sloggers_watermarked.0.jpg",
		},
	}

	for _, test := range tests {
		node, _ := html.Parse(strings.NewReader(test.in))
		doc := goquery.NewDocumentFromNode(node)

		s := doc.Find("img")
		got := guessSourceURL(s, nil)
		if !reflect.DeepEqual(test.want, got) {
			_, file, line, _ := runtime.Caller(0)
			fmt.Printf("%s:%d:\n\ncall guessSourceURL(%#v)\n\texp: %#v\n\n\tgot: %#v\n\n",
				filepath.Base(file), line, s, test.want, got)
			t.FailNow()
		}
	}
	testutils.Cleanup()
}
Пример #29
0
func getData(easy *curl.CURL, url string, ep string) (link string) {
	a := ""
	fmt.Println(url)

	//calback
	fooTest := func(buf []byte, userdata interface{}) bool {

		a = a + string(buf)
		//fmt.Println(a)
		nod, err := html.Parse(strings.NewReader(a))
		check(err)

		doc := goquery.NewDocumentFromNode(nod)

		doc.Find("td").Each(func(i int, s *goquery.Selection) {
			s.Eq(0).Each(func(k int, bb *goquery.Selection) {
				bb.Find("img").Each(func(l int, cc *goquery.Selection) {
					link = cc.AttrOr("src", "12345")
				})
			})
		})

		return true
	}

	// page forward to welcome
	easy.Setopt(curl.OPT_URL, url)
	easy.Setopt(curl.OPT_HTTPGET, true)
	easy.Setopt(curl.OPT_WRITEFUNCTION, fooTest)
	if err := easy.Perform(); err != nil {
		println("ERROR: ", err.Error())
	}
	//fmt.Println("success "+link+" ==\n")

	return link
}
Пример #30
0
// Process the response for a URL.
func (this *worker) visitUrl(res *http.Response) []*url.URL {
	var doc *goquery.Document
	var harvested []*url.URL
	var doLinks bool

	// Load a goquery document and call the visitor function
	if bd, e := ioutil.ReadAll(res.Body); e != nil {
		this.extender.Error(newCrawlError(e, CekReadBody, res.Request.URL))
		this.logFunc(LogError, "ERROR reading body %s: %s", res.Request.URL.String(), e.Error())
	} else {
		if node, e := html.Parse(bytes.NewBuffer(bd)); e != nil {
			this.extender.Error(newCrawlError(e, CekParseBody, res.Request.URL))
			this.logFunc(LogError, "ERROR parsing %s: %s", res.Request.URL.String(), e.Error())
		} else {
			doc = goquery.NewDocumentFromNode(node)
			doc.Url = res.Request.URL
		}
		// Re-assign the body so it can be consumed by the visitor function
		res.Body = ioutil.NopCloser(bytes.NewBuffer(bd))
	}

	// Visit the document (with nil goquery doc if failed to load)
	if harvested, doLinks = this.extender.Visit(res, doc); doLinks {
		// Links were not processed by the visitor, so process links
		if doc != nil {
			harvested = this.processLinks(doc)
		} else {
			this.extender.Error(newCrawlErrorMessage("No goquery document to process links.", CekProcessLinks, res.Request.URL))
			this.logFunc(LogError, "ERROR processing links %s", res.Request.URL.String())
		}
	}
	// Notify that this URL has been visited
	this.extender.Visited(res.Request.URL, harvested)

	return harvested
}