Beispiel #1
0
func (this Crawler) Crawl() *Article {

	article := new(Article)
	this.assignParseCandidate()
	this.assignHtml()

	if this.rawHtml == "" {
		return article
	}

	reader := strings.NewReader(this.rawHtml)
	document, err := goquery.NewDocumentFromReader(reader)

	if err != nil {
		panic(err.Error())
	}

	attr := ""
	selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool {
		attr, exists := s.Attr("http-equiv")
		if exists && attr == "Content-Type" {
			return false
		}
		return true
	})

	if selection != nil {
		attr, _ = selection.Attr("content")
		attr = strings.Replace(attr, " ", "", -1)

		if strings.HasPrefix(attr, "text/html;charset=") {
			cs := strings.TrimPrefix(attr, "text/html;charset=")
			cs = strings.ToLower(cs)

			if cs != "utf-8" {
				r, err := charset.NewReader(cs, strings.NewReader(this.rawHtml))
				if err != nil {
					// On error, skip the read
					this.rawHtml = ""
				} else {
					utf8, _ := ioutil.ReadAll(r)
					this.rawHtml = string(utf8)
				}
				reader = strings.NewReader(this.rawHtml)
				document, err = goquery.NewDocumentFromReader(reader)
			}
		}
	}

	if err == nil {
		extractor := NewExtractor(this.config)
		html, _ := document.Html()
		start := TimeInNanoseconds()
		article.RawHtml = html
		article.FinalUrl = this.helper.url
		article.LinkHash = this.helper.linkHash
		article.Doc = document
		article.Title = extractor.getTitle(article)
		article.MetaLang = extractor.getMetaLanguage(article)
		article.MetaFavicon = extractor.getFavicon(article)

		article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)description]")
		article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)keywords]")
		article.CanonicalLink = extractor.getCanonicalLink(article)
		article.Domain = extractor.getDomain(article)
		article.Tags = extractor.getTags(article)

		cleaner := NewCleaner(this.config)
		article.Doc = cleaner.clean(article)

		article.TopImage = OpenGraphResolver(article)
		if article.TopImage == "" {
			article.TopImage = WebPageResolver(article)
		}
		article.TopNode = extractor.calculateBestNode(article)
		if article.TopNode != nil {
			article.TopNode = extractor.postCleanup(article.TopNode)

			outputFormatter := new(outputFormatter)
			article.CleanedText = outputFormatter.getFormattedText(article)

			videoExtractor := NewVideoExtractor()
			article.Movies = videoExtractor.GetVideos(article)
		}

		stop := TimeInNanoseconds()
		delta := stop - start
		article.Delta = delta

	} else {
		panic(err.Error())
	}
	return article
}
Beispiel #2
0
// Crawl fetches the HTML body and returns an Article
func (c Crawler) Crawl() (*Article, error) {

	article := new(Article)
	c.assignParseCandidate()
	err := c.assignHTML()
	if err != nil {
		return nil, err
	}

	// This was supposed to have been set by assignHTML, so something went wrong
	if c.RawHTML == "" {
		return article, nil
	}

	reader := strings.NewReader(c.RawHTML)
	document, err := goquery.NewDocumentFromReader(reader)
	if err != nil {
		return nil, err
	}

	selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool {
		attr, exists := s.Attr("http-equiv")
		if exists && attr == "Content-Type" {
			return false
		}
		return true
	})

	if selection != nil {
		attr, _ := selection.Attr("content")
		attr = strings.Replace(attr, " ", "", -1)

		if strings.HasPrefix(attr, "text/html;charset=") {
			cs := strings.TrimPrefix(attr, "text/html;charset=")
			cs = strings.ToLower(cs)

			if cs != "utf-8" {
				r, err1 := charset.NewReader(cs, strings.NewReader(c.RawHTML))
				if err1 != nil {
					// On error, skip the read
					c.RawHTML = ""
				} else {
					utf8, _ := ioutil.ReadAll(r)
					c.RawHTML = string(utf8)
				}
				reader = strings.NewReader(c.RawHTML)
				document, err = goquery.NewDocumentFromReader(reader)
			}
		}
	}

	if err != nil {
		return nil, err
	}

	extractor := NewExtractor(c.config)
	html, err := document.Html()
	if err != nil {
		return nil, err
	}

	startTime := time.Now().UnixNano()
	article.RawHTML = html
	article.FinalURL = c.helper.url
	article.LinkHash = c.helper.linkHash
	article.Doc = document
	article.Title = extractor.getTitle(article)
	article.MetaLang = extractor.getMetaLanguage(article)
	article.MetaFavicon = extractor.getFavicon(article)

	article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^description$]")
	article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^keywords$]")
	article.CanonicalLink = extractor.getCanonicalLink(article)
	article.Domain = extractor.getDomain(article)
	article.Tags = extractor.getTags(article)

	cleaner := NewCleaner(c.config)
	article.Doc = cleaner.clean(article)

	article.TopImage = OpenGraphResolver(article)
	if article.TopImage == "" {
		article.TopImage = WebPageResolver(article)
	}
	article.TopNode = extractor.calculateBestNode(article)
	if article.TopNode != nil {
		article.TopNode = extractor.postCleanup(article.TopNode)

		outputFormatter := new(outputFormatter)
		article.CleanedText, article.Links = outputFormatter.getFormattedText(article)

		videoExtractor := NewVideoExtractor()
		article.Movies = videoExtractor.GetVideos(article)
	}

	article.Delta = time.Now().UnixNano() - startTime

	return article, nil
}