func (this Crawler) Crawl() *Article { article := new(Article) this.assignParseCandidate() this.assignHtml() if this.rawHtml == "" { return article } reader := strings.NewReader(this.rawHtml) document, err := goquery.NewDocumentFromReader(reader) if err != nil { panic(err.Error()) } attr := "" selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool { attr, exists := s.Attr("http-equiv") if exists && attr == "Content-Type" { return false } return true }) if selection != nil { attr, _ = selection.Attr("content") attr = strings.Replace(attr, " ", "", -1) if strings.HasPrefix(attr, "text/html;charset=") { cs := strings.TrimPrefix(attr, "text/html;charset=") cs = strings.ToLower(cs) if cs != "utf-8" { r, err := charset.NewReader(cs, strings.NewReader(this.rawHtml)) if err != nil { // On error, skip the read this.rawHtml = "" } else { utf8, _ := ioutil.ReadAll(r) this.rawHtml = string(utf8) } reader = strings.NewReader(this.rawHtml) document, err = goquery.NewDocumentFromReader(reader) } } } if err == nil { extractor := NewExtractor(this.config) html, _ := document.Html() start := TimeInNanoseconds() article.RawHtml = html article.FinalUrl = this.helper.url article.LinkHash = this.helper.linkHash article.Doc = document article.Title = extractor.getTitle(article) article.MetaLang = extractor.getMetaLanguage(article) article.MetaFavicon = extractor.getFavicon(article) article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)description]") article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)keywords]") article.CanonicalLink = extractor.getCanonicalLink(article) article.Domain = extractor.getDomain(article) article.Tags = extractor.getTags(article) cleaner := NewCleaner(this.config) article.Doc = cleaner.clean(article) article.TopImage = OpenGraphResolver(article) if article.TopImage == "" { article.TopImage = WebPageResolver(article) } article.TopNode = extractor.calculateBestNode(article) if article.TopNode != nil { article.TopNode = extractor.postCleanup(article.TopNode) outputFormatter := new(outputFormatter) article.CleanedText = outputFormatter.getFormattedText(article) videoExtractor := NewVideoExtractor() article.Movies = videoExtractor.GetVideos(article) } stop := TimeInNanoseconds() delta := stop - start article.Delta = delta } else { panic(err.Error()) } return article }
// Crawl fetches the HTML body and returns an Article func (c Crawler) Crawl() (*Article, error) { article := new(Article) c.assignParseCandidate() err := c.assignHTML() if err != nil { return nil, err } // This was supposed to have been set by assignHTML, so something went wrong if c.RawHTML == "" { return article, nil } reader := strings.NewReader(c.RawHTML) document, err := goquery.NewDocumentFromReader(reader) if err != nil { return nil, err } selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool { attr, exists := s.Attr("http-equiv") if exists && attr == "Content-Type" { return false } return true }) if selection != nil { attr, _ := selection.Attr("content") attr = strings.Replace(attr, " ", "", -1) if strings.HasPrefix(attr, "text/html;charset=") { cs := strings.TrimPrefix(attr, "text/html;charset=") cs = strings.ToLower(cs) if cs != "utf-8" { r, err1 := charset.NewReader(cs, strings.NewReader(c.RawHTML)) if err1 != nil { // On error, skip the read c.RawHTML = "" } else { utf8, _ := ioutil.ReadAll(r) c.RawHTML = string(utf8) } reader = strings.NewReader(c.RawHTML) document, err = goquery.NewDocumentFromReader(reader) } } } if err != nil { return nil, err } extractor := NewExtractor(c.config) html, err := document.Html() if err != nil { return nil, err } startTime := time.Now().UnixNano() article.RawHTML = html article.FinalURL = c.helper.url article.LinkHash = c.helper.linkHash article.Doc = document article.Title = extractor.getTitle(article) article.MetaLang = extractor.getMetaLanguage(article) article.MetaFavicon = extractor.getFavicon(article) article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^description$]") article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^keywords$]") article.CanonicalLink = extractor.getCanonicalLink(article) article.Domain = extractor.getDomain(article) article.Tags = extractor.getTags(article) cleaner := NewCleaner(c.config) article.Doc = cleaner.clean(article) article.TopImage = OpenGraphResolver(article) if article.TopImage == "" { article.TopImage = WebPageResolver(article) } article.TopNode = extractor.calculateBestNode(article) if article.TopNode != nil { article.TopNode = extractor.postCleanup(article.TopNode) outputFormatter := new(outputFormatter) article.CleanedText, article.Links = outputFormatter.getFormattedText(article) videoExtractor := NewVideoExtractor() article.Movies = videoExtractor.GetVideos(article) } article.Delta = time.Now().UnixNano() - startTime return article, nil }