func score(tag *goquery.Selection) int { src, _ := tag.Attr("src") if src == "" { src, _ = tag.Attr("data-src") } if src == "" { src, _ = tag.Attr("data-lazy-src") } if src == "" { return -1 } tagScore := 0 for rule, score := range rules { if rule.MatchString(src) { tagScore += score } } alt, exists := tag.Attr("alt") if exists { if strings.Contains(alt, "thumbnail") { tagScore-- } } id, exists := tag.Attr("id") if exists { if id == "fbPhotoImage" { tagScore++ } } return tagScore }
func guessSourceURL(s *goquery.Selection, link *url.URL) string { possibleSrcAttr := []string{"src", "data-src", "srcset", "data-full-size", "data-original", "href", // bbc "pagespeed_lazy_src", } var possibleSrcURLs []string for _, attr := range possibleSrcAttr { link, exist := s.Attr(attr) if exist { Debug("possible image src url under: %v", attr) possibleSrcURLs = append(possibleSrcURLs, link) } } if len(possibleSrcURLs) == 0 { return "" } validSrcURLs := removeInvalidURLs(possibleSrcURLs, link) if len(validSrcURLs) == 0 { return "" } indexLongestElement, _ := longestElement(validSrcURLs) return validSrcURLs[indexLongestElement] }
// attributeOrDefault reads an attribute and returns it or the default value when it's empty. func (bow *Browser) attrOrDefault(name, def string, sel *goquery.Selection) string { a, ok := sel.Attr(name) if ok { return a } return def }
// Add street to result and find street information (to WikipediaMoscow.result) func (parser *WikipediaMoscow) processLink(_ int, s *goquery.Selection, done chan<- *StreetInfo) { name := strings.TrimSpace(s.Text()) if len(name) == 0 { done <- parser.getEmptyInfo("") return } href, exists := s.Attr("href") if !exists { done <- parser.getEmptyInfo("") return } var ( class string info *StreetInfo ) class, exists = s.Attr("class") if exists && class == "new" { info = parser.getEmptyInfo(name) } else if resp, err := http.Get(parser.baseURL + href); err != nil { info = parser.getEmptyInfo(name) } else { streetparser := NewWikipediaStreetParser() info, err = streetparser.ParseStreetInfo(name, resp.Body) if err != nil { info = parser.getEmptyInfo(name) } } done <- info }
func (this *parser) name(selector string, selection *goquery.Selection) string { value, exists := selection.Attr(selector) if exists { return value } return "" }
// toPage is a helper function that accepts an anchor // tag referencing a markdown file, parsing the markdown // file and returning a page to be included in our docs. func toPage(site *Site, el *goquery.Selection) (*Page, error) { // follow the link to see if this is a page // that should be added to our documentation. href, ok := el.Attr("href") if !ok || href == "#" { return nil, nil } // read the markdown file, convert to html and // read into a dom element. doc, err := toDocument(filepath.Join(site.base, href)) if err != nil { return nil, err } // convert the extension from markdown to // html, in preparation for type conversion. href = strings.Replace(href, ".md", ".html", -1) el.SetAttr("href", href) page := &Page{} page.Href = href page.html, err = doc.Html() return page, err }
func (ve *VideoExtractor) getSrc(node *goquery.Selection) string { value, exists := node.Attr("src") if exists { return value } return "" }
func convertTagToJqueryFormat(tag string, s *goquery.Selection) string { tagitself := tag pos := strings.Index(tag, " ") if pos > -1 { tagitself = tag[0:pos] } else { return tag } class, found := s.Attr("class") if found && class != "" { pos := strings.Index(class, " ") // leave only a first class from a list if pos > -1 { class = class[0:pos] } tagitself = tagitself + "." + class } return tagitself }
func (d *Document) classWeight(s *goquery.Selection) int { weight := 0 if !d.WeightClasses { return weight } class, _ := s.Attr("class") id, _ := s.Attr("id") if class != "" { if negativeRegexp.MatchString(class) { weight -= 25 } if positiveRegexp.MatchString(class) { weight += 25 } } if id != "" { if negativeRegexp.MatchString(id) { weight -= 25 } if positiveRegexp.MatchString(id) { weight += 25 } } return weight }
// Parse from div.tweet func (tweet *Tweet) Parse(s *goquery.Selection) (err error) { success := false attrs := []string{ "data-item-id", "data-screen-name", "data-name", } data := map[string]string{} for _, attr := range attrs { var value string if value, success = s.Attr(attr); !success { tweet.Success = 0 err = fmt.Errorf("not having %s attribute", attr) return } data[attr] = value } tweet.ItemID = data["data-item-id"] tweet.ScreenName = data["data-screen-name"] tweet.Name = data["data-name"] tweet.Success = 1 // if could get the above attribues, allow the following values to be blank. tweet.Time, _ = s.Find("._timestamp").Attr("data-time") tweet.Text = s.Find(".tweet-text").Text() return }
func ScrapeNotes(s *goquery.Selection) []string { notes := []string{} s.Find("abbr").Not("abbr:first-of-type").Not("abbr.c").Each(func(i int, s *goquery.Selection) { note, _ := s.Attr("title") notes = append(notes, note) }) return notes }
func ScrapeOrigins(s *goquery.Selection) []string { origins := []string{} s.Find("abbr.c").Each(func(i int, s *goquery.Selection) { origin, _ := s.Attr("title") origins = append(origins, origin) }) return origins }
func ScrapeNotes(s *goquery.Selection) []string { notes := []string{} s.Clone().Find("span[title]").First().Remove().End().End().Find("span.d i span.d[title]").Remove().End().Find("span.d[title]").Each(func(i int, s *goquery.Selection) { note, _ := s.Attr("title") notes = append(notes, note) }) return notes }
func attrToUrl(s *goquery.Selection, attr string) (*url.URL, error) { link, exists := s.Attr(attr) if exists { return url.Parse(link) } return nil, errors.New("Attr " + attr + " not found") }
func (ve *VideoExtractor) getHeight(node *goquery.Selection) int { value, exists := node.Attr("height") if exists { nvalue, _ := strconv.Atoi(value) return nvalue } return 0 }
func getNumbericAttribute(s *goquery.Selection, attr string) int { a, f := s.Attr(attr) if f { ai, _ := strconv.Atoi(a) return ai } return 0 }
func getHeight(sel *goquery.Selection) string { heightAttr := "height" height, exist := sel.Attr(heightAttr) if exist { return height } return "" }
func getDataFromDOM(s *gq.Selection, arr []string, code string) string { var dt string if arr[0] == "text" { dt = s.Text() } else { dt, _ = s.Attr(arr[0]) } return encode_string(dt, code) }
func itemURL(s *goquery.Selection) (url string) { s.Closest("tr").Next().Find("a").Each(func(_ int, s *goquery.Selection) { href, _ := s.Attr("href") if strings.HasPrefix(href, "item?id=") { url = hnURL + href } }) return }
// guessCodeLang returns the code language if supported, otherwise and empty // string. func guessCodeLang(sel *goquery.Selection) string { codeLangAttr, _ := sel.Attr("class") if codeLangAttr == "" { codeLangAttr, _ = sel.Attr("lang") } return extractCodeLang(codeLangAttr) }
/** * This function parses and returns the uri associated with the HTML anchor * <a href="http://www..."...> tag * This function assumes that 'href' attribute contains absolute url. * It returns "" empty string if it can't find href attribute from the * goquery.Selection parameter. */ func getUri(sel *goquery.Selection) string { if sel != nil { str, exists := sel.Attr("href") if exists { u, err := url.Parse(str) checkErr(err) return u.String() } } return "" }
func displayDetails(single *goquery.Selection) { text := strings.TrimSpace(single.Text()) href, _ := single.Attr("href") length := utf8.RuneCountInString(text) if ((length > 5) && wordExists(text, "keywords")) || ((length > 5) && wordExists(href, "keywords")) { if wordExists(text, "products") { fmt.Println("Link", single.Text(), "--->", href) } } }
func (this *contentExtractor) getNodeGravityScore(node *goquery.Selection) int { grvScoreString, exists := node.Attr("gravityScore") if !exists { return 0 } grvScore, err := strconv.Atoi(grvScoreString) if err != nil { return 0 } return grvScore }
func addIngredient(ingredients []data.Ingredient, a *goquery.Selection) []data.Ingredient { if href, ok := a.Attr("href"); ok { glog.V(2).Info(" href: " + href) id, err := strconv.Atoi(strings.Split(href, "/")[2]) if err != nil { glog.Errorf("Failed to extract id from %s: %v", href, err) } else { ingredients = append(ingredients, data.Ingredient{Name: a.Text(), Id: id}) } } return ingredients }
func parseResource(s *goquery.Selection) (_production, _stored, _capacity int) { productionStr, _ := s.Attr("title") production, _ := strconv.Atoi(productionStr) status := s.Text() split := strings.Split(status, "/") stored, _ := strconv.Atoi(split[0]) capacity, _ := strconv.Atoi(split[1]) return production, stored, capacity }
func ParseRecipe(sel *goquery.Selection) (*Recipe, error) { if _, exists := sel.Attr("itemscope"); !exists { return nil, ErrMissingItemScope } itemtype, exists := sel.Attr("itemtype") if !exists { return nil, ErrMissingItemType } if itemtype != RecipeSchemaURL { return nil, ErrWrongItemType } recipe := &Recipe{ CreativeWork: CreativeWork{ Thing: Thing{}, }, } nameSel := sel.Find("[itemprop='name']") recipe.Name = strings.TrimSpace(nameSel.Text()) authorSel := sel.Find("[itemprop='author']").First() recipe.Author = strings.TrimSpace(authorSel.Text()) datePublishedSel := sel.Find("[itemprop='datePublished']") datePublishedText, exists := datePublishedSel.Attr("content") if !exists { datePublishedText = datePublishedSel.Text() } var err error if len(datePublishedText) != 0 { recipe.DatePublished, err = time.Parse("2006-01-02", datePublishedText) if err != nil { return nil, err } } nutritionInformationSel := sel.Find(fmt.Sprintf("[itemscope=''][itemtype='%s']", NutritionInformationSchemaURL)) if nutritionInformationSel.Size() > 0 { recipe.Nutrition, err = ParseNutritionInformation(nutritionInformationSel) if err != nil { return nil, err } } imageSel := sel.Find("[itemprop='image']") recipe.Image, _ = imageSel.Attr("src") descriptionSel := sel.Find("[itemprop='description']") recipe.Description = strings.TrimSpace(descriptionSel.Text()) return recipe, nil }
//stores how many decent nodes are under a parent node func (this *contentExtractor) updateNodeCount(node *goquery.Selection, addToCount int) { currentScore := 0 var err error scoreString, _ := node.Attr("gravityNodes") if scoreString != "" { currentScore, err = strconv.Atoi(scoreString) if err != nil { currentScore = 0 } } newScore := currentScore + addToCount this.config.parser.setAttr(node, "gravityNodes", strconv.Itoa(newScore)) }
//adds a score to the gravityScore Attribute we put on divs //we'll get the current score then add the score we're passing in to the current func (extr *ContentExtractor) updateScore(node *goquery.Selection, addToScore int) { currentScore := 0 var err error scoreString, _ := node.Attr("gravityScore") if scoreString != "" { currentScore, err = strconv.Atoi(scoreString) if err != nil { currentScore = 0 } } newScore := currentScore + addToScore extr.config.parser.setAttr(node, "gravityScore", strconv.Itoa(newScore)) }
// filterLITag finds street names func filterLITag(_ int, s *goquery.Selection) bool { _, exists := s.Attr("id") if exists { return false } _, exists = s.Attr("class") if exists { return false } return true }
// attributeToUrl reads an attribute from an element and returns a url. func (bow *Browser) attrToResolvedUrl(name string, sel *goquery.Selection) (*url.URL, error) { src, ok := sel.Attr(name) if !ok { return nil, errors.NewAttributeNotFound( "Attribute '%s' not found.", name) } ur, err := url.Parse(src) if err != nil { return nil, err } return bow.ResolveUrl(ur), nil }