Beispiel #1
0
func score(tag *goquery.Selection) int {
	src, _ := tag.Attr("src")
	if src == "" {
		src, _ = tag.Attr("data-src")
	}
	if src == "" {
		src, _ = tag.Attr("data-lazy-src")
	}
	if src == "" {
		return -1
	}
	tagScore := 0
	for rule, score := range rules {
		if rule.MatchString(src) {
			tagScore += score
		}
	}

	alt, exists := tag.Attr("alt")
	if exists {
		if strings.Contains(alt, "thumbnail") {
			tagScore--
		}
	}

	id, exists := tag.Attr("id")
	if exists {
		if id == "fbPhotoImage" {
			tagScore++
		}
	}
	return tagScore
}
Beispiel #2
0
func guessSourceURL(s *goquery.Selection, link *url.URL) string {
	possibleSrcAttr := []string{"src",
		"data-src",
		"srcset",
		"data-full-size",
		"data-original",
		"href", // bbc
		"pagespeed_lazy_src",
	}

	var possibleSrcURLs []string
	for _, attr := range possibleSrcAttr {
		link, exist := s.Attr(attr)
		if exist {
			Debug("possible image src url under: %v", attr)
			possibleSrcURLs = append(possibleSrcURLs, link)
		}
	}
	if len(possibleSrcURLs) == 0 {
		return ""
	}

	validSrcURLs := removeInvalidURLs(possibleSrcURLs, link)
	if len(validSrcURLs) == 0 {
		return ""
	}

	indexLongestElement, _ := longestElement(validSrcURLs)

	return validSrcURLs[indexLongestElement]
}
Beispiel #3
0
// attributeOrDefault reads an attribute and returns it or the default value when it's empty.
func (bow *Browser) attrOrDefault(name, def string, sel *goquery.Selection) string {
	a, ok := sel.Attr(name)
	if ok {
		return a
	}
	return def
}
Beispiel #4
0
// Add street to result and find street information (to WikipediaMoscow.result)
func (parser *WikipediaMoscow) processLink(_ int, s *goquery.Selection, done chan<- *StreetInfo) {
	name := strings.TrimSpace(s.Text())
	if len(name) == 0 {
		done <- parser.getEmptyInfo("")
		return
	}

	href, exists := s.Attr("href")
	if !exists {
		done <- parser.getEmptyInfo("")
		return
	}

	var (
		class string
		info  *StreetInfo
	)

	class, exists = s.Attr("class")

	if exists && class == "new" {
		info = parser.getEmptyInfo(name)
	} else if resp, err := http.Get(parser.baseURL + href); err != nil {
		info = parser.getEmptyInfo(name)
	} else {
		streetparser := NewWikipediaStreetParser()
		info, err = streetparser.ParseStreetInfo(name, resp.Body)
		if err != nil {
			info = parser.getEmptyInfo(name)
		}
	}
	done <- info
}
Beispiel #5
0
func (this *parser) name(selector string, selection *goquery.Selection) string {
	value, exists := selection.Attr(selector)
	if exists {
		return value
	}
	return ""
}
Beispiel #6
0
// toPage is a helper function that accepts an anchor
// tag referencing a markdown file, parsing the markdown
// file and returning a page to be included in our docs.
func toPage(site *Site, el *goquery.Selection) (*Page, error) {

	// follow the link to see if this is a page
	// that should be added to our documentation.
	href, ok := el.Attr("href")
	if !ok || href == "#" {
		return nil, nil
	}

	// read the markdown file, convert to html and
	// read into a dom element.
	doc, err := toDocument(filepath.Join(site.base, href))
	if err != nil {
		return nil, err
	}

	// convert the extension from markdown to
	// html, in preparation for type conversion.
	href = strings.Replace(href, ".md", ".html", -1)
	el.SetAttr("href", href)

	page := &Page{}
	page.Href = href
	page.html, err = doc.Html()
	return page, err
}
Beispiel #7
0
func (ve *VideoExtractor) getSrc(node *goquery.Selection) string {
	value, exists := node.Attr("src")
	if exists {
		return value
	}
	return ""
}
Beispiel #8
0
func convertTagToJqueryFormat(tag string, s *goquery.Selection) string {
	tagitself := tag

	pos := strings.Index(tag, " ")

	if pos > -1 {
		tagitself = tag[0:pos]
	} else {

		return tag
	}

	class, found := s.Attr("class")

	if found && class != "" {
		pos := strings.Index(class, " ")
		// leave only a first class from a list
		if pos > -1 {
			class = class[0:pos]
		}

		tagitself = tagitself + "." + class
	}

	return tagitself
}
Beispiel #9
0
func (d *Document) classWeight(s *goquery.Selection) int {
	weight := 0
	if !d.WeightClasses {
		return weight
	}

	class, _ := s.Attr("class")
	id, _ := s.Attr("id")

	if class != "" {
		if negativeRegexp.MatchString(class) {
			weight -= 25
		}

		if positiveRegexp.MatchString(class) {
			weight += 25
		}
	}

	if id != "" {
		if negativeRegexp.MatchString(id) {
			weight -= 25
		}

		if positiveRegexp.MatchString(id) {
			weight += 25
		}
	}

	return weight
}
// Parse from div.tweet
func (tweet *Tweet) Parse(s *goquery.Selection) (err error) {
	success := false
	attrs := []string{
		"data-item-id",
		"data-screen-name",
		"data-name",
	}
	data := map[string]string{}

	for _, attr := range attrs {
		var value string
		if value, success = s.Attr(attr); !success {
			tweet.Success = 0
			err = fmt.Errorf("not having %s attribute", attr)
			return
		}
		data[attr] = value
	}

	tweet.ItemID = data["data-item-id"]
	tweet.ScreenName = data["data-screen-name"]
	tweet.Name = data["data-name"]
	tweet.Success = 1

	// if could get the above attribues, allow the following values to be blank.
	tweet.Time, _ = s.Find("._timestamp").Attr("data-time")
	tweet.Text = s.Find(".tweet-text").Text()
	return
}
Beispiel #11
0
func ScrapeNotes(s *goquery.Selection) []string {
	notes := []string{}
	s.Find("abbr").Not("abbr:first-of-type").Not("abbr.c").Each(func(i int, s *goquery.Selection) {
		note, _ := s.Attr("title")
		notes = append(notes, note)
	})
	return notes
}
Beispiel #12
0
func ScrapeOrigins(s *goquery.Selection) []string {
	origins := []string{}
	s.Find("abbr.c").Each(func(i int, s *goquery.Selection) {
		origin, _ := s.Attr("title")
		origins = append(origins, origin)
	})
	return origins
}
Beispiel #13
0
func ScrapeNotes(s *goquery.Selection) []string {
	notes := []string{}
	s.Clone().Find("span[title]").First().Remove().End().End().Find("span.d i span.d[title]").Remove().End().Find("span.d[title]").Each(func(i int, s *goquery.Selection) {
		note, _ := s.Attr("title")
		notes = append(notes, note)
	})
	return notes
}
func attrToUrl(s *goquery.Selection, attr string) (*url.URL, error) {
	link, exists := s.Attr(attr)
	if exists {
		return url.Parse(link)
	}

	return nil, errors.New("Attr " + attr + " not found")
}
Beispiel #15
0
func (ve *VideoExtractor) getHeight(node *goquery.Selection) int {
	value, exists := node.Attr("height")
	if exists {
		nvalue, _ := strconv.Atoi(value)
		return nvalue
	}
	return 0
}
Beispiel #16
0
func getNumbericAttribute(s *goquery.Selection, attr string) int {
	a, f := s.Attr(attr)

	if f {
		ai, _ := strconv.Atoi(a)
		return ai
	}
	return 0
}
Beispiel #17
0
func getHeight(sel *goquery.Selection) string {

	heightAttr := "height"
	height, exist := sel.Attr(heightAttr)
	if exist {
		return height
	}
	return ""
}
Beispiel #18
0
func getDataFromDOM(s *gq.Selection, arr []string, code string) string {
	var dt string
	if arr[0] == "text" {
		dt = s.Text()
	} else {
		dt, _ = s.Attr(arr[0])
	}
	return encode_string(dt, code)
}
Beispiel #19
0
func itemURL(s *goquery.Selection) (url string) {
	s.Closest("tr").Next().Find("a").Each(func(_ int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		if strings.HasPrefix(href, "item?id=") {
			url = hnURL + href
		}
	})
	return
}
Beispiel #20
0
// guessCodeLang returns the code language if supported, otherwise and empty
// string.
func guessCodeLang(sel *goquery.Selection) string {

	codeLangAttr, _ := sel.Attr("class")

	if codeLangAttr == "" {
		codeLangAttr, _ = sel.Attr("lang")
	}

	return extractCodeLang(codeLangAttr)
}
Beispiel #21
0
/**
 * This function parses and returns the uri associated with the HTML anchor
 * <a href="http://www..."...> tag
 * This function assumes that 'href' attribute contains absolute url.
 * It returns "" empty string if it can't find href attribute from the
 * goquery.Selection parameter.
 */
func getUri(sel *goquery.Selection) string {
	if sel != nil {
		str, exists := sel.Attr("href")
		if exists {
			u, err := url.Parse(str)
			checkErr(err)
			return u.String()
		}
	}
	return ""
}
Beispiel #22
0
func displayDetails(single *goquery.Selection) {
	text := strings.TrimSpace(single.Text())
	href, _ := single.Attr("href")
	length := utf8.RuneCountInString(text)
	if ((length > 5) && wordExists(text, "keywords")) || ((length > 5) && wordExists(href, "keywords")) {
		if wordExists(text, "products") {
			fmt.Println("Link", single.Text(), "--->", href)
		}
	}

}
Beispiel #23
0
func (this *contentExtractor) getNodeGravityScore(node *goquery.Selection) int {
	grvScoreString, exists := node.Attr("gravityScore")
	if !exists {
		return 0
	}
	grvScore, err := strconv.Atoi(grvScoreString)
	if err != nil {
		return 0
	}
	return grvScore
}
Beispiel #24
0
func addIngredient(ingredients []data.Ingredient, a *goquery.Selection) []data.Ingredient {
	if href, ok := a.Attr("href"); ok {
		glog.V(2).Info("    href: " + href)
		id, err := strconv.Atoi(strings.Split(href, "/")[2])
		if err != nil {
			glog.Errorf("Failed to extract id from %s: %v", href, err)
		} else {
			ingredients = append(ingredients, data.Ingredient{Name: a.Text(), Id: id})
		}
	}
	return ingredients
}
Beispiel #25
0
func parseResource(s *goquery.Selection) (_production, _stored, _capacity int) {
	productionStr, _ := s.Attr("title")
	production, _ := strconv.Atoi(productionStr)

	status := s.Text()
	split := strings.Split(status, "/")

	stored, _ := strconv.Atoi(split[0])
	capacity, _ := strconv.Atoi(split[1])

	return production, stored, capacity
}
Beispiel #26
0
func ParseRecipe(sel *goquery.Selection) (*Recipe, error) {
	if _, exists := sel.Attr("itemscope"); !exists {
		return nil, ErrMissingItemScope
	}
	itemtype, exists := sel.Attr("itemtype")
	if !exists {
		return nil, ErrMissingItemType
	}
	if itemtype != RecipeSchemaURL {
		return nil, ErrWrongItemType
	}
	recipe := &Recipe{
		CreativeWork: CreativeWork{
			Thing: Thing{},
		},
	}

	nameSel := sel.Find("[itemprop='name']")
	recipe.Name = strings.TrimSpace(nameSel.Text())

	authorSel := sel.Find("[itemprop='author']").First()
	recipe.Author = strings.TrimSpace(authorSel.Text())

	datePublishedSel := sel.Find("[itemprop='datePublished']")
	datePublishedText, exists := datePublishedSel.Attr("content")
	if !exists {
		datePublishedText = datePublishedSel.Text()
	}
	var err error
	if len(datePublishedText) != 0 {
		recipe.DatePublished, err = time.Parse("2006-01-02", datePublishedText)
		if err != nil {
			return nil, err
		}
	}

	nutritionInformationSel := sel.Find(fmt.Sprintf("[itemscope=''][itemtype='%s']", NutritionInformationSchemaURL))
	if nutritionInformationSel.Size() > 0 {
		recipe.Nutrition, err = ParseNutritionInformation(nutritionInformationSel)
		if err != nil {
			return nil, err
		}
	}

	imageSel := sel.Find("[itemprop='image']")
	recipe.Image, _ = imageSel.Attr("src")

	descriptionSel := sel.Find("[itemprop='description']")
	recipe.Description = strings.TrimSpace(descriptionSel.Text())

	return recipe, nil
}
Beispiel #27
0
//stores how many decent nodes are under a parent node
func (this *contentExtractor) updateNodeCount(node *goquery.Selection, addToCount int) {
	currentScore := 0
	var err error
	scoreString, _ := node.Attr("gravityNodes")
	if scoreString != "" {
		currentScore, err = strconv.Atoi(scoreString)
		if err != nil {
			currentScore = 0
		}
	}
	newScore := currentScore + addToCount
	this.config.parser.setAttr(node, "gravityNodes", strconv.Itoa(newScore))
}
Beispiel #28
0
//adds a score to the gravityScore Attribute we put on divs
//we'll get the current score then add the score we're passing in to the current
func (extr *ContentExtractor) updateScore(node *goquery.Selection, addToScore int) {
	currentScore := 0
	var err error
	scoreString, _ := node.Attr("gravityScore")
	if scoreString != "" {
		currentScore, err = strconv.Atoi(scoreString)
		if err != nil {
			currentScore = 0
		}
	}
	newScore := currentScore + addToScore
	extr.config.parser.setAttr(node, "gravityScore", strconv.Itoa(newScore))
}
Beispiel #29
0
// filterLITag finds street names
func filterLITag(_ int, s *goquery.Selection) bool {
	_, exists := s.Attr("id")
	if exists {
		return false
	}

	_, exists = s.Attr("class")
	if exists {
		return false
	}

	return true
}
Beispiel #30
0
// attributeToUrl reads an attribute from an element and returns a url.
func (bow *Browser) attrToResolvedUrl(name string, sel *goquery.Selection) (*url.URL, error) {
	src, ok := sel.Attr(name)
	if !ok {
		return nil, errors.NewAttributeNotFound(
			"Attribute '%s' not found.", name)
	}
	ur, err := url.Parse(src)
	if err != nil {
		return nil, err
	}

	return bow.ResolveUrl(ur), nil
}