Example #1
0
//a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
//boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
//so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it
func (this *contentExtractor) isBoostable(node *goquery.Selection) bool {
	stepsAway := 0
	next := node.Next()
	for next != nil && stepsAway < node.Siblings().Length() {
		currentNodeTag := node.Get(0).DataAtom.String()
		if currentNodeTag == "p" {
			if stepsAway >= 3 {
				if this.config.debug {
					log.Println("Next paragraph is too far away, not boosting")
				}
				return false
			}

			paraText := node.Text()
			ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, paraText)
			if ws.stopWordCount > 5 {
				if this.config.debug {
					log.Println("We're gonna boost this node, seems content")
				}
				return true
			}
		}

		stepsAway++
		next = next.Next()
	}

	return false
}
Example #2
0
func guessCaption(sel *goquery.Selection, imgTag string) string {
	possibleCapTags := []string{
		"alt",
		"title",
		"data-caption", // bbc.com
	}
	possibleCreditClasses := []string{
		"credit",
		"caption-credit",
	}
	possibleCapClasses := []string{
		".caption",
		".caption.source",
		"p.caption",
		".media-caption",
		".caption-credit",
		".caption-left",
		".caption-left",
		".caption-right",
		".caption-center",
		".photoCaption",
		".pb-caption",
		"figurecaption",
		"figcaption",
		".imageCaption",            // evolution institute
		"wp-caption-text",          // mcntyr.com
		".wp-caption-text",         // quantamagazin.com
		".article__image__caption", // aeon
		"p", // technologie review
	}

	// collect all captions
	var captionsFromTags []string
	for _, capTag := range possibleCapTags {
		// Since sescapeDocLatexMetaChars only looks for Text() elements,
		// these attributes are typically not covered, so we must do it here by
		// hand
		//log.Println("tag:  ", capTag)

		// first try to find caption tag from img environment (sel)
		caption, exist := sel.Attr(capTag)
		if !exist {
			// if no caption is found, try to find it within the imgTag
			tagSel := sel.Find(imgTag)
			caption, exist = tagSel.Attr(capTag)
		}
		if exist {
			//log.Println("tags:  ", caption)
			EscapeLatexMetaChars(&caption)

			if captionIsValid(caption, "") {
				captionsFromTags = append(captionsFromTags, caption)
			}
		}
	}

	var captionsFromClass []string
	for _, capClass := range possibleCapClasses {
		caption, _ := sel.Find(capClass).Html()

		// for aeon, where the image caption is not with the fig environment
		// but in the next sibling we append this selection to the figEnv
		if caption == "" {
			classAttr, _ := sel.Next().Attr("class")
			if strings.Contains(classAttr, capClass) {
				caption, _ = sel.Next().Html()
				sel.AppendSelection(sel.Next())
			}
		}

		if captionIsValid(caption, capClass) {
			captionsFromClass = append(captionsFromClass, caption)
		}
	}

	// collect all credits
	var credits []string
	for _, creditClass := range possibleCreditClasses {
		credit, _ := sel.Find(creditClass).Html()
		if credit != "" {
			credits = append(credits, credit)
		}
	}

	// prefer captions from a caption class over the caption from an 'alt' tag
	var captions []string
	if captionsFromClass != nil {
		captions = captionsFromClass
	} else {
		captions = captionsFromTags
	}

	_, longestCap := longestElement(captions)
	_, longestCredit := longestElement(credits)
	longestCap = strings.TrimSpace(longestCap)
	longestCredit = strings.TrimSpace(longestCredit)
	total := longestCap + " " + longestCredit
	total = sanatizeCaptions(total)

	capDoc, err := goquery.NewDocumentFromReader(strings.NewReader(total))
	if err != nil {
		Error("Could not parse string %v", total)
		Err(err)
		return total
	}

	// sometimes Captions or credits contain spurious <p> and \n's, which lead
	// to pdflatex fails because of line breaks. Hence, we remove them.
	wrapElementsAndKeepLinebreak(capDoc, "p", " ", "")
	convertLinks(capDoc)

	return strings.TrimSpace(capDoc.Text())
}