//a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to //boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs //so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it func (this *contentExtractor) isBoostable(node *goquery.Selection) bool { stepsAway := 0 next := node.Next() for next != nil && stepsAway < node.Siblings().Length() { currentNodeTag := node.Get(0).DataAtom.String() if currentNodeTag == "p" { if stepsAway >= 3 { if this.config.debug { log.Println("Next paragraph is too far away, not boosting") } return false } paraText := node.Text() ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, paraText) if ws.stopWordCount > 5 { if this.config.debug { log.Println("We're gonna boost this node, seems content") } return true } } stepsAway++ next = next.Next() } return false }
func guessCaption(sel *goquery.Selection, imgTag string) string { possibleCapTags := []string{ "alt", "title", "data-caption", // bbc.com } possibleCreditClasses := []string{ "credit", "caption-credit", } possibleCapClasses := []string{ ".caption", ".caption.source", "p.caption", ".media-caption", ".caption-credit", ".caption-left", ".caption-left", ".caption-right", ".caption-center", ".photoCaption", ".pb-caption", "figurecaption", "figcaption", ".imageCaption", // evolution institute "wp-caption-text", // mcntyr.com ".wp-caption-text", // quantamagazin.com ".article__image__caption", // aeon "p", // technologie review } // collect all captions var captionsFromTags []string for _, capTag := range possibleCapTags { // Since sescapeDocLatexMetaChars only looks for Text() elements, // these attributes are typically not covered, so we must do it here by // hand //log.Println("tag: ", capTag) // first try to find caption tag from img environment (sel) caption, exist := sel.Attr(capTag) if !exist { // if no caption is found, try to find it within the imgTag tagSel := sel.Find(imgTag) caption, exist = tagSel.Attr(capTag) } if exist { //log.Println("tags: ", caption) EscapeLatexMetaChars(&caption) if captionIsValid(caption, "") { captionsFromTags = append(captionsFromTags, caption) } } } var captionsFromClass []string for _, capClass := range possibleCapClasses { caption, _ := sel.Find(capClass).Html() // for aeon, where the image caption is not with the fig environment // but in the next sibling we append this selection to the figEnv if caption == "" { classAttr, _ := sel.Next().Attr("class") if strings.Contains(classAttr, capClass) { caption, _ = sel.Next().Html() sel.AppendSelection(sel.Next()) } } if captionIsValid(caption, capClass) { captionsFromClass = append(captionsFromClass, caption) } } // collect all credits var credits []string for _, creditClass := range possibleCreditClasses { credit, _ := sel.Find(creditClass).Html() if credit != "" { credits = append(credits, credit) } } // prefer captions from a caption class over the caption from an 'alt' tag var captions []string if captionsFromClass != nil { captions = captionsFromClass } else { captions = captionsFromTags } _, longestCap := longestElement(captions) _, longestCredit := longestElement(credits) longestCap = strings.TrimSpace(longestCap) longestCredit = strings.TrimSpace(longestCredit) total := longestCap + " " + longestCredit total = sanatizeCaptions(total) capDoc, err := goquery.NewDocumentFromReader(strings.NewReader(total)) if err != nil { Error("Could not parse string %v", total) Err(err) return total } // sometimes Captions or credits contain spurious <p> and \n's, which lead // to pdflatex fails because of line breaks. Hence, we remove them. wrapElementsAndKeepLinebreak(capDoc, "p", " ", "") convertLinks(capDoc) return strings.TrimSpace(capDoc.Text()) }