コード例 #1
0
ファイル: fragments.go プロジェクト: michigan-com/newsfetch
func parseTimingFragment(text string, msg *m.Messages) m.RecipeTimingFragment {
	result := m.RecipeTimingFragment{TagF: m.TimingTag}

	for _, component := range fractionAwareSplitBySlashes(text) {
		component = strings.TrimSpace(component)
		if value, ok := extractComponent(component, servesRe); ok {
			result.ServingSize = value
		} else if value, ok := extractComponent(component, totalTimeRe); ok {
			result.TotalTime = parseDuration(value)
		} else if value, ok := extractComponent(component, prepTimeRe); ok {
			result.PreparationTime = parseDuration(value)
		} else {
			msg.AddWarningf("Unknown duration component: %#v", component)
		}
	}

	return result
}
コード例 #2
0
ファイル: parse.go プロジェクト: michigan-com/newsfetch
func ExtractBodyFromDocument(doc *gq.Document, fromJSON bool, includeTitle bool) *m.ExtractedBody {
	msg := new(m.Messages)

	var paragraphs *gq.Selection
	if fromJSON {
		paragraphs = doc.Find("p")
	} else {
		if len(doc.Find(".longform-body").Nodes) == 0 {
			paragraphs = doc.Find("div[itemprop=articleBody] > p")
		} else {
			paragraphs = doc.Find("div[itemprop=articleBody] > .longform-body > p")
		}
	}

	// remove contact info at the end of the article (might not be needed any more when parsing
	// HTML from JSON?)
	paragraphs.Find("span.-newsgate-paragraph-cci-endnote-contact-").Remove()
	paragraphs.Find("span.-newsgate-paragraph-cci-endnote-contrib-").Remove()

	ignoreRemaining := false
	paragraphStrings := paragraphs.Map(func(i int, paragraph *gq.Selection) string {
		if ignoreRemaining {
			return ""
		}
		for _, selector := range [...]string{"span.-newsgate-character-cci-tagline-name-", "span.-newsgate-paragraph-cci-infobox-head-"} {
			if el := paragraph.Find(selector); el.Length() > 0 {
				ignoreRemaining = true
				return ""
			}
		}

		text := strings.TrimSpace(paragraph.Text())

		if worthy, _ := classify.IsWorthyParagraph(text); !worthy {
			return ""
		}

		//marker := ""

		for _, selector := range [...]string{"span.-newsgate-paragraph-cci-subhead-lead-", "span.-newsgate-paragraph-cci-subhead-"} {
			if el := paragraph.Find(selector); el.Length() > 0 {
				//marker = "### "
				return ""
				break
			}
		}

		return text
	})

	if len(paragraphStrings) > 0 {
		paragraphStrings[0] = dateline.RmDateline(paragraphStrings[0])
	}

	content := make([]string, 0, len(paragraphStrings)+1)
	if includeTitle {
		title := ExtractTitleFromDocument(doc)
		content = append(content, title)
	}

	content = append(content, withoutEmptyStrings(paragraphStrings)...)

	body := strings.Join(content, "\n")
	recipeData, recipeMsg := recipe_parsing.ExtractRecipes(doc)
	msg.AddMessages("recipes", recipeMsg)
	extracted := m.ExtractedBody{body, recipeData, msg}
	return &extracted
}