func parseTimingFragment(text string, msg *m.Messages) m.RecipeTimingFragment { result := m.RecipeTimingFragment{TagF: m.TimingTag} for _, component := range fractionAwareSplitBySlashes(text) { component = strings.TrimSpace(component) if value, ok := extractComponent(component, servesRe); ok { result.ServingSize = value } else if value, ok := extractComponent(component, totalTimeRe); ok { result.TotalTime = parseDuration(value) } else if value, ok := extractComponent(component, prepTimeRe); ok { result.PreparationTime = parseDuration(value) } else { msg.AddWarningf("Unknown duration component: %#v", component) } } return result }
func ExtractBodyFromDocument(doc *gq.Document, fromJSON bool, includeTitle bool) *m.ExtractedBody { msg := new(m.Messages) var paragraphs *gq.Selection if fromJSON { paragraphs = doc.Find("p") } else { if len(doc.Find(".longform-body").Nodes) == 0 { paragraphs = doc.Find("div[itemprop=articleBody] > p") } else { paragraphs = doc.Find("div[itemprop=articleBody] > .longform-body > p") } } // remove contact info at the end of the article (might not be needed any more when parsing // HTML from JSON?) paragraphs.Find("span.-newsgate-paragraph-cci-endnote-contact-").Remove() paragraphs.Find("span.-newsgate-paragraph-cci-endnote-contrib-").Remove() ignoreRemaining := false paragraphStrings := paragraphs.Map(func(i int, paragraph *gq.Selection) string { if ignoreRemaining { return "" } for _, selector := range [...]string{"span.-newsgate-character-cci-tagline-name-", "span.-newsgate-paragraph-cci-infobox-head-"} { if el := paragraph.Find(selector); el.Length() > 0 { ignoreRemaining = true return "" } } text := strings.TrimSpace(paragraph.Text()) if worthy, _ := classify.IsWorthyParagraph(text); !worthy { return "" } //marker := "" for _, selector := range [...]string{"span.-newsgate-paragraph-cci-subhead-lead-", "span.-newsgate-paragraph-cci-subhead-"} { if el := paragraph.Find(selector); el.Length() > 0 { //marker = "### " return "" break } } return text }) if len(paragraphStrings) > 0 { paragraphStrings[0] = dateline.RmDateline(paragraphStrings[0]) } content := make([]string, 0, len(paragraphStrings)+1) if includeTitle { title := ExtractTitleFromDocument(doc) content = append(content, title) } content = append(content, withoutEmptyStrings(paragraphStrings)...) body := strings.Join(content, "\n") recipeData, recipeMsg := recipe_parsing.ExtractRecipes(doc) msg.AddMessages("recipes", recipeMsg) extracted := m.ExtractedBody{body, recipeData, msg} return &extracted }