func getBusListfrmMTC(bussite string) ([]string, error) { var b *goquery.Document var bb *goquery.Selection var fn func(int, *goquery.Selection) string b, e := goquery.NewDocument(bussite) if e != nil { return nil, e } bb = b.Find("Option") bb = bb.Not(":empty") fn = func(i int, s *goquery.Selection) string { return s.Text() } return bb.Map(fn), nil }
func TrimmedTexts(s *goquery.Selection) []string { return s.Map(func(_ int, s *goquery.Selection) string { return strings.TrimSpace(s.Text()) }) }
func ExtractBodyFromDocument(doc *gq.Document, fromJSON bool, includeTitle bool) *m.ExtractedBody { msg := new(m.Messages) var paragraphs *gq.Selection if fromJSON { paragraphs = doc.Find("p") } else { if len(doc.Find(".longform-body").Nodes) == 0 { paragraphs = doc.Find("div[itemprop=articleBody] > p") } else { paragraphs = doc.Find("div[itemprop=articleBody] > .longform-body > p") } } // remove contact info at the end of the article (might not be needed any more when parsing // HTML from JSON?) paragraphs.Find("span.-newsgate-paragraph-cci-endnote-contact-").Remove() paragraphs.Find("span.-newsgate-paragraph-cci-endnote-contrib-").Remove() ignoreRemaining := false paragraphStrings := paragraphs.Map(func(i int, paragraph *gq.Selection) string { if ignoreRemaining { return "" } for _, selector := range [...]string{"span.-newsgate-character-cci-tagline-name-", "span.-newsgate-paragraph-cci-infobox-head-"} { if el := paragraph.Find(selector); el.Length() > 0 { ignoreRemaining = true return "" } } text := strings.TrimSpace(paragraph.Text()) if worthy, _ := classify.IsWorthyParagraph(text); !worthy { return "" } //marker := "" for _, selector := range [...]string{"span.-newsgate-paragraph-cci-subhead-lead-", "span.-newsgate-paragraph-cci-subhead-"} { if el := paragraph.Find(selector); el.Length() > 0 { //marker = "### " return "" break } } return text }) if len(paragraphStrings) > 0 { paragraphStrings[0] = dateline.RmDateline(paragraphStrings[0]) } content := make([]string, 0, len(paragraphStrings)+1) if includeTitle { title := ExtractTitleFromDocument(doc) content = append(content, title) } content = append(content, withoutEmptyStrings(paragraphStrings)...) body := strings.Join(content, "\n") recipeData, recipeMsg := recipe_parsing.ExtractRecipes(doc) msg.AddMessages("recipes", recipeMsg) extracted := m.ExtractedBody{body, recipeData, msg} return &extracted }