Пример #1
0
func getBusListfrmMTC(bussite string) ([]string, error) {
	var b *goquery.Document
	var bb *goquery.Selection
	var fn func(int, *goquery.Selection) string
	b, e := goquery.NewDocument(bussite)
	if e != nil {
		return nil, e
	}
	bb = b.Find("Option")
	bb = bb.Not(":empty")
	fn = func(i int, s *goquery.Selection) string {
		return s.Text()
	}
	return bb.Map(fn), nil
}
Пример #2
0
func TrimmedTexts(s *goquery.Selection) []string {
	return s.Map(func(_ int, s *goquery.Selection) string {
		return strings.TrimSpace(s.Text())
	})
}
Пример #3
0
func ExtractBodyFromDocument(doc *gq.Document, fromJSON bool, includeTitle bool) *m.ExtractedBody {
	msg := new(m.Messages)

	var paragraphs *gq.Selection
	if fromJSON {
		paragraphs = doc.Find("p")
	} else {
		if len(doc.Find(".longform-body").Nodes) == 0 {
			paragraphs = doc.Find("div[itemprop=articleBody] > p")
		} else {
			paragraphs = doc.Find("div[itemprop=articleBody] > .longform-body > p")
		}
	}

	// remove contact info at the end of the article (might not be needed any more when parsing
	// HTML from JSON?)
	paragraphs.Find("span.-newsgate-paragraph-cci-endnote-contact-").Remove()
	paragraphs.Find("span.-newsgate-paragraph-cci-endnote-contrib-").Remove()

	ignoreRemaining := false
	paragraphStrings := paragraphs.Map(func(i int, paragraph *gq.Selection) string {
		if ignoreRemaining {
			return ""
		}
		for _, selector := range [...]string{"span.-newsgate-character-cci-tagline-name-", "span.-newsgate-paragraph-cci-infobox-head-"} {
			if el := paragraph.Find(selector); el.Length() > 0 {
				ignoreRemaining = true
				return ""
			}
		}

		text := strings.TrimSpace(paragraph.Text())

		if worthy, _ := classify.IsWorthyParagraph(text); !worthy {
			return ""
		}

		//marker := ""

		for _, selector := range [...]string{"span.-newsgate-paragraph-cci-subhead-lead-", "span.-newsgate-paragraph-cci-subhead-"} {
			if el := paragraph.Find(selector); el.Length() > 0 {
				//marker = "### "
				return ""
				break
			}
		}

		return text
	})

	if len(paragraphStrings) > 0 {
		paragraphStrings[0] = dateline.RmDateline(paragraphStrings[0])
	}

	content := make([]string, 0, len(paragraphStrings)+1)
	if includeTitle {
		title := ExtractTitleFromDocument(doc)
		content = append(content, title)
	}

	content = append(content, withoutEmptyStrings(paragraphStrings)...)

	body := strings.Join(content, "\n")
	recipeData, recipeMsg := recipe_parsing.ExtractRecipes(doc)
	msg.AddMessages("recipes", recipeMsg)
	extracted := m.ExtractedBody{body, recipeData, msg}
	return &extracted
}