Example #1
func parseGamePosition(selection *goquery.Selection) (position int) {
	positionString := strings.TrimSpace(selection.Children().First().Text())
	var err error
	position, err = strconv.Atoi(strings.TrimSpace(positionString))
	helper.HandleFatalError("parsing game position failed:", err)
Example #2
func unpackMission(s *goquery.Selection) *Mission {
	m := Mission{}
	tds := s.Children()
	r, err := tds.First().Html()
	if err != nil {
		log.Printf("Error parsing HTML: %+v\n", err)
	} else {
		m.Division = r
	node := tds.Next().Children()
	name, err := node.Html()
	if err != nil {
		log.Println("Error getting name: ", err)
	m.Name = strings.TrimSpace(name)
	href, ok := node.Attr("href")
	if !ok {
		log.Println("No href")
	m.Url = href
	node = tds.Next()
	desc, err := node.Find(".desc").Children().Html()
	if err != nil {
		log.Println("Err getting desc", err)
	m.Description = strings.TrimSpace(desc)
	node = tds.Next()
	date, err := node.Next().Children().Html()
	m.LaunchDate = date
	date2 := strings.Trim(node.Next().First().Text(), "1234567890")
	m.LaunchDateHuman = strings.TrimSpace(date2)
	m.Phase = strings.TrimLeft(tds.Last().Text(), "1234567890")
	return &m
Example #3
File: scrape.go Project: squat/drae
func ScrapeDefinition(s *goquery.Selection) *Definition {
	category, _ := s.Find("abbr").First().Attr("title")

	return &Definition{
		Category:   category,
		Definition: JoinNodesWithSpace(s.Children().First().NextAll().Not("abbr").Not("span.h")),
		Origin:     ScrapeOrigins(s),
		Notes:      ScrapeNotes(s),
		Examples:   ScrapeExamples(s),
Example #4
func describeSentences(s *goquery.Selection) TextDescription {
	var d TextDescription

	var text string
	// get text of this node and then split for sentences
	if s.Children().Length() > 0 {
		text = getTextFromHtml(s)
	} else {
		text = s.Text()

	sentences := tokenizer.Tokenize(text)

	d.CountSentences = len(sentences)
	for _, s := range sentences {
		sentence := s.Text

		if len(sentence) == 0 {

		c := len(get_words_from(sentence))

		d.AverageWords += c

		if c > 3 {
			// presume normal sentence usually has more 3 words

			if c < 25 {
				// but a sentence should not have nore 25 words. We will not
				// consider such sentence as a good one

			lastsymbol := sentence[len(sentence)-1:]

			if strings.ContainsAny(lastsymbol, ".?!") {


	if d.CountSentences > 0 {
		d.AverageWords = int(d.AverageWords / d.CountSentences)

	return d
Example #5
func hasSingleChildMatching(s *gq.Selection, selector string) bool {
	parent := s.Nodes[0]
	childElCount := 0
	for child := parent.FirstChild; child != nil; child = child.NextSibling {
		switch child.Type {
		case html.CommentNode:
		case html.TextNode:
			if child.Data != "" {
				return false
		case html.ElementNode:
			return false

	if childElCount != 1 {
		return false

	children := s.Children()
	return children.Length() == 1 && children.Is(selector)
Example #6
File: wiki.go Project: mnpk/apex
// nodes returns a string representation of the selection's children.
func nodes(s *goquery.Selection) string {
	return strings.Join(s.Children().Map(node), "")
Example #7
func NumberOfElementChild(s *goquery.Selection) int {
	return s.Children().Length()
	//return s.Children().Size()
Example #8
* This is the core function. It checks a selection object and finds if this is a text node
* or it is needed to go deeper , inside a node that has most of text
func findSelectionWithPrimaryText(s *goquery.Selection) *goquery.Selection {

	// if no children then return a text from this node
	if s.Children().Length() == 0 {
		return s

	// variable to find a node with longest text inside it
	sort_by_count_sentences := 0
	// a node with longest text inside it
	var sort_by_text_node *goquery.Selection = nil

	// keep count of nodes containing more 2 sentences
	count_of_nodes_with_sentences := 0

	max_count_of_correct_sentences := 0

	// calcuate count of real symbols
	node_full_text_len := utf8.RuneCountInString(s.Text())

	top_total_count_of_correct_sentences := getNumbericAttribute(s, "totalcountofcorrectsentences")

	// all subnodes lengths
	tlengths := []int{}
	densityes := []int{}

	s.Children().Each(func(i int, sec *goquery.Selection) {
		totalcountofcorrectsentences := getNumbericAttribute(sec, "totalcountofcorrectsentences")

		if totalcountofcorrectsentences > 1 {

			if totalcountofcorrectsentences > max_count_of_correct_sentences {
				max_count_of_correct_sentences = totalcountofcorrectsentences

		// node text length
		tlen := utf8.RuneCountInString(sec.Text())

		html, _ := sec.Html()
		hlen := utf8.RuneCountInString(html)

		if tlen == 0 {
			// process next subnode

		tlengths = append(tlengths, tlen)

		density := (hlen / tlen)

		densityes = append(densityes, density)

		// check if this block is better then previous
		// choose better block only if previous is empty or
		// has less then 10 real sentences
		if totalcountofcorrectsentences > sort_by_count_sentences && sort_by_count_sentences < 10 {

			sort_by_count_sentences = totalcountofcorrectsentences
			sort_by_text_node = sec


	// if any nide with a text was found
	if sort_by_count_sentences > 0 {
		// calculate mean deviation
		lvar := getMeanDeviation(tlengths)

		// get relative value of a mean deviation agains full text length in a node
		lvarproc := (100 * lvar) / float64(node_full_text_len)

		// during tests we found that if this value is less 5
		// the a node is what we are looking for
		// it is the node with "main" text of a page
		if lvarproc < 15 && len(tlengths) > 3 ||
			(count_of_nodes_with_sentences > 2 &&
				float32(max_count_of_correct_sentences) < float32(top_total_count_of_correct_sentences)*0.8) {

			// we found that a text is equally distributed between subnodes
			// no need to go deeper

			return s
		// go deeper inside a node with most of text

		return findSelectionWithPrimaryText(sort_by_text_node)
	// no subnodes found. return a node itself
	return s
Example #9
// describe a text inside a node and add description as pseudo attributes
func describeDocumentNode(s *goquery.Selection) *goquery.Selection {
	var totalcountofgoodsentences int
	var totalcountofcorrectsentences int
	var maxcountofflatsentences int

	countchildren := s.Children().Length()

	var sd TextDescription

	if countchildren > 0 {
		// for each child node check if to remove or not
		s.Children().Each(func(i int, sec *goquery.Selection) {

			// go deeper recursively

			// aggregate data to set to a node

			totalcountofgoodsentences += getNumbericAttribute(sec, "totalcountofgoodsentences")
			totalcountofcorrectsentences += getNumbericAttribute(sec, "totalcountofcorrectsentences")

			countsentences := getNumbericAttribute(sec, "maxcountofflatsentences")

			if countsentences > maxcountofflatsentences {
				maxcountofflatsentences = countsentences


		// describe sentences in this html tag only, drop child nodes
		secclone := getSelectionWihoutChildren(s)

		sd = describeSentences(secclone)

		totalcountofgoodsentences += sd.CountGoodSentences
		totalcountofcorrectsentences += sd.CountCorrectSentences

		if sd.CountGoodSentences > maxcountofflatsentences {
			maxcountofflatsentences = sd.CountGoodSentences

	} else {
		// no child nodes

		sd = describeSentences(s)
		totalcountofgoodsentences = sd.CountGoodSentences
		maxcountofflatsentences = sd.CountGoodSentences
		totalcountofcorrectsentences = sd.CountCorrectSentences
	//fmt.Printf("set totalcountofgoodsentences ")
	// set attributes for the node
	s.SetAttr("countsentences", strconv.Itoa(sd.CountSentences))
	s.SetAttr("averagewords", strconv.Itoa(sd.AverageWords))
	s.SetAttr("countgoodsentences", strconv.Itoa(sd.CountGoodSentences))
	s.SetAttr("countlongsentences", strconv.Itoa(sd.CountLongSentences))
	s.SetAttr("totalcountofgoodsentences", strconv.Itoa(totalcountofgoodsentences))
	s.SetAttr("totalcountofcorrectsentences", strconv.Itoa(totalcountofcorrectsentences))
	s.SetAttr("maxcountofflatsentences", strconv.Itoa(maxcountofflatsentences))

	return s