Beispiel #1
0
func parseGamePosition(selection *goquery.Selection) (position int) {
	positionString := strings.TrimSpace(selection.Children().First().Text())
	var err error
	position, err = strconv.Atoi(strings.TrimSpace(positionString))
	helper.HandleFatalError("parsing game position failed:", err)
	return
}
Beispiel #2
0
func unpackMission(s *goquery.Selection) *Mission {
	m := Mission{}
	tds := s.Children()
	r, err := tds.First().Html()
	if err != nil {
		log.Printf("Error parsing HTML: %+v\n", err)
	} else {
		m.Division = r
	}
	node := tds.Next().Children()
	name, err := node.Html()
	if err != nil {
		log.Println("Error getting name: ", err)
	}
	m.Name = strings.TrimSpace(name)
	href, ok := node.Attr("href")
	if !ok {
		log.Println("No href")
	}
	m.Url = href
	node = tds.Next()
	desc, err := node.Find(".desc").Children().Html()
	if err != nil {
		log.Println("Err getting desc", err)
	}
	m.Description = strings.TrimSpace(desc)
	node = tds.Next()
	date, err := node.Next().Children().Html()
	m.LaunchDate = date
	date2 := strings.Trim(node.Next().First().Text(), "1234567890")
	m.LaunchDateHuman = strings.TrimSpace(date2)
	m.Phase = strings.TrimLeft(tds.Last().Text(), "1234567890")
	return &m
}
Beispiel #3
0
func ScrapeDefinition(s *goquery.Selection) *Definition {
	category, _ := s.Find("abbr").First().Attr("title")

	return &Definition{
		Category:   category,
		Definition: JoinNodesWithSpace(s.Children().First().NextAll().Not("abbr").Not("span.h")),
		Origin:     ScrapeOrigins(s),
		Notes:      ScrapeNotes(s),
		Examples:   ScrapeExamples(s),
	}
}
Beispiel #4
0
func describeSentences(s *goquery.Selection) TextDescription {
	var d TextDescription

	var text string
	// get text of this node and then split for sentences
	if s.Children().Length() > 0 {
		text = getTextFromHtml(s)
	} else {
		text = s.Text()
	}

	sentences := tokenizer.Tokenize(text)

	d.CountSentences = len(sentences)
	//fmt.Println("==============================================")
	for _, s := range sentences {
		sentence := s.Text

		if len(sentence) == 0 {
			continue
		}

		c := len(get_words_from(sentence))
		//fmt.Println(sentence)

		d.AverageWords += c

		if c > 3 {
			// presume normal sentence usually has more 3 words
			d.CountLongSentences++

			if c < 25 {
				// but a sentence should not have nore 25 words. We will not
				// consider such sentence as a good one
				d.CountGoodSentences++

			}
			lastsymbol := sentence[len(sentence)-1:]

			if strings.ContainsAny(lastsymbol, ".?!") {
				d.CountCorrectSentences++
			}
		}

	}

	if d.CountSentences > 0 {
		d.AverageWords = int(d.AverageWords / d.CountSentences)
	}

	return d
}
Beispiel #5
0
func hasSingleChildMatching(s *gq.Selection, selector string) bool {
	parent := s.Nodes[0]
	childElCount := 0
	for child := parent.FirstChild; child != nil; child = child.NextSibling {
		switch child.Type {
		case html.CommentNode:
		case html.TextNode:
			if child.Data != "" {
				return false
			}
		case html.ElementNode:
			childElCount++
		default:
			return false
		}
	}

	if childElCount != 1 {
		return false
	}

	children := s.Children()
	return children.Length() == 1 && children.Is(selector)
}
Beispiel #6
0
Datei: wiki.go Projekt: mnpk/apex
// nodes returns a string representation of the selection's children.
func nodes(s *goquery.Selection) string {
	return strings.Join(s.Children().Map(node), "")
}
Beispiel #7
0
func NumberOfElementChild(s *goquery.Selection) int {
	return s.Children().Length()
	//return s.Children().Size()
}
Beispiel #8
0
/*
* This is the core function. It checks a selection object and finds if this is a text node
* or it is needed to go deeper , inside a node that has most of text
 */
func findSelectionWithPrimaryText(s *goquery.Selection) *goquery.Selection {

	// if no children then return a text from this node
	if s.Children().Length() == 0 {
		return s
	}

	// variable to find a node with longest text inside it
	sort_by_count_sentences := 0
	// a node with longest text inside it
	var sort_by_text_node *goquery.Selection = nil

	// keep count of nodes containing more 2 sentences
	count_of_nodes_with_sentences := 0

	max_count_of_correct_sentences := 0

	// calcuate count of real symbols
	node_full_text_len := utf8.RuneCountInString(s.Text())

	top_total_count_of_correct_sentences := getNumbericAttribute(s, "totalcountofcorrectsentences")

	// all subnodes lengths
	tlengths := []int{}
	densityes := []int{}

	s.Children().Each(func(i int, sec *goquery.Selection) {
		totalcountofcorrectsentences := getNumbericAttribute(sec, "totalcountofcorrectsentences")

		if totalcountofcorrectsentences > 1 {
			count_of_nodes_with_sentences++

			if totalcountofcorrectsentences > max_count_of_correct_sentences {
				max_count_of_correct_sentences = totalcountofcorrectsentences
			}
		}

		// node text length
		tlen := utf8.RuneCountInString(sec.Text())

		html, _ := sec.Html()
		hlen := utf8.RuneCountInString(html)

		if tlen == 0 {
			// process next subnode
			return
		}

		tlengths = append(tlengths, tlen)

		density := (hlen / tlen)

		densityes = append(densityes, density)

		// check if this block is better then previous
		// choose better block only if previous is empty or
		// has less then 10 real sentences
		if totalcountofcorrectsentences > sort_by_count_sentences && sort_by_count_sentences < 10 {

			sort_by_count_sentences = totalcountofcorrectsentences
			sort_by_text_node = sec
		}

	})

	// if any nide with a text was found
	if sort_by_count_sentences > 0 {
		// calculate mean deviation
		lvar := getMeanDeviation(tlengths)

		// get relative value of a mean deviation agains full text length in a node
		lvarproc := (100 * lvar) / float64(node_full_text_len)

		// during tests we found that if this value is less 5
		// the a node is what we are looking for
		// it is the node with "main" text of a page
		if lvarproc < 15 && len(tlengths) > 3 ||
			(count_of_nodes_with_sentences > 2 &&
				float32(max_count_of_correct_sentences) < float32(top_total_count_of_correct_sentences)*0.8) {

			// we found that a text is equally distributed between subnodes
			// no need to go deeper

			return s
		}
		// go deeper inside a node with most of text

		return findSelectionWithPrimaryText(sort_by_text_node)
	}
	// no subnodes found. return a node itself
	return s
}
Beispiel #9
0
// describe a text inside a node and add description as pseudo attributes
func describeDocumentNode(s *goquery.Selection) *goquery.Selection {
	var totalcountofgoodsentences int
	var totalcountofcorrectsentences int
	var maxcountofflatsentences int

	countchildren := s.Children().Length()

	var sd TextDescription

	if countchildren > 0 {
		// for each child node check if to remove or not
		s.Children().Each(func(i int, sec *goquery.Selection) {

			// go deeper recursively
			describeDocumentNode(sec)

			// aggregate data to set to a node

			totalcountofgoodsentences += getNumbericAttribute(sec, "totalcountofgoodsentences")
			totalcountofcorrectsentences += getNumbericAttribute(sec, "totalcountofcorrectsentences")

			countsentences := getNumbericAttribute(sec, "maxcountofflatsentences")

			if countsentences > maxcountofflatsentences {
				maxcountofflatsentences = countsentences
			}

		})

		// describe sentences in this html tag only, drop child nodes
		secclone := getSelectionWihoutChildren(s)

		sd = describeSentences(secclone)

		totalcountofgoodsentences += sd.CountGoodSentences
		totalcountofcorrectsentences += sd.CountCorrectSentences

		if sd.CountGoodSentences > maxcountofflatsentences {
			maxcountofflatsentences = sd.CountGoodSentences
		}

	} else {
		// no child nodes
		//fmt.Println(s.Text())

		sd = describeSentences(s)
		totalcountofgoodsentences = sd.CountGoodSentences
		maxcountofflatsentences = sd.CountGoodSentences
		totalcountofcorrectsentences = sd.CountCorrectSentences
	}
	//fmt.Printf("set totalcountofgoodsentences ")
	// set attributes for the node
	s.SetAttr("countsentences", strconv.Itoa(sd.CountSentences))
	s.SetAttr("averagewords", strconv.Itoa(sd.AverageWords))
	s.SetAttr("countgoodsentences", strconv.Itoa(sd.CountGoodSentences))
	s.SetAttr("countlongsentences", strconv.Itoa(sd.CountLongSentences))
	s.SetAttr("totalcountofgoodsentences", strconv.Itoa(totalcountofgoodsentences))
	s.SetAttr("totalcountofcorrectsentences", strconv.Itoa(totalcountofcorrectsentences))
	s.SetAttr("maxcountofflatsentences", strconv.Itoa(maxcountofflatsentences))

	return s
}