Beispiel #1
0
func (b baiduNews) findP(html *goquery.Selection) *goquery.Selection {
	if html.Is("body") {
		return html
	} else if result := html.Parent().Find("p"); len(result.Nodes) == 0 {
		return b.findP(html.Parent())
	} else {
		return html.Parent()
	}
}
Beispiel #2
0
func encuentraGrupo(tabla *goquery.Selection) (grupo int) {
	var anterior *goquery.Selection
	for anterior = tabla.Prev(); anterior.Length() > 0 && grupo == 0; anterior = anterior.Prev() {
		if !anterior.Is("div") {
			log.Fatal(errors.New("No se encontró curso para la tabla"))
		}
		strongs := anterior.Find("strong")
		if strongs.Length() != 1 {
			continue
		}
		hayMatch, err := regexp.MatchString("Grupo [0-9]+", strongs.Text())
		if err != nil {
			mataPrograma("Morí en encuentraGrupo regex", err)
		}
		if hayMatch {
			tokens := strings.Split(strongs.Text(), " ")
			grupo, err = strconv.Atoi(tokens[1])
			if err != nil {
				mataPrograma("Morí en encuentraGrupo hayMatch", err)
			}
		}
	}
	return
}
Beispiel #3
0
func (d *Document) scoreNode(s *goquery.Selection) *candidate {
	contentScore := d.classWeight(s)
	if s.Is("div") {
		contentScore += 5
	} else if s.Is("blockquote,form") {
		contentScore = 3
	} else if s.Is("th") {
		contentScore -= 5
	}

	return &candidate{s, float32(contentScore)}
}
Beispiel #4
0
func (d *Document) cleanConditionally(s *goquery.Selection, selector string) {
	if !d.CleanConditionally {
		return
	}

	s.Find(selector).Each(func(i int, s *goquery.Selection) {
		node := s.Get(0)
		weight := float32(d.classWeight(s))
		contentScore := float32(0)

		if c, ok := d.candidates[node]; ok {
			contentScore = c.score
		}

		if weight+contentScore < 0 {
			removeNodes(s)
			Logger.Printf("Conditionally cleaned %s%s with weight %f and content score %f\n", node.Data, getName(s), weight, contentScore)
			return
		}

		text := s.Text()
		if strings.Count(text, ",") < 10 {
			counts := map[string]int{
				"p":     s.Find("p").Length(),
				"img":   s.Find("img").Length(),
				"li":    s.Find("li").Length() - 100,
				"a":     s.Find("a").Length(),
				"embed": s.Find("embed").Length(),
				"input": s.Find("input").Length(),
			}

			contentLength := len(strings.TrimSpace(text))
			linkDensity := d.getLinkDensity(s)
			remove := false
			reason := ""

			if counts["img"] > counts["p"] {
				reason = "too many images"
				remove = true
			} else if counts["li"] > counts["p"] && !s.Is("ul,ol") {
				reason = "more <li>s than <p>s"
				remove = true
			} else if counts["input"] > int(counts["p"]/3.0) {
				reason = "less than 3x <p>s than <input>s"
				remove = true
			} else if contentLength < d.MinTextLength && (counts["img"] == 0 || counts["img"] > 2) {
				reason = "too short content length without a single image"
				remove = true
			} else if weight < 25 && linkDensity > 0.2 {
				reason = fmt.Sprintf("too many links for its weight (%f)", weight)
				remove = true
			} else if weight >= 25 && linkDensity > 0.5 {
				reason = fmt.Sprintf("too many links for its weight (%f)", weight)
				remove = true
			} else if (counts["embed"] == 1 && contentLength < 75) || counts["embed"] > 1 {
				reason = "<embed>s with too short a content length, or too many <embed>s"
				remove = true
			}

			if remove {
				Logger.Printf("Conditionally cleaned %s%s with weight %f and content score %f because it has %s\n", node.Data, getName(s), weight, contentScore, reason)
				removeNodes(s)
			}
		}
	})
}