func (b baiduNews) findP(html *goquery.Selection) *goquery.Selection { if html.Is("body") { return html } else if result := html.Parent().Find("p"); len(result.Nodes) == 0 { return b.findP(html.Parent()) } else { return html.Parent() } }
func encuentraGrupo(tabla *goquery.Selection) (grupo int) { var anterior *goquery.Selection for anterior = tabla.Prev(); anterior.Length() > 0 && grupo == 0; anterior = anterior.Prev() { if !anterior.Is("div") { log.Fatal(errors.New("No se encontró curso para la tabla")) } strongs := anterior.Find("strong") if strongs.Length() != 1 { continue } hayMatch, err := regexp.MatchString("Grupo [0-9]+", strongs.Text()) if err != nil { mataPrograma("Morí en encuentraGrupo regex", err) } if hayMatch { tokens := strings.Split(strongs.Text(), " ") grupo, err = strconv.Atoi(tokens[1]) if err != nil { mataPrograma("Morí en encuentraGrupo hayMatch", err) } } } return }
func (d *Document) scoreNode(s *goquery.Selection) *candidate { contentScore := d.classWeight(s) if s.Is("div") { contentScore += 5 } else if s.Is("blockquote,form") { contentScore = 3 } else if s.Is("th") { contentScore -= 5 } return &candidate{s, float32(contentScore)} }
func (d *Document) cleanConditionally(s *goquery.Selection, selector string) { if !d.CleanConditionally { return } s.Find(selector).Each(func(i int, s *goquery.Selection) { node := s.Get(0) weight := float32(d.classWeight(s)) contentScore := float32(0) if c, ok := d.candidates[node]; ok { contentScore = c.score } if weight+contentScore < 0 { removeNodes(s) Logger.Printf("Conditionally cleaned %s%s with weight %f and content score %f\n", node.Data, getName(s), weight, contentScore) return } text := s.Text() if strings.Count(text, ",") < 10 { counts := map[string]int{ "p": s.Find("p").Length(), "img": s.Find("img").Length(), "li": s.Find("li").Length() - 100, "a": s.Find("a").Length(), "embed": s.Find("embed").Length(), "input": s.Find("input").Length(), } contentLength := len(strings.TrimSpace(text)) linkDensity := d.getLinkDensity(s) remove := false reason := "" if counts["img"] > counts["p"] { reason = "too many images" remove = true } else if counts["li"] > counts["p"] && !s.Is("ul,ol") { reason = "more <li>s than <p>s" remove = true } else if counts["input"] > int(counts["p"]/3.0) { reason = "less than 3x <p>s than <input>s" remove = true } else if contentLength < d.MinTextLength && (counts["img"] == 0 || counts["img"] > 2) { reason = "too short content length without a single image" remove = true } else if weight < 25 && linkDensity > 0.2 { reason = fmt.Sprintf("too many links for its weight (%f)", weight) remove = true } else if weight >= 25 && linkDensity > 0.5 { reason = fmt.Sprintf("too many links for its weight (%f)", weight) remove = true } else if (counts["embed"] == 1 && contentLength < 75) || counts["embed"] > 1 { reason = "<embed>s with too short a content length, or too many <embed>s" remove = true } if remove { Logger.Printf("Conditionally cleaned %s%s with weight %f and content score %f because it has %s\n", node.Data, getName(s), weight, contentScore, reason) removeNodes(s) } } }) }