func (this *contentExtractor) walkSiblings(node *goquery.Selection) []*goquery.Selection { currentSibling := node.Prev() b := make([]*goquery.Selection, 0) for currentSibling.Length() != 0 { b = append(b, currentSibling) previousSibling := currentSibling.Prev() currentSibling = previousSibling } return b }
func (extr *ContentExtractor) walkSiblings(node *goquery.Selection) []*goquery.Selection { currentSibling := node.Prev() var b []*goquery.Selection for currentSibling.Length() != 0 { b = append(b, currentSibling) previousSibling := currentSibling.Prev() currentSibling = previousSibling } return b }
func encuentraGrupo(tabla *goquery.Selection) (grupo int) { var anterior *goquery.Selection for anterior = tabla.Prev(); anterior.Length() > 0 && grupo == 0; anterior = anterior.Prev() { if !anterior.Is("div") { log.Fatal(errors.New("No se encontró curso para la tabla")) } strongs := anterior.Find("strong") if strongs.Length() != 1 { continue } hayMatch, err := regexp.MatchString("Grupo [0-9]+", strongs.Text()) if err != nil { mataPrograma("Morí en encuentraGrupo regex", err) } if hayMatch { tokens := strings.Split(strongs.Text(), " ") grupo, err = strconv.Atoi(tokens[1]) if err != nil { mataPrograma("Morí en encuentraGrupo hayMatch", err) } } } return }
func (e *extractImages) checkLarge(s *goquery.Selection, depth uint) bool { imgs := s.FindMatcher(imgTags).FilterFunction( func(i int, s *goquery.Selection) bool { if i > 30 { return false } src, ok := s.Attr("src") if !ok { return false } for _, s := range badImgNames { if strings.Contains(src, s) { return false } } return true }).FilterFunction( func(i int, s *goquery.Selection) bool { img := e.hitCache(s, "src") if img == nil { return false } return true }) rimgs := e.hitCaches(imgs, "src") if len(rimgs) > 0 { var bestImg *Image cnt := 0 initialArea := 0.0 maxScore := 0.0 if len(rimgs) > 30 { rimgs = rimgs[:30] } for _, i := range rimgs { shouldScore := ((depth >= 1 && i.Width > 300) || depth == 0) && i.Width > minImgWidth && !e.isBannerDims(i) if !shouldScore { continue } area := float64(i.Width * i.Height) score := 0.0 if initialArea == 0.0 { initialArea = area * 1.48 score = 1.0 } else { areaDiff := area / initialArea sequenceScore := 1.0 / float64(cnt) score = sequenceScore * areaDiff } if score > maxScore { maxScore = score bestImg = i } cnt++ } if bestImg != nil { bestImg.Confidence = uint(100 / len(rimgs)) e.a.Img = bestImg return true } } if depth > 2 { return false } prev := s.Prev() if prev.Length() > 0 { return e.checkLarge(prev, depth) } par := s.Parent() if par.Length() > 0 { return e.checkLarge(par, depth+1) } return false }