func (ve *VideoExtractor) getEmbedTag(node *goquery.Selection) video { parent := node.Parent() if parent != nil { parentTag := parent.Get(0).DataAtom.String() if parentTag == "object" { return ve.getObjectTag(node) } } return ve.getVideo(node) }
func removeNodes(s *goquery.Selection) { s.Each(func(i int, s *goquery.Selection) { parent := s.Parent() if parent.Length() == 0 { // TODO??? } else { parent.Get(0).RemoveChild(s.Get(0)) } }) }
func (b baiduNews) findP(html *goquery.Selection) *goquery.Selection { if html.Is("body") { return html } else if result := html.Parent().Find("p"); len(result.Nodes) == 0 { return b.findP(html.Parent()) } else { return html.Parent() } }
func get_sesiones(selection_strong *goquery.Selection) (hubo bool) { div_selection := selection_strong.Parent() selection := div_selection.Next() var table_selction *goquery.Selection hubo = false for { tipo := goquery.NodeName(selection) if tipo == "table" { hubo = true table_selction = selection break } else if tipo == "div" && len(selection.Find("strong").Nodes) != 0 { return } else if len(selection.Nodes) == 0 { return } selection = selection.Next() } tr_selection := table_selction.Find("tr") for idx_tr := 0; idx_tr < len(tr_selection.Nodes); idx_tr++ { td_selection := tr_selection.Eq(idx_tr).Find("td") // puesto, nombre, dias, horas, salon if len(td_selection.Nodes) > 5 { fmt.Println(td_selection.Text()) hubo = false return } for idx_td := 0; idx_td < len(td_selection.Nodes); idx_td++ { if strings.TrimSpace(td_selection.Eq(idx_td).Text()) == "" { hubo = false return } } } return }
func (e *extractImages) checkLarge(s *goquery.Selection, depth uint) bool { imgs := s.FindMatcher(imgTags).FilterFunction( func(i int, s *goquery.Selection) bool { if i > 30 { return false } src, ok := s.Attr("src") if !ok { return false } for _, s := range badImgNames { if strings.Contains(src, s) { return false } } return true }).FilterFunction( func(i int, s *goquery.Selection) bool { img := e.hitCache(s, "src") if img == nil { return false } return true }) rimgs := e.hitCaches(imgs, "src") if len(rimgs) > 0 { var bestImg *Image cnt := 0 initialArea := 0.0 maxScore := 0.0 if len(rimgs) > 30 { rimgs = rimgs[:30] } for _, i := range rimgs { shouldScore := ((depth >= 1 && i.Width > 300) || depth == 0) && i.Width > minImgWidth && !e.isBannerDims(i) if !shouldScore { continue } area := float64(i.Width * i.Height) score := 0.0 if initialArea == 0.0 { initialArea = area * 1.48 score = 1.0 } else { areaDiff := area / initialArea sequenceScore := 1.0 / float64(cnt) score = sequenceScore * areaDiff } if score > maxScore { maxScore = score bestImg = i } cnt++ } if bestImg != nil { bestImg.Confidence = uint(100 / len(rimgs)) e.a.Img = bestImg return true } } if depth > 2 { return false } prev := s.Prev() if prev.Length() > 0 { return e.checkLarge(prev, depth) } par := s.Parent() if par.Length() > 0 { return e.checkLarge(par, depth+1) } return false }
func findParentWithHref(s *goquery.Selection) *goquery.Selection { // TODO: Do loop to find first parent with link return s.Parent() }