Ejemplo n.º 1
0
func (ve *VideoExtractor) getEmbedTag(node *goquery.Selection) video {
	parent := node.Parent()
	if parent != nil {
		parentTag := parent.Get(0).DataAtom.String()
		if parentTag == "object" {
			return ve.getObjectTag(node)
		}
	}
	return ve.getVideo(node)
}
Ejemplo n.º 2
0
func removeNodes(s *goquery.Selection) {
	s.Each(func(i int, s *goquery.Selection) {
		parent := s.Parent()
		if parent.Length() == 0 {
			// TODO???
		} else {
			parent.Get(0).RemoveChild(s.Get(0))
		}
	})
}
Ejemplo n.º 3
0
func (b baiduNews) findP(html *goquery.Selection) *goquery.Selection {
	if html.Is("body") {
		return html
	} else if result := html.Parent().Find("p"); len(result.Nodes) == 0 {
		return b.findP(html.Parent())
	} else {
		return html.Parent()
	}
}
Ejemplo n.º 4
0
func get_sesiones(selection_strong *goquery.Selection) (hubo bool) {
	div_selection := selection_strong.Parent()
	selection := div_selection.Next()
	var table_selction *goquery.Selection
	hubo = false
	for {
		tipo := goquery.NodeName(selection)
		if tipo == "table" {
			hubo = true
			table_selction = selection
			break
		} else if tipo == "div" && len(selection.Find("strong").Nodes) != 0 {
			return
		} else if len(selection.Nodes) == 0 {
			return
		}
		selection = selection.Next()
	}
	tr_selection := table_selction.Find("tr")
	for idx_tr := 0; idx_tr < len(tr_selection.Nodes); idx_tr++ {
		td_selection := tr_selection.Eq(idx_tr).Find("td")
		// puesto, nombre, dias, horas, salon
		if len(td_selection.Nodes) > 5 {
			fmt.Println(td_selection.Text())
			hubo = false
			return
		}
		for idx_td := 0; idx_td < len(td_selection.Nodes); idx_td++ {
			if strings.TrimSpace(td_selection.Eq(idx_td).Text()) == "" {
				hubo = false
				return
			}
		}
	}
	return
}
Ejemplo n.º 5
0
func (e *extractImages) checkLarge(s *goquery.Selection, depth uint) bool {
	imgs := s.FindMatcher(imgTags).FilterFunction(
		func(i int, s *goquery.Selection) bool {
			if i > 30 {
				return false
			}

			src, ok := s.Attr("src")
			if !ok {
				return false
			}

			for _, s := range badImgNames {
				if strings.Contains(src, s) {
					return false
				}
			}

			return true
		}).FilterFunction(
		func(i int, s *goquery.Selection) bool {
			img := e.hitCache(s, "src")

			if img == nil {
				return false
			}

			return true
		})

	rimgs := e.hitCaches(imgs, "src")
	if len(rimgs) > 0 {
		var bestImg *Image

		cnt := 0
		initialArea := 0.0
		maxScore := 0.0

		if len(rimgs) > 30 {
			rimgs = rimgs[:30]
		}

		for _, i := range rimgs {
			shouldScore := ((depth >= 1 && i.Width > 300) || depth == 0) &&
				i.Width > minImgWidth &&
				!e.isBannerDims(i)
			if !shouldScore {
				continue
			}

			area := float64(i.Width * i.Height)
			score := 0.0

			if initialArea == 0.0 {
				initialArea = area * 1.48
				score = 1.0
			} else {
				areaDiff := area / initialArea
				sequenceScore := 1.0 / float64(cnt)
				score = sequenceScore * areaDiff
			}

			if score > maxScore {
				maxScore = score
				bestImg = i
			}

			cnt++
		}

		if bestImg != nil {
			bestImg.Confidence = uint(100 / len(rimgs))
			e.a.Img = bestImg
			return true
		}
	}

	if depth > 2 {
		return false
	}

	prev := s.Prev()
	if prev.Length() > 0 {
		return e.checkLarge(prev, depth)
	}

	par := s.Parent()
	if par.Length() > 0 {
		return e.checkLarge(par, depth+1)
	}

	return false
}
Ejemplo n.º 6
0
func findParentWithHref(s *goquery.Selection) *goquery.Selection {
	// TODO: Do loop to find first parent with link
	return s.Parent()
}