示例#1
0
文件: extractor.go 项目: ngs/GoOse
func (this *contentExtractor) walkSiblings(node *goquery.Selection) []*goquery.Selection {
	currentSibling := node.Prev()
	b := make([]*goquery.Selection, 0)
	for currentSibling.Length() != 0 {
		b = append(b, currentSibling)
		previousSibling := currentSibling.Prev()
		currentSibling = previousSibling
	}
	return b
}
示例#2
0
func (extr *ContentExtractor) walkSiblings(node *goquery.Selection) []*goquery.Selection {
	currentSibling := node.Prev()
	var b []*goquery.Selection
	for currentSibling.Length() != 0 {
		b = append(b, currentSibling)
		previousSibling := currentSibling.Prev()
		currentSibling = previousSibling
	}
	return b
}
示例#3
0
func encuentraGrupo(tabla *goquery.Selection) (grupo int) {
	var anterior *goquery.Selection
	for anterior = tabla.Prev(); anterior.Length() > 0 && grupo == 0; anterior = anterior.Prev() {
		if !anterior.Is("div") {
			log.Fatal(errors.New("No se encontró curso para la tabla"))
		}
		strongs := anterior.Find("strong")
		if strongs.Length() != 1 {
			continue
		}
		hayMatch, err := regexp.MatchString("Grupo [0-9]+", strongs.Text())
		if err != nil {
			mataPrograma("Morí en encuentraGrupo regex", err)
		}
		if hayMatch {
			tokens := strings.Split(strongs.Text(), " ")
			grupo, err = strconv.Atoi(tokens[1])
			if err != nil {
				mataPrograma("Morí en encuentraGrupo hayMatch", err)
			}
		}
	}
	return
}
示例#4
0
func (e *extractImages) checkLarge(s *goquery.Selection, depth uint) bool {
	imgs := s.FindMatcher(imgTags).FilterFunction(
		func(i int, s *goquery.Selection) bool {
			if i > 30 {
				return false
			}

			src, ok := s.Attr("src")
			if !ok {
				return false
			}

			for _, s := range badImgNames {
				if strings.Contains(src, s) {
					return false
				}
			}

			return true
		}).FilterFunction(
		func(i int, s *goquery.Selection) bool {
			img := e.hitCache(s, "src")

			if img == nil {
				return false
			}

			return true
		})

	rimgs := e.hitCaches(imgs, "src")
	if len(rimgs) > 0 {
		var bestImg *Image

		cnt := 0
		initialArea := 0.0
		maxScore := 0.0

		if len(rimgs) > 30 {
			rimgs = rimgs[:30]
		}

		for _, i := range rimgs {
			shouldScore := ((depth >= 1 && i.Width > 300) || depth == 0) &&
				i.Width > minImgWidth &&
				!e.isBannerDims(i)
			if !shouldScore {
				continue
			}

			area := float64(i.Width * i.Height)
			score := 0.0

			if initialArea == 0.0 {
				initialArea = area * 1.48
				score = 1.0
			} else {
				areaDiff := area / initialArea
				sequenceScore := 1.0 / float64(cnt)
				score = sequenceScore * areaDiff
			}

			if score > maxScore {
				maxScore = score
				bestImg = i
			}

			cnt++
		}

		if bestImg != nil {
			bestImg.Confidence = uint(100 / len(rimgs))
			e.a.Img = bestImg
			return true
		}
	}

	if depth > 2 {
		return false
	}

	prev := s.Prev()
	if prev.Length() > 0 {
		return e.checkLarge(prev, depth)
	}

	par := s.Parent()
	if par.Length() > 0 {
		return e.checkLarge(par, depth+1)
	}

	return false
}