func (e extractContent) noParasWithoutTable(s *goquery.Selection) bool { s.FindMatcher(pTags).Each(func(i int, s *goquery.Selection) { if len(s.Text()) < 25 { s.Remove() } }) return s.FindMatcher(pTags).Length() == 0 && !nodeIs(s.Nodes[0], atom.Td) }
func divToPara(i int, s *goquery.Selection) { if s.FindMatcher(keepTags).Length() == 0 { s.Nodes[0].Data = "p" s.Nodes[0].DataAtom = atom.P } else { ns := getReplacements(s.Empty()) s.AppendNodes(ns...) } }
func (e extractContent) getSiblingContent( a *Article, s *goquery.Selection, baseScore uint) []*html.Node { var ret []*html.Node if nodeIs(s.Nodes[0], atom.P) && len(s.Text()) > 0 { return s.Nodes } ps := s.FindMatcher(pTags) for _, n := range ps.Nodes { cc := a.getCCache(n) if len(cc.text) > 0 { if cc.stopwords > baseScore && !cc.highLinkDensity { ret = append(ret, createNode(atom.P, "p", cc.text)) } } } return ret }
func (e *extractImages) checkLarge(s *goquery.Selection, depth uint) bool { imgs := s.FindMatcher(imgTags).FilterFunction( func(i int, s *goquery.Selection) bool { if i > 30 { return false } src, ok := s.Attr("src") if !ok { return false } for _, s := range badImgNames { if strings.Contains(src, s) { return false } } return true }).FilterFunction( func(i int, s *goquery.Selection) bool { img := e.hitCache(s, "src") if img == nil { return false } return true }) rimgs := e.hitCaches(imgs, "src") if len(rimgs) > 0 { var bestImg *Image cnt := 0 initialArea := 0.0 maxScore := 0.0 if len(rimgs) > 30 { rimgs = rimgs[:30] } for _, i := range rimgs { shouldScore := ((depth >= 1 && i.Width > 300) || depth == 0) && i.Width > minImgWidth && !e.isBannerDims(i) if !shouldScore { continue } area := float64(i.Width * i.Height) score := 0.0 if initialArea == 0.0 { initialArea = area * 1.48 score = 1.0 } else { areaDiff := area / initialArea sequenceScore := 1.0 / float64(cnt) score = sequenceScore * areaDiff } if score > maxScore { maxScore = score bestImg = i } cnt++ } if bestImg != nil { bestImg.Confidence = uint(100 / len(rimgs)) e.a.Img = bestImg return true } } if depth > 2 { return false } prev := s.Prev() if prev.Length() > 0 { return e.checkLarge(prev, depth) } par := s.Parent() if par.Length() > 0 { return e.checkLarge(par, depth+1) } return false }