func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
//adds any siblings that may have a decent score to this node func (this *contentExtractor) addSiblings(topNode *goquery.Selection) *goquery.Selection { if this.config.debug { log.Println("Starting to add siblings") } baselinescoreSiblingsPara := this.getSiblingsScore(topNode) results := this.walkSiblings(topNode) for _, currentNode := range results { ps := this.getSiblingsContent(currentNode, float64(baselinescoreSiblingsPara)) for _, p := range ps { nodes := make([]*html.Node, len(topNode.Nodes)+1) nodes[0] = p.Get(0) for i, node := range topNode.Nodes { nodes[i+1] = node } topNode.Nodes = nodes } } return topNode }
func Tianhuan(templete *Node, src *goquery.Selection, m map[string]interface{}) error { mySelect, properties, err := templete.GetSelect(SELECT) if err != nil { return err } var ser *goquery.Selection if mySelect == "" { fmt.Println("value-of 需要填写select属性") ser = src } else { ser = src.Find(mySelect) } var nodes = make([]*goquery.Selection, 0) if len(ser.Nodes) == 0 { return errors.New(mySelect + ":未搜索到数据") } if properties != nil { ser.EachWithBreak(func(i int, s *goquery.Selection) bool { if properties.Index != 0 { if properties.Index == i+1 { //index nodes = append(nodes, s) return false } return true } var lenn = len(s.Get(0).Attr) // var ma bool = false for b := 0; b < lenn; b++ { if _, ok := properties.Property[s.Get(0).Attr[b].Key]; ok { for _, v := range properties.Property[s.Get(0).Attr[b].Key] { if strings.Contains(v, ".*") { if ok, er := regexp.Match(v, []byte(s.Get(0).Attr[b].Val)); er != nil { return false } else { if ok { if Ifok(properties, s.Get(0), b) { nodes = append(nodes, s) } return true } } } if s.Get(0).Attr[b].Val == v { if Ifok(properties, s.Get(0), b) { nodes = append(nodes, s) return true } } } } } return true }) //abandon _, pro, err := templete.GetSelect(ABANDON) if err != nil { return err } if pro != nil { for k, _ := range pro.Property { nodes = nodesDelete(nodes, GetSelectsByValue(pro.Property[k], nodes, k)) } } } else { ser.Each(func(i int, s *goquery.Selection) { nodes = append(nodes, s) }) } //not 操作符 nodes = notInstructionCharacter(properties, nodes) fmt.Println("***************************") for _, vvv := range nodes { fmt.Println(vvv.Get(0).Attr, vvv.Get(0).Data) } fmt.Println("------------------------------") var index int = 0 for _, matchLabel := range templete.Children { switch matchLabel.LabelName { case STORAGE: //取值 if 0 != MatchMap(nodes, matchLabel, m, &index, nil) { break } case VALUEOF: //查找值 var bb *goquery.Selection var la = make([]*html.Node, 0) for _, ll := range nodes { la = append(la, ll.Get(0)) } bb.Nodes = la if err := Tianhuan(matchLabel, bb, m); err != nil { fmt.Println(err) } case LAYER: //给值加层级 if matchLabel.Attr[NAME] != "" { //判断是不是数组 是数组的话 新增一个[]map[string]interface{} 类型 m[matchLabel.Attr[NAME]] = make(map[string]interface{}) if matchLabel.Attr[ARRAY] == "true" || matchLabel.Attr[ARRAYEND] != "" { //判断数组什么时候结束 var property *Properties if matchLabel.Attr[ARRAYEND] != "" { _, property, err = matchLabel.GetSelect(ARRAYEND) if err != nil { inputLog.Println(err) break } arr := make([]map[string]interface{}, 0, 0) ToValueArray(nodes, matchLabel, &index, property, &arr) m[matchLabel.Attr[NAME]] = arr //数组 } //传递参数确认是否数组结束 } else { for i := 0; i < len(matchLabel.Children); i++ { if 0 != MatchMap(nodes, matchLabel.Children[i], m[matchLabel.Attr[NAME]].(map[string]interface{}), &index, nil) { break } } } } else { fmt.Println("未书写层级名称, 跳过此标签 和其他子标签") } default: fmt.Println("模板标签书写错误了吧") } } return nil }
func (this *parser) clear(selection *goquery.Selection) { selection.Nodes = make([]*html.Node, 0) }