Example #1
0
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
	ps := make([]*goquery.Selection, 0)
	if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
		ps = append(ps, currentSibling)
		return ps
	} else {
		potentialParagraphs := currentSibling.Find("p")
		potentialParagraphs.Each(func(i int, s *goquery.Selection) {
			text := s.Text()
			if len(text) > 0 {
				ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
				paragraphScore := ws.stopWordCount
				siblingBaselineScore := 0.30
				highLinkDensity := this.isHighLinkDensity(s)
				score := siblingBaselineScore * baselinescoreSiblingsPara
				if score < float64(paragraphScore) && !highLinkDensity {
					node := new(html.Node)
					node.Type = html.TextNode
					node.Data = text
					node.DataAtom = atom.P
					nodes := make([]*html.Node, 1)
					nodes[0] = node
					newSelection := new(goquery.Selection)
					newSelection.Nodes = nodes
					ps = append(ps, newSelection)
				}
			}

		})
	}
	return ps
}
Example #2
0
//adds any siblings that may have a decent score to this node
func (this *contentExtractor) addSiblings(topNode *goquery.Selection) *goquery.Selection {
	if this.config.debug {
		log.Println("Starting to add siblings")
	}
	baselinescoreSiblingsPara := this.getSiblingsScore(topNode)
	results := this.walkSiblings(topNode)
	for _, currentNode := range results {
		ps := this.getSiblingsContent(currentNode, float64(baselinescoreSiblingsPara))
		for _, p := range ps {
			nodes := make([]*html.Node, len(topNode.Nodes)+1)
			nodes[0] = p.Get(0)
			for i, node := range topNode.Nodes {
				nodes[i+1] = node
			}
			topNode.Nodes = nodes
		}
	}
	return topNode
}
Example #3
0
func Tianhuan(templete *Node, src *goquery.Selection, m map[string]interface{}) error {

	mySelect, properties, err := templete.GetSelect(SELECT)
	if err != nil {
		return err
	}
	var ser *goquery.Selection
	if mySelect == "" {
		fmt.Println("value-of 需要填写select属性")
		ser = src
	} else {
		ser = src.Find(mySelect)
	}

	var nodes = make([]*goquery.Selection, 0)
	if len(ser.Nodes) == 0 {

		return errors.New(mySelect + ":未搜索到数据")
	}
	if properties != nil {

		ser.EachWithBreak(func(i int, s *goquery.Selection) bool {
			if properties.Index != 0 {
				if properties.Index == i+1 { //index
					nodes = append(nodes, s)
					return false
				}
				return true
			}
			var lenn = len(s.Get(0).Attr)
			//			var ma bool = false
			for b := 0; b < lenn; b++ {
				if _, ok := properties.Property[s.Get(0).Attr[b].Key]; ok {
					for _, v := range properties.Property[s.Get(0).Attr[b].Key] {
						if strings.Contains(v, ".*") {
							if ok, er := regexp.Match(v, []byte(s.Get(0).Attr[b].Val)); er != nil {
								return false
							} else {
								if ok {
									if Ifok(properties, s.Get(0), b) {
										nodes = append(nodes, s)
									}
									return true
								}
							}
						}

						if s.Get(0).Attr[b].Val == v {
							if Ifok(properties, s.Get(0), b) {
								nodes = append(nodes, s)
								return true
							}
						}
					}
				}

			}
			return true
		})

		//abandon
		_, pro, err := templete.GetSelect(ABANDON)
		if err != nil {
			return err
		}
		if pro != nil {
			for k, _ := range pro.Property {
				nodes = nodesDelete(nodes, GetSelectsByValue(pro.Property[k], nodes, k))
			}

		}

	} else {
		ser.Each(func(i int, s *goquery.Selection) {
			nodes = append(nodes, s)
		})
	}

	//not 操作符
	nodes = notInstructionCharacter(properties, nodes)

	fmt.Println("***************************")
	for _, vvv := range nodes {
		fmt.Println(vvv.Get(0).Attr, vvv.Get(0).Data)
	}
	fmt.Println("------------------------------")
	var index int = 0
	for _, matchLabel := range templete.Children {
		switch matchLabel.LabelName {
		case STORAGE: //取值
			if 0 != MatchMap(nodes, matchLabel, m, &index, nil) {
				break
			}
		case VALUEOF: //查找值
			var bb *goquery.Selection
			var la = make([]*html.Node, 0)
			for _, ll := range nodes {
				la = append(la, ll.Get(0))
			}
			bb.Nodes = la
			if err := Tianhuan(matchLabel, bb, m); err != nil {
				fmt.Println(err)
			}
		case LAYER: //给值加层级
			if matchLabel.Attr[NAME] != "" {

				//判断是不是数组 是数组的话 新增一个[]map[string]interface{} 类型
				m[matchLabel.Attr[NAME]] = make(map[string]interface{})
				if matchLabel.Attr[ARRAY] == "true" || matchLabel.Attr[ARRAYEND] != "" {
					//判断数组什么时候结束
					var property *Properties
					if matchLabel.Attr[ARRAYEND] != "" {
						_, property, err = matchLabel.GetSelect(ARRAYEND)
						if err != nil {
							inputLog.Println(err)
							break
						}
						arr := make([]map[string]interface{}, 0, 0)
						ToValueArray(nodes, matchLabel, &index, property, &arr)
						m[matchLabel.Attr[NAME]] = arr
						//数组
					}
					//传递参数确认是否数组结束

				} else {
					for i := 0; i < len(matchLabel.Children); i++ {
						if 0 != MatchMap(nodes, matchLabel.Children[i], m[matchLabel.Attr[NAME]].(map[string]interface{}), &index, nil) {
							break
						}

					}
				}
			} else {
				fmt.Println("未书写层级名称, 跳过此标签 和其他子标签")
			}

		default:
			fmt.Println("模板标签书写错误了吧")
		}
	}

	return nil
}
Example #4
0
func (this *parser) clear(selection *goquery.Selection) {
	selection.Nodes = make([]*html.Node, 0)
}