Пример #1
0
func (b baiduNews) commonPrase(resp *context.Response) (infoStr string) {
	body := resp.GetDom().Find("body")

	var info *goquery.Selection

	if h1s := body.Find("h1"); len(h1s.Nodes) != 0 {
		for i := 0; i < len(h1s.Nodes); i++ {
			info = b.findP(h1s.Eq(i))
		}
	} else if h2s := body.Find("h2"); len(h2s.Nodes) != 0 {
		for i := 0; i < len(h2s.Nodes); i++ {
			info = b.findP(h2s.Eq(i))
		}
	} else if h3s := body.Find("h3"); len(h3s.Nodes) != 0 {
		for i := 0; i < len(h3s.Nodes); i++ {
			info = b.findP(h3s.Eq(i))
		}
	} else {
		info = body.Find("body")
	}
	// 去除标签
	// info.RemoveFiltered("script")
	// info.RemoveFiltered("style")
	infoStr, _ = info.Html()

	// 清洗HTML
	infoStr = CleanHtml(infoStr, 5)
	return
}
Пример #2
0
// 指定ruleName时,调用相应ParseFunc()解析响应流
// 未指定ruleName时或ruleName为空时,调用Root()
func (self *Spider) Parse(resp *context.Response, ruleName ...string) {
	if len(ruleName) == 0 || ruleName[0] == "" {
		if resp != nil {
			resp.SetRuleName("")
		}
		self.RuleTree.Root(self, resp)
		return
	}

	resp.SetRuleName(ruleName[0])
	self.GetRule(ruleName[0]).ParseFunc(self, resp)
}
Пример #3
0
func NewContext(sp *Spider, resp *context.Response) *Context {
	if resp == nil {
		return &Context{
			Spider:   sp,
			Response: resp,
		}
	}
	return &Context{
		Spider:   sp,
		Request:  resp.GetRequest(),
		Response: resp,
	}
}
Пример #4
0
// 输出文本结果
// item允许的类型为map[int]interface{}或map[string]interface{}
func (self *Spider) Output(ruleName string, resp *context.Response, item interface{}) {
	resp.SetRuleName(ruleName)
	switch item2 := item.(type) {
	case map[int]interface{}:
		resp.AddItem(self.CreatItem(ruleName, item2))
	case map[string]interface{}:
		resp.AddItem(item2)
	}
}
Пример #5
0
// 根据响应流运行指定解析Rule,仅用于crawl模块,Rule中请使用Parse()代替
func (self *Spider) ExecParse(resp *context.Response) {
	self.RuleTree.Trunk[resp.GetRuleName()].ParseFunc(self, resp)
}
Пример #6
0
// 调用指定Rule下解析函数ParseFunc(),解析响应流
func (self *Spider) Parse(ruleName string, resp *context.Response) {
	resp.SetRuleName(ruleName)
	self.ExecParse(resp)
}
Пример #7
0
// 输出文件结果
func (self *Spider) FileOutput(resp *context.Response, name ...string) {
	resp.AddFile(name...)
}