func (b baiduNews) commonPrase(resp *context.Response) (infoStr string) { body := resp.GetDom().Find("body") var info *goquery.Selection if h1s := body.Find("h1"); len(h1s.Nodes) != 0 { for i := 0; i < len(h1s.Nodes); i++ { info = b.findP(h1s.Eq(i)) } } else if h2s := body.Find("h2"); len(h2s.Nodes) != 0 { for i := 0; i < len(h2s.Nodes); i++ { info = b.findP(h2s.Eq(i)) } } else if h3s := body.Find("h3"); len(h3s.Nodes) != 0 { for i := 0; i < len(h3s.Nodes); i++ { info = b.findP(h3s.Eq(i)) } } else { info = body.Find("body") } // 去除标签 // info.RemoveFiltered("script") // info.RemoveFiltered("style") infoStr, _ = info.Html() // 清洗HTML infoStr = CleanHtml(infoStr, 5) return }
// 指定ruleName时,调用相应ParseFunc()解析响应流 // 未指定ruleName时或ruleName为空时,调用Root() func (self *Spider) Parse(resp *context.Response, ruleName ...string) { if len(ruleName) == 0 || ruleName[0] == "" { if resp != nil { resp.SetRuleName("") } self.RuleTree.Root(self, resp) return } resp.SetRuleName(ruleName[0]) self.GetRule(ruleName[0]).ParseFunc(self, resp) }
func NewContext(sp *Spider, resp *context.Response) *Context { if resp == nil { return &Context{ Spider: sp, Response: resp, } } return &Context{ Spider: sp, Request: resp.GetRequest(), Response: resp, } }
// 输出文本结果 // item允许的类型为map[int]interface{}或map[string]interface{} func (self *Spider) Output(ruleName string, resp *context.Response, item interface{}) { resp.SetRuleName(ruleName) switch item2 := item.(type) { case map[int]interface{}: resp.AddItem(self.CreatItem(ruleName, item2)) case map[string]interface{}: resp.AddItem(item2) } }
// 根据响应流运行指定解析Rule,仅用于crawl模块,Rule中请使用Parse()代替 func (self *Spider) ExecParse(resp *context.Response) { self.RuleTree.Trunk[resp.GetRuleName()].ParseFunc(self, resp) }
// 调用指定Rule下解析函数ParseFunc(),解析响应流 func (self *Spider) Parse(ruleName string, resp *context.Response) { resp.SetRuleName(ruleName) self.ExecParse(resp) }
// 输出文件结果 func (self *Spider) FileOutput(resp *context.Response, name ...string) { resp.AddFile(name...) }