// 下载。 func (sched *myScheduler) download(req base.Request) { defer func() { if p := recover(); p != nil { errMsg := fmt.Sprintf("Fatal Download Error: %s\n", p) log.Error(errMsg) } }() downloader, err := sched.dlpool.Take() if err != nil { errMsg := fmt.Sprintf("Downloader pool error: %s", err) sched.sendError(errors.New(errMsg), SCHEDULER_CODE) return } defer func() { err := sched.dlpool.Return(downloader) if err != nil { errMsg := fmt.Sprintf("Downloader pool error: %s", err) sched.sendError(errors.New(errMsg), SCHEDULER_CODE) } }() code := generateCode(DOWNLOADER_CODE, downloader.Id()) respp, err := downloader.Download(req) if respp != nil { sched.sendResp(*respp, code) } if err != nil { sched.sendError(err, code) } }
func record(level byte, content string) { if content == "" { return } switch level { case 0: log.Info(content) case 1: log.Warnf(content) case 2: log.Error(content) } }
// 分析。 func (sched *myScheduler) analyze(respParsers []anlz.ParseResponse, resp base.Response) { defer func() { if p := recover(); p != nil { errMsg := fmt.Sprintf("Fatal Analysis Error: %s\n", p) log.Error(errMsg) } }() analyzer, err := sched.analyzerPool.Take() if err != nil { errMsg := fmt.Sprintf("Analyzer pool error: %s", err) sched.sendError(errors.New(errMsg), SCHEDULER_CODE) return } defer func() { err := sched.analyzerPool.Return(analyzer) if err != nil { errMsg := fmt.Sprintf("Analyzer pool error: %s", err) sched.sendError(errors.New(errMsg), SCHEDULER_CODE) } }() code := generateCode(ANALYZER_CODE, analyzer.Id()) dataList, errs := analyzer.Analyze(respParsers, resp) if dataList != nil { for _, data := range dataList { if data == nil { continue } switch d := data.(type) { case *base.Request: sched.saveReqToCache(*d, code) case *base.Item: sched.sendItem(*d, code) default: errMsg := fmt.Sprintf("Unsupported data type '%T'! (value=%v)\n", d, d) sched.sendError(errors.New(errMsg), code) } } } if errs != nil { for _, err := range errs { sched.sendError(err, code) } } }
func main() { // 创建调度器 scheduler := sched.NewScheduler() // 准备监控参数 intervalNs := 10 * time.Millisecond maxIdleCount := uint(1000) // 开始监控 checkCountChan := tool.Monitoring( scheduler, intervalNs, maxIdleCount, true, false, record) // 准备启动参数 channelArgs := base.NewChannelArgs(10, 10, 10, 10) poolBaseArgs := base.NewPoolBaseArgs(3, 3) crawlDepth := uint32(1) httpClientGenerator := genHttpClient respParsers := getResponseParsers() itemProcessors := getItemProcessors() startUrl := "http://www.sogou.com" firstHttpReq, err := http.NewRequest("GET", startUrl, nil) if err != nil { log.Error(err) return } // 开启调度器 scheduler.Start( channelArgs, poolBaseArgs, crawlDepth, httpClientGenerator, respParsers, itemProcessors, firstHttpReq) // 等待监控结束 <-checkCountChan }
// 打开条目处理管道。 func (sched *myScheduler) openItemPipeline() { go func() { sched.itemPipeline.SetFailFast(true) code := ITEMPIPELINE_CODE for item := range sched.getItemChan() { go func(item base.Item) { defer func() { if p := recover(); p != nil { errMsg := fmt.Sprintf("Fatal Item Processing Error: %s\n", p) log.Error(errMsg) } }() errs := sched.itemPipeline.Send(item) if errs != nil { for _, err := range errs { sched.sendError(err, code) } } }(item) } }() }
func (sched *myScheduler) Start( channelArgs base.ChannelArgs, poolBaseArgs base.PoolBaseArgs, crawlDepth uint32, httpClientGenerator GenHttpClient, respParsers []anlz.ParseResponse, itemProcessors []ipl.ProcessItem, firstHttpReq *http.Request) (err error) { defer func() { if p := recover(); p != nil { errMsg := fmt.Sprintf("Fatal Scheduler Error: %s\n", p) log.Error(errMsg) err = errors.New(errMsg) } }() if atomic.LoadUint32(&sched.running) == 1 { return errors.New("The scheduler has been started!\n") } atomic.StoreUint32(&sched.running, 1) if err := channelArgs.Check(); err != nil { return err } sched.channelArgs = channelArgs if err := poolBaseArgs.Check(); err != nil { return err } sched.poolBaseArgs = poolBaseArgs sched.crawlDepth = crawlDepth sched.chanman = generateChannelManager(sched.channelArgs) if httpClientGenerator == nil { return errors.New("The HTTP client generator list is invalid!") } dlpool, err := generatePageDownloaderPool( sched.poolBaseArgs.PageDownloaderPoolSize(), httpClientGenerator) if err != nil { errMsg := fmt.Sprintf("Occur error when get page downloader pool: %s\n", err) return errors.New(errMsg) } sched.dlpool = dlpool analyzerPool, err := generateAnalyzerPool(sched.poolBaseArgs.AnalyzerPoolSize()) if err != nil { if err != nil { errMsg := fmt.Sprintf("Occur error when get analyzer pool: %s\n", err) return errors.New(errMsg) } } sched.analyzerPool = analyzerPool if itemProcessors == nil { return errors.New("The item processor list is invalid!") } for i, ip := range itemProcessors { if ip == nil { return errors.New(fmt.Sprintf("The %dth item processor is invalid!", i)) } } sched.itemPipeline = generateItemPipeline(itemProcessors) if sched.stopSign == nil { sched.stopSign = mdw.NewStopSign() } else { sched.stopSign.Reset() } sched.reqCache = newRequestCache() sched.urlMap = make(map[string]bool) sched.startDownloading() sched.activateAnalyzers(respParsers) sched.openItemPipeline() sched.schedule(10 * time.Millisecond) if firstHttpReq == nil { return errors.New("The first HTTP request is invalid!") } pd, err := getPrimaryDomain(firstHttpReq.Host) if err != nil { return err } sched.primaryDomain = pd firstReq := base.NewRequest(firstHttpReq, 0) sched.reqCache.put(firstReq) return nil }