func NewRedirectorHandler() *RedirectorHandler { ret := RedirectorHandler{} ret.metricSender, _ = graphite.New(ConfigInstance().GraphiteHost, "") ret.linksChannel = []chan Link{} for i := 0; i < ConfigInstance().RedirectChanNum*PRIORITY_LEVELS; i++ { ret.linksChannel = append(ret.linksChannel, make(chan Link, ConfigInstance().RedirectChanSize)) } ret.processedLinks = NewBloomFilter() ret.usedChannels = make(map[int]int64) ret.urlFilter = NewURLFilter() ret.linksRecvCount = 0 ret.domainLinksRecvCount = make(map[string]int) ret.ticker = time.NewTicker(time.Second * 3600) go func() { for t := range ret.ticker.C { newRules := GetSitePatterns() for rule, pri := range newRules { ret.urlFilter.ruleMatcher.AddRule(rule, pri) } log.Println(t) } }() for i := 0; i < ConfigInstance().RedirectChanNum*PRIORITY_LEVELS; i++ { go ret.Redirect(i) } return &ret }
func NewDownloadHanler() *DownloadHandler { ret := DownloadHandler{} ret.urlFilter = NewURLFilter() var err error ret.currentPath = strconv.FormatInt(time.Now().UnixNano(), 10) + ".tsv" ret.writer, err = os.Create("./pages/" + ret.currentPath) if err != nil { os.Exit(0) } ret.RtDownloaderAddrs = GetRealtimeDownloaderList() ret.metricSender, _ = graphite.New(ConfigInstance().GraphiteHost, "") ret.LinksChannel = make(chan Link, DOWNLOADER_QUEUE_SIZE) ret.PageChannel = make(chan WebPage, DOWNLOADER_QUEUE_SIZE) ret.ExtractedLinksChannel = make(chan Link, DOWNLOADER_QUEUE_SIZE) ret.Downloader = NewHTTPGetDownloader() ret.processedPageCount = 0 ret.totalDownloadedPageCount = 0 ret.proxyDownloadedPageCount = 0 ret.writePageCount = 0 ret.linkRecvCount = make(map[string]int) ret.pageDownloadCount = make(map[string]int) ret.pageWriteCount = make(map[string]int) for _, proxy := range GetProxyList() { pd := NewHTTPGetProxyDownloader(proxy) if pd == nil { continue } ret.ProxyDownloader = append(ret.ProxyDownloader, pd) } log.Println("proxy downloader count", len(ret.ProxyDownloader)) ret.signals = make(chan os.Signal, 1) signal.Notify(ret.signals, syscall.SIGINT) go func() { <-ret.signals defer ret.writer.Close() os.Exit(0) }() go ret.Download() go ret.ProcExtractedLinks() go ret.FlushPages() return &ret }