Пример #1
0
func NewRedirectorHandler() *RedirectorHandler {
	ret := RedirectorHandler{}
	ret.metricSender, _ = graphite.New(ConfigInstance().GraphiteHost, "")
	ret.linksChannel = []chan Link{}
	for i := 0; i < ConfigInstance().RedirectChanNum*PRIORITY_LEVELS; i++ {
		ret.linksChannel = append(ret.linksChannel, make(chan Link, ConfigInstance().RedirectChanSize))
	}
	ret.processedLinks = NewBloomFilter()
	ret.usedChannels = make(map[int]int64)
	ret.urlFilter = NewURLFilter()
	ret.linksRecvCount = 0
	ret.domainLinksRecvCount = make(map[string]int)
	ret.ticker = time.NewTicker(time.Second * 3600)
	go func() {
		for t := range ret.ticker.C {
			newRules := GetSitePatterns()
			for rule, pri := range newRules {
				ret.urlFilter.ruleMatcher.AddRule(rule, pri)
			}
			log.Println(t)
		}
	}()

	for i := 0; i < ConfigInstance().RedirectChanNum*PRIORITY_LEVELS; i++ {
		go ret.Redirect(i)
	}
	return &ret
}
Пример #2
0
func NewDownloadHanler() *DownloadHandler {
	ret := DownloadHandler{}
	ret.urlFilter = NewURLFilter()
	var err error
	ret.currentPath = strconv.FormatInt(time.Now().UnixNano(), 10) + ".tsv"
	ret.writer, err = os.Create("./pages/" + ret.currentPath)

	if err != nil {
		os.Exit(0)
	}
	ret.RtDownloaderAddrs = GetRealtimeDownloaderList()
	ret.metricSender, _ = graphite.New(ConfigInstance().GraphiteHost, "")
	ret.LinksChannel = make(chan Link, DOWNLOADER_QUEUE_SIZE)
	ret.PageChannel = make(chan WebPage, DOWNLOADER_QUEUE_SIZE)
	ret.ExtractedLinksChannel = make(chan Link, DOWNLOADER_QUEUE_SIZE)
	ret.Downloader = NewHTTPGetDownloader()
	ret.processedPageCount = 0
	ret.totalDownloadedPageCount = 0
	ret.proxyDownloadedPageCount = 0
	ret.writePageCount = 0
	ret.linkRecvCount = make(map[string]int)
	ret.pageDownloadCount = make(map[string]int)
	ret.pageWriteCount = make(map[string]int)

	for _, proxy := range GetProxyList() {
		pd := NewHTTPGetProxyDownloader(proxy)
		if pd == nil {
			continue
		}
		ret.ProxyDownloader = append(ret.ProxyDownloader, pd)
	}
	log.Println("proxy downloader count", len(ret.ProxyDownloader))

	ret.signals = make(chan os.Signal, 1)
	signal.Notify(ret.signals, syscall.SIGINT)
	go func() {
		<-ret.signals
		defer ret.writer.Close()
		os.Exit(0)
	}()
	go ret.Download()
	go ret.ProcExtractedLinks()
	go ret.FlushPages()
	return &ret
}