Beispiel #1
0
func FetchGo(runtimeConfig RuntimeConfig, taskC *chan []byte, quitC *chan bool, offsets *RoutingOffset) {
	shard := offsets.Shard
	log.Info("fetch task started.shard:", shard)
	go func() {
		for {
			url := <-*taskC
			log.Debug("shard:", shard, ",url received:", string(url))

			if !runtimeConfig.Storage.CheckFetchedUrl(url) {
				timeout := 10 * time.Second

				//					if(fetchFilter.Lookup(url)){
				//						log.Debug("hit fetch filter ,ignore,",string(url))
				//						continue
				//					}
				//					fetchFilter.Add(url)

				log.Debug("shard:", shard, ",url cool,start fetching:", string(url))
				fetchUrl(url, timeout, runtimeConfig, offsets)
				if runtimeConfig.TaskConfig.FetchDelayThreshold > 0 {
					log.Debug("sleep ", runtimeConfig.TaskConfig.FetchDelayThreshold, "ms to control crawling speed")
					time.Sleep(time.Duration(runtimeConfig.TaskConfig.FetchDelayThreshold) * time.Millisecond)
					log.Debug("wake up now,continue crawing")
				}

			} else {
				log.Debug("hit fetch-bloomfilter,ignore,", string(url))
			}

		}
	}()

	<-*quitC
	log.Info("fetch task exit.shard:", shard)
}
Beispiel #2
0
func ParsedSavedFileLog2(runtimeConfig RuntimeConfig, pendingFetchUrls chan []byte, url string) {
	if url != "" {
		log.Trace("start parse filelog:", url)

		if runtimeConfig.Storage.CheckFetchedUrl([]byte(url)) {
			log.Debug("hit fetch filter ignore,", url)
			return
		}
		log.Debug("new task extracted from saved page:", url)
		pendingFetchUrls <- []byte(url)
	}
}
Beispiel #3
0
func FetchFileWithOffset(runtimeConfig RuntimeConfig, path string, skipOffset int64) {

	var offset int64 = 0

	time1, _ := util.FileMTime(path)
	log.Debug("start touch time:", time1)

	f, err := os.Open(path)
	if err != nil {
		log.Debug("error opening file,", path, " ", err)
		return
	}

	r := bufio.NewReader(f)
	s, e := util.Readln(r)
	offset = 0
	log.Trace("new offset:", offset)

	for e == nil {
		offset = offset + 1
		//TODO use byte offset instead of lines
		if offset > skipOffset {
			ParsedSavedFileLog(runtimeConfig, s)
		}

		runtimeConfig.Storage.PersistOffset(runtimeConfig.PathConfig.SavedFileLog+".offset", offset)

		s, e = util.Readln(r)
		//todo store offset
	}
	log.Trace("end offset:", offset, "vs ", skipOffset)

waitUpdate:
	time2, _ := util.FileMTime(path)

	log.Trace("2nd touch time:", time2)

	if time2 > time1 {
		log.Trace("file has been changed,restart parse")
		FetchFileWithOffset(runtimeConfig, path, offset)
	} else {
		log.Trace("waiting file update", path)
		time.Sleep(10 * time.Millisecond)
		goto waitUpdate
	}
}
Beispiel #4
0
//format url,prepare for bloom filter
func formatUrlForFilter(url []byte) []byte {
	src := string(url)
	log.Debug("start to normalize url:", src)
	if strings.HasSuffix(src, "/") {
		src = strings.TrimRight(src, "/")
	}
	src = strings.TrimSpace(src)
	src = strings.ToLower(src)
	return []byte(src)
}
Beispiel #5
0
func (this *FsStore) LoadOffset(fileName string) int64 {
	log.Debug("start init offsets,", fileName)
	if util.CheckFileExists(fileName) {
		log.Debug("found offset file,start loading,", fileName)
		n, err := ioutil.ReadFile(fileName)
		if err != nil {
			log.Error("offset", fileName, ",", err)
			return 0
		}
		ret, err := strconv.ParseInt(string(n), 10, 64)
		if err != nil {
			log.Error("offset", fileName, ",", err)
			return 0
		}
		log.Info("init offsets successfully,", fileName, ":", ret)
		return int64(ret)
	}

	return 0
}
Beispiel #6
0
//parse config setting
func parseConfig() *TaskConfig {
	log.Debug("start parsing taskConfig")
	taskConfig := new(TaskConfig)
	taskConfig.LinkUrlExtractRegex = regexp.MustCompile(
		config.GetStringConfig("CrawlerRule", "LinkUrlExtractRegex", "(\\s+(src2|src|href|HREF|SRC))\\s*=\\s*[\"']?(.*?)[\"']"))

	taskConfig.SplitByUrlParameter = config.GetStringConfig("CrawlerRule", "SplitByUrlParameter", "p")

	taskConfig.LinkUrlExtractRegexGroupIndex = config.GetIntConfig("CrawlerRule", "LinkUrlExtractRegexGroupIndex", 3)
	taskConfig.Name = config.GetStringConfig("CrawlerRule", "Name", "GopaTask")

	taskConfig.FollowSameDomain = config.GetBoolConfig("CrawlerRule", "FollowSameDomain", true)
	taskConfig.FollowSubDomain = config.GetBoolConfig("CrawlerRule", "FollowSubDomain", true)
	taskConfig.LinkUrlMustContain = config.GetStringConfig("CrawlerRule", "LinkUrlMustContain", "")
	taskConfig.LinkUrlMustNotContain = config.GetStringConfig("CrawlerRule", "LinkUrlMustNotContain", "")

	taskConfig.SkipPageParsePattern = regexp.MustCompile(config.GetStringConfig("CrawlerRule", "SkipPageParsePattern", ".*?\\.((js)|(css)|(rar)|(gz)|(zip)|(exe)|(bmp)|(jpeg)|(gif)|(png)|(jpg)|(apk))\\b")) //end with js,css,apk,zip,ignore

	taskConfig.FetchUrlPattern = regexp.MustCompile(config.GetStringConfig("CrawlerRule", "FetchUrlPattern", ".*"))
	taskConfig.FetchUrlMustContain = config.GetStringConfig("CrawlerRule", "FetchUrlMustContain", "")
	taskConfig.FetchUrlMustNotContain = config.GetStringConfig("CrawlerRule", "FetchUrlMustNotContain", "")

	taskConfig.SavingUrlPattern = regexp.MustCompile(config.GetStringConfig("CrawlerRule", "SavingUrlPattern", ".*"))
	taskConfig.SavingUrlMustContain = config.GetStringConfig("CrawlerRule", "SavingUrlMustContain", "")
	taskConfig.SavingUrlMustNotContain = config.GetStringConfig("CrawlerRule", "SavingUrlMustNotContain", "")

	taskConfig.Cookie = config.GetStringConfig("CrawlerRule", "Cookie", "")
	taskConfig.FetchDelayThreshold = config.GetIntConfig("CrawlerRule", "FetchDelayThreshold", 0)

	taskConfig.TaskDataPath = config.GetStringConfig("CrawlerRule", "TaskData", runtimeConfig.PathConfig.TaskData+"/"+taskConfig.Name+"/")

	defaultWebDataPath := runtimeConfig.PathConfig.WebData + "/" + taskConfig.Name + "/"
	if runtimeConfig.StoreWebPageTogether {
		defaultWebDataPath = runtimeConfig.PathConfig.WebData
	}

	taskConfig.WebDataPath = config.GetStringConfig("CrawlerRule", "WebData", defaultWebDataPath)

	log.Debug("finished parsing taskConfig")
	return taskConfig
}
Beispiel #7
0
func initBloomFilter(bloomFilterPersistFileName string) *Filter {
	var bloomFilter = new(Filter)
	//loading or initializing bloom filter
	if util.CheckFileExists(bloomFilterPersistFileName) {
		log.Debug("found bloomFilter,start reload,", bloomFilterPersistFileName)
		n, err := ioutil.ReadFile(bloomFilterPersistFileName)
		if err != nil {
			log.Error("bloomFilter:", bloomFilterPersistFileName, err)
		}
		if err := bloomFilter.GobDecode(n); err != nil {
			log.Error("bloomFilter:", bloomFilterPersistFileName, err)
		}
		log.Info("bloomFilter successfully reloaded:", bloomFilterPersistFileName)
	} else {
		probItems := config.GetIntConfig("BloomFilter", "ItemSize", 100000)
		log.Debug("initializing bloom-filter", bloomFilterPersistFileName, ",virual size is,", probItems)
		bloomFilter = NewFilter(fnv.New64(), probItems)
		log.Info("bloomFilter successfully initialized:", bloomFilterPersistFileName)
	}
	return bloomFilter
}
Beispiel #8
0
func ParsedSavedFileLog(runtimeConfig RuntimeConfig, fileLog string) {
	if fileLog != "" {
		log.Debug("start parse filelog:", fileLog)
		//load file's content,and extract links

		stringArray := strings.Split(fileLog, "|||")
		fileUrl := stringArray[0]
		fileName := []byte(stringArray[1])

		if runtimeConfig.Storage.CheckParsedFile(fileName) {
			log.Debug("hit parse filter ignore,", string(fileName))
			return
		}

		fileContent := loadFileContent(string(fileName))
		runtimeConfig.Storage.AddParsedFile(fileName)

		if fileContent != nil {
			//			log.Debug("partition:", partition, ",parse fileName:", string(fileName))

			//extract urls to fetch queue.
			extractLinks(runtimeConfig, fileUrl, fileName, fileContent)
			//			offsetV := msg.Offset()
			//			offset.Offset = offsetV
			//
			//			path := taskConfig.BaseStoragePath+     "task/parse_offset_" + strconv.FormatInt(int64(partition), 10) + ".tmp"
			//			path_new := taskConfig.BaseStoragePath+     "task/parse_offset_" + strconv.FormatInt(int64(partition), 10)
			//			fout, error := os.Create(path)
			//			if error != nil {
			//				log.Error(path, error)
			//				return
			//			}
			//
			//			defer fout.Close()
			//			log.Debug("partition:", partition, ",saved offset:", offsetV)
			//			fout.Write([]byte(strconv.FormatUint(msg.Offset(), 10)))
			//			utils.CopyFile(path, path_new)
		}
	}
}
Beispiel #9
0
func (this *FsStore) PersistOffset(fileName string, offset int64) {
	//persist worker's offset
	path := fileName + ".tmp"
	fout, error := os.Create(path)
	if error != nil {
		log.Error(path, error)
		return
	}

	defer fout.Close()
	log.Debug("saved offset:", fileName, ":", offset)
	fout.Write([]byte(strconv.FormatInt(offset, 10)))
	util.CopyFile(path, fileName)
}
Beispiel #10
0
func shutdown(offsets []*RoutingOffset, quitChannels []*chan bool, offsets2 []*RoutingOffset, quitChannels2 []*chan bool, quit chan bool) {
	log.Debug("start shutting down")
	for i := range quitChannels {
		log.Debug("send exit signal to channel,", i)
		*quitChannels[i] <- true
	}

	for i := range quitChannels2 {
		log.Debug("send exit signal to channel,", i)
		*quitChannels2[i] <- true
	}

	log.Info("sent quit signal to go routings done")

	//	for i:=range offsets{
	//		//TODO
	//		log.Info("persist offset,",i,":",offsets[i].Offset,",",offsets[i].shard)
	//	}

	//	log.Info("persist kafka offsets done")

	quit <- true
	log.Debug("finished shutting down")
}
Beispiel #11
0
func post(url string, cookie string, postStr string) []byte {

	log.Debug("let's post :" + url)

	client := &http.Client{
		CheckRedirect: nil,
	}

	postBytesReader := bytes.NewReader([]byte(postStr))
	reqest, _ := http.NewRequest("POST", url, postBytesReader)

	reqest.Header.Set("User-Agent", " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36")
	reqest.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
	reqest.Header.Set("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3")
	reqest.Header.Set("Accept-Encoding", "gzip,deflate,sdch")
	//	reqest.Header.Add("Content-Type", "application/x-www-form-urlencoded")
	reqest.Header.Add("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
	reqest.Header.Set("Accept-Language", "zh-CN,zh;q=0.8")
	reqest.Header.Set("Cache-Control", "max-age=0")
	reqest.Header.Set("Connection", "keep-alive")
	reqest.Header.Set("Referer", url)

	if len(cookie) > 0 {
		log.Debug("dealing with cookie:" + cookie)
		array := strings.Split(cookie, ";")
		for item := range array {
			array2 := strings.Split(array[item], "=")
			if len(array2) == 2 {
				cookieObj := http.Cookie{}
				cookieObj.Name = array2[0]
				cookieObj.Value = array2[1]
				reqest.AddCookie(&cookieObj)
			} else {
				log.Info("error,index out of range:" + array[item])
			}
		}
	}

	resp, err := client.Do(reqest)

	if err != nil {
		log.Error(url, err)
		return nil
	}

	defer resp.Body.Close()

	var reader io.ReadCloser
	switch resp.Header.Get("Content-Encoding") {
	case "gzip":
		reader, err = gzip.NewReader(resp.Body)
		if err != nil {
			log.Error(url, err)
			return nil
		}
		defer reader.Close()
	default:
		reader = resp.Body
	}

	if reader != nil {
		body, err := ioutil.ReadAll(reader)
		if err != nil {
			log.Error(url, err)
			return nil
		}
		return body
	}
	return nil
}
Beispiel #12
0
func main() {
	flag.StringVar(&seedUrl, "seed", "http://example.com", "the seed url,where everything starts")
	flag.StringVar(&logLevel, "log", "info", "setting log level,options:trace,debug,info,warn,error")

	flag.Parse()

	defer log.Flush()

	runtime.GOMAXPROCS(runtime.NumCPU())

	log.SetInitLogging(logLevel)

	runtimeConfig.PathConfig = new(PathConfig)
	runtimeConfig.ClusterConfig = new(ClusterConfig)

	runtimeConfig.ClusterConfig.Name = config.GetStringConfig("cluster", "name", "gopa")

	// per cluster:data/gopa/
	runtimeConfig.PathConfig.Home = config.GetStringConfig("path", "home", "cluster/"+runtimeConfig.ClusterConfig.Name+"/")

	runtimeConfig.PathConfig.Data = config.GetStringConfig("path", "data", "")
	if runtimeConfig.PathConfig.Data == "" {
		runtimeConfig.PathConfig.Data = runtimeConfig.PathConfig.Home + "/" + "data/"
	}

	runtimeConfig.PathConfig.Log = config.GetStringConfig("path", "log", "")
	if runtimeConfig.PathConfig.Log == "" {
		runtimeConfig.PathConfig.Log = runtimeConfig.PathConfig.Home + "/" + "log/"
	}

	runtimeConfig.PathConfig.WebData = config.GetStringConfig("path", "webdata", "")
	if runtimeConfig.PathConfig.WebData == "" {
		runtimeConfig.PathConfig.WebData = runtimeConfig.PathConfig.Data + "/" + "webdata/"
	}

	runtimeConfig.PathConfig.TaskData = config.GetStringConfig("path", "taskdata", "")
	if runtimeConfig.PathConfig.TaskData == "" {
		runtimeConfig.PathConfig.TaskData = runtimeConfig.PathConfig.Data + "/" + "taskdata/"
	}

	runtimeConfig.StoreWebPageTogether = config.GetBoolConfig("Global", "StoreWebPageTogether", true)

	runtimeConfig.TaskConfig = parseConfig()

	//set default logging
	logPath := runtimeConfig.PathConfig.Log + "/" + runtimeConfig.TaskConfig.Name + "/gopa.log"
	log.SetLogging(logLevel, logPath)

	runtimeConfig.ParseUrlsFromSavedFileLog = config.GetBoolConfig("Switch", "ParseUrlsFromSavedFileLog", true)
	runtimeConfig.LoadTemplatedFetchJob = config.GetBoolConfig("Switch", "LoadTemplatedFetchJob", true)
	runtimeConfig.LoadRuledFetchJob = config.GetBoolConfig("Switch", "LoadRuledFetchJob", false)
	runtimeConfig.LoadPendingFetchJobs = config.GetBoolConfig("Switch", "LoadPendingFetchJobs", true)
	runtimeConfig.HttpEnabled = config.GetBoolConfig("Switch", "HttpEnabled", true)
	runtimeConfig.ParseUrlsFromPreviousSavedPage = config.GetBoolConfig("Switch", "ParseUrlsFromPreviousSavedPage", false)
	runtimeConfig.ArrayStringSplitter = config.GetStringConfig("CrawlerRule", "ArrayStringSplitter", ",")

	runtimeConfig.GoProfEnabled = config.GetBoolConfig("CrawlerRule", "GoProfEnabled", false)

	runtimeConfig.WalkBloomFilterFileName = config.GetStringConfig("BloomFilter", "WalkBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/walk.bloomfilter")
	runtimeConfig.FetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "FetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/fetch.bloomfilter")
	runtimeConfig.ParseBloomFilterFileName = config.GetStringConfig("BloomFilter", "ParseBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/parse.bloomfilter")
	runtimeConfig.PendingFetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "PendingFetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/pending_fetch.bloomfilter")

	runtimeConfig.PathConfig.SavedFileLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_parse.files"
	runtimeConfig.PathConfig.PendingFetchLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_fetch.urls"
	runtimeConfig.PathConfig.FetchFailedLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/failed_fetch.urls"

	runtimeConfig.MaxGoRoutine = config.GetIntConfig("Global", "MaxGoRoutine", 2)
	if runtimeConfig.MaxGoRoutine < 2 {
		runtimeConfig.MaxGoRoutine = 2
	}

	log.Debug("maxGoRoutine:", runtimeConfig.MaxGoRoutine)
	log.Debug("path.home:", runtimeConfig.PathConfig.Home)

	os.MkdirAll(runtimeConfig.PathConfig.Home, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.Data, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.Log, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.WebData, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.TaskData, 0777)

	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath, 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/tasks/", 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/filters/", 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/urls/", 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.WebDataPath, 0777)

	runtimeConfig.RuledFetchConfig = new(RuledFetchConfig)
	runtimeConfig.RuledFetchConfig.UrlTemplate = config.GetStringConfig("RuledFetch", "UrlTemplate", "")
	runtimeConfig.RuledFetchConfig.From = config.GetIntConfig("RuledFetch", "From", 0)
	runtimeConfig.RuledFetchConfig.To = config.GetIntConfig("RuledFetch", "To", 10)
	runtimeConfig.RuledFetchConfig.Step = config.GetIntConfig("RuledFetch", "Step", 1)
	runtimeConfig.RuledFetchConfig.LinkExtractPattern = config.GetStringConfig("RuledFetch", "LinkExtractPattern", "")
	runtimeConfig.RuledFetchConfig.LinkTemplate = config.GetStringConfig("RuledFetch", "LinkTemplate", "")

	if seedUrl == "" || seedUrl == "http://example.com" {
		log.Error("no seed was given. type:\"gopa -h\" for help.")
		os.Exit(1)
	}

	log.Info("[gopa] " + runtimeConfig.Version + " is on.")

	runtimeConfig.Storage = &fsstore.FsStore{}

	//	if(runtimeConfig.)

	runtimeConfig.Storage.InitWalkBloomFilter(runtimeConfig.WalkBloomFilterFileName)
	runtimeConfig.Storage.InitFetchBloomFilter(runtimeConfig.FetchBloomFilterFileName)
	runtimeConfig.Storage.InitParseBloomFilter(runtimeConfig.ParseBloomFilterFileName)
	runtimeConfig.Storage.InitPendingFetchBloomFilter(runtimeConfig.PendingFetchBloomFilterFileName)

	//	atr:="AZaz"
	//	btr:=[]byte(atr)
	//	fmt.Println(btr)
	//
	//	id:= getSeqStr([]byte("AA"),[]byte("ZZ"),false)
	//	fmt.Println(id)

	//pprof serves
	if runtimeConfig.GoProfEnabled {
		go func() {
			log.Info(http.ListenAndServe("localhost:6060", nil))
			log.Info("pprof server is up,http://localhost:6060/debug/pprof")
		}()
	}

	//http serves
	if runtimeConfig.HttpEnabled {
		go func() {
			httpServ.Start(runtimeConfig)
		}()
	}

	//adding default http protocol
	if !strings.HasPrefix(seedUrl, "http") {
		seedUrl = "http://" + seedUrl
	}

	maxGoRoutine := runtimeConfig.MaxGoRoutine
	fetchQuitChannels := make([]*chan bool, maxGoRoutine)   //shutdownSignal signals for each go routing
	fetchTaskChannels := make([]*chan []byte, maxGoRoutine) //fetchTask channels
	fetchOffsets := make([]*RoutingOffset, maxGoRoutine)    //kafka fetchOffsets

	parseQuitChannels := make([]*chan bool, 2) //shutdownSignal signals for each go routing
	//	parseQuitChannels := make([]*chan bool, MaxGoRoutine) //shutdownSignal signals for each go routing
	parseOffsets := make([]*RoutingOffset, maxGoRoutine) //kafka fetchOffsets

	shutdownSignal := make(chan bool, 1)
	finalQuitSignal := make(chan bool, 1)

	//handle exit event
	exitEventChannel := make(chan os.Signal, 1)
	signal.Notify(exitEventChannel, syscall.SIGINT)
	signal.Notify(exitEventChannel, os.Interrupt)
	go func() {
		s := <-exitEventChannel
		log.Debug("got signal:", s)
		if s == os.Interrupt || s.(os.Signal) == syscall.SIGINT {
			log.Warn("got signal:os.Interrupt,saving data and exit")
			//			defer  os.Exit(0)

			runtimeConfig.Storage.PersistBloomFilter()

			//wait workers to exit
			log.Info("waiting workers exit")
			go shutdown(fetchOffsets, fetchQuitChannels, parseOffsets, parseQuitChannels, shutdownSignal)
			<-shutdownSignal
			log.Info("workers shutdown")
			finalQuitSignal <- true
		}
	}()

	//start fetcher
	for i := 0; i < maxGoRoutine; i++ {
		quitC := make(chan bool, 1)
		taskC := make(chan []byte)

		fetchQuitChannels[i] = &quitC
		fetchTaskChannels[i] = &taskC
		offset := new(RoutingOffset)
		//		offset.Offset = initOffset(runtimeConfig, "fetch", i)
		offset.Shard = i
		fetchOffsets[i] = offset

		go task.FetchGo(runtimeConfig, &taskC, &quitC, offset)
	}

	c2 := make(chan bool, 1)
	parseQuitChannels[0] = &c2
	offset2 := new(RoutingOffset)
	//	offset2.Offset = initOffset(runtimeConfig, "parse", 0)
	offset2.Shard = 0
	parseOffsets[0] = offset2
	pendingFetchUrls := make(chan []byte)

	//fetch rule:all urls -> persisted to sotre -> fetched from store -> pushed to pendingFetchUrls -> redistributed to sharded goroutines -> fetch -> save webpage to store -> done
	//parse rule:url saved to store -> local path persisted to store -> fetched to pendingParseFiles -> redistributed to sharded goroutines -> parse -> clean urls -> enqueue to url store ->done

	//sending feed to task queue
	go func() {
		//notice seed will not been persisted
		log.Debug("sending feed to fetch queue,", seedUrl)
		pendingFetchUrls <- []byte(seedUrl)
	}()

	//start local saved file parser
	if runtimeConfig.ParseUrlsFromSavedFileLog {
		go task.ParseGo(pendingFetchUrls, runtimeConfig, &c2, offset2)
	}

	//redistribute pendingFetchUrls to sharded workers
	go func() {
		for {
			url := <-pendingFetchUrls
			if !runtimeConfig.Storage.CheckWalkedUrl(url) {

				if runtimeConfig.Storage.CheckFetchedUrl(url) {
					log.Warn("dont hit walk bloomfilter but hit fetch bloomfilter,also ignore,", string(url))
					runtimeConfig.Storage.AddWalkedUrl(url)
					continue
				}

				randomShard := 0
				if maxGoRoutine > 1 {
					randomShard = rand.Intn(maxGoRoutine - 1)
				}
				log.Debug("publish:", string(url), ",shard:", randomShard)
				runtimeConfig.Storage.AddWalkedUrl(url)
				*fetchTaskChannels[randomShard] <- url
			} else {
				log.Trace("hit walk or fetch bloomfilter,just ignore,", string(url))
			}
		}
	}()

	//load predefined fetch jobs
	if runtimeConfig.LoadTemplatedFetchJob {
		go func() {

			if util.CheckFileExists(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt") {

				templates := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt")
				ids := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/id.txt")

				for _, id := range ids {
					for _, template := range templates {
						log.Trace("id:", id)
						log.Trace("template:", template)
						url := strings.Replace(template, "{id}", id, -1)
						log.Debug("new task from template:", url)
						pendingFetchUrls <- []byte(url)
					}
				}
				log.Info("templated download is done.")

			}

		}()
	}

	//fetch urls from saved pages
	if runtimeConfig.LoadPendingFetchJobs {
		c3 := make(chan bool, 1)
		parseQuitChannels[1] = &c3
		offset3 := new(RoutingOffset)
		//		offset3.Offset = initOffset(runtimeConfig, "fetch_from_saved", 0)
		offset3.Shard = 0
		parseOffsets[1] = offset3
		go task.LoadTaskFromLocalFile(pendingFetchUrls, &runtimeConfig, &c3, offset3)
	}

	//parse fetch failed jobs,and will ignore the walk-filter
	//TODO

	if runtimeConfig.LoadRuledFetchJob {
		log.Debug("start ruled fetch")
		go func() {
			if runtimeConfig.RuledFetchConfig.UrlTemplate != "" {
				for i := runtimeConfig.RuledFetchConfig.From; i <= runtimeConfig.RuledFetchConfig.To; i += runtimeConfig.RuledFetchConfig.Step {
					url := strings.Replace(runtimeConfig.RuledFetchConfig.UrlTemplate, "{id}", strconv.FormatInt(int64(i), 10), -1)
					log.Debug("add ruled url:", url)
					pendingFetchUrls <- []byte(url)
				}
			} else {
				log.Error("ruled template is empty,ignore")
			}
		}()

	}

	<-finalQuitSignal
	log.Info("[gopa] is down")
}
Beispiel #13
0
func (this *FsStore) CheckSavedFile(file string) bool {
	log.Debug("start check file:", file)
	return util.CheckFileExists(file)
}
Beispiel #14
0
func extractLinks(runtimeConfig RuntimeConfig, fileUrl string, fileName []byte, body []byte) {

	//	siteUrlStr := string(fileName)
	//	siteUrlStr = strings.TrimLeft(siteUrlStr, "data/")
	//	siteUrlStr = "http://" + siteUrlStr
	//	log.Debug("fileName to Url:", string(fileName), ",", siteUrlStr)

	siteUrlStr := fileUrl
	siteConfig := runtimeConfig.TaskConfig

	siteUrlByte := []byte(siteUrlStr)
	log.Debug("enter links extract,", siteUrlStr)
	if siteConfig.SkipPageParsePattern.Match(siteUrlByte) {
		log.Debug("hit SkipPageParsePattern pattern,", siteUrlStr)
		return
	}

	log.Debug("parsing external links:", siteUrlStr, ",using:", siteConfig.LinkUrlExtractRegex)

	matches := siteConfig.LinkUrlExtractRegex.FindAllSubmatch(body, -1)
	log.Debug("extract links with pattern,total matchs:", len(matches), " match result,", string(fileName))
	xIndex := 0
	for _, match := range matches {
		log.Debug("dealing with match result,", xIndex)
		xIndex = xIndex + 1
		url := match[siteConfig.LinkUrlExtractRegexGroupIndex]
		filterUrl := formatUrlForFilter(url)
		log.Debug("url clean result:", string(filterUrl), ",original url:", string(url))
		filteredUrl := string(filterUrl)

		//filter error link
		if filteredUrl == "" {
			log.Debug("filteredUrl is empty,continue")
			continue
		}

		result1 := strings.HasPrefix(filteredUrl, "#")
		if result1 {
			log.Debug("filteredUrl started with: # ,continue")
			continue
		}

		result2 := strings.HasPrefix(filteredUrl, "javascript:")
		if result2 {
			log.Debug("filteredUrl started with: javascript: ,continue")
			continue
		}

		hit := false

		//		l.Lock();
		//		defer l.Unlock();

		if runtimeConfig.Storage.CheckWalkedUrl(filterUrl) || runtimeConfig.Storage.CheckFetchedUrl(filterUrl) || runtimeConfig.Storage.CheckPendingFetchUrl(filterUrl) {
			log.Debug("hit bloomFilter,continue")
			hit = true
			continue
		}

		if !hit {
			currentUrlStr := string(url)
			currentUrlStr = strings.Trim(currentUrlStr, " ")

			seedUrlStr := siteUrlStr
			seedURI, err := ParseRequestURI(seedUrlStr)

			if err != nil {
				log.Error("ParseSeedURI failed!: ", seedUrlStr, " , ", err)
				continue
			}

			currentURI1, err := ParseRequestURI(currentUrlStr)
			currentURI := currentURI1
			if err != nil {
				if strings.Contains(err.Error(), "invalid URI for request") {
					log.Debug("invalid URI for request,fix relative url,original:", currentUrlStr)
					//					log.Debug("old relatived url,", currentUrlStr)
					//page based relative urls

					currentUrlStr = "http://" + seedURI.Host + "/" + currentUrlStr
					currentURI1, err = ParseRequestURI(currentUrlStr)
					currentURI = currentURI1
					if err != nil {
						log.Error("ParseCurrentURI internal failed!: ", currentUrlStr, " , ", err)
						continue
					}

					log.Debug("new relatived url,", currentUrlStr)

				} else {
					log.Error("ParseCurrentURI failed!: ", currentUrlStr, " , ", err)
					continue
				}
			}

			//			relative links
			if currentURI == nil || currentURI.Host == "" {
				if strings.HasPrefix(currentURI.Path, "/") {
					//root based relative urls
					log.Debug("old relatived url,", currentUrlStr)
					currentUrlStr = "http://" + seedURI.Host + currentUrlStr
					log.Debug("new relatived url,", currentUrlStr)
				} else {
					log.Debug("old relatived url,", currentUrlStr)
					//page based relative urls
					urlPath := getRootUrl(currentURI)
					currentUrlStr = "http://" + urlPath + currentUrlStr
					log.Debug("new relatived url,", currentUrlStr)
				}
			} else {
				log.Debug("host:", currentURI.Host, " ", currentURI.Host == "")

				//resolve domain specific filter
				if siteConfig.FollowSameDomain {

					if siteConfig.FollowSubDomain {

						//TODO handler com.cn and .com,using a TLC-domain list

					} else if seedURI.Host != currentURI.Host {
						log.Debug("domain mismatch,", seedURI.Host, " vs ", currentURI.Host)
						//continue
					}
					//TODO follow all or list of domain
				}
			}

			if len(siteConfig.LinkUrlMustContain) > 0 {
				if !util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustContain) {
					log.Debug("link does not hit must-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain)
					continue
				}
			}

			if len(siteConfig.LinkUrlMustNotContain) > 0 {
				if util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustNotContain) {
					log.Debug("link hit must-not-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain)
					continue
				}
			}

			//normalize url
			currentUrlStr = MustNormalizeURLString(currentUrlStr, FlagLowercaseScheme|FlagLowercaseHost|FlagUppercaseEscapes|
				FlagRemoveUnnecessaryHostDots|FlagRemoveDuplicateSlashes|FlagRemoveFragment)
			log.Debug("normalized url:", currentUrlStr)
			currentUrlByte := []byte(currentUrlStr)
			if !(runtimeConfig.Storage.CheckWalkedUrl(currentUrlByte) || runtimeConfig.Storage.CheckFetchedUrl(currentUrlByte) || runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte)) {
				//bloomFilter.Lookup(currentUrlByte) {

				//								if(CheckIgnore(currentUrlStr)){}

				//				log.Info("enqueue fetch: ", currentUrlStr)

				//				broker.Publish(kafka.NewMessage(currentUrlByte))

				//copied form fetchTask,TODO refactor
				//checking fetchUrlPattern
				log.Debug("started check fetchUrlPattern,", currentUrlStr)
				if siteConfig.FetchUrlPattern.Match(currentUrlByte) {
					log.Debug("match fetch url pattern,", currentUrlStr)
					if len(siteConfig.FetchUrlMustNotContain) > 0 {
						if util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustNotContain) {
							log.Debug("hit FetchUrlMustNotContain,ignore,", currentUrlStr)
							continue
						}
					}

					if len(siteConfig.FetchUrlMustContain) > 0 {
						if !util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustContain) {
							log.Debug("not hit FetchUrlMustContain,ignore,", currentUrlStr)
							continue
						}
					}
				} else {
					log.Debug("does not hit FetchUrlPattern ignoring,", currentUrlStr)
					continue
				}

				if !runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte) {
					log.Debug("log new pendingFetch url", currentUrlStr)
					runtimeConfig.Storage.LogPendingFetchUrl(runtimeConfig.PathConfig.PendingFetchLog, currentUrlStr)
					runtimeConfig.Storage.AddPendingFetchUrl(currentUrlByte)
				} else {
					log.Debug("hit new pendingFetch filter,ignore:", currentUrlStr)
				}
				//				pendingUrls <- currentUrlByte

				//	TODO pendingFetchFilter			bloomFilter.Add(currentUrlByte)
			} else {
				log.Debug("hit bloom filter,ignore:", currentUrlStr)
			}
			//			bloomFilter.Add([]byte(filterUrl))
		} else {
			log.Debug("hit bloom filter,ignore,", string(url))
		}
		log.Debug("exit links extract,", siteUrlStr)

	}

	//TODO 处理ruled fetch pattern

	log.Info("all links within ", siteUrlStr, " is done")
}
Beispiel #15
0
//fetch url's content
func fetchUrl(url []byte, timeout time.Duration, runtimeConfig RuntimeConfig, offsets *RoutingOffset) {
	t := time.NewTimer(timeout)
	defer t.Stop()

	resource := string(url)

	log.Debug("enter fetchUrl method:", resource)

	config := runtimeConfig.TaskConfig

	if runtimeConfig.Storage.CheckFetchedUrl(url) {
		return
	}

	path := getSavedPath(runtimeConfig, url)

	if runtimeConfig.Storage.CheckSavedFile(path) {
		log.Warn("file is already saved,skip fetch.", path)
		runtimeConfig.Storage.AddSavedUrl(url)

		//re-parse local's previous saved page
		if runtimeConfig.ParseUrlsFromPreviousSavedPage {
			if !runtimeConfig.Storage.CheckParsedFile([]byte(path)) {
				log.Debug("previous saved page send to parse-queue:", path)
				runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path)
			}
		}
		return
	}

	//checking fetchUrlPattern
	log.Debug("started check fetchUrlPattern,", config.FetchUrlPattern, ",", resource)
	if config.FetchUrlPattern.Match(url) {
		log.Debug("match fetch url pattern,", resource)
		if len(config.FetchUrlMustNotContain) > 0 {
			if util.ContainStr(resource, config.FetchUrlMustNotContain) {
				log.Debug("hit FetchUrlMustNotContain,ignore,", resource, " , ", config.FetchUrlMustNotContain)
				return
			}
		}

		if len(config.FetchUrlMustContain) > 0 {
			if !util.ContainStr(resource, config.FetchUrlMustContain) {
				log.Debug("not hit FetchUrlMustContain,ignore,", resource, " , ", config.FetchUrlMustContain)
				return
			}
		}
	} else {
		log.Debug("does not hit FetchUrlPattern ignoring,", resource)
		return
	}

	log.Debug("start fetch url,", resource)
	flg := make(chan bool, 1)

	go func() {

		body, err := HttpGetWithCookie(resource, config.Cookie)

		if err == nil {
			if body != nil {
				//todo parse urls from this page
				log.Debug("started check savingUrlPattern,", config.SavingUrlPattern, ",", string(url))
				if config.SavingUrlPattern.Match(url) {
					log.Debug("match saving url pattern,", resource)
					if len(config.SavingUrlMustNotContain) > 0 {
						if util.ContainStr(resource, config.SavingUrlMustNotContain) {
							log.Debug("hit SavingUrlMustNotContain,ignore,", resource, " , ", config.SavingUrlMustNotContain)
							goto exitPage
						}
					}

					if len(config.SavingUrlMustContain) > 0 {
						if !util.ContainStr(resource, config.SavingUrlMustContain) {
							log.Debug("not hit SavingUrlMustContain,ignore,", resource, " , ", config.SavingUrlMustContain)
							goto exitPage
						}
					}

					_, err := Save(runtimeConfig, path, body)
					if err == nil {
						log.Info("saved:", path)
						//todo saved per shard
						runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path)
					} else {
						log.Info("error while saved:", path, ",", err)
						goto exitPage
					}

				} else {
					log.Debug("does not hit SavingUrlPattern ignoring,", resource)
				}
			}
			runtimeConfig.Storage.AddFetchedUrl(url)
		exitPage:
			log.Debug("exit fetchUrl method:", resource)
		} else {
			//			runtimeConfig.Storage.AddFetchFailedUrl(url)
			runtimeConfig.Storage.LogFetchFailedUrl(runtimeConfig.PathConfig.FetchFailedLog, resource)
		}
		flg <- true
	}()

	//监听通道,由于设有超时,不可能泄露
	select {
	case <-t.C:
		log.Error("fetching url time out,", resource)
	case <-flg:
		log.Debug("fetching url normal exit,", resource)
		return
	}

}