Example #1
0
func FetchGo(runtimeConfig RuntimeConfig, taskC *chan []byte, quitC *chan bool, offsets *RoutingOffset) {
	shard := offsets.Shard
	log.Info("fetch task started.shard:", shard)
	go func() {
		for {
			url := <-*taskC
			log.Debug("shard:", shard, ",url received:", string(url))

			if !runtimeConfig.Storage.CheckFetchedUrl(url) {
				timeout := 10 * time.Second

				//					if(fetchFilter.Lookup(url)){
				//						log.Debug("hit fetch filter ,ignore,",string(url))
				//						continue
				//					}
				//					fetchFilter.Add(url)

				log.Debug("shard:", shard, ",url cool,start fetching:", string(url))
				fetchUrl(url, timeout, runtimeConfig, offsets)
				if runtimeConfig.TaskConfig.FetchDelayThreshold > 0 {
					log.Debug("sleep ", runtimeConfig.TaskConfig.FetchDelayThreshold, "ms to control crawling speed")
					time.Sleep(time.Duration(runtimeConfig.TaskConfig.FetchDelayThreshold) * time.Millisecond)
					log.Debug("wake up now,continue crawing")
				}

			} else {
				log.Debug("hit fetch-bloomfilter,ignore,", string(url))
			}

		}
	}()

	<-*quitC
	log.Info("fetch task exit.shard:", shard)
}
Example #2
0
func DeleteHandler(w http.ResponseWriter, r *http.Request) {
	n := new(storage.Needle)
	vid, fid, _ := parseURLPath(r.URL.Path)
	volumeId, _ := strconv.ParseUint(vid, 10, 64)
	n.ParsePath(fid)

	if *IsDebug {
		log.Info("deleting", n)
	}

	cookie := n.Cookie
	count, ok := store.Read(volumeId, n)

	if ok != nil {
		m := make(map[string]uint32)
		m["size"] = 0
		writeJson(w, r, m)
		return
	}

	if n.Cookie != cookie {
		log.Info("delete with unmaching cookie from ", r.RemoteAddr, "agent", r.UserAgent())
		return
	}

	n.Size = 0
	store.Delete(volumeId, n)
	m := make(map[string]uint32)
	m["size"] = uint32(count)
	writeJson(w, r, m)
}
Example #3
0
func main() {
	flag.Parse()

	//volume master block
	log.Info("Volume Size Limit is", *volumeSizeLimitMB, "MB")
	mapper = directory.NewMapper(*metaFolder, "directory", uint64(*volumeSizeLimitMB)*1024*1024)

	//weedfs master handler
	http.HandleFunc("/dir/assign", dirAssignHandler)
	http.HandleFunc("/dir/lookup", dirLookupHandler)
	http.HandleFunc("/dir/join", dirJoinHandler)
	http.HandleFunc("/dir/status", dirStatusHandler)

	//start weedfs master
	go func() {
		log.Info("Start directory service at http://127.0.0.1:" + strconv.Itoa(*port))
		e := http.ListenAndServe(":"+strconv.Itoa(*port), nil)
		if e != nil {
			log.Error("Fail to start:", e)
		}
	}()

	//volume block
	//TODO: now default to 1G, this value should come from server?
	store = storage.NewStore(*storePort, *publicUrl, *chunkFolder, *volumes)
	defer store.Close()

	//weedfs volume handler
	http.HandleFunc("/", storeHandler)
	http.HandleFunc("/status", statusHandler)
	http.HandleFunc("/add_volume", addVolumeHandler)

	go func() {
		for {
			store.Join(*metaServer)
			time.Sleep(time.Duration(float32(*pulse*1e3)*(1+rand.Float32())) * time.Millisecond)
		}
	}()
	log.Info("store joined at", *metaServer)

	//start weedfs volume
	go func() {
		log.Info("Start storage service at http://127.0.0.1:"+strconv.Itoa(*storePort), "public url", *publicUrl)
		e := http.ListenAndServe(":"+strconv.Itoa(*storePort), nil)
		if e != nil {
			log.Error("Fail to start:", e)
		}
	}()

}
Example #4
0
func Start(runtimeConfig RuntimeConfig) {
	config = runtimeConfig
	http.HandleFunc("/", index)
	http.ListenAndServe(":8001", nil)
	log.Info("http server is up,http://localhost:8001/")

}
Example #5
0
func NewStore(port int, publicUrl, dirname string, volumeListString string) (s *Store) {
	s = &Store{Port: port, PublicUrl: publicUrl, dir: dirname}
	s.volumes = make(map[uint64]*Volume)

	s.AddVolume(volumeListString)

	log.Info("Store started on dir:", dirname, "with", len(s.volumes), "volumes")
	return
}
Example #6
0
func (m *Mapper) saveSequence() {
	log.Info("Saving file id sequence", m.FileIdSequence, "to", path.Join(m.dir, m.fileName+".seq"))
	seqFile, e := os.OpenFile(path.Join(m.dir, m.fileName+".seq"), os.O_CREATE|os.O_WRONLY, 0644)
	if e != nil {
		log.Error("Sequence File Save [ERROR] %s,", e)
	}
	defer seqFile.Close()
	encoder := gob.NewEncoder(seqFile)
	encoder.Encode(m.FileIdSequence)
}
Example #7
0
func dirJoinHandler(w http.ResponseWriter, r *http.Request) {
	s := r.RemoteAddr[0:strings.Index(r.RemoteAddr, ":")+1] + r.FormValue("port")
	publicUrl := r.FormValue("publicUrl")
	volumes := new([]storage.VolumeInfo)
	json.Unmarshal([]byte(r.FormValue("volumes")), volumes)
	if *IsDebug {
		log.Info(s, "volumes", r.FormValue("volumes"))
	}
	mapper.Add(*directory.NewMachine(s, publicUrl, *volumes))
}
Example #8
0
func NewMapper(dirname string, filename string, volumeSizeLimit uint64) (m *Mapper) {
	m = &Mapper{dir: dirname, fileName: filename}
	m.vid2machineId = make(map[uint32]int)
	m.volumeSizeLimit = volumeSizeLimit
	m.Writers = *new([]uint32)
	m.Machines = *new([]*Machine)

	seqFile, se := os.OpenFile(path.Join(m.dir, m.fileName+".seq"), os.O_RDONLY, 0644)
	if se != nil {
		m.FileIdSequence = FileIdSaveInterval
		log.Info("Setting file id sequence", m.FileIdSequence)
	} else {
		decoder := gob.NewDecoder(seqFile)
		defer seqFile.Close()
		decoder.Decode(&m.FileIdSequence)
		log.Info("Loading file id sequence", m.FileIdSequence, "=>", m.FileIdSequence+FileIdSaveInterval)
		//in case the server stops between intervals
		m.FileIdSequence += FileIdSaveInterval
	}
	return
}
Example #9
0
File: fs.go Project: xujianhai/gopa
func initBloomFilter(bloomFilterPersistFileName string) *Filter {
	var bloomFilter = new(Filter)
	//loading or initializing bloom filter
	if util.CheckFileExists(bloomFilterPersistFileName) {
		log.Debug("found bloomFilter,start reload,", bloomFilterPersistFileName)
		n, err := ioutil.ReadFile(bloomFilterPersistFileName)
		if err != nil {
			log.Error("bloomFilter:", bloomFilterPersistFileName, err)
		}
		if err := bloomFilter.GobDecode(n); err != nil {
			log.Error("bloomFilter:", bloomFilterPersistFileName, err)
		}
		log.Info("bloomFilter successfully reloaded:", bloomFilterPersistFileName)
	} else {
		probItems := config.GetIntConfig("BloomFilter", "ItemSize", 100000)
		log.Debug("initializing bloom-filter", bloomFilterPersistFileName, ",virual size is,", probItems)
		bloomFilter = NewFilter(fnv.New64(), probItems)
		log.Info("bloomFilter successfully initialized:", bloomFilterPersistFileName)
	}
	return bloomFilter
}
Example #10
0
func GetHandler(w http.ResponseWriter, r *http.Request) {
	n := new(storage.Needle)
	vid, fid, ext := parseURLPath(r.URL.Path)
	volumeId, _ := strconv.ParseUint(vid, 10, 64)
	n.ParsePath(fid)

	if *IsDebug {
		log.Info("volume", volumeId, "reading", n)
	}
	cookie := n.Cookie
	count, e := store.Read(volumeId, n)
	if *IsDebug {
		log.Info("read bytes", count, "error", e)
	}
	if n.Cookie != cookie {
		log.Info("request with unmaching cookie from ", r.RemoteAddr, "agent", r.UserAgent())
		return
	}
	if ext != "" {
		w.Header().Set("Content-Type", mime.TypeByExtension(ext))
	}
	w.Write(n.Data)
}
Example #11
0
File: fs.go Project: xujianhai/gopa
func persistBloomFilter(bloomFilterPersistFileName string, bloomFilter *Filter) {

	//save bloom-filter
	m, err := bloomFilter.GobEncode()
	if err != nil {
		log.Error(err)
		return
	}
	err = ioutil.WriteFile(bloomFilterPersistFileName, m, 0600)
	if err != nil {
		panic(err)
		return
	}
	log.Info("bloomFilter safety persisted.")
}
Example #12
0
func ParseGo(pendingUrls chan []byte, runtimeConfig RuntimeConfig, quit *chan bool, offsets *RoutingOffset) {
	log.Info("parsing task started.")
	path := runtimeConfig.PathConfig.SavedFileLog
	//touch local's file
	//read all of line
	//if hit the EOF,will wait 2s,and then reopen the file,and try again,may be check the time of last modified

waitFile:
	if !util.CheckFileExists(path) {
		log.Trace("waiting file create", path)
		time.Sleep(10 * time.Millisecond)
		goto waitFile
	}

	var offset int64 = runtimeConfig.Storage.LoadOffset(runtimeConfig.PathConfig.SavedFileLog + ".offset")
	FetchFileWithOffset(runtimeConfig, path, offset)
}
Example #13
0
func (m *Mapper) PickForWrite(c string) (string, int, MachineInfo, error) {
	len_writers := len(m.Writers)
	if len_writers <= 0 {
		log.Info("No more writable volumes!")
		return "", 0, m.Machines[rand.Intn(len(m.Machines))].Server, errors.New("No more writable volumes!")
	}
	vid := m.Writers[rand.Intn(len_writers)]
	machine_id := m.vid2machineId[vid]
	if machine_id > 0 {
		machine := m.Machines[machine_id-1]
		fileId, count := m.NextFileId(c)
		if count == 0 {
			return "", 0, m.Machines[rand.Intn(len(m.Machines))].Server, errors.New("Strange count:" + c)
		}
		return NewFileId(vid, fileId, rand.Uint32()).String(), count, machine.Server, nil
	}
	return "", 0, m.Machines[rand.Intn(len(m.Machines))].Server, errors.New("Strangely vid " + strconv.FormatUint(uint64(vid), 10) + " is on no machine!")
}
Example #14
0
func parseURLPath(path string) (vid, fid, ext string) {
	sepIndex := strings.LastIndex(path, "/")
	commaIndex := strings.LastIndex(path[sepIndex:], ",")
	if commaIndex <= 0 {
		if "favicon.ico" != path[sepIndex+1:] {
			log.Info("unknown file id", path[sepIndex+1:])
		}
		return
	}
	dotIndex := strings.LastIndex(path[sepIndex:], ".")
	vid = path[sepIndex+1 : commaIndex]
	fid = path[commaIndex+1:]
	ext = ""
	if dotIndex > 0 {
		fid = path[commaIndex+1 : dotIndex]
		ext = path[dotIndex+1:]
	}
	return
}
Example #15
0
File: fs.go Project: xujianhai/gopa
func (this *FsStore) LoadOffset(fileName string) int64 {
	log.Debug("start init offsets,", fileName)
	if util.CheckFileExists(fileName) {
		log.Debug("found offset file,start loading,", fileName)
		n, err := ioutil.ReadFile(fileName)
		if err != nil {
			log.Error("offset", fileName, ",", err)
			return 0
		}
		ret, err := strconv.ParseInt(string(n), 10, 64)
		if err != nil {
			log.Error("offset", fileName, ",", err)
			return 0
		}
		log.Info("init offsets successfully,", fileName, ":", ret)
		return int64(ret)
	}

	return 0
}
Example #16
0
func LoadNeedleMap(file *os.File) *NeedleMap {
	nm := NewNeedleMap(file)
	bytes := make([]byte, 16*RowsToRead)
	count, e := nm.indexFile.Read(bytes)
	if count > 0 {
		fstat, _ := file.Stat()
		log.Info("Loading index file", fstat.Name(), "size", fstat.Size())
	}
	for count > 0 && e == nil {
		for i := 0; i < count; i += 16 {
			key := util.BytesToUint64(bytes[i : i+8])
			offset := util.BytesToUint32(bytes[i+8 : i+12])
			size := util.BytesToUint32(bytes[i+12 : i+16])
			if offset > 0 {
				nm.m[key] = &NeedleValue{Offset: offset, Size: size}
			} else {
				delete(nm.m, key)
			}
		}

		count, e = nm.indexFile.Read(bytes)
	}
	return nm
}
Example #17
0
func shutdown(offsets []*RoutingOffset, quitChannels []*chan bool, offsets2 []*RoutingOffset, quitChannels2 []*chan bool, quit chan bool) {
	log.Debug("start shutting down")
	for i := range quitChannels {
		log.Debug("send exit signal to channel,", i)
		*quitChannels[i] <- true
	}

	for i := range quitChannels2 {
		log.Debug("send exit signal to channel,", i)
		*quitChannels2[i] <- true
	}

	log.Info("sent quit signal to go routings done")

	//	for i:=range offsets{
	//		//TODO
	//		log.Info("persist offset,",i,":",offsets[i].Offset,",",offsets[i].shard)
	//	}

	//	log.Info("persist kafka offsets done")

	quit <- true
	log.Debug("finished shutting down")
}
Example #18
0
func post(url string, cookie string, postStr string) []byte {

	log.Debug("let's post :" + url)

	client := &http.Client{
		CheckRedirect: nil,
	}

	postBytesReader := bytes.NewReader([]byte(postStr))
	reqest, _ := http.NewRequest("POST", url, postBytesReader)

	reqest.Header.Set("User-Agent", " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36")
	reqest.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
	reqest.Header.Set("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3")
	reqest.Header.Set("Accept-Encoding", "gzip,deflate,sdch")
	//	reqest.Header.Add("Content-Type", "application/x-www-form-urlencoded")
	reqest.Header.Add("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
	reqest.Header.Set("Accept-Language", "zh-CN,zh;q=0.8")
	reqest.Header.Set("Cache-Control", "max-age=0")
	reqest.Header.Set("Connection", "keep-alive")
	reqest.Header.Set("Referer", url)

	if len(cookie) > 0 {
		log.Debug("dealing with cookie:" + cookie)
		array := strings.Split(cookie, ";")
		for item := range array {
			array2 := strings.Split(array[item], "=")
			if len(array2) == 2 {
				cookieObj := http.Cookie{}
				cookieObj.Name = array2[0]
				cookieObj.Value = array2[1]
				reqest.AddCookie(&cookieObj)
			} else {
				log.Info("error,index out of range:" + array[item])
			}
		}
	}

	resp, err := client.Do(reqest)

	if err != nil {
		log.Error(url, err)
		return nil
	}

	defer resp.Body.Close()

	var reader io.ReadCloser
	switch resp.Header.Get("Content-Encoding") {
	case "gzip":
		reader, err = gzip.NewReader(resp.Body)
		if err != nil {
			log.Error(url, err)
			return nil
		}
		defer reader.Close()
	default:
		reader = resp.Body
	}

	if reader != nil {
		body, err := ioutil.ReadAll(reader)
		if err != nil {
			log.Error(url, err)
			return nil
		}
		return body
	}
	return nil
}
Example #19
0
//fetch url's content
func fetchUrl(url []byte, timeout time.Duration, runtimeConfig RuntimeConfig, offsets *RoutingOffset) {
	t := time.NewTimer(timeout)
	defer t.Stop()

	resource := string(url)

	log.Debug("enter fetchUrl method:", resource)

	config := runtimeConfig.TaskConfig

	if runtimeConfig.Storage.CheckFetchedUrl(url) {
		return
	}

	path := getSavedPath(runtimeConfig, url)

	if runtimeConfig.Storage.CheckSavedFile(path) {
		log.Warn("file is already saved,skip fetch.", path)
		runtimeConfig.Storage.AddSavedUrl(url)

		//re-parse local's previous saved page
		if runtimeConfig.ParseUrlsFromPreviousSavedPage {
			if !runtimeConfig.Storage.CheckParsedFile([]byte(path)) {
				log.Debug("previous saved page send to parse-queue:", path)
				runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path)
			}
		}
		return
	}

	//checking fetchUrlPattern
	log.Debug("started check fetchUrlPattern,", config.FetchUrlPattern, ",", resource)
	if config.FetchUrlPattern.Match(url) {
		log.Debug("match fetch url pattern,", resource)
		if len(config.FetchUrlMustNotContain) > 0 {
			if util.ContainStr(resource, config.FetchUrlMustNotContain) {
				log.Debug("hit FetchUrlMustNotContain,ignore,", resource, " , ", config.FetchUrlMustNotContain)
				return
			}
		}

		if len(config.FetchUrlMustContain) > 0 {
			if !util.ContainStr(resource, config.FetchUrlMustContain) {
				log.Debug("not hit FetchUrlMustContain,ignore,", resource, " , ", config.FetchUrlMustContain)
				return
			}
		}
	} else {
		log.Debug("does not hit FetchUrlPattern ignoring,", resource)
		return
	}

	log.Debug("start fetch url,", resource)
	flg := make(chan bool, 1)

	go func() {

		body, err := HttpGetWithCookie(resource, config.Cookie)

		if err == nil {
			if body != nil {
				//todo parse urls from this page
				log.Debug("started check savingUrlPattern,", config.SavingUrlPattern, ",", string(url))
				if config.SavingUrlPattern.Match(url) {
					log.Debug("match saving url pattern,", resource)
					if len(config.SavingUrlMustNotContain) > 0 {
						if util.ContainStr(resource, config.SavingUrlMustNotContain) {
							log.Debug("hit SavingUrlMustNotContain,ignore,", resource, " , ", config.SavingUrlMustNotContain)
							goto exitPage
						}
					}

					if len(config.SavingUrlMustContain) > 0 {
						if !util.ContainStr(resource, config.SavingUrlMustContain) {
							log.Debug("not hit SavingUrlMustContain,ignore,", resource, " , ", config.SavingUrlMustContain)
							goto exitPage
						}
					}

					_, err := Save(runtimeConfig, path, body)
					if err == nil {
						log.Info("saved:", path)
						//todo saved per shard
						runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path)
					} else {
						log.Info("error while saved:", path, ",", err)
						goto exitPage
					}

				} else {
					log.Debug("does not hit SavingUrlPattern ignoring,", resource)
				}
			}
			runtimeConfig.Storage.AddFetchedUrl(url)
		exitPage:
			log.Debug("exit fetchUrl method:", resource)
		} else {
			//			runtimeConfig.Storage.AddFetchFailedUrl(url)
			runtimeConfig.Storage.LogFetchFailedUrl(runtimeConfig.PathConfig.FetchFailedLog, resource)
		}
		flg <- true
	}()

	//监听通道,由于设有超时,不可能泄露
	select {
	case <-t.C:
		log.Error("fetching url time out,", resource)
	case <-flg:
		log.Debug("fetching url normal exit,", resource)
		return
	}

}
Example #20
0
func main() {
	flag.StringVar(&seedUrl, "seed", "http://example.com", "the seed url,where everything starts")
	flag.StringVar(&logLevel, "log", "info", "setting log level,options:trace,debug,info,warn,error")

	flag.Parse()

	defer log.Flush()

	runtime.GOMAXPROCS(runtime.NumCPU())

	log.SetInitLogging(logLevel)

	runtimeConfig.PathConfig = new(PathConfig)
	runtimeConfig.ClusterConfig = new(ClusterConfig)

	runtimeConfig.ClusterConfig.Name = config.GetStringConfig("cluster", "name", "gopa")

	// per cluster:data/gopa/
	runtimeConfig.PathConfig.Home = config.GetStringConfig("path", "home", "cluster/"+runtimeConfig.ClusterConfig.Name+"/")

	runtimeConfig.PathConfig.Data = config.GetStringConfig("path", "data", "")
	if runtimeConfig.PathConfig.Data == "" {
		runtimeConfig.PathConfig.Data = runtimeConfig.PathConfig.Home + "/" + "data/"
	}

	runtimeConfig.PathConfig.Log = config.GetStringConfig("path", "log", "")
	if runtimeConfig.PathConfig.Log == "" {
		runtimeConfig.PathConfig.Log = runtimeConfig.PathConfig.Home + "/" + "log/"
	}

	runtimeConfig.PathConfig.WebData = config.GetStringConfig("path", "webdata", "")
	if runtimeConfig.PathConfig.WebData == "" {
		runtimeConfig.PathConfig.WebData = runtimeConfig.PathConfig.Data + "/" + "webdata/"
	}

	runtimeConfig.PathConfig.TaskData = config.GetStringConfig("path", "taskdata", "")
	if runtimeConfig.PathConfig.TaskData == "" {
		runtimeConfig.PathConfig.TaskData = runtimeConfig.PathConfig.Data + "/" + "taskdata/"
	}

	runtimeConfig.StoreWebPageTogether = config.GetBoolConfig("Global", "StoreWebPageTogether", true)

	runtimeConfig.TaskConfig = parseConfig()

	//set default logging
	logPath := runtimeConfig.PathConfig.Log + "/" + runtimeConfig.TaskConfig.Name + "/gopa.log"
	log.SetLogging(logLevel, logPath)

	runtimeConfig.ParseUrlsFromSavedFileLog = config.GetBoolConfig("Switch", "ParseUrlsFromSavedFileLog", true)
	runtimeConfig.LoadTemplatedFetchJob = config.GetBoolConfig("Switch", "LoadTemplatedFetchJob", true)
	runtimeConfig.LoadRuledFetchJob = config.GetBoolConfig("Switch", "LoadRuledFetchJob", false)
	runtimeConfig.LoadPendingFetchJobs = config.GetBoolConfig("Switch", "LoadPendingFetchJobs", true)
	runtimeConfig.HttpEnabled = config.GetBoolConfig("Switch", "HttpEnabled", true)
	runtimeConfig.ParseUrlsFromPreviousSavedPage = config.GetBoolConfig("Switch", "ParseUrlsFromPreviousSavedPage", false)
	runtimeConfig.ArrayStringSplitter = config.GetStringConfig("CrawlerRule", "ArrayStringSplitter", ",")

	runtimeConfig.GoProfEnabled = config.GetBoolConfig("CrawlerRule", "GoProfEnabled", false)

	runtimeConfig.WalkBloomFilterFileName = config.GetStringConfig("BloomFilter", "WalkBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/walk.bloomfilter")
	runtimeConfig.FetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "FetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/fetch.bloomfilter")
	runtimeConfig.ParseBloomFilterFileName = config.GetStringConfig("BloomFilter", "ParseBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/parse.bloomfilter")
	runtimeConfig.PendingFetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "PendingFetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/pending_fetch.bloomfilter")

	runtimeConfig.PathConfig.SavedFileLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_parse.files"
	runtimeConfig.PathConfig.PendingFetchLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_fetch.urls"
	runtimeConfig.PathConfig.FetchFailedLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/failed_fetch.urls"

	runtimeConfig.MaxGoRoutine = config.GetIntConfig("Global", "MaxGoRoutine", 2)
	if runtimeConfig.MaxGoRoutine < 2 {
		runtimeConfig.MaxGoRoutine = 2
	}

	log.Debug("maxGoRoutine:", runtimeConfig.MaxGoRoutine)
	log.Debug("path.home:", runtimeConfig.PathConfig.Home)

	os.MkdirAll(runtimeConfig.PathConfig.Home, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.Data, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.Log, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.WebData, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.TaskData, 0777)

	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath, 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/tasks/", 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/filters/", 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/urls/", 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.WebDataPath, 0777)

	runtimeConfig.RuledFetchConfig = new(RuledFetchConfig)
	runtimeConfig.RuledFetchConfig.UrlTemplate = config.GetStringConfig("RuledFetch", "UrlTemplate", "")
	runtimeConfig.RuledFetchConfig.From = config.GetIntConfig("RuledFetch", "From", 0)
	runtimeConfig.RuledFetchConfig.To = config.GetIntConfig("RuledFetch", "To", 10)
	runtimeConfig.RuledFetchConfig.Step = config.GetIntConfig("RuledFetch", "Step", 1)
	runtimeConfig.RuledFetchConfig.LinkExtractPattern = config.GetStringConfig("RuledFetch", "LinkExtractPattern", "")
	runtimeConfig.RuledFetchConfig.LinkTemplate = config.GetStringConfig("RuledFetch", "LinkTemplate", "")

	if seedUrl == "" || seedUrl == "http://example.com" {
		log.Error("no seed was given. type:\"gopa -h\" for help.")
		os.Exit(1)
	}

	log.Info("[gopa] " + runtimeConfig.Version + " is on.")

	runtimeConfig.Storage = &fsstore.FsStore{}

	//	if(runtimeConfig.)

	runtimeConfig.Storage.InitWalkBloomFilter(runtimeConfig.WalkBloomFilterFileName)
	runtimeConfig.Storage.InitFetchBloomFilter(runtimeConfig.FetchBloomFilterFileName)
	runtimeConfig.Storage.InitParseBloomFilter(runtimeConfig.ParseBloomFilterFileName)
	runtimeConfig.Storage.InitPendingFetchBloomFilter(runtimeConfig.PendingFetchBloomFilterFileName)

	//	atr:="AZaz"
	//	btr:=[]byte(atr)
	//	fmt.Println(btr)
	//
	//	id:= getSeqStr([]byte("AA"),[]byte("ZZ"),false)
	//	fmt.Println(id)

	//pprof serves
	if runtimeConfig.GoProfEnabled {
		go func() {
			log.Info(http.ListenAndServe("localhost:6060", nil))
			log.Info("pprof server is up,http://localhost:6060/debug/pprof")
		}()
	}

	//http serves
	if runtimeConfig.HttpEnabled {
		go func() {
			httpServ.Start(runtimeConfig)
		}()
	}

	//adding default http protocol
	if !strings.HasPrefix(seedUrl, "http") {
		seedUrl = "http://" + seedUrl
	}

	maxGoRoutine := runtimeConfig.MaxGoRoutine
	fetchQuitChannels := make([]*chan bool, maxGoRoutine)   //shutdownSignal signals for each go routing
	fetchTaskChannels := make([]*chan []byte, maxGoRoutine) //fetchTask channels
	fetchOffsets := make([]*RoutingOffset, maxGoRoutine)    //kafka fetchOffsets

	parseQuitChannels := make([]*chan bool, 2) //shutdownSignal signals for each go routing
	//	parseQuitChannels := make([]*chan bool, MaxGoRoutine) //shutdownSignal signals for each go routing
	parseOffsets := make([]*RoutingOffset, maxGoRoutine) //kafka fetchOffsets

	shutdownSignal := make(chan bool, 1)
	finalQuitSignal := make(chan bool, 1)

	//handle exit event
	exitEventChannel := make(chan os.Signal, 1)
	signal.Notify(exitEventChannel, syscall.SIGINT)
	signal.Notify(exitEventChannel, os.Interrupt)
	go func() {
		s := <-exitEventChannel
		log.Debug("got signal:", s)
		if s == os.Interrupt || s.(os.Signal) == syscall.SIGINT {
			log.Warn("got signal:os.Interrupt,saving data and exit")
			//			defer  os.Exit(0)

			runtimeConfig.Storage.PersistBloomFilter()

			//wait workers to exit
			log.Info("waiting workers exit")
			go shutdown(fetchOffsets, fetchQuitChannels, parseOffsets, parseQuitChannels, shutdownSignal)
			<-shutdownSignal
			log.Info("workers shutdown")
			finalQuitSignal <- true
		}
	}()

	//start fetcher
	for i := 0; i < maxGoRoutine; i++ {
		quitC := make(chan bool, 1)
		taskC := make(chan []byte)

		fetchQuitChannels[i] = &quitC
		fetchTaskChannels[i] = &taskC
		offset := new(RoutingOffset)
		//		offset.Offset = initOffset(runtimeConfig, "fetch", i)
		offset.Shard = i
		fetchOffsets[i] = offset

		go task.FetchGo(runtimeConfig, &taskC, &quitC, offset)
	}

	c2 := make(chan bool, 1)
	parseQuitChannels[0] = &c2
	offset2 := new(RoutingOffset)
	//	offset2.Offset = initOffset(runtimeConfig, "parse", 0)
	offset2.Shard = 0
	parseOffsets[0] = offset2
	pendingFetchUrls := make(chan []byte)

	//fetch rule:all urls -> persisted to sotre -> fetched from store -> pushed to pendingFetchUrls -> redistributed to sharded goroutines -> fetch -> save webpage to store -> done
	//parse rule:url saved to store -> local path persisted to store -> fetched to pendingParseFiles -> redistributed to sharded goroutines -> parse -> clean urls -> enqueue to url store ->done

	//sending feed to task queue
	go func() {
		//notice seed will not been persisted
		log.Debug("sending feed to fetch queue,", seedUrl)
		pendingFetchUrls <- []byte(seedUrl)
	}()

	//start local saved file parser
	if runtimeConfig.ParseUrlsFromSavedFileLog {
		go task.ParseGo(pendingFetchUrls, runtimeConfig, &c2, offset2)
	}

	//redistribute pendingFetchUrls to sharded workers
	go func() {
		for {
			url := <-pendingFetchUrls
			if !runtimeConfig.Storage.CheckWalkedUrl(url) {

				if runtimeConfig.Storage.CheckFetchedUrl(url) {
					log.Warn("dont hit walk bloomfilter but hit fetch bloomfilter,also ignore,", string(url))
					runtimeConfig.Storage.AddWalkedUrl(url)
					continue
				}

				randomShard := 0
				if maxGoRoutine > 1 {
					randomShard = rand.Intn(maxGoRoutine - 1)
				}
				log.Debug("publish:", string(url), ",shard:", randomShard)
				runtimeConfig.Storage.AddWalkedUrl(url)
				*fetchTaskChannels[randomShard] <- url
			} else {
				log.Trace("hit walk or fetch bloomfilter,just ignore,", string(url))
			}
		}
	}()

	//load predefined fetch jobs
	if runtimeConfig.LoadTemplatedFetchJob {
		go func() {

			if util.CheckFileExists(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt") {

				templates := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt")
				ids := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/id.txt")

				for _, id := range ids {
					for _, template := range templates {
						log.Trace("id:", id)
						log.Trace("template:", template)
						url := strings.Replace(template, "{id}", id, -1)
						log.Debug("new task from template:", url)
						pendingFetchUrls <- []byte(url)
					}
				}
				log.Info("templated download is done.")

			}

		}()
	}

	//fetch urls from saved pages
	if runtimeConfig.LoadPendingFetchJobs {
		c3 := make(chan bool, 1)
		parseQuitChannels[1] = &c3
		offset3 := new(RoutingOffset)
		//		offset3.Offset = initOffset(runtimeConfig, "fetch_from_saved", 0)
		offset3.Shard = 0
		parseOffsets[1] = offset3
		go task.LoadTaskFromLocalFile(pendingFetchUrls, &runtimeConfig, &c3, offset3)
	}

	//parse fetch failed jobs,and will ignore the walk-filter
	//TODO

	if runtimeConfig.LoadRuledFetchJob {
		log.Debug("start ruled fetch")
		go func() {
			if runtimeConfig.RuledFetchConfig.UrlTemplate != "" {
				for i := runtimeConfig.RuledFetchConfig.From; i <= runtimeConfig.RuledFetchConfig.To; i += runtimeConfig.RuledFetchConfig.Step {
					url := strings.Replace(runtimeConfig.RuledFetchConfig.UrlTemplate, "{id}", strconv.FormatInt(int64(i), 10), -1)
					log.Debug("add ruled url:", url)
					pendingFetchUrls <- []byte(url)
				}
			} else {
				log.Error("ruled template is empty,ignore")
			}
		}()

	}

	<-finalQuitSignal
	log.Info("[gopa] is down")
}
Example #21
0
File: fs.go Project: xujianhai/gopa
func (this *FsStore) TaskEnqueue(url []byte) {
	log.Info("task enqueue:", string(url))
}
Example #22
0
func extractLinks(runtimeConfig RuntimeConfig, fileUrl string, fileName []byte, body []byte) {

	//	siteUrlStr := string(fileName)
	//	siteUrlStr = strings.TrimLeft(siteUrlStr, "data/")
	//	siteUrlStr = "http://" + siteUrlStr
	//	log.Debug("fileName to Url:", string(fileName), ",", siteUrlStr)

	siteUrlStr := fileUrl
	siteConfig := runtimeConfig.TaskConfig

	siteUrlByte := []byte(siteUrlStr)
	log.Debug("enter links extract,", siteUrlStr)
	if siteConfig.SkipPageParsePattern.Match(siteUrlByte) {
		log.Debug("hit SkipPageParsePattern pattern,", siteUrlStr)
		return
	}

	log.Debug("parsing external links:", siteUrlStr, ",using:", siteConfig.LinkUrlExtractRegex)

	matches := siteConfig.LinkUrlExtractRegex.FindAllSubmatch(body, -1)
	log.Debug("extract links with pattern,total matchs:", len(matches), " match result,", string(fileName))
	xIndex := 0
	for _, match := range matches {
		log.Debug("dealing with match result,", xIndex)
		xIndex = xIndex + 1
		url := match[siteConfig.LinkUrlExtractRegexGroupIndex]
		filterUrl := formatUrlForFilter(url)
		log.Debug("url clean result:", string(filterUrl), ",original url:", string(url))
		filteredUrl := string(filterUrl)

		//filter error link
		if filteredUrl == "" {
			log.Debug("filteredUrl is empty,continue")
			continue
		}

		result1 := strings.HasPrefix(filteredUrl, "#")
		if result1 {
			log.Debug("filteredUrl started with: # ,continue")
			continue
		}

		result2 := strings.HasPrefix(filteredUrl, "javascript:")
		if result2 {
			log.Debug("filteredUrl started with: javascript: ,continue")
			continue
		}

		hit := false

		//		l.Lock();
		//		defer l.Unlock();

		if runtimeConfig.Storage.CheckWalkedUrl(filterUrl) || runtimeConfig.Storage.CheckFetchedUrl(filterUrl) || runtimeConfig.Storage.CheckPendingFetchUrl(filterUrl) {
			log.Debug("hit bloomFilter,continue")
			hit = true
			continue
		}

		if !hit {
			currentUrlStr := string(url)
			currentUrlStr = strings.Trim(currentUrlStr, " ")

			seedUrlStr := siteUrlStr
			seedURI, err := ParseRequestURI(seedUrlStr)

			if err != nil {
				log.Error("ParseSeedURI failed!: ", seedUrlStr, " , ", err)
				continue
			}

			currentURI1, err := ParseRequestURI(currentUrlStr)
			currentURI := currentURI1
			if err != nil {
				if strings.Contains(err.Error(), "invalid URI for request") {
					log.Debug("invalid URI for request,fix relative url,original:", currentUrlStr)
					//					log.Debug("old relatived url,", currentUrlStr)
					//page based relative urls

					currentUrlStr = "http://" + seedURI.Host + "/" + currentUrlStr
					currentURI1, err = ParseRequestURI(currentUrlStr)
					currentURI = currentURI1
					if err != nil {
						log.Error("ParseCurrentURI internal failed!: ", currentUrlStr, " , ", err)
						continue
					}

					log.Debug("new relatived url,", currentUrlStr)

				} else {
					log.Error("ParseCurrentURI failed!: ", currentUrlStr, " , ", err)
					continue
				}
			}

			//			relative links
			if currentURI == nil || currentURI.Host == "" {
				if strings.HasPrefix(currentURI.Path, "/") {
					//root based relative urls
					log.Debug("old relatived url,", currentUrlStr)
					currentUrlStr = "http://" + seedURI.Host + currentUrlStr
					log.Debug("new relatived url,", currentUrlStr)
				} else {
					log.Debug("old relatived url,", currentUrlStr)
					//page based relative urls
					urlPath := getRootUrl(currentURI)
					currentUrlStr = "http://" + urlPath + currentUrlStr
					log.Debug("new relatived url,", currentUrlStr)
				}
			} else {
				log.Debug("host:", currentURI.Host, " ", currentURI.Host == "")

				//resolve domain specific filter
				if siteConfig.FollowSameDomain {

					if siteConfig.FollowSubDomain {

						//TODO handler com.cn and .com,using a TLC-domain list

					} else if seedURI.Host != currentURI.Host {
						log.Debug("domain mismatch,", seedURI.Host, " vs ", currentURI.Host)
						//continue
					}
					//TODO follow all or list of domain
				}
			}

			if len(siteConfig.LinkUrlMustContain) > 0 {
				if !util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustContain) {
					log.Debug("link does not hit must-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain)
					continue
				}
			}

			if len(siteConfig.LinkUrlMustNotContain) > 0 {
				if util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustNotContain) {
					log.Debug("link hit must-not-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain)
					continue
				}
			}

			//normalize url
			currentUrlStr = MustNormalizeURLString(currentUrlStr, FlagLowercaseScheme|FlagLowercaseHost|FlagUppercaseEscapes|
				FlagRemoveUnnecessaryHostDots|FlagRemoveDuplicateSlashes|FlagRemoveFragment)
			log.Debug("normalized url:", currentUrlStr)
			currentUrlByte := []byte(currentUrlStr)
			if !(runtimeConfig.Storage.CheckWalkedUrl(currentUrlByte) || runtimeConfig.Storage.CheckFetchedUrl(currentUrlByte) || runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte)) {
				//bloomFilter.Lookup(currentUrlByte) {

				//								if(CheckIgnore(currentUrlStr)){}

				//				log.Info("enqueue fetch: ", currentUrlStr)

				//				broker.Publish(kafka.NewMessage(currentUrlByte))

				//copied form fetchTask,TODO refactor
				//checking fetchUrlPattern
				log.Debug("started check fetchUrlPattern,", currentUrlStr)
				if siteConfig.FetchUrlPattern.Match(currentUrlByte) {
					log.Debug("match fetch url pattern,", currentUrlStr)
					if len(siteConfig.FetchUrlMustNotContain) > 0 {
						if util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustNotContain) {
							log.Debug("hit FetchUrlMustNotContain,ignore,", currentUrlStr)
							continue
						}
					}

					if len(siteConfig.FetchUrlMustContain) > 0 {
						if !util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustContain) {
							log.Debug("not hit FetchUrlMustContain,ignore,", currentUrlStr)
							continue
						}
					}
				} else {
					log.Debug("does not hit FetchUrlPattern ignoring,", currentUrlStr)
					continue
				}

				if !runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte) {
					log.Debug("log new pendingFetch url", currentUrlStr)
					runtimeConfig.Storage.LogPendingFetchUrl(runtimeConfig.PathConfig.PendingFetchLog, currentUrlStr)
					runtimeConfig.Storage.AddPendingFetchUrl(currentUrlByte)
				} else {
					log.Debug("hit new pendingFetch filter,ignore:", currentUrlStr)
				}
				//				pendingUrls <- currentUrlByte

				//	TODO pendingFetchFilter			bloomFilter.Add(currentUrlByte)
			} else {
				log.Debug("hit bloom filter,ignore:", currentUrlStr)
			}
			//			bloomFilter.Add([]byte(filterUrl))
		} else {
			log.Debug("hit bloom filter,ignore,", string(url))
		}
		log.Debug("exit links extract,", siteUrlStr)

	}

	//TODO 处理ruled fetch pattern

	log.Info("all links within ", siteUrlStr, " is done")
}