Example #1
0
func main() {
	flag.Parse()

	//volume master block
	log.Info("Volume Size Limit is", *volumeSizeLimitMB, "MB")
	mapper = directory.NewMapper(*metaFolder, "directory", uint64(*volumeSizeLimitMB)*1024*1024)

	//weedfs master handler
	http.HandleFunc("/dir/assign", dirAssignHandler)
	http.HandleFunc("/dir/lookup", dirLookupHandler)
	http.HandleFunc("/dir/join", dirJoinHandler)
	http.HandleFunc("/dir/status", dirStatusHandler)

	//start weedfs master
	go func() {
		log.Info("Start directory service at http://127.0.0.1:" + strconv.Itoa(*port))
		e := http.ListenAndServe(":"+strconv.Itoa(*port), nil)
		if e != nil {
			log.Error("Fail to start:", e)
		}
	}()

	//volume block
	//TODO: now default to 1G, this value should come from server?
	store = storage.NewStore(*storePort, *publicUrl, *chunkFolder, *volumes)
	defer store.Close()

	//weedfs volume handler
	http.HandleFunc("/", storeHandler)
	http.HandleFunc("/status", statusHandler)
	http.HandleFunc("/add_volume", addVolumeHandler)

	go func() {
		for {
			store.Join(*metaServer)
			time.Sleep(time.Duration(float32(*pulse*1e3)*(1+rand.Float32())) * time.Millisecond)
		}
	}()
	log.Info("store joined at", *metaServer)

	//start weedfs volume
	go func() {
		log.Info("Start storage service at http://127.0.0.1:"+strconv.Itoa(*storePort), "public url", *publicUrl)
		e := http.ListenAndServe(":"+strconv.Itoa(*storePort), nil)
		if e != nil {
			log.Error("Fail to start:", e)
		}
	}()

}
Example #2
0
func Post(url string, values url.Values) []byte {
	r, err := http.PostForm(url, values)
	if err != nil {
		log.Error("post:", err)
		return nil
	}
	defer r.Body.Close()
	b, err := ioutil.ReadAll(r.Body)
	if err != nil {
		log.Error("post:", err)
		return nil
	}
	return b
}
Example #3
0
File: fs.go Project: xujianhai/gopa
func (this *FsStore) Get(key string) []byte {
	file, error := util.FileGetContent(key)
	if error != nil {
		log.Error("get file:", key, error)
	}
	return file
}
Example #4
0
func NewNeedle(r *http.Request) (n *Needle, e error) {

	n = new(Needle)
	form, fe := r.MultipartReader()
	if fe != nil {
		log.Error("MultipartReader [ERROR] %s\n", fe)
		e = fe
		return
	}
	part, _ := form.NextPart()
	//log.Println("uploading file " + part.FileName())
	data, _ := ioutil.ReadAll(part)
	n.Data = data

	commaSep := strings.LastIndex(r.URL.Path, ",")
	dotSep := strings.LastIndex(r.URL.Path, ".")
	fid := r.URL.Path[commaSep+1:]
	if dotSep > 0 {
		fid = r.URL.Path[commaSep+1 : dotSep]
	}

	n.ParsePath(fid)

	return
}
Example #5
0
func NewVolume(dirname string, id uint32) (v *Volume) {
	var e error
	v = &Volume{dir: dirname, Id: id}
	fileName := strconv.FormatUint(uint64(v.Id), 10)
	v.dataFile, e = os.OpenFile(path.Join(v.dir, fileName+".dat"), os.O_RDWR|os.O_CREATE, 0644)
	if e != nil {
		log.Error("New Volume [ERROR] %s\n", e)
	}
	v.maybeWriteSuperBlock()
	indexFile, ie := os.OpenFile(path.Join(v.dir, fileName+".idx"), os.O_RDWR|os.O_CREATE, 0644)
	if ie != nil {
		log.Error("Write Volume Index [ERROR] %s\n", ie)
	}
	v.nm = LoadNeedleMap(indexFile)

	return
}
Example #6
0
func CopyFile(src, dst string) (w int64, err error) {
	srcFile, err := os.Open(src)
	if err != nil {
		log.Error(err.Error())
		return
	}
	defer srcFile.Close()

	dstFile, err := os.Create(dst)

	if err != nil {
		log.Error(err.Error())
		return
	}

	defer dstFile.Close()

	return io.Copy(dstFile, srcFile)
}
Example #7
0
func dirAssignHandler(w http.ResponseWriter, r *http.Request) {
	c := r.FormValue("count")
	fid, count, machine, err := mapper.PickForWrite(c)
	if err == nil {
		writeJson(w, r, map[string]string{"fid": fid, "url": machine.Url, "publicUrl": machine.PublicUrl, "count": strconv.Itoa(count)})
	} else {
		log.Error(err)
		writeJson(w, r, map[string]string{"error": err.Error()})
	}
}
Example #8
0
func (m *Mapper) saveSequence() {
	log.Info("Saving file id sequence", m.FileIdSequence, "to", path.Join(m.dir, m.fileName+".seq"))
	seqFile, e := os.OpenFile(path.Join(m.dir, m.fileName+".seq"), os.O_CREATE|os.O_WRONLY, 0644)
	if e != nil {
		log.Error("Sequence File Save [ERROR] %s,", e)
	}
	defer seqFile.Close()
	encoder := gob.NewEncoder(seqFile)
	encoder.Encode(m.FileIdSequence)
}
Example #9
0
File: fs.go Project: xujianhai/gopa
func (this *FsStore) LoadOffset(fileName string) int64 {
	log.Debug("start init offsets,", fileName)
	if util.CheckFileExists(fileName) {
		log.Debug("found offset file,start loading,", fileName)
		n, err := ioutil.ReadFile(fileName)
		if err != nil {
			log.Error("offset", fileName, ",", err)
			return 0
		}
		ret, err := strconv.ParseInt(string(n), 10, 64)
		if err != nil {
			log.Error("offset", fileName, ",", err)
			return 0
		}
		log.Info("init offsets successfully,", fileName, ":", ret)
		return int64(ret)
	}

	return 0
}
Example #10
0
func ParseFileId(fid string) *FileId {
	a := strings.Split(fid, ",")
	if len(a) != 2 {
		log.Error("Invalid fid", fid, ", split length", len(a))
		return nil
	}
	vid_string, key_hash_string := a[0], a[1]
	vid, _ := strconv.ParseUint(vid_string, 10, 64)
	key, hash := storage.ParseKeyHash(key_hash_string)
	return &FileId{VolumeId: uint32(vid), Key: key, Hashcode: hash}
}
Example #11
0
func ParseKeyHash(key_hash_string string) (uint64, uint32) {
	key_hash_bytes, khe := hex.DecodeString(key_hash_string)
	key_hash_len := len(key_hash_bytes)
	if khe != nil || key_hash_len <= 4 {
		log.Error("Invalid key_hash", key_hash_string, "length:", key_hash_len, "error", khe)
		return 0, 0
	}
	key := util.BytesToUint64(key_hash_bytes[0 : key_hash_len-4])
	hash := util.BytesToUint32(key_hash_bytes[key_hash_len-4 : key_hash_len])
	return key, hash
}
Example #12
0
File: fs.go Project: xujianhai/gopa
func initBloomFilter(bloomFilterPersistFileName string) *Filter {
	var bloomFilter = new(Filter)
	//loading or initializing bloom filter
	if util.CheckFileExists(bloomFilterPersistFileName) {
		log.Debug("found bloomFilter,start reload,", bloomFilterPersistFileName)
		n, err := ioutil.ReadFile(bloomFilterPersistFileName)
		if err != nil {
			log.Error("bloomFilter:", bloomFilterPersistFileName, err)
		}
		if err := bloomFilter.GobDecode(n); err != nil {
			log.Error("bloomFilter:", bloomFilterPersistFileName, err)
		}
		log.Info("bloomFilter successfully reloaded:", bloomFilterPersistFileName)
	} else {
		probItems := config.GetIntConfig("BloomFilter", "ItemSize", 100000)
		log.Debug("initializing bloom-filter", bloomFilterPersistFileName, ",virual size is,", probItems)
		bloomFilter = NewFilter(fnv.New64(), probItems)
		log.Info("bloomFilter successfully initialized:", bloomFilterPersistFileName)
	}
	return bloomFilter
}
Example #13
0
func loadFileContent(fileName string) []byte {
	if util.CheckFileExists(fileName) {
		log.Trace("found fileName,start loading:", fileName)
		n, err := ioutil.ReadFile(fileName)
		if err != nil {
			log.Error("loadFile", err, ",", fileName)
			return nil
		}
		return n
	}
	return nil
}
Example #14
0
File: fs.go Project: xujianhai/gopa
func (this *FsStore) PersistOffset(fileName string, offset int64) {
	//persist worker's offset
	path := fileName + ".tmp"
	fout, error := os.Create(path)
	if error != nil {
		log.Error(path, error)
		return
	}

	defer fout.Close()
	log.Debug("saved offset:", fileName, ":", offset)
	fout.Write([]byte(strconv.FormatInt(offset, 10)))
	util.CopyFile(path, fileName)
}
Example #15
0
File: fs.go Project: xujianhai/gopa
func persistBloomFilter(bloomFilterPersistFileName string, bloomFilter *Filter) {

	//save bloom-filter
	m, err := bloomFilter.GobEncode()
	if err != nil {
		log.Error(err)
		return
	}
	err = ioutil.WriteFile(bloomFilterPersistFileName, m, 0600)
	if err != nil {
		panic(err)
		return
	}
	log.Info("bloomFilter safety persisted.")
}
Example #16
0
//weedfs master
func dirLookupHandler(w http.ResponseWriter, r *http.Request) {
	vid := r.FormValue("volumeId")
	commaSep := strings.Index(vid, ",")
	if commaSep > 0 {
		vid = vid[0:commaSep]
	}
	volumeId, _ := strconv.ParseUint(vid, 10, 64)
	machine, e := mapper.Get(uint32(volumeId))
	if e == nil {
		writeJson(w, r, machine.Server)
	} else {
		log.Error("Invalid volume id", volumeId)
		writeJson(w, r, map[string]string{"error": "volume id " + strconv.FormatUint(volumeId, 10) + " not found"})
	}
}
Example #17
0
func ReadAllLines(file string) []string {
	lines := []string{}
	f, err := os.Open(file)
	if err != nil {
		log.Error("error opening file,", file, " ", err)
		os.Exit(1)
	}

	r := bufio.NewReader(f)
	s, e := Readln(r)
	lines = append(lines, s)
	for e == nil {
		s, e = Readln(r)
		if s != "" {
			lines = append(lines, s)
		}
	}

	return lines
}
Example #18
0
func (n *Needle) ParsePath(fid string) {
	length := len(fid)
	if length <= 8 {
		if length > 0 {
			log.Error("Invalid fid", fid, "length", length)
		}
		return
	}
	delta := ""
	deltaIndex := strings.LastIndex(fid, "_")
	if deltaIndex > 0 {
		fid, delta = fid[0:deltaIndex], fid[deltaIndex+1:]
	}
	n.Key, n.Cookie = ParseKeyHash(fid)
	if delta != "" {
		d, e := strconv.ParseUint(delta, 10, 64)
		if e == nil {
			n.Key += d
		}
	}
}
Example #19
0
func HttpGet(resource string) (msg []byte, err error) {

	//	//validate url
	//	host, err := ParseRequestURI(resource)
	//	if err != nil {
	//		log.Error(resource,err)
	//		return nil,err
	//	}

	//	//check domain
	//	_, err =net.LookupIP(host.Host)
	//	if err != nil {
	//		log.Error(resource,err)
	//		return nil,err
	//	}

	client := &http.Client{
		Transport: &http.Transport{
			Dial: func(netw, addr string) (net.Conn, error) {
				deadline := time.Now().Add(10 * time.Second)
				c, err := net.DialTimeout(netw, addr, 5*time.Second) //连接超时时间
				if err != nil {
					log.Error(resource, err)
					return nil, err
				}

				c.SetDeadline(deadline)
				return c, nil
			},
		},
	}

	req, err := http.NewRequest("GET", resource, nil)

	if err != nil {
		log.Error(resource, err)
		return nil, err
	}

	//support gzip
	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; gopa/0.1; +http://infinitbyte.com/gopa)")
	req.Header.Set("Accept-Encoding", "gzip")

	resp, err := client.Do(req)
	if err != nil {
		log.Error(resource, err)
		return nil, err
	}

	defer resp.Body.Close()

	var reader io.ReadCloser
	switch resp.Header.Get("Content-Encoding") {
	case "gzip":
		reader, err = gzip.NewReader(resp.Body)
		if err != nil {
			log.Error(resource, err)
			return nil, err
		}
		defer reader.Close()
	default:
		reader = resp.Body
	}
	if reader != nil {
		body, err := ioutil.ReadAll(reader)
		if err != nil {
			log.Error(resource, err)
			return nil, err
		}
		return body, nil
	}
	return nil, http.ErrNotSupported
}
Example #20
0
func post(url string, cookie string, postStr string) []byte {

	log.Debug("let's post :" + url)

	client := &http.Client{
		CheckRedirect: nil,
	}

	postBytesReader := bytes.NewReader([]byte(postStr))
	reqest, _ := http.NewRequest("POST", url, postBytesReader)

	reqest.Header.Set("User-Agent", " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36")
	reqest.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
	reqest.Header.Set("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3")
	reqest.Header.Set("Accept-Encoding", "gzip,deflate,sdch")
	//	reqest.Header.Add("Content-Type", "application/x-www-form-urlencoded")
	reqest.Header.Add("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
	reqest.Header.Set("Accept-Language", "zh-CN,zh;q=0.8")
	reqest.Header.Set("Cache-Control", "max-age=0")
	reqest.Header.Set("Connection", "keep-alive")
	reqest.Header.Set("Referer", url)

	if len(cookie) > 0 {
		log.Debug("dealing with cookie:" + cookie)
		array := strings.Split(cookie, ";")
		for item := range array {
			array2 := strings.Split(array[item], "=")
			if len(array2) == 2 {
				cookieObj := http.Cookie{}
				cookieObj.Name = array2[0]
				cookieObj.Value = array2[1]
				reqest.AddCookie(&cookieObj)
			} else {
				log.Info("error,index out of range:" + array[item])
			}
		}
	}

	resp, err := client.Do(reqest)

	if err != nil {
		log.Error(url, err)
		return nil
	}

	defer resp.Body.Close()

	var reader io.ReadCloser
	switch resp.Header.Get("Content-Encoding") {
	case "gzip":
		reader, err = gzip.NewReader(resp.Body)
		if err != nil {
			log.Error(url, err)
			return nil
		}
		defer reader.Close()
	default:
		reader = resp.Body
	}

	if reader != nil {
		body, err := ioutil.ReadAll(reader)
		if err != nil {
			log.Error(url, err)
			return nil
		}
		return body
	}
	return nil
}
Example #21
0
//fetch url's content
func fetchUrl(url []byte, timeout time.Duration, runtimeConfig RuntimeConfig, offsets *RoutingOffset) {
	t := time.NewTimer(timeout)
	defer t.Stop()

	resource := string(url)

	log.Debug("enter fetchUrl method:", resource)

	config := runtimeConfig.TaskConfig

	if runtimeConfig.Storage.CheckFetchedUrl(url) {
		return
	}

	path := getSavedPath(runtimeConfig, url)

	if runtimeConfig.Storage.CheckSavedFile(path) {
		log.Warn("file is already saved,skip fetch.", path)
		runtimeConfig.Storage.AddSavedUrl(url)

		//re-parse local's previous saved page
		if runtimeConfig.ParseUrlsFromPreviousSavedPage {
			if !runtimeConfig.Storage.CheckParsedFile([]byte(path)) {
				log.Debug("previous saved page send to parse-queue:", path)
				runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path)
			}
		}
		return
	}

	//checking fetchUrlPattern
	log.Debug("started check fetchUrlPattern,", config.FetchUrlPattern, ",", resource)
	if config.FetchUrlPattern.Match(url) {
		log.Debug("match fetch url pattern,", resource)
		if len(config.FetchUrlMustNotContain) > 0 {
			if util.ContainStr(resource, config.FetchUrlMustNotContain) {
				log.Debug("hit FetchUrlMustNotContain,ignore,", resource, " , ", config.FetchUrlMustNotContain)
				return
			}
		}

		if len(config.FetchUrlMustContain) > 0 {
			if !util.ContainStr(resource, config.FetchUrlMustContain) {
				log.Debug("not hit FetchUrlMustContain,ignore,", resource, " , ", config.FetchUrlMustContain)
				return
			}
		}
	} else {
		log.Debug("does not hit FetchUrlPattern ignoring,", resource)
		return
	}

	log.Debug("start fetch url,", resource)
	flg := make(chan bool, 1)

	go func() {

		body, err := HttpGetWithCookie(resource, config.Cookie)

		if err == nil {
			if body != nil {
				//todo parse urls from this page
				log.Debug("started check savingUrlPattern,", config.SavingUrlPattern, ",", string(url))
				if config.SavingUrlPattern.Match(url) {
					log.Debug("match saving url pattern,", resource)
					if len(config.SavingUrlMustNotContain) > 0 {
						if util.ContainStr(resource, config.SavingUrlMustNotContain) {
							log.Debug("hit SavingUrlMustNotContain,ignore,", resource, " , ", config.SavingUrlMustNotContain)
							goto exitPage
						}
					}

					if len(config.SavingUrlMustContain) > 0 {
						if !util.ContainStr(resource, config.SavingUrlMustContain) {
							log.Debug("not hit SavingUrlMustContain,ignore,", resource, " , ", config.SavingUrlMustContain)
							goto exitPage
						}
					}

					_, err := Save(runtimeConfig, path, body)
					if err == nil {
						log.Info("saved:", path)
						//todo saved per shard
						runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path)
					} else {
						log.Info("error while saved:", path, ",", err)
						goto exitPage
					}

				} else {
					log.Debug("does not hit SavingUrlPattern ignoring,", resource)
				}
			}
			runtimeConfig.Storage.AddFetchedUrl(url)
		exitPage:
			log.Debug("exit fetchUrl method:", resource)
		} else {
			//			runtimeConfig.Storage.AddFetchFailedUrl(url)
			runtimeConfig.Storage.LogFetchFailedUrl(runtimeConfig.PathConfig.FetchFailedLog, resource)
		}
		flg <- true
	}()

	//监听通道,由于设有超时,不可能泄露
	select {
	case <-t.C:
		log.Error("fetching url time out,", resource)
	case <-flg:
		log.Debug("fetching url normal exit,", resource)
		return
	}

}
Example #22
0
func extractLinks(runtimeConfig RuntimeConfig, fileUrl string, fileName []byte, body []byte) {

	//	siteUrlStr := string(fileName)
	//	siteUrlStr = strings.TrimLeft(siteUrlStr, "data/")
	//	siteUrlStr = "http://" + siteUrlStr
	//	log.Debug("fileName to Url:", string(fileName), ",", siteUrlStr)

	siteUrlStr := fileUrl
	siteConfig := runtimeConfig.TaskConfig

	siteUrlByte := []byte(siteUrlStr)
	log.Debug("enter links extract,", siteUrlStr)
	if siteConfig.SkipPageParsePattern.Match(siteUrlByte) {
		log.Debug("hit SkipPageParsePattern pattern,", siteUrlStr)
		return
	}

	log.Debug("parsing external links:", siteUrlStr, ",using:", siteConfig.LinkUrlExtractRegex)

	matches := siteConfig.LinkUrlExtractRegex.FindAllSubmatch(body, -1)
	log.Debug("extract links with pattern,total matchs:", len(matches), " match result,", string(fileName))
	xIndex := 0
	for _, match := range matches {
		log.Debug("dealing with match result,", xIndex)
		xIndex = xIndex + 1
		url := match[siteConfig.LinkUrlExtractRegexGroupIndex]
		filterUrl := formatUrlForFilter(url)
		log.Debug("url clean result:", string(filterUrl), ",original url:", string(url))
		filteredUrl := string(filterUrl)

		//filter error link
		if filteredUrl == "" {
			log.Debug("filteredUrl is empty,continue")
			continue
		}

		result1 := strings.HasPrefix(filteredUrl, "#")
		if result1 {
			log.Debug("filteredUrl started with: # ,continue")
			continue
		}

		result2 := strings.HasPrefix(filteredUrl, "javascript:")
		if result2 {
			log.Debug("filteredUrl started with: javascript: ,continue")
			continue
		}

		hit := false

		//		l.Lock();
		//		defer l.Unlock();

		if runtimeConfig.Storage.CheckWalkedUrl(filterUrl) || runtimeConfig.Storage.CheckFetchedUrl(filterUrl) || runtimeConfig.Storage.CheckPendingFetchUrl(filterUrl) {
			log.Debug("hit bloomFilter,continue")
			hit = true
			continue
		}

		if !hit {
			currentUrlStr := string(url)
			currentUrlStr = strings.Trim(currentUrlStr, " ")

			seedUrlStr := siteUrlStr
			seedURI, err := ParseRequestURI(seedUrlStr)

			if err != nil {
				log.Error("ParseSeedURI failed!: ", seedUrlStr, " , ", err)
				continue
			}

			currentURI1, err := ParseRequestURI(currentUrlStr)
			currentURI := currentURI1
			if err != nil {
				if strings.Contains(err.Error(), "invalid URI for request") {
					log.Debug("invalid URI for request,fix relative url,original:", currentUrlStr)
					//					log.Debug("old relatived url,", currentUrlStr)
					//page based relative urls

					currentUrlStr = "http://" + seedURI.Host + "/" + currentUrlStr
					currentURI1, err = ParseRequestURI(currentUrlStr)
					currentURI = currentURI1
					if err != nil {
						log.Error("ParseCurrentURI internal failed!: ", currentUrlStr, " , ", err)
						continue
					}

					log.Debug("new relatived url,", currentUrlStr)

				} else {
					log.Error("ParseCurrentURI failed!: ", currentUrlStr, " , ", err)
					continue
				}
			}

			//			relative links
			if currentURI == nil || currentURI.Host == "" {
				if strings.HasPrefix(currentURI.Path, "/") {
					//root based relative urls
					log.Debug("old relatived url,", currentUrlStr)
					currentUrlStr = "http://" + seedURI.Host + currentUrlStr
					log.Debug("new relatived url,", currentUrlStr)
				} else {
					log.Debug("old relatived url,", currentUrlStr)
					//page based relative urls
					urlPath := getRootUrl(currentURI)
					currentUrlStr = "http://" + urlPath + currentUrlStr
					log.Debug("new relatived url,", currentUrlStr)
				}
			} else {
				log.Debug("host:", currentURI.Host, " ", currentURI.Host == "")

				//resolve domain specific filter
				if siteConfig.FollowSameDomain {

					if siteConfig.FollowSubDomain {

						//TODO handler com.cn and .com,using a TLC-domain list

					} else if seedURI.Host != currentURI.Host {
						log.Debug("domain mismatch,", seedURI.Host, " vs ", currentURI.Host)
						//continue
					}
					//TODO follow all or list of domain
				}
			}

			if len(siteConfig.LinkUrlMustContain) > 0 {
				if !util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustContain) {
					log.Debug("link does not hit must-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain)
					continue
				}
			}

			if len(siteConfig.LinkUrlMustNotContain) > 0 {
				if util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustNotContain) {
					log.Debug("link hit must-not-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain)
					continue
				}
			}

			//normalize url
			currentUrlStr = MustNormalizeURLString(currentUrlStr, FlagLowercaseScheme|FlagLowercaseHost|FlagUppercaseEscapes|
				FlagRemoveUnnecessaryHostDots|FlagRemoveDuplicateSlashes|FlagRemoveFragment)
			log.Debug("normalized url:", currentUrlStr)
			currentUrlByte := []byte(currentUrlStr)
			if !(runtimeConfig.Storage.CheckWalkedUrl(currentUrlByte) || runtimeConfig.Storage.CheckFetchedUrl(currentUrlByte) || runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte)) {
				//bloomFilter.Lookup(currentUrlByte) {

				//								if(CheckIgnore(currentUrlStr)){}

				//				log.Info("enqueue fetch: ", currentUrlStr)

				//				broker.Publish(kafka.NewMessage(currentUrlByte))

				//copied form fetchTask,TODO refactor
				//checking fetchUrlPattern
				log.Debug("started check fetchUrlPattern,", currentUrlStr)
				if siteConfig.FetchUrlPattern.Match(currentUrlByte) {
					log.Debug("match fetch url pattern,", currentUrlStr)
					if len(siteConfig.FetchUrlMustNotContain) > 0 {
						if util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustNotContain) {
							log.Debug("hit FetchUrlMustNotContain,ignore,", currentUrlStr)
							continue
						}
					}

					if len(siteConfig.FetchUrlMustContain) > 0 {
						if !util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustContain) {
							log.Debug("not hit FetchUrlMustContain,ignore,", currentUrlStr)
							continue
						}
					}
				} else {
					log.Debug("does not hit FetchUrlPattern ignoring,", currentUrlStr)
					continue
				}

				if !runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte) {
					log.Debug("log new pendingFetch url", currentUrlStr)
					runtimeConfig.Storage.LogPendingFetchUrl(runtimeConfig.PathConfig.PendingFetchLog, currentUrlStr)
					runtimeConfig.Storage.AddPendingFetchUrl(currentUrlByte)
				} else {
					log.Debug("hit new pendingFetch filter,ignore:", currentUrlStr)
				}
				//				pendingUrls <- currentUrlByte

				//	TODO pendingFetchFilter			bloomFilter.Add(currentUrlByte)
			} else {
				log.Debug("hit bloom filter,ignore:", currentUrlStr)
			}
			//			bloomFilter.Add([]byte(filterUrl))
		} else {
			log.Debug("hit bloom filter,ignore,", string(url))
		}
		log.Debug("exit links extract,", siteUrlStr)

	}

	//TODO 处理ruled fetch pattern

	log.Info("all links within ", siteUrlStr, " is done")
}
Example #23
0
File: fs.go Project: xujianhai/gopa
func (this *FsStore) AddFetchFailedUrl(url []byte) {
	//TODO
	log.Error("fetch failed url:", string(url))
}
Example #24
0
func main() {
	flag.StringVar(&seedUrl, "seed", "http://example.com", "the seed url,where everything starts")
	flag.StringVar(&logLevel, "log", "info", "setting log level,options:trace,debug,info,warn,error")

	flag.Parse()

	defer log.Flush()

	runtime.GOMAXPROCS(runtime.NumCPU())

	log.SetInitLogging(logLevel)

	runtimeConfig.PathConfig = new(PathConfig)
	runtimeConfig.ClusterConfig = new(ClusterConfig)

	runtimeConfig.ClusterConfig.Name = config.GetStringConfig("cluster", "name", "gopa")

	// per cluster:data/gopa/
	runtimeConfig.PathConfig.Home = config.GetStringConfig("path", "home", "cluster/"+runtimeConfig.ClusterConfig.Name+"/")

	runtimeConfig.PathConfig.Data = config.GetStringConfig("path", "data", "")
	if runtimeConfig.PathConfig.Data == "" {
		runtimeConfig.PathConfig.Data = runtimeConfig.PathConfig.Home + "/" + "data/"
	}

	runtimeConfig.PathConfig.Log = config.GetStringConfig("path", "log", "")
	if runtimeConfig.PathConfig.Log == "" {
		runtimeConfig.PathConfig.Log = runtimeConfig.PathConfig.Home + "/" + "log/"
	}

	runtimeConfig.PathConfig.WebData = config.GetStringConfig("path", "webdata", "")
	if runtimeConfig.PathConfig.WebData == "" {
		runtimeConfig.PathConfig.WebData = runtimeConfig.PathConfig.Data + "/" + "webdata/"
	}

	runtimeConfig.PathConfig.TaskData = config.GetStringConfig("path", "taskdata", "")
	if runtimeConfig.PathConfig.TaskData == "" {
		runtimeConfig.PathConfig.TaskData = runtimeConfig.PathConfig.Data + "/" + "taskdata/"
	}

	runtimeConfig.StoreWebPageTogether = config.GetBoolConfig("Global", "StoreWebPageTogether", true)

	runtimeConfig.TaskConfig = parseConfig()

	//set default logging
	logPath := runtimeConfig.PathConfig.Log + "/" + runtimeConfig.TaskConfig.Name + "/gopa.log"
	log.SetLogging(logLevel, logPath)

	runtimeConfig.ParseUrlsFromSavedFileLog = config.GetBoolConfig("Switch", "ParseUrlsFromSavedFileLog", true)
	runtimeConfig.LoadTemplatedFetchJob = config.GetBoolConfig("Switch", "LoadTemplatedFetchJob", true)
	runtimeConfig.LoadRuledFetchJob = config.GetBoolConfig("Switch", "LoadRuledFetchJob", false)
	runtimeConfig.LoadPendingFetchJobs = config.GetBoolConfig("Switch", "LoadPendingFetchJobs", true)
	runtimeConfig.HttpEnabled = config.GetBoolConfig("Switch", "HttpEnabled", true)
	runtimeConfig.ParseUrlsFromPreviousSavedPage = config.GetBoolConfig("Switch", "ParseUrlsFromPreviousSavedPage", false)
	runtimeConfig.ArrayStringSplitter = config.GetStringConfig("CrawlerRule", "ArrayStringSplitter", ",")

	runtimeConfig.GoProfEnabled = config.GetBoolConfig("CrawlerRule", "GoProfEnabled", false)

	runtimeConfig.WalkBloomFilterFileName = config.GetStringConfig("BloomFilter", "WalkBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/walk.bloomfilter")
	runtimeConfig.FetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "FetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/fetch.bloomfilter")
	runtimeConfig.ParseBloomFilterFileName = config.GetStringConfig("BloomFilter", "ParseBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/parse.bloomfilter")
	runtimeConfig.PendingFetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "PendingFetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/pending_fetch.bloomfilter")

	runtimeConfig.PathConfig.SavedFileLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_parse.files"
	runtimeConfig.PathConfig.PendingFetchLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_fetch.urls"
	runtimeConfig.PathConfig.FetchFailedLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/failed_fetch.urls"

	runtimeConfig.MaxGoRoutine = config.GetIntConfig("Global", "MaxGoRoutine", 2)
	if runtimeConfig.MaxGoRoutine < 2 {
		runtimeConfig.MaxGoRoutine = 2
	}

	log.Debug("maxGoRoutine:", runtimeConfig.MaxGoRoutine)
	log.Debug("path.home:", runtimeConfig.PathConfig.Home)

	os.MkdirAll(runtimeConfig.PathConfig.Home, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.Data, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.Log, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.WebData, 0777)
	os.MkdirAll(runtimeConfig.PathConfig.TaskData, 0777)

	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath, 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/tasks/", 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/filters/", 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/urls/", 0777)
	os.MkdirAll(runtimeConfig.TaskConfig.WebDataPath, 0777)

	runtimeConfig.RuledFetchConfig = new(RuledFetchConfig)
	runtimeConfig.RuledFetchConfig.UrlTemplate = config.GetStringConfig("RuledFetch", "UrlTemplate", "")
	runtimeConfig.RuledFetchConfig.From = config.GetIntConfig("RuledFetch", "From", 0)
	runtimeConfig.RuledFetchConfig.To = config.GetIntConfig("RuledFetch", "To", 10)
	runtimeConfig.RuledFetchConfig.Step = config.GetIntConfig("RuledFetch", "Step", 1)
	runtimeConfig.RuledFetchConfig.LinkExtractPattern = config.GetStringConfig("RuledFetch", "LinkExtractPattern", "")
	runtimeConfig.RuledFetchConfig.LinkTemplate = config.GetStringConfig("RuledFetch", "LinkTemplate", "")

	if seedUrl == "" || seedUrl == "http://example.com" {
		log.Error("no seed was given. type:\"gopa -h\" for help.")
		os.Exit(1)
	}

	log.Info("[gopa] " + runtimeConfig.Version + " is on.")

	runtimeConfig.Storage = &fsstore.FsStore{}

	//	if(runtimeConfig.)

	runtimeConfig.Storage.InitWalkBloomFilter(runtimeConfig.WalkBloomFilterFileName)
	runtimeConfig.Storage.InitFetchBloomFilter(runtimeConfig.FetchBloomFilterFileName)
	runtimeConfig.Storage.InitParseBloomFilter(runtimeConfig.ParseBloomFilterFileName)
	runtimeConfig.Storage.InitPendingFetchBloomFilter(runtimeConfig.PendingFetchBloomFilterFileName)

	//	atr:="AZaz"
	//	btr:=[]byte(atr)
	//	fmt.Println(btr)
	//
	//	id:= getSeqStr([]byte("AA"),[]byte("ZZ"),false)
	//	fmt.Println(id)

	//pprof serves
	if runtimeConfig.GoProfEnabled {
		go func() {
			log.Info(http.ListenAndServe("localhost:6060", nil))
			log.Info("pprof server is up,http://localhost:6060/debug/pprof")
		}()
	}

	//http serves
	if runtimeConfig.HttpEnabled {
		go func() {
			httpServ.Start(runtimeConfig)
		}()
	}

	//adding default http protocol
	if !strings.HasPrefix(seedUrl, "http") {
		seedUrl = "http://" + seedUrl
	}

	maxGoRoutine := runtimeConfig.MaxGoRoutine
	fetchQuitChannels := make([]*chan bool, maxGoRoutine)   //shutdownSignal signals for each go routing
	fetchTaskChannels := make([]*chan []byte, maxGoRoutine) //fetchTask channels
	fetchOffsets := make([]*RoutingOffset, maxGoRoutine)    //kafka fetchOffsets

	parseQuitChannels := make([]*chan bool, 2) //shutdownSignal signals for each go routing
	//	parseQuitChannels := make([]*chan bool, MaxGoRoutine) //shutdownSignal signals for each go routing
	parseOffsets := make([]*RoutingOffset, maxGoRoutine) //kafka fetchOffsets

	shutdownSignal := make(chan bool, 1)
	finalQuitSignal := make(chan bool, 1)

	//handle exit event
	exitEventChannel := make(chan os.Signal, 1)
	signal.Notify(exitEventChannel, syscall.SIGINT)
	signal.Notify(exitEventChannel, os.Interrupt)
	go func() {
		s := <-exitEventChannel
		log.Debug("got signal:", s)
		if s == os.Interrupt || s.(os.Signal) == syscall.SIGINT {
			log.Warn("got signal:os.Interrupt,saving data and exit")
			//			defer  os.Exit(0)

			runtimeConfig.Storage.PersistBloomFilter()

			//wait workers to exit
			log.Info("waiting workers exit")
			go shutdown(fetchOffsets, fetchQuitChannels, parseOffsets, parseQuitChannels, shutdownSignal)
			<-shutdownSignal
			log.Info("workers shutdown")
			finalQuitSignal <- true
		}
	}()

	//start fetcher
	for i := 0; i < maxGoRoutine; i++ {
		quitC := make(chan bool, 1)
		taskC := make(chan []byte)

		fetchQuitChannels[i] = &quitC
		fetchTaskChannels[i] = &taskC
		offset := new(RoutingOffset)
		//		offset.Offset = initOffset(runtimeConfig, "fetch", i)
		offset.Shard = i
		fetchOffsets[i] = offset

		go task.FetchGo(runtimeConfig, &taskC, &quitC, offset)
	}

	c2 := make(chan bool, 1)
	parseQuitChannels[0] = &c2
	offset2 := new(RoutingOffset)
	//	offset2.Offset = initOffset(runtimeConfig, "parse", 0)
	offset2.Shard = 0
	parseOffsets[0] = offset2
	pendingFetchUrls := make(chan []byte)

	//fetch rule:all urls -> persisted to sotre -> fetched from store -> pushed to pendingFetchUrls -> redistributed to sharded goroutines -> fetch -> save webpage to store -> done
	//parse rule:url saved to store -> local path persisted to store -> fetched to pendingParseFiles -> redistributed to sharded goroutines -> parse -> clean urls -> enqueue to url store ->done

	//sending feed to task queue
	go func() {
		//notice seed will not been persisted
		log.Debug("sending feed to fetch queue,", seedUrl)
		pendingFetchUrls <- []byte(seedUrl)
	}()

	//start local saved file parser
	if runtimeConfig.ParseUrlsFromSavedFileLog {
		go task.ParseGo(pendingFetchUrls, runtimeConfig, &c2, offset2)
	}

	//redistribute pendingFetchUrls to sharded workers
	go func() {
		for {
			url := <-pendingFetchUrls
			if !runtimeConfig.Storage.CheckWalkedUrl(url) {

				if runtimeConfig.Storage.CheckFetchedUrl(url) {
					log.Warn("dont hit walk bloomfilter but hit fetch bloomfilter,also ignore,", string(url))
					runtimeConfig.Storage.AddWalkedUrl(url)
					continue
				}

				randomShard := 0
				if maxGoRoutine > 1 {
					randomShard = rand.Intn(maxGoRoutine - 1)
				}
				log.Debug("publish:", string(url), ",shard:", randomShard)
				runtimeConfig.Storage.AddWalkedUrl(url)
				*fetchTaskChannels[randomShard] <- url
			} else {
				log.Trace("hit walk or fetch bloomfilter,just ignore,", string(url))
			}
		}
	}()

	//load predefined fetch jobs
	if runtimeConfig.LoadTemplatedFetchJob {
		go func() {

			if util.CheckFileExists(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt") {

				templates := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt")
				ids := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/id.txt")

				for _, id := range ids {
					for _, template := range templates {
						log.Trace("id:", id)
						log.Trace("template:", template)
						url := strings.Replace(template, "{id}", id, -1)
						log.Debug("new task from template:", url)
						pendingFetchUrls <- []byte(url)
					}
				}
				log.Info("templated download is done.")

			}

		}()
	}

	//fetch urls from saved pages
	if runtimeConfig.LoadPendingFetchJobs {
		c3 := make(chan bool, 1)
		parseQuitChannels[1] = &c3
		offset3 := new(RoutingOffset)
		//		offset3.Offset = initOffset(runtimeConfig, "fetch_from_saved", 0)
		offset3.Shard = 0
		parseOffsets[1] = offset3
		go task.LoadTaskFromLocalFile(pendingFetchUrls, &runtimeConfig, &c3, offset3)
	}

	//parse fetch failed jobs,and will ignore the walk-filter
	//TODO

	if runtimeConfig.LoadRuledFetchJob {
		log.Debug("start ruled fetch")
		go func() {
			if runtimeConfig.RuledFetchConfig.UrlTemplate != "" {
				for i := runtimeConfig.RuledFetchConfig.From; i <= runtimeConfig.RuledFetchConfig.To; i += runtimeConfig.RuledFetchConfig.Step {
					url := strings.Replace(runtimeConfig.RuledFetchConfig.UrlTemplate, "{id}", strconv.FormatInt(int64(i), 10), -1)
					log.Debug("add ruled url:", url)
					pendingFetchUrls <- []byte(url)
				}
			} else {
				log.Error("ruled template is empty,ignore")
			}
		}()

	}

	<-finalQuitSignal
	log.Info("[gopa] is down")
}