Example #1
0
func (self *Spider) RunPusher() error {
	log.Printf("Spider.RunPusher()\n")
	for {
		// попробуем достать урлы, которые надо обойти
		urls := self.urls.DequeueN(self.pushCnt)
		if urls == nil {
			time.Sleep(self.interval)
			continue
		}

		log.Printf("Spider.RunPusher(): push urls %#v\n", urls)

		// скажем менеджеру загрузки их обойти
		if err := self.cg.PushUrls(urls); err != nil {
			return err
		}

		// положим их в список ожидающих
		self.waitMutex.Lock()
		for _, url := range urls {
			self.waitUrls[url] = struct{}{}
		}
		self.waitMutex.Unlock()
		log.Printf("Spider.RunPusher(): pushed urls %#v\n", urls)
	}
}
Example #2
0
func (self *Caregiver) PushUrls(urls []string) error {
	log.Printf("Caregiver.PushUrls(%#v)\n", urls)
	data := map[string][]string{}
	for _, u := range urls {
		url, err := url.Parse(u)
		if err != nil {
			return errors.NewErr(err)
		}

		path := url.Path
		if len(path) != 0 && path[0] == '/' {
			path = path[1:]
		}
		if url.RawQuery != "" {
			path += "?"
			path += url.RawQuery
		}
		data[url.Host] = append(data[url.Host], path)
	}

	for host, urls := range data {
		self.mutex.Lock()
		hosts, ok := self.hosts[host]
		if !ok {
			hosts = self.getData(host)
			self.hosts[host] = hosts
		}
		self.mutex.Unlock()
		hosts.urls.EnqueueAll(urls...)
	}
	log.Printf("Caregiver.PushUrls(%#v) OK\n", urls)
	return nil
}
Example #3
0
func (self *Gatekeeper) Write(url, key string, data []byte) (Value, error) {
	log.Printf("Gatekeeper.Write(%v, %v)\n", url, key)

	if self.file.file == nil {
		f, err := os.Create(self.dir + "/" + strconv.Itoa(int(self.fNum)))
		if err != nil {
			return Value{}, errors.NewErr(err)
		}

		self.file = gkFile{
			file:   f,
			offset: 0,
			end:    time.Now().Add(self.maxTime),
		}
	} else if self.file.offset >= self.maxFileSize {
		if err := self.nextFile(); err != nil {
			return Value{}, err
		}
	} else if self.file.end.Before(time.Now()) {
		if err := self.file.file.Sync(); err != nil {
			return Value{}, errors.NewErr(err)
		}
		self.file.end = time.Now().Add(self.maxTime)
	}

	offset := self.file.offset

	cnt := 0
	n, err := self.file.WriteLenval([]byte(url))
	if err != nil {
		return Value{}, err
	}
	cnt += n

	n, err = self.file.WriteLenval(data)
	if err != nil {
		return Value{}, err
	}
	cnt += n

	self.file.offset += uint64(cnt)

	res := Value{
		FNum:   self.fNum,
		Offset: offset,
		Len:    uint64(cnt),
	}

	self.trie.Add([]byte(key), res)

	// TODO: remove
	self.file.file.Sync()

	log.Printf("Gatekeeper.Write(%v, %v) OK (%+v)\n", url, key, res)
	return res, nil
}
Example #4
0
func (self *Gatekeeper) Find(key string) (Value, bool) {
	log.Printf("Gatekeeper.Find(%+v)\n", key)
	res, ok := self.trie.Find([]byte(key))
	log.Printf("Gatekeeper.Find(%+v) OK (%v, %v)\n", key, res, ok)
	if !ok {
		return Value{}, false
	}

	return res.(Value), true
}
Example #5
0
// TODO: тут эта штука должна распределять урлы по шардам и по сети раздавать
func (self *Spider) AddUrls(urls []string) {
	log.Printf("Spider.AddUrls(%#v)\n", urls)
	self.urls.EnqueueAll(urls...)
	// положим их в список ожидающих
	self.waitMutex.Lock()
	for _, url := range urls {
		self.waitUrls[url] = struct{}{}
	}
	self.waitMutex.Unlock()
	log.Printf("Spider.AddUrls(%#v) OK\n", urls)
}
Example #6
0
func (self *Resolver) ResolveAll(hosts []string) ([][]string, error) {
	log.Printf("Resolver.ResolveAll(%#v)\n", hosts)
	res := make([][]string, len(hosts))
	for i, host := range hosts {
		b, err := self.Resolve(host)
		if err != nil {
			return nil, err
		}

		res[i] = b
	}
	log.Printf("Resolver.ResolveAll(%#v) OK\n", hosts)
	return res, nil
}
Example #7
0
func (self *Downloader) DownloadAll(urls []string) ([]string, error) {
	log.Printf("Downloader.DownloadAll(%#v)!\n", urls)
	res := make([]string, len(urls))
	for i, url := range urls {
		b, err := self.Download(url)
		if err != nil {
			return nil, err
		}

		res[i] = b
	}
	log.Printf("Downloader.DownloadAll(%#v) OK!\n", urls)
	return res, nil
}
Example #8
0
func main() {
	var help = flag.Bool("help", false, "print help")
	var addr = flag.String("addr", "", "address to dial")
	var method = flag.String("method", "", "rpc method")
	var parg = flag.String("arg", "", "method argument")
	flag.Parse()

	if *help || *addr == "" || *method == "" || *parg == "" {
		flag.PrintDefaults()
		return
	}

	client, err := util.JsonRpcDial(*addr)
	if err != nil {
		log.Fatal(err)
	}

	var arg interface{}
	if err := json.Unmarshal([]byte(*parg), &arg); err != nil {
		log.Fatal(errors.NewErr(err))
	}

	var res interface{}
	if err := client.Call(*method, arg, &res); err != nil {
		log.Fatal(errors.NewErr(err))
	}

	log.Printf("%+v\n", res)
}
Example #9
0
func (self *Downloader) Download(url string) (string, error) {
	log.Printf("Downloader.Download(%s)\n", url)
	resp, err := http.Get(url)
	if err != nil {
		return "", errors.NewErr(err)
	}
	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return "", errors.NewErr(err)
	}

	log.Printf("Downloader.Download(%s) OK!\n", url)
	return string(body), nil
}
Example #10
0
func (self *Resolver) Resolve(host string) ([]string, error) {
	log.Printf("Resolver.Resolve(%s)\n", host)
	self.mutex.RLock()
	r, hacheHit := self.cache[host]
	self.mutex.RUnlock()

	var res []string
	if hacheHit && time.Now().Before(r.end) {
		self.load(&r, &res)
		log.Printf("Resolver.Resolve(%s) OK (cache)!\n", host)
		return res, nil
	}

	val, err := net.LookupIP(host)
	if err != nil {
		err = errors.NewErr(err)
		if hacheHit {
			self.load(&r, &res)
			log.Errorln(err)
			log.Printf("Resolver.Resolve(%s) OK (cache, but error)!\n", host)
			return res, nil
		}
		return nil, err
	}

	d := dataT{
		ips: make([]net.IP, 0, len(val)),
		end: time.Now().Add(self.cacheTime),
	}
	for _, v := range val {
		// if ip4 := v.To4(); len(ip4) == net.IPv4len {
		d.ips = append(d.ips, v)
		// }
	}

	self.mutex.Lock()
	self.cache[host] = d
	self.mutex.Unlock()

	self.load(&d, &res)
	log.Printf("Resolver.Resolve(%s) OK!\n", host)
	return res, nil
}
Example #11
0
func (self *Caregiver) PullUrls() map[string]string {
	log.Printf("Caregiver.PullUrls()\n")
	for {
		self.dataMutex.RLock()
		sleep := len(self.data) == 0
		self.dataMutex.RUnlock()
		if !sleep {
			break
		}
		time.Sleep(self.pullTimeout)
	}

	self.dataMutex.Lock()
	defer self.dataMutex.Unlock()
	res := self.data
	self.data = map[string]string{}
	log.Printf("Caregiver.PullUrls() OK (%v)\n", len(res))
	return res
}
Example #12
0
func (self *Gatekeeper) Read(val Value) (string, error) {
	log.Printf("Gatekeeper.Read(%+v)\n", val)
	f, err := util.Open(self.dir + "/" + strconv.Itoa(int(val.FNum)))
	if err != nil {
		return "", err
	}

	if _, err := f.Seek(int64(val.Offset), 0); err != nil {
		return "", err
	}

	if _, err := f.SkipLenval(); err != nil {
		return "", err
	}

	_, res, err := f.ReadLenval()
	if err != nil {
		return "", err
	}

	log.Printf("Gatekeeper.Read(%+v) OK\n", val)
	return string(res), nil
}
Example #13
0
func (self *Gatekeeper) load(name string, num uint, counts map[uint]uint) error {
	log.Printf("Gatekeeper.load(%v, %v)\n", name, num)
	file, err := util.Open(name)
	if err != nil {
		return err
	}

	cnt := uint64(0)
	for {
		n1, key, err := file.ReadLenval()
		if err == io.EOF {
			break
		}
		if err != nil {
			return err
		}

		u, err := UrlTransform(string(key))
		if err != nil {
			return err
		}

		n2, err := file.SkipLenval()
		if err != nil {
			return err
		}

		nv := Value{
			FNum:   num,
			Offset: cnt,
			Len:    n1 + n2,
		}
		old := self.trie.Add([]byte(u), nv)
		if old != nil {
			old := old.(Value)
			if old.Len != 0 {
				counts[old.FNum] -= 1
			}
		}

		cnt += n1 + n2
		counts[num] += 1
	}
	return nil
}
Example #14
0
func (self *Spider) RunPuller() error {
	log.Printf("Spider.RunPuller()\n")
	for {
		now := time.Now()
		// получим документы
		urls, err := self.cg.PullUrls()
		if err != nil {
			return err
		}

		log.Printf("Spider.RunPuller(): pulled urls\n")

		// запишем их в хранилище
		toDel := []string{}
		for url, v := range urls {
			_, err := self.gk.Write(url, v)
			if err != nil {
				toDel = append(toDel, url)
			}
		}

		if len(toDel) != 0 {
			// те, что не получилось, попросим перекачать потом
			log.Printf("Spider.RunPuller(): reenqueue urls %#v\n", toDel)
			self.urls.EnqueueAll(toDel...)
			for _, url := range toDel {
				delete(urls, url)
			}
		}

		// те, что получилось удалим из ожидания
		self.waitMutex.Lock()
		for url, _ := range urls {
			delete(self.waitUrls, url)
		}
		self.waitMutex.Unlock()

		// и отметим, что выкачали
		for url, _ := range urls {
			self.doneUrls[url] = struct{}{}
		}

		log.Printf("Spider.RunPuller(): find urls\n")

		// теперь распарсим документы на предмет урлов и добавим их в список желаемых
		newUrls := make([]string, 0, 100 /*TODO: ajust multiplier at runtime?*/ *len(urls))
		for k, v := range urls {
			curr, err := url.Parse(k)
			if err != nil {
				return err
			}

			matches := urlRegex.FindAllStringSubmatch(v, -1)
			for _, v := range matches {
				u := v[1]
				if parsed, err := url.Parse(u); err == nil {
					parsed.Fragment = ""
					if parsed.Scheme == "" {
						parsed.Scheme = curr.Scheme
					}
					if parsed.Host == "" {
						parsed.Host = curr.Host
					}
					newUrls = append(newUrls, parsed.String())
				}
			}
		}

		// проверим, есть ли такие урлы в ожидании, если есть, отменим их
		self.waitMutex.Lock()
		for i := 0; i < len(newUrls); i += 1 {
			if _, ok := self.waitUrls[newUrls[i]]; ok {
				newUrls[i] = ""
			}
		}
		self.waitMutex.Unlock()
		// проверим, есть ли такие урлы в скачанных, если есть, отменим их
		for i := 0; i < len(newUrls); i += 1 {
			if newUrls[i] == "" {
				continue
			}
			if _, ok := self.doneUrls[newUrls[i]]; ok {
				newUrls[i] = ""
			}
		}

		// TODO: тут еще по robots.txt для хоста вычистить урлы

		// проверим, есть ли такие урлы в хранилище, если есть, отменим их
		for i := 0; i < len(newUrls); i += 1 {
			if newUrls[i] == "" {
				continue
			}

			_, ok, err := self.gk.Find(newUrls[i])
			if err != nil {
				return err
			}

			if ok {
				newUrls[i] = ""
			}
		}

		tmp := map[string]struct{}{}
		for _, url := range newUrls {
			if url != "" {
				tmp[url] = struct{}{}
			}
		}

		// положим оставшиеся в очередь
		toQ := make([]string, 0, len(tmp))
		for url, _ := range tmp {
			toQ = append(toQ, url)
		}

		if len(toQ) != 0 {
			self.AddUrls(toQ)
		}

		passed := time.Now().Sub(now)
		if self.interval > passed {
			time.Sleep(self.interval - passed)
		}
	}
}
Example #15
0
func (self *Caregiver) Start() error {
	log.Printf("Caregiver.Start()\n")
	for {
		// выделим хосты, которын можно по таймауту качать
		data := map[string]*hostData{}
		self.mutex.Lock()
		now := time.Now()
		for k, v := range self.hosts {
			if v.urls.Len() != 0 && v.end.Before(now) {
				data[k] = v
			}
		}
		self.mutex.Unlock()
		if len(data) == 0 {
			time.Sleep(self.WorkTimeout)
			continue
		}

		log.Printf("Caregiver.Start(): start downloading\n")

		// резолвим dns
		hosts := make([]string, 0, len(data))
		for k, _ := range data {
			hosts = append(hosts, k)
		}

		var err error
		ips := map[string][]string{}
		// if self.dns != nil {
		// 	ips, err = self.dns.ResolveAll(hosts)
		// 	if err != nil {
		// 		return err
		// 	}
		// }

		now = time.Now()
		// можно качать!
		urls := []string{}
		for k, v := range data {
			/*
				TODO: запросы по ip почему-то не работаютю.
				Подозреваю, что сервер читает RequestURL и нужно будет реализовать свой стек http, чтобы все заработало.
			*/

			ip := k
			if v, ok := ips[k]; ok && len(v) != 0 {
				ip = v[0]
				if strings.Contains(ip, ":") {
					ip = "[" + ip + "]:80"
				}
			}
			host := "http://" + ip
			us := v.urls.DequeueN(v.maxCount)
			for i := 0; i < len(us); i += 1 {
				if len(us[i]) != 0 {
					us[i] = host + "/" + us[i]
				} else {
					us[i] = host
				}
			}
			urls = append(urls, us...)
		}

		log.Printf("Caregiver.Start(): collected urls %#v\n", urls)
		docs, err := self.downloader.DownloadAll(urls)
		if err != nil {
			log.Errorln(err)
		}

		now = time.Now()
		for _, v := range data {
			v.end = now.Add(v.timeout)
		}

		for i, v := range docs {
			if v != "" {
				self.dataMutex.Lock()
				self.data[urls[i]] = v
				self.dataMutex.Unlock()
			} else {
				log.Errorln("Couldn't download url "+urls[i]+",", v)
			}
		}

		log.Printf("Caregiver.Start(): downloaded urls %#v\n", urls)
	}
}