func (self *Spider) RunPusher() error { log.Printf("Spider.RunPusher()\n") for { // попробуем достать урлы, которые надо обойти urls := self.urls.DequeueN(self.pushCnt) if urls == nil { time.Sleep(self.interval) continue } log.Printf("Spider.RunPusher(): push urls %#v\n", urls) // скажем менеджеру загрузки их обойти if err := self.cg.PushUrls(urls); err != nil { return err } // положим их в список ожидающих self.waitMutex.Lock() for _, url := range urls { self.waitUrls[url] = struct{}{} } self.waitMutex.Unlock() log.Printf("Spider.RunPusher(): pushed urls %#v\n", urls) } }
func (self *Caregiver) PushUrls(urls []string) error { log.Printf("Caregiver.PushUrls(%#v)\n", urls) data := map[string][]string{} for _, u := range urls { url, err := url.Parse(u) if err != nil { return errors.NewErr(err) } path := url.Path if len(path) != 0 && path[0] == '/' { path = path[1:] } if url.RawQuery != "" { path += "?" path += url.RawQuery } data[url.Host] = append(data[url.Host], path) } for host, urls := range data { self.mutex.Lock() hosts, ok := self.hosts[host] if !ok { hosts = self.getData(host) self.hosts[host] = hosts } self.mutex.Unlock() hosts.urls.EnqueueAll(urls...) } log.Printf("Caregiver.PushUrls(%#v) OK\n", urls) return nil }
func (self *Gatekeeper) Write(url, key string, data []byte) (Value, error) { log.Printf("Gatekeeper.Write(%v, %v)\n", url, key) if self.file.file == nil { f, err := os.Create(self.dir + "/" + strconv.Itoa(int(self.fNum))) if err != nil { return Value{}, errors.NewErr(err) } self.file = gkFile{ file: f, offset: 0, end: time.Now().Add(self.maxTime), } } else if self.file.offset >= self.maxFileSize { if err := self.nextFile(); err != nil { return Value{}, err } } else if self.file.end.Before(time.Now()) { if err := self.file.file.Sync(); err != nil { return Value{}, errors.NewErr(err) } self.file.end = time.Now().Add(self.maxTime) } offset := self.file.offset cnt := 0 n, err := self.file.WriteLenval([]byte(url)) if err != nil { return Value{}, err } cnt += n n, err = self.file.WriteLenval(data) if err != nil { return Value{}, err } cnt += n self.file.offset += uint64(cnt) res := Value{ FNum: self.fNum, Offset: offset, Len: uint64(cnt), } self.trie.Add([]byte(key), res) // TODO: remove self.file.file.Sync() log.Printf("Gatekeeper.Write(%v, %v) OK (%+v)\n", url, key, res) return res, nil }
func (self *Gatekeeper) Find(key string) (Value, bool) { log.Printf("Gatekeeper.Find(%+v)\n", key) res, ok := self.trie.Find([]byte(key)) log.Printf("Gatekeeper.Find(%+v) OK (%v, %v)\n", key, res, ok) if !ok { return Value{}, false } return res.(Value), true }
// TODO: тут эта штука должна распределять урлы по шардам и по сети раздавать func (self *Spider) AddUrls(urls []string) { log.Printf("Spider.AddUrls(%#v)\n", urls) self.urls.EnqueueAll(urls...) // положим их в список ожидающих self.waitMutex.Lock() for _, url := range urls { self.waitUrls[url] = struct{}{} } self.waitMutex.Unlock() log.Printf("Spider.AddUrls(%#v) OK\n", urls) }
func (self *Resolver) ResolveAll(hosts []string) ([][]string, error) { log.Printf("Resolver.ResolveAll(%#v)\n", hosts) res := make([][]string, len(hosts)) for i, host := range hosts { b, err := self.Resolve(host) if err != nil { return nil, err } res[i] = b } log.Printf("Resolver.ResolveAll(%#v) OK\n", hosts) return res, nil }
func (self *Downloader) DownloadAll(urls []string) ([]string, error) { log.Printf("Downloader.DownloadAll(%#v)!\n", urls) res := make([]string, len(urls)) for i, url := range urls { b, err := self.Download(url) if err != nil { return nil, err } res[i] = b } log.Printf("Downloader.DownloadAll(%#v) OK!\n", urls) return res, nil }
func main() { var help = flag.Bool("help", false, "print help") var addr = flag.String("addr", "", "address to dial") var method = flag.String("method", "", "rpc method") var parg = flag.String("arg", "", "method argument") flag.Parse() if *help || *addr == "" || *method == "" || *parg == "" { flag.PrintDefaults() return } client, err := util.JsonRpcDial(*addr) if err != nil { log.Fatal(err) } var arg interface{} if err := json.Unmarshal([]byte(*parg), &arg); err != nil { log.Fatal(errors.NewErr(err)) } var res interface{} if err := client.Call(*method, arg, &res); err != nil { log.Fatal(errors.NewErr(err)) } log.Printf("%+v\n", res) }
func (self *Downloader) Download(url string) (string, error) { log.Printf("Downloader.Download(%s)\n", url) resp, err := http.Get(url) if err != nil { return "", errors.NewErr(err) } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { return "", errors.NewErr(err) } log.Printf("Downloader.Download(%s) OK!\n", url) return string(body), nil }
func (self *Resolver) Resolve(host string) ([]string, error) { log.Printf("Resolver.Resolve(%s)\n", host) self.mutex.RLock() r, hacheHit := self.cache[host] self.mutex.RUnlock() var res []string if hacheHit && time.Now().Before(r.end) { self.load(&r, &res) log.Printf("Resolver.Resolve(%s) OK (cache)!\n", host) return res, nil } val, err := net.LookupIP(host) if err != nil { err = errors.NewErr(err) if hacheHit { self.load(&r, &res) log.Errorln(err) log.Printf("Resolver.Resolve(%s) OK (cache, but error)!\n", host) return res, nil } return nil, err } d := dataT{ ips: make([]net.IP, 0, len(val)), end: time.Now().Add(self.cacheTime), } for _, v := range val { // if ip4 := v.To4(); len(ip4) == net.IPv4len { d.ips = append(d.ips, v) // } } self.mutex.Lock() self.cache[host] = d self.mutex.Unlock() self.load(&d, &res) log.Printf("Resolver.Resolve(%s) OK!\n", host) return res, nil }
func (self *Caregiver) PullUrls() map[string]string { log.Printf("Caregiver.PullUrls()\n") for { self.dataMutex.RLock() sleep := len(self.data) == 0 self.dataMutex.RUnlock() if !sleep { break } time.Sleep(self.pullTimeout) } self.dataMutex.Lock() defer self.dataMutex.Unlock() res := self.data self.data = map[string]string{} log.Printf("Caregiver.PullUrls() OK (%v)\n", len(res)) return res }
func (self *Gatekeeper) Read(val Value) (string, error) { log.Printf("Gatekeeper.Read(%+v)\n", val) f, err := util.Open(self.dir + "/" + strconv.Itoa(int(val.FNum))) if err != nil { return "", err } if _, err := f.Seek(int64(val.Offset), 0); err != nil { return "", err } if _, err := f.SkipLenval(); err != nil { return "", err } _, res, err := f.ReadLenval() if err != nil { return "", err } log.Printf("Gatekeeper.Read(%+v) OK\n", val) return string(res), nil }
func (self *Gatekeeper) load(name string, num uint, counts map[uint]uint) error { log.Printf("Gatekeeper.load(%v, %v)\n", name, num) file, err := util.Open(name) if err != nil { return err } cnt := uint64(0) for { n1, key, err := file.ReadLenval() if err == io.EOF { break } if err != nil { return err } u, err := UrlTransform(string(key)) if err != nil { return err } n2, err := file.SkipLenval() if err != nil { return err } nv := Value{ FNum: num, Offset: cnt, Len: n1 + n2, } old := self.trie.Add([]byte(u), nv) if old != nil { old := old.(Value) if old.Len != 0 { counts[old.FNum] -= 1 } } cnt += n1 + n2 counts[num] += 1 } return nil }
func (self *Spider) RunPuller() error { log.Printf("Spider.RunPuller()\n") for { now := time.Now() // получим документы urls, err := self.cg.PullUrls() if err != nil { return err } log.Printf("Spider.RunPuller(): pulled urls\n") // запишем их в хранилище toDel := []string{} for url, v := range urls { _, err := self.gk.Write(url, v) if err != nil { toDel = append(toDel, url) } } if len(toDel) != 0 { // те, что не получилось, попросим перекачать потом log.Printf("Spider.RunPuller(): reenqueue urls %#v\n", toDel) self.urls.EnqueueAll(toDel...) for _, url := range toDel { delete(urls, url) } } // те, что получилось удалим из ожидания self.waitMutex.Lock() for url, _ := range urls { delete(self.waitUrls, url) } self.waitMutex.Unlock() // и отметим, что выкачали for url, _ := range urls { self.doneUrls[url] = struct{}{} } log.Printf("Spider.RunPuller(): find urls\n") // теперь распарсим документы на предмет урлов и добавим их в список желаемых newUrls := make([]string, 0, 100 /*TODO: ajust multiplier at runtime?*/ *len(urls)) for k, v := range urls { curr, err := url.Parse(k) if err != nil { return err } matches := urlRegex.FindAllStringSubmatch(v, -1) for _, v := range matches { u := v[1] if parsed, err := url.Parse(u); err == nil { parsed.Fragment = "" if parsed.Scheme == "" { parsed.Scheme = curr.Scheme } if parsed.Host == "" { parsed.Host = curr.Host } newUrls = append(newUrls, parsed.String()) } } } // проверим, есть ли такие урлы в ожидании, если есть, отменим их self.waitMutex.Lock() for i := 0; i < len(newUrls); i += 1 { if _, ok := self.waitUrls[newUrls[i]]; ok { newUrls[i] = "" } } self.waitMutex.Unlock() // проверим, есть ли такие урлы в скачанных, если есть, отменим их for i := 0; i < len(newUrls); i += 1 { if newUrls[i] == "" { continue } if _, ok := self.doneUrls[newUrls[i]]; ok { newUrls[i] = "" } } // TODO: тут еще по robots.txt для хоста вычистить урлы // проверим, есть ли такие урлы в хранилище, если есть, отменим их for i := 0; i < len(newUrls); i += 1 { if newUrls[i] == "" { continue } _, ok, err := self.gk.Find(newUrls[i]) if err != nil { return err } if ok { newUrls[i] = "" } } tmp := map[string]struct{}{} for _, url := range newUrls { if url != "" { tmp[url] = struct{}{} } } // положим оставшиеся в очередь toQ := make([]string, 0, len(tmp)) for url, _ := range tmp { toQ = append(toQ, url) } if len(toQ) != 0 { self.AddUrls(toQ) } passed := time.Now().Sub(now) if self.interval > passed { time.Sleep(self.interval - passed) } } }
func (self *Caregiver) Start() error { log.Printf("Caregiver.Start()\n") for { // выделим хосты, которын можно по таймауту качать data := map[string]*hostData{} self.mutex.Lock() now := time.Now() for k, v := range self.hosts { if v.urls.Len() != 0 && v.end.Before(now) { data[k] = v } } self.mutex.Unlock() if len(data) == 0 { time.Sleep(self.WorkTimeout) continue } log.Printf("Caregiver.Start(): start downloading\n") // резолвим dns hosts := make([]string, 0, len(data)) for k, _ := range data { hosts = append(hosts, k) } var err error ips := map[string][]string{} // if self.dns != nil { // ips, err = self.dns.ResolveAll(hosts) // if err != nil { // return err // } // } now = time.Now() // можно качать! urls := []string{} for k, v := range data { /* TODO: запросы по ip почему-то не работаютю. Подозреваю, что сервер читает RequestURL и нужно будет реализовать свой стек http, чтобы все заработало. */ ip := k if v, ok := ips[k]; ok && len(v) != 0 { ip = v[0] if strings.Contains(ip, ":") { ip = "[" + ip + "]:80" } } host := "http://" + ip us := v.urls.DequeueN(v.maxCount) for i := 0; i < len(us); i += 1 { if len(us[i]) != 0 { us[i] = host + "/" + us[i] } else { us[i] = host } } urls = append(urls, us...) } log.Printf("Caregiver.Start(): collected urls %#v\n", urls) docs, err := self.downloader.DownloadAll(urls) if err != nil { log.Errorln(err) } now = time.Now() for _, v := range data { v.end = now.Add(v.timeout) } for i, v := range docs { if v != "" { self.dataMutex.Lock() self.data[urls[i]] = v self.dataMutex.Unlock() } else { log.Errorln("Couldn't download url "+urls[i]+",", v) } } log.Printf("Caregiver.Start(): downloaded urls %#v\n", urls) } }