Exemple #1
0
func (w *wrapper) To(url string) (*crawler.URL, error) {
	uu, err := urlx.Parse(url)
	if err != nil {
		return nil, err
	}
	u := &crawler.URL{}
	u.URL = *uu
	u.Depth = w.Depth
	u.Done = w.Done
	u.Last = w.Last
	u.Status = w.Status
	u.NumVisit = w.NumVisit
	u.NumRetry = w.NumRetry
	return u, nil
}
Exemple #2
0
// Enqueue adds urls to queue.
func (cw *Crawler) Enqueue(urls ...string) error {
	for _, u := range urls {
		uu, err := urlx.Parse(u, cw.normalize)
		if err != nil {
			return err
		}
		if ok, err := cw.store.PutNX(&URL{
			URL: *uu,
		}); err != nil {
			return err
		} else if ok {
			cw.scheduler.NewIn <- uu
		}
	}
	return nil
}
Exemple #3
0
func (s *BoltStore) Recover(ch chan<- *url.URL) error {
	return s.DB.View(func(tx *bolt.Tx) error {
		b := tx.Bucket(bkURL)
		c := b.Cursor()
		for k, v := c.First(); k != nil; k, v = c.Next() {
			w := &wrapper{}
			if err := s.codec.Unmarshal(v, w); err != nil {
				return err
			} else if w.Done {
				continue
			} else if u, err := urlx.Parse(string(k)); err != nil {
				return err
			} else {
				ch <- u
			}
		}
		return nil
	})
}
Exemple #4
0
func (cw *Crawler) addSeeds(seeds ...string) (n int, err error) {
	if len(seeds) == 0 {
		return 0, errors.New("crawler: no seed provided")
	}
	for _, seed := range seeds {
		var u *url.URL
		var ok bool
		if u, err = urlx.Parse(seed, cw.normalize); err != nil {
			return
		}
		if ok, err = cw.store.PutNX(&URL{
			URL: *u,
		}); err != nil {
			return
		} else if ok {
			cw.scheduler.NewIn <- u
			n++
		}
	}
	return
}