示例#1
0
func (spider *spider) parseList(ctx context.Context, resp *crawl.Response) error {
	defer spider.c.Close()

	var currentTitle string

	resp.Find("div#unterMenu a").Each(func(_ int, s *goquery.Selection) {
		c, _ := s.Attr("class")
		switch c {
		case "unterMenuTitel":
			currentTitle = strings.ToLower(s.Text())
		case "unterMenuName":
			ctx = metadata.NewContext(ctx, metadata.Pairs(
				"type", currentTitle,
				"title", s.Text(),
			))
			href, _ := s.Attr("href")
			spider.c.Execute(ctx, &crawl.Request{
				URL:       strings.TrimSpace(href),
				Referer:   resp.URL().String(),
				Callbacks: crawl.Callbacks("user-agents"),
			})
		}
	})
	close(spider.results)
	return nil
}
示例#2
0
func main() {
	defer glog.Flush()
	flag.Parse()

	if *outputFile == "" {
		glog.Fatal("--output flag cannot be empty")
	}

	c := crawl.New(
		crawl.WithConcurrency(1),
		crawl.WithQueue(crawl.NewQueue(1000)),
	)

	spider := &spider{c: c, results: make(chan *userAgent, 10000)}
	c.Register("list", spider.parseList)
	c.Register("user-agents", spider.parseUserAgents)

	ctx := context.Background()
	ctx, cancel := context.WithDeadline(ctx, time.Now().Add(time.Second*30))
	defer cancel()

	if err := c.Schedule(ctx, &crawl.Request{
		URL:       "http://www.useragentstring.com/pages/useragentstring.php",
		Callbacks: crawl.Callbacks("list"),
	}); err != nil {
		glog.Fatal(err)
	}

	glog.Info("Starting crawl")

	go func() {
		for err := range c.Errors() {
			glog.Infof("Crawl error: %v", err)
		}
	}()

	go c.Start()

	f, err := os.Create(*outputFile)
	if err != nil {
		glog.Fatal(err)
	}
	defer f.Close()

	var results []*userAgent
	for result := range spider.results {
		results = append(results, result)
	}

	b, err := json.MarshalIndent(results, "", "  ")
	if err != nil {
		glog.Fatal(err)
	}
	if _, err := f.Write(b); err != nil {
		glog.Fatal(err)
	}
	glog.Infof("Done (%d user agents)", len(results))
}