示例#1
0
文件: wiki.go 项目: gnagel/spider
func (w *WikipediaHTMLSpider) Spin(ctx *spider.Context) error {
	if _, err := ctx.DoRequest(); err != nil {
		return err
	}

	html, _ := ctx.HTMLParser()
	summary := html.Find("#mw-content-text p").First().Text()

	fmt.Println(summary)
	return nil
}
示例#2
0
func (h *HubSpider) SaveAll(ctx *spider.Context) error {
	langs := make(map[string]Repos)
	langs[ctx.Get("lang").(string)] = ctx.Get("repos").(Repos)
	for _, c := range ctx.Children {
		langs[c.Get("lang").(string)] = c.Get("repos").(Repos)
	}
	s := NewSnapshot(langs)
	if err := s.Save(h.DB); err != nil {
		return err
	}

	log.Println("Saved", len(langs), "languages.")
	return nil
}
示例#3
0
文件: wiki.go 项目: gnagel/spider
func (w *WikipediaJSONSpider) Spin(ctx *spider.Context) error {
	if _, err := ctx.DoRequest(); err != nil {
		return err
	}
	jsonparser, _ := ctx.JSONParser()
	pages, err := jsonparser.GetPath("query", "pages").Map()
	if err != nil {
		return err
	}
	for _, p := range pages {
		fmt.Println(p.(map[string]interface{})["extract"])
	}
	return nil
}
示例#4
0
func (h *HubSpider) Spin(ctx *spider.Context) error {
	if h.Parent {
		snapshots, err := FindSnapshotByTime(h.DB, time.Now())
		if err == nil && snapshots != nil {
			fmt.Println("Checked today snapshot has already been saved.")
			return nil
		}
	}

	fmt.Println("Fetching ", h.Lang, "at: ", h.URL)
	if _, err := ctx.DoRequest(); err != nil {
		return err
	}
	fmt.Println("Status:", ctx.Response().Status, "for", h.Lang)

	doc, err := ctx.HTMLParser()
	if err != nil {
		return err
	}

	repos, err := ParseHTMLToRepos(doc)
	if err != nil {
		return err
	}

	ctx.Set("lang", h.Lang)
	ctx.Set("repos", repos)

	if !h.Parent {
		return nil
	}

	// Code to find all languages
	// langs := make([]langWithURL, 0)
	//
	// doc.Find(".select-menu .select-menu-list .select-menu-item[role=\"menuitem\"]").Each(func(i int, s *goquery.Selection) {
	// 	a := s.Find("a.select-menu-item-text")
	// 	lang := a.Text()
	// 	url, _ := a.Attr("href")
	// 	langs = append(langs, langWithURL{lang, url})
	// })

	allDoneChan := make(chan struct{})
	errChan := make(chan error)
	// Fetch for each languages
	var wg sync.WaitGroup
	for i, l := range langs {
		ls := NewLanguageSpider(l)
		newCtx, err := ls.Setup(ctx)
		if err != nil {
			return err
		}
		wg.Add(1)
		go func(ls spider.Spider, newCtx *spider.Context, i int) {
			defer wg.Done()
			time.Sleep(time.Duration(i) * 800 * time.Millisecond)
			if err := ls.Spin(newCtx); err != nil {
				errChan <- err
			}
		}(ls, newCtx, i)

	}

	go func() {
		wg.Wait()
		close(allDoneChan)
	}()

	select {
	case err := <-errChan:
		return err
	case <-allDoneChan:
		return h.SaveAll(ctx)
	}
}