示例#1
0
func fetchCategory(url string) Category {
	req, err := http.NewRequest("GET", url, nil)
	req.Header.Set("User-Agent", scraperConfig.UserAgent)

	httpClient := http.Client{
		Transport: &http.Transport{
			Dial:              timeoutDialler(time.Duration(10 * time.Second)),
			DisableKeepAlives: true,
		},
	}

	var output = Category{}

	resp, err := httpClient.Do(req)
	if err != nil {
		log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err)
		return output
	}
	defer resp.Body.Close()

	if resp.StatusCode == 200 {

		doc, err := transform.NewDocFromReader(resp.Body)
		if err != nil {
			log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err)
			return output
		}

		pathFragments := strings.Split(url, "/")
		output.Name = pathFragments[len(pathFragments)-1]
		log.Println("Processing", output.Name)

		if !categorySet[output.Name] {
			// prevent cycles. this is wonky, but will do for now
			t := transform.NewTransform(doc)
			var getUrls = func(n *h5.Node) {
				urls := strings.Split(n.Children[0].Data(), "\n")
				for _, item := range urls {
					item = strings.TrimSpace(item)
					// if we encounter a subcategory, recurse
					if blekkoSubCat.MatchString(item) {
						subCatUrl := fmt.Sprintf("https://blekko.com/ws/+/view+%s", item)
						subCat := fetchCategory(subCatUrl)
						for _, subUrl := range subCat.Urls {
							output.Urls = append(output.Urls, subUrl)
						}
					} else if item != "" {
						output.Urls = append(output.Urls, item)
					}
				}
			}
			t.Apply(getUrls, "#urls-text")

			categorySet[output.Name] = true
		}
	}
	return output
}
示例#2
0
func fetchTagUrls(url string) []string {
	req, err := http.NewRequest("GET", url, nil)
	req.Header.Set("User-Agent", scraperConfig.UserAgent)

	httpClient := http.Client{
		Transport: &http.Transport{
			Dial:              timeoutDialler(time.Duration(10 * time.Second)),
			DisableKeepAlives: true,
		},
	}

	var output = []string{}

	resp, err := httpClient.Do(req)
	if err != nil {
		log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err)
		return output
	}
	defer resp.Body.Close()

	if resp.StatusCode == 200 {
		doc, err := transform.NewDocFromReader(resp.Body)
		if err != nil {
			log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err)
			return output
		}

		t := transform.NewTransform(doc)
		var GetUrls = func(n *h5.Node) {
			for _, a := range n.Attr {
				if a.Name == "href" {
					output = append(output, a.Value)
					break
				}
			}
		}
		t.Apply(GetUrls, "#tags-directory", "ul", "li", "a")
	}
	return output
}