func fetchCategory(url string) Category { req, err := http.NewRequest("GET", url, nil) req.Header.Set("User-Agent", scraperConfig.UserAgent) httpClient := http.Client{ Transport: &http.Transport{ Dial: timeoutDialler(time.Duration(10 * time.Second)), DisableKeepAlives: true, }, } var output = Category{} resp, err := httpClient.Do(req) if err != nil { log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err) return output } defer resp.Body.Close() if resp.StatusCode == 200 { doc, err := transform.NewDocFromReader(resp.Body) if err != nil { log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err) return output } pathFragments := strings.Split(url, "/") output.Name = pathFragments[len(pathFragments)-1] log.Println("Processing", output.Name) if !categorySet[output.Name] { // prevent cycles. this is wonky, but will do for now t := transform.NewTransform(doc) var getUrls = func(n *h5.Node) { urls := strings.Split(n.Children[0].Data(), "\n") for _, item := range urls { item = strings.TrimSpace(item) // if we encounter a subcategory, recurse if blekkoSubCat.MatchString(item) { subCatUrl := fmt.Sprintf("https://blekko.com/ws/+/view+%s", item) subCat := fetchCategory(subCatUrl) for _, subUrl := range subCat.Urls { output.Urls = append(output.Urls, subUrl) } } else if item != "" { output.Urls = append(output.Urls, item) } } } t.Apply(getUrls, "#urls-text") categorySet[output.Name] = true } } return output }
func fetchTagUrls(url string) []string { req, err := http.NewRequest("GET", url, nil) req.Header.Set("User-Agent", scraperConfig.UserAgent) httpClient := http.Client{ Transport: &http.Transport{ Dial: timeoutDialler(time.Duration(10 * time.Second)), DisableKeepAlives: true, }, } var output = []string{} resp, err := httpClient.Do(req) if err != nil { log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err) return output } defer resp.Body.Close() if resp.StatusCode == 200 { doc, err := transform.NewDocFromReader(resp.Body) if err != nil { log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err) return output } t := transform.NewTransform(doc) var GetUrls = func(n *h5.Node) { for _, a := range n.Attr { if a.Name == "href" { output = append(output, a.Value) break } } } t.Apply(GetUrls, "#tags-directory", "ul", "li", "a") } return output }