func (sn *SksNode) Fetch() error { sn.Normalize() req, err := http.NewRequest("GET", sn.uri, nil) if err != nil { return err } req.Header.Set("User-Agent", "sks_peers/0.2 (SKS mesh spidering)") resp, err := HttpDoWithTimeout(http.DefaultClient, req, *flHttpFetchTimeout) if err != nil { return err } defer resp.Body.Close() sn.Status = resp.Status Log.Printf("[%s] Response status: %s", sn.Hostname, sn.Status) sn.ServerHeader = resp.Header.Get("Server") sn.ViaHeader = resp.Header.Get("Via") //doc, err := ehtml.Parse(resp.Body) buf, err := ioutil.ReadAll(resp.Body) if err != nil { return err } doc, err := htmlp.Parse(buf, htmlp.DefaultEncodingBytes, nil, htmlp.DefaultParseOption, htmlp.DefaultEncodingBytes) if err != nil { return err } sn.pageContent = doc return nil }
func fetchCardNames(names *[]string) { res, err := http.Get(ravURL) if err != nil { fmt.Println(err) } response, err := ioutil.ReadAll(res.Body) if err != nil { fmt.Println(err) } res.Body.Close() doc, err := html.Parse(response, html.DefaultEncodingBytes, nil, html.DefaultParseOption, html.DefaultEncodingBytes) if err != nil { fmt.Println(err) } html := doc.Root().FirstChild() defer doc.Free() results, err := html.Search("//tr[@class='cardItem']") for _, row := range results { name, err := row.Search("./td[@class='name']") if err != nil { fmt.Println(err) continue } stringName := name[0].Content() *names = append(*names, stringName) } if err != nil { fmt.Println(err) } return }
/* ParseHtml parses an UTF-8 encoded byte array and returns an html.HtmlDocument. It uses parsing default options that ignore errors or warnings, making it suitable for the poorly-formed 'tag soup' often found on the web. If the content is not UTF-8 encoded or you want to customize the parsing options, you should call html.Parse directly. */ func ParseHtml(content []byte) (doc *html.HtmlDocument, err error) { return html.Parse(content, html.DefaultEncodingBytes, nil, html.DefaultParseOption, html.DefaultEncodingBytes) }
func TiendaInglesa(url string) (data ProductData, err error) { // Find productId urlObj, err := net_url.Parse(url) if err != nil { err = &ScrapeError{INVALID_PRODUCT_URL, "Invalid url. Could not parse."} return } productIdList, present := urlObj.Query()["idarticulo"] if !present || len(productIdList) != 1 { err = &ScrapeError{INVALID_PRODUCT_URL, "Invalid url. Could not find idarticulo param."} return } productId, err := strconv.Atoi(productIdList[0]) if err != nil { err = &ScrapeError{INVALID_PRODUCT_URL, "Invalid url. idarticulo param is not integer."} return } imgChan := make(chan string, 1) go getImage(productId, imgChan) resp, err := http.Get(url) if err != nil { log.Printf("Error getting site. ", err) return } if finalUrl := resp.Request.URL.String(); strings.Contains(finalUrl, "articulo_no_habilitado") { err = &ScrapeError{INVALID_PRODUCT_URL, "Product not found"} return } if resp.StatusCode != http.StatusOK { errorCode := int(resp.StatusCode/100) * 100 err = &ScrapeError{errorCode, "Request error. Invalid StatusCode"} return } body, _ := ioutil.ReadAll(resp.Body) doc, err := html.Parse(body, []byte("iso-8859-1"), nil, html.DefaultParseOption, html.DefaultEncodingBytes) defer doc.Free() if err != nil { err = &ScrapeError{PARSING_ERROR, "Parsing error."} return } // Find Title results, err := doc.Search("//h1[@class='titulo_producto_top']") if err != nil { err = &ScrapeError{PARSING_ERROR, "Parsing error. No H1."} return } if len(results) == 0 { err = &ScrapeError{PARSING_ERROR, "Parsing error. No product title."} return } name := strings.TrimSpace(results[0].Content()) // Find Price results, err = doc.Search("//div[@class='contendor_precio']//td[@class='precio']") if len(results) == 0 { err = &ScrapeError{PARSING_ERROR, "Parsing error. No Price"} return } priceStr := results[0].Content() priceSplitList := strings.Fields(priceStr) price, err := strconv.ParseFloat(priceSplitList[len(priceSplitList)-1], 64) if err != nil { log.Printf("Error parsing price") return } // Find description results, err = doc.Search("//div[@class='contenido_descripcion']") var description string if err == nil && len(results) > 0 { description = strings.TrimSpace(results[0].Content()) } // Find categories results, err = doc.Search("//div[@class='navegacion']/a") var categories []string if err == nil && len(results) > 1 { // Remove "Home" category. results = results[1:] categories = make([]string, len(results)) for i := range results { categories[i] = strings.TrimSpace(results[i].Content()) } } //Image Url imageUrl := <-imgChan data = ProductData{ Name: name, Url: url, Price: price, Description: description, Categories: categories, Fetched: time.Now().UTC().Format("2006-01-02T15:04Z"), ImageUrl: imageUrl, } return }