Beispiel #1
0
func (sn *SksNode) Fetch() error {
	sn.Normalize()
	req, err := http.NewRequest("GET", sn.uri, nil)
	if err != nil {
		return err
	}
	req.Header.Set("User-Agent", "sks_peers/0.2 (SKS mesh spidering)")
	resp, err := HttpDoWithTimeout(http.DefaultClient, req, *flHttpFetchTimeout)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	sn.Status = resp.Status
	Log.Printf("[%s] Response status: %s", sn.Hostname, sn.Status)
	sn.ServerHeader = resp.Header.Get("Server")
	sn.ViaHeader = resp.Header.Get("Via")
	//doc, err := ehtml.Parse(resp.Body)
	buf, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return err
	}
	doc, err := htmlp.Parse(buf, htmlp.DefaultEncodingBytes, nil, htmlp.DefaultParseOption, htmlp.DefaultEncodingBytes)
	if err != nil {
		return err
	}
	sn.pageContent = doc
	return nil
}
Beispiel #2
0
func fetchCardNames(names *[]string) {
	res, err := http.Get(ravURL)
	if err != nil {
		fmt.Println(err)
	}
	response, err := ioutil.ReadAll(res.Body)
	if err != nil {
		fmt.Println(err)
	}
	res.Body.Close()

	doc, err := html.Parse(response, html.DefaultEncodingBytes, nil, html.DefaultParseOption, html.DefaultEncodingBytes)

	if err != nil {
		fmt.Println(err)
	}

	html := doc.Root().FirstChild()
	defer doc.Free()

	results, err := html.Search("//tr[@class='cardItem']")

	for _, row := range results {

		name, err := row.Search("./td[@class='name']")

		if err != nil {
			fmt.Println(err)
			continue
		}

		stringName := name[0].Content()
		*names = append(*names, stringName)
	}

	if err != nil {
		fmt.Println(err)
	}

	return
}
Beispiel #3
0
/*
ParseHtml parses an UTF-8 encoded byte array and returns an html.HtmlDocument. It uses parsing default options that ignore
errors or warnings, making it suitable for the poorly-formed 'tag soup' often found on the web.

If the content is not UTF-8 encoded or you want to customize the parsing options, you should call html.Parse directly.
*/
func ParseHtml(content []byte) (doc *html.HtmlDocument, err error) {
	return html.Parse(content, html.DefaultEncodingBytes, nil, html.DefaultParseOption, html.DefaultEncodingBytes)
}
func TiendaInglesa(url string) (data ProductData, err error) {
	// Find productId
	urlObj, err := net_url.Parse(url)
	if err != nil {
		err = &ScrapeError{INVALID_PRODUCT_URL, "Invalid url. Could not parse."}
		return
	}

	productIdList, present := urlObj.Query()["idarticulo"]

	if !present || len(productIdList) != 1 {
		err = &ScrapeError{INVALID_PRODUCT_URL, "Invalid url. Could not find idarticulo param."}
		return
	}

	productId, err := strconv.Atoi(productIdList[0])

	if err != nil {
		err = &ScrapeError{INVALID_PRODUCT_URL, "Invalid url. idarticulo param is not integer."}
		return
	}

	imgChan := make(chan string, 1)
	go getImage(productId, imgChan)

	resp, err := http.Get(url)

	if err != nil {
		log.Printf("Error getting site. ", err)
		return
	}

	if finalUrl := resp.Request.URL.String(); strings.Contains(finalUrl, "articulo_no_habilitado") {
		err = &ScrapeError{INVALID_PRODUCT_URL, "Product not found"}
		return
	}

	if resp.StatusCode != http.StatusOK {
		errorCode := int(resp.StatusCode/100) * 100
		err = &ScrapeError{errorCode, "Request error. Invalid StatusCode"}
		return
	}

	body, _ := ioutil.ReadAll(resp.Body)
	doc, err := html.Parse(body, []byte("iso-8859-1"), nil, html.DefaultParseOption, html.DefaultEncodingBytes)
	defer doc.Free()

	if err != nil {
		err = &ScrapeError{PARSING_ERROR, "Parsing error."}
		return
	}
	// Find Title
	results, err := doc.Search("//h1[@class='titulo_producto_top']")
	if err != nil {
		err = &ScrapeError{PARSING_ERROR, "Parsing error. No H1."}
		return
	}
	if len(results) == 0 {
		err = &ScrapeError{PARSING_ERROR, "Parsing error. No product title."}
		return
	}
	name := strings.TrimSpace(results[0].Content())

	// Find Price
	results, err = doc.Search("//div[@class='contendor_precio']//td[@class='precio']")
	if len(results) == 0 {
		err = &ScrapeError{PARSING_ERROR, "Parsing error. No Price"}
		return
	}
	priceStr := results[0].Content()
	priceSplitList := strings.Fields(priceStr)
	price, err := strconv.ParseFloat(priceSplitList[len(priceSplitList)-1], 64)
	if err != nil {
		log.Printf("Error parsing price")
		return
	}

	// Find description
	results, err = doc.Search("//div[@class='contenido_descripcion']")
	var description string
	if err == nil && len(results) > 0 {
		description = strings.TrimSpace(results[0].Content())
	}

	// Find categories
	results, err = doc.Search("//div[@class='navegacion']/a")
	var categories []string
	if err == nil && len(results) > 1 {
		// Remove "Home" category.
		results = results[1:]
		categories = make([]string, len(results))
		for i := range results {
			categories[i] = strings.TrimSpace(results[i].Content())
		}
	}

	//Image Url
	imageUrl := <-imgChan

	data = ProductData{
		Name:        name,
		Url:         url,
		Price:       price,
		Description: description,
		Categories:  categories,
		Fetched:     time.Now().UTC().Format("2006-01-02T15:04Z"),
		ImageUrl:    imageUrl,
	}
	return
}