// this function returns some specific signature of a selection
// so it can be easy found to get data quickly next time
func getSelectionSignature(s *goquery.Selection) string {
	var signature string

	tag, _ := goquery.OuterHtml(s)

	pos := strings.Index(tag, ">")

	if pos > -1 {
		tag = tag[1:pos]
	} else {
		return ""
	}

	signature = convertTagToJqueryFormat(tag, s)

	s.Parents().Each(func(i int, sec *goquery.Selection) {
		ohtml, _ := goquery.OuterHtml(sec)

		pos := strings.Index(ohtml, ">")

		if pos > -1 {
			ohtml = ohtml[1:pos]
		}

		tag := convertTagToJqueryFormat(ohtml, sec)

		signature = tag + " " + signature
	})

	return signature
}
Exemple #2
0
func run() {
	var (
		doc           *goquery.Document
		doc_err, err  error
		fd            *os.File
		file_encoding string = "-"
	)
	if *url == "-" {
		doc, doc_err = goquery.NewDocumentFromReader(os.Stdin)
	} else if fd, err = os.Open(*url); err == nil {
		doc, doc_err = goquery.NewDocumentFromReader(fd)
		defer fd.Close()
	} else {
		tmp_url := *url
		if !strings.HasPrefix(tmp_url, "http") {
			tmp_url = "http://" + tmp_url
		}
		doc, doc_err = goquery.NewDocument(tmp_url)
	}
	if doc_err != nil {
		log.Fatal("goquery NewDocument err:", doc_err)
	}

	if *debug {
		log.Printf("tag=[%s]\n", selector)
	}

	if !*noenc {
		file_encoding = get_html_enc(doc)
	}

	doc.Find(selector).Map(func(i int, sel *goquery.Selection) string {
		output := ""
		switch fun {
		case "html":
			if output, err = sel.Html(); err != nil {
				log.Fatal("select err:", err)
			}
			output = strings.TrimSpace(output)
		case "ohtml":
			if output, err = goquery.OuterHtml(sel); err != nil {
				log.Fatal("select err:", err)
			}
			output = strings.TrimSpace(output)
		case "text":
			output = strings.TrimSpace(sel.Text())
		case "attr":
			attr_list := strings.Split(*attr, ",")
			for _, attr_i := range attr_list {
				var output_i string
				if attr_i == "text" { // a hardcode case for convenience scrapy
					output_i = strings.TrimSpace(sel.Text())
				} else {
					output_i = sel.AttrOr(strings.TrimSpace(attr_i), "-")
				}
				if output_i == "" {
					output_i = "-"
				}
				output += output_i + "\t"
			}
		}

		if !*noenc && file_encoding != "" && file_encoding != "utf8" {
			if output, err = iconv.ConvertString(output, file_encoding, "utf8"); err != nil {
				log.Fatal("encoding invalid", err)
			}
		}
		fmt.Println(output)
		return ""
	})
}