// this function returns some specific signature of a selection // so it can be easy found to get data quickly next time func getSelectionSignature(s *goquery.Selection) string { var signature string tag, _ := goquery.OuterHtml(s) pos := strings.Index(tag, ">") if pos > -1 { tag = tag[1:pos] } else { return "" } signature = convertTagToJqueryFormat(tag, s) s.Parents().Each(func(i int, sec *goquery.Selection) { ohtml, _ := goquery.OuterHtml(sec) pos := strings.Index(ohtml, ">") if pos > -1 { ohtml = ohtml[1:pos] } tag := convertTagToJqueryFormat(ohtml, sec) signature = tag + " " + signature }) return signature }
func run() { var ( doc *goquery.Document doc_err, err error fd *os.File file_encoding string = "-" ) if *url == "-" { doc, doc_err = goquery.NewDocumentFromReader(os.Stdin) } else if fd, err = os.Open(*url); err == nil { doc, doc_err = goquery.NewDocumentFromReader(fd) defer fd.Close() } else { tmp_url := *url if !strings.HasPrefix(tmp_url, "http") { tmp_url = "http://" + tmp_url } doc, doc_err = goquery.NewDocument(tmp_url) } if doc_err != nil { log.Fatal("goquery NewDocument err:", doc_err) } if *debug { log.Printf("tag=[%s]\n", selector) } if !*noenc { file_encoding = get_html_enc(doc) } doc.Find(selector).Map(func(i int, sel *goquery.Selection) string { output := "" switch fun { case "html": if output, err = sel.Html(); err != nil { log.Fatal("select err:", err) } output = strings.TrimSpace(output) case "ohtml": if output, err = goquery.OuterHtml(sel); err != nil { log.Fatal("select err:", err) } output = strings.TrimSpace(output) case "text": output = strings.TrimSpace(sel.Text()) case "attr": attr_list := strings.Split(*attr, ",") for _, attr_i := range attr_list { var output_i string if attr_i == "text" { // a hardcode case for convenience scrapy output_i = strings.TrimSpace(sel.Text()) } else { output_i = sel.AttrOr(strings.TrimSpace(attr_i), "-") } if output_i == "" { output_i = "-" } output += output_i + "\t" } } if !*noenc && file_encoding != "" && file_encoding != "utf8" { if output, err = iconv.ConvertString(output, file_encoding, "utf8"); err != nil { log.Fatal("encoding invalid", err) } } fmt.Println(output) return "" }) }