func loadXpath(response *http.Response, xpath string) ([]byte, error) { body, err := ioutil.ReadAll(response.Body) panicError(err) // Parse body to see if login worked // reader := strings.NewReader(body) root, err := html.Parse(bytes.NewBuffer(body)) if err != nil { return nil, err } var b bytes.Buffer html.Render(&b, root) fixedHtml := b // body = bytes.NewReader(fixedHtml) xmlroot, xmlerr := xmlpath.ParseHTML(bytes.NewReader(fixedHtml.Bytes())) if xmlerr != nil { return nil, xmlerr } path := xmlpath.MustCompile(xpath) if value, ok := path.Bytes(xmlroot); ok { return value, nil } return nil, errors.New("Could not find xpath") }
func doc_parse(c_doc_page chan []byte, c_documents chan Lbc_doc, wg *sync.WaitGroup) { doc_page := <-c_doc_page //fmt.Printf("%s\n", string(doc_page)) utf8_reader := decode_utf8(string(doc_page)) doc_page_noscript := remove_noscript(utf8_reader) fix_html := fix_broken_html(doc_page_noscript) //r, _ := regexp.Compile("<meta .+?=.+?=\"\".+>") //r, _ := regexp.Compile("<meta .+?") //str_noscript := r.ReplaceAllString(fix_html, "") //root, err := xmlpath.ParseHTML( strings.NewReader(fix_html) ) _, err := xmlpath.ParseHTML(strings.NewReader(fix_html)) if err != nil { fmt.Println("!!!!!!!!!!!! BUG DOC page", err) //fmt.Println( "!!!!!!!!!!!! BUG DOC page utf8", string(doc_page_noscript)) log.Println("!!!!!!!!!!!! BUG DOC page", err) return } /* title_xpath := xmlpath.MustCompile("/html/body/div/div[2]/div/div[3]/div/div[1]/div[1]/h1/text()") //doc urls if doc_title, ok := title_xpath.String(root); ok { log.Println("##### DOC Title:", doc_title) } */ }
func tarballsFrom(source tarballSource) ([]*Tarball, error) { resp, err := http.Get(source.url) if err != nil { return nil, err } defer resp.Body.Close() data, err := ioutil.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("cannot read http response: %v", err) } clearScripts(data) root, err := xmlpath.ParseHTML(bytes.NewBuffer(data)) if err != nil { return nil, err } var tbs []*Tarball iter := xmlpath.MustCompile(source.xpath).Iter(root) for iter.Next() { s := iter.Node().String() if strings.HasPrefix(s, "//") { s = "https:" + s } if tb, ok := parseURL(s); ok { tbs = append(tbs, tb) } } if len(tbs) == 0 { return nil, fmt.Errorf("no downloads available at " + source.url) } return tbs, nil }
func (s *BasicSuite) TestHTML(c *C) { node, err := xmlpath.ParseHTML(bytes.NewBuffer(trivialHtml)) c.Assert(err, IsNil) path := xmlpath.MustCompile("/root/foo") result, ok := path.String(node) c.Assert(ok, Equals, true) c.Assert(result, Equals, "<a>") }
func main() { nextIn := 0 * time.Second for { <-time.After(nextIn) resp, err := http.Get(RP_URL) if err != nil { log.Println(err) nextIn = 1 * time.Second continue } var b bytes.Buffer _, err = io.Copy(&b, resp.Body) if err != nil { log.Println(err) nextIn = 1 * time.Second continue } defer resp.Body.Close() parts := strings.Split(string(b.Bytes()), "|") if len(parts) != 2 { log.Printf("Expected 2 parts, got %d", len(parts)) nextIn = 1 * time.Second continue } resp.Body.Close() root, err := xmlpath.ParseHTML(bytes.NewReader([]byte(parts[1]))) if err != nil { log.Println(err) nextIn = 1 * time.Second continue } current, ok := current_playing_path.String(root) if !ok { log.Println("Couldn't find currently playing") nextIn = 1 * time.Second continue } nextTick, err := strconv.Atoi(parts[0]) if err != nil { log.Printf("Couldn't get int value out of %d", parts[0]) nextIn = 1 * time.Second continue } nextIn = time.Duration(nextTick) * time.Millisecond fmt.Println(current) } }
func front_parse(c_front_urls chan string, c_front_page chan []byte, c_doc_urls chan string, wg *sync.WaitGroup) { front_page := <-c_front_page //fmt.Printf("%s\n", string(front_page)) //path := xmlpath.MustCompile("/html/body/div/div[2]/div/div[3]/div/div[1]/div[1]/h1/text()") //title doc_urls_xpath := xmlpath.MustCompile("/html/body/div[@id=\"page_align\"]/div[@id=\"page_width\"]/div[@id=\"ContainerMain\"]/div[@class=\"content-border list\"]/div[@class=\"content-color\"]/div[@class=\"list-lbc\"]//a/@href") //doc urls next_front_urls_xpath := xmlpath.MustCompile("/html/body/div[@id=\"page_align\"]/div[@id=\"page_width\"]/div[@id=\"ContainerMain\"]/nav/ul[@id=\"paging\"]/li[@class=\"page\"]") //next url /* front_page_noscript := remove_noscript(front_page) fix_html := fix_broken_html(front_page_noscript) utf8_reader := decode_utf8(fix_html) root, err := xmlpath.ParseHTML(utf8_reader)*/ utf8_reader := decode_utf8(string(front_page)) doc_page_noscript := remove_noscript(utf8_reader) fix_html := fix_broken_html(doc_page_noscript) //fmt.Println(string(fix_html)) root, err := xmlpath.ParseHTML(strings.NewReader(fix_html)) if err != nil { //log.Println("ca rentre") log.Fatal("FRONT PAGE", err) } doc_urls := doc_urls_xpath.Iter(root) for doc_urls.Next() { doc_url := doc_urls.Node().String() c_doc_urls <- doc_url //log.Println( "Doc URL:", doc_url) //<-- DOC URL } prev_next_front_urls := next_front_urls_xpath.Iter(root) var node *xmlpath.Node for prev_next_front_urls.Next() { node = prev_next_front_urls.Node() } href_xpath := xmlpath.MustCompile("a/@href") if next_front_url, ok := href_xpath.String(node); ok { c_front_urls <- next_front_url log.Println("Next Front URL:", next_front_url) wg.Add(1) go front_worker(c_front_urls, c_front_page, c_doc_urls, wg) } else { log.Println("No Next Front URL") log.Println("Front DONE") return } }
func (c *Comp) doCompXPath(r *http.Response) bool { path := xmlpath.MustCompile(c.Path) root, err := xmlpath.ParseHTML(r.Body) if err != nil { fmt.Printf("doCompXPath: %v\n", err) return false } value, ok := path.String(root) if !ok { return false } ok = c.dataMatch(value) return ok }
func ParseTorumemo(content io.ReadCloser) (string, string, error) { datePath := xmlpath.MustCompile(`//div[@class="date"]`) titlePath := xmlpath.MustCompile(`//div[@class="title"]`) contentPath := xmlpath.MustCompile(`//div[@class="body"]/p`) root, err := xmlpath.ParseHTML(content) if err != nil { return "", "", err } date, _ := datePath.String(root) title, _ := titlePath.String(root) date = strings.TrimSpace(date) title = strings.TrimSpace(title) iter := contentPath.Iter(root) var body string for iter.Next() { body += iter.Node().String() } return date + " " + title, body, err }
func reparseHtml(s string) (*xmlpath.Node, error) { content := mahonia.NewDecoder("cp932").ConvertString(s) doc, err := xhtml.Parse(strings.NewReader(content)) if err != nil { return nil, fmt.Errorf("could not parse HTML for %s ...(snip): %v", content[:30], err) } var b bytes.Buffer xhtml.Render(&b, doc) fixed := strings.NewReader(b.String()) root, err := xmlpath.ParseHTML(fixed) if err != nil { return nil, fmt.Errorf("could not rebuild HTML for %s ...(snip): %v", content[:30], err) } return root, nil }
func crawle(rank, url string, quest chan string, wg *sync.WaitGroup) { // XPATH of quest name // '//table/tr[(position() mod 2) = 1]/td[1]/a[1]/text()' defer wg.Done() path := xmlpath.MustCompile("//table/tr/td[1]/span/../a[1]/text()") resp, err := http.Get(url) if err != nil { fmt.Println("Error while fetching", url) return } defer resp.Body.Close() root, err := xmlpath.ParseHTML(resp.Body) if err != nil { fmt.Println("Error while parsing", url) return } iter := path.Iter(root) for iter.Next() { quest <- fmt.Sprintf("{\"rank\": \"%s\", \"name\": \"%s\"},", rank, iter.Node().String()) } }
// http://www.jma.go.jp/jp/amedas_h/today-46211.html // http://www6.kaiho.mlit.go.jp/03kanku/shimoda/ shift-jis // http://www6.kaiho.mlit.go.jp/03kanku/yokosuka/kisyou.html func getWindData(c appengine.Context) (*WindData, error) { windData := &WindData{} client := urlfetch.Client(c) resp, err := client.Get(MICS_URL) if err != nil { return nil, fmt.Errorf("could not get %s: %v", MICS_URL, err) } if resp.StatusCode != 200 { return nil, fmt.Errorf("server responded non-200: %s, %s", MICS_URL, resp.Status) } defer resp.Body.Close() // http://stackoverflow.com/questions/24101721/parse-broken-html-with-golang buf := new(bytes.Buffer) buf.ReadFrom(resp.Body) content := mahonia.NewDecoder("cp932").ConvertString(buf.String()) doc, err := xhtml.Parse(strings.NewReader(content)) // https://godoc.org/golang.org/x/net/html if err != nil { return nil, fmt.Errorf("could not parse HTML for %s: %v", MICS_URL, err) } var b bytes.Buffer xhtml.Render(&b, doc) fixed := strings.NewReader(b.String()) root, err := xmlpath.ParseHTML(fixed) if err != nil { return nil, fmt.Errorf("could not parse HTML: %s\n Error: %v", content, err) } path := xmlpath.MustCompile(MICS_TABLE_XPATH) table, ok := path.String(root) if !ok { return nil, fmt.Errorf("could not find table path") } re := regexp.MustCompile("([^\n])\n") windData.Table = re.ReplaceAllString(table, "$1 ") path = xmlpath.MustCompile(MICS_DATE_XPATH) date, ok := path.String(root) if !ok { return nil, fmt.Errorf("could not find date") } windData.Date = date imgResp, err := client.Get(MICS_SHIMODA_IMG_URL) if err != nil { return nil, fmt.Errorf("unable to get img from %s: %v", MICS_SHIMODA_IMG_URL, err) } if imgResp.StatusCode != 200 { return nil, fmt.Errorf("img server responded non-200: %s, %s", MICS_SHIMODA_IMG_URL, imgResp.Status) } defer imgResp.Body.Close() // XXX need to resize the image for Gratina2 // JPG is more available: http://media.kddi.com/app/publish/torisetsu/pdf/gratina2_torisetsu_shousai.pdf // go image packages // image/gif, image/jpeg: http://golang.org/pkg/image/gif/#Encode pngImg, err := png.Decode(imgResp.Body) if err != nil { // we can do with only text info c.Infof("No image attached. Could not decode png: %v", err) return windData, nil } buf.Reset() err = jpeg.Encode(buf, pngImg, &jpeg.Options{Quality: 75}) if err != nil { // we can do with text info only c.Infof("No image attached. Could not encode to jpeg: %v", err) return windData, nil } windData.Img = buf.Bytes() return windData, nil }