Пример #1
0
func loadXpath(response *http.Response, xpath string) ([]byte, error) {
	body, err := ioutil.ReadAll(response.Body)
	panicError(err)

	// Parse body to see if login worked
	//	reader := strings.NewReader(body)
	root, err := html.Parse(bytes.NewBuffer(body))
	if err != nil {
		return nil, err
	}

	var b bytes.Buffer
	html.Render(&b, root)
	fixedHtml := b

	//	body = bytes.NewReader(fixedHtml)
	xmlroot, xmlerr := xmlpath.ParseHTML(bytes.NewReader(fixedHtml.Bytes()))

	if xmlerr != nil {
		return nil, xmlerr
	}

	path := xmlpath.MustCompile(xpath)
	if value, ok := path.Bytes(xmlroot); ok {
		return value, nil
	}

	return nil, errors.New("Could not find xpath")
}
Пример #2
0
func doc_parse(c_doc_page chan []byte, c_documents chan Lbc_doc, wg *sync.WaitGroup) {
	doc_page := <-c_doc_page
	//fmt.Printf("%s\n", string(doc_page))

	utf8_reader := decode_utf8(string(doc_page))
	doc_page_noscript := remove_noscript(utf8_reader)
	fix_html := fix_broken_html(doc_page_noscript)

	//r, _ := regexp.Compile("<meta .+?=.+?=\"\".+>")
	//r, _ := regexp.Compile("<meta .+?")
	//str_noscript := r.ReplaceAllString(fix_html, "")

	//root, err := xmlpath.ParseHTML( strings.NewReader(fix_html) )
	_, err := xmlpath.ParseHTML(strings.NewReader(fix_html))

	if err != nil {
		fmt.Println("!!!!!!!!!!!! BUG DOC page", err)
		//fmt.Println( "!!!!!!!!!!!! BUG DOC page utf8", string(doc_page_noscript))
		log.Println("!!!!!!!!!!!! BUG DOC page", err)
		return
	}
	/*
		title_xpath := xmlpath.MustCompile("/html/body/div/div[2]/div/div[3]/div/div[1]/div[1]/h1/text()") //doc urls

		if doc_title, ok := title_xpath.String(root); ok {
			log.Println("##### DOC Title:", doc_title)
		}
	*/

}
Пример #3
0
func tarballsFrom(source tarballSource) ([]*Tarball, error) {
	resp, err := http.Get(source.url)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()
	data, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("cannot read http response: %v", err)
	}
	clearScripts(data)
	root, err := xmlpath.ParseHTML(bytes.NewBuffer(data))
	if err != nil {
		return nil, err
	}
	var tbs []*Tarball
	iter := xmlpath.MustCompile(source.xpath).Iter(root)
	for iter.Next() {
		s := iter.Node().String()
		if strings.HasPrefix(s, "//") {
			s = "https:" + s
		}
		if tb, ok := parseURL(s); ok {
			tbs = append(tbs, tb)
		}
	}
	if len(tbs) == 0 {
		return nil, fmt.Errorf("no downloads available at " + source.url)
	}
	return tbs, nil
}
Пример #4
0
func (s *BasicSuite) TestHTML(c *C) {
	node, err := xmlpath.ParseHTML(bytes.NewBuffer(trivialHtml))
	c.Assert(err, IsNil)
	path := xmlpath.MustCompile("/root/foo")
	result, ok := path.String(node)
	c.Assert(ok, Equals, true)
	c.Assert(result, Equals, "<a>")
}
Пример #5
0
func main() {
	nextIn := 0 * time.Second
	for {
		<-time.After(nextIn)
		resp, err := http.Get(RP_URL)
		if err != nil {
			log.Println(err)
			nextIn = 1 * time.Second
			continue
		}

		var b bytes.Buffer
		_, err = io.Copy(&b, resp.Body)
		if err != nil {
			log.Println(err)
			nextIn = 1 * time.Second
			continue
		}
		defer resp.Body.Close()

		parts := strings.Split(string(b.Bytes()), "|")
		if len(parts) != 2 {
			log.Printf("Expected 2 parts, got %d", len(parts))
			nextIn = 1 * time.Second
			continue
		}
		resp.Body.Close()

		root, err := xmlpath.ParseHTML(bytes.NewReader([]byte(parts[1])))
		if err != nil {
			log.Println(err)
			nextIn = 1 * time.Second
			continue
		}

		current, ok := current_playing_path.String(root)
		if !ok {
			log.Println("Couldn't find currently playing")
			nextIn = 1 * time.Second
			continue
		}

		nextTick, err := strconv.Atoi(parts[0])
		if err != nil {
			log.Printf("Couldn't get int value out of %d", parts[0])
			nextIn = 1 * time.Second
			continue
		}

		nextIn = time.Duration(nextTick) * time.Millisecond
		fmt.Println(current)
	}
}
Пример #6
0
func front_parse(c_front_urls chan string, c_front_page chan []byte, c_doc_urls chan string, wg *sync.WaitGroup) {
	front_page := <-c_front_page
	//fmt.Printf("%s\n", string(front_page))
	//path := xmlpath.MustCompile("/html/body/div/div[2]/div/div[3]/div/div[1]/div[1]/h1/text()") //title
	doc_urls_xpath := xmlpath.MustCompile("/html/body/div[@id=\"page_align\"]/div[@id=\"page_width\"]/div[@id=\"ContainerMain\"]/div[@class=\"content-border list\"]/div[@class=\"content-color\"]/div[@class=\"list-lbc\"]//a/@href") //doc urls
	next_front_urls_xpath := xmlpath.MustCompile("/html/body/div[@id=\"page_align\"]/div[@id=\"page_width\"]/div[@id=\"ContainerMain\"]/nav/ul[@id=\"paging\"]/li[@class=\"page\"]")                                                   //next url
	/*
		front_page_noscript := remove_noscript(front_page)
		fix_html := fix_broken_html(front_page_noscript)
		utf8_reader := decode_utf8(fix_html)
		root, err := xmlpath.ParseHTML(utf8_reader)*/

	utf8_reader := decode_utf8(string(front_page))
	doc_page_noscript := remove_noscript(utf8_reader)

	fix_html := fix_broken_html(doc_page_noscript)

	//fmt.Println(string(fix_html))

	root, err := xmlpath.ParseHTML(strings.NewReader(fix_html))

	if err != nil {
		//log.Println("ca rentre")
		log.Fatal("FRONT PAGE", err)
	}

	doc_urls := doc_urls_xpath.Iter(root)
	for doc_urls.Next() {
		doc_url := doc_urls.Node().String()
		c_doc_urls <- doc_url
		//log.Println( "Doc URL:", doc_url)  //<-- DOC URL
	}

	prev_next_front_urls := next_front_urls_xpath.Iter(root)
	var node *xmlpath.Node

	for prev_next_front_urls.Next() {
		node = prev_next_front_urls.Node()
	}

	href_xpath := xmlpath.MustCompile("a/@href")
	if next_front_url, ok := href_xpath.String(node); ok {
		c_front_urls <- next_front_url
		log.Println("Next Front URL:", next_front_url)
		wg.Add(1)
		go front_worker(c_front_urls, c_front_page, c_doc_urls, wg)
	} else {
		log.Println("No Next Front URL")
		log.Println("Front DONE")
		return
	}
}
Пример #7
0
func (c *Comp) doCompXPath(r *http.Response) bool {
	path := xmlpath.MustCompile(c.Path)
	root, err := xmlpath.ParseHTML(r.Body)
	if err != nil {
		fmt.Printf("doCompXPath: %v\n", err)
		return false
	}

	value, ok := path.String(root)
	if !ok {
		return false
	}

	ok = c.dataMatch(value)
	return ok
}
Пример #8
0
func ParseTorumemo(content io.ReadCloser) (string, string, error) {
	datePath := xmlpath.MustCompile(`//div[@class="date"]`)
	titlePath := xmlpath.MustCompile(`//div[@class="title"]`)
	contentPath := xmlpath.MustCompile(`//div[@class="body"]/p`)
	root, err := xmlpath.ParseHTML(content)
	if err != nil {
		return "", "", err
	}

	date, _ := datePath.String(root)
	title, _ := titlePath.String(root)
	date = strings.TrimSpace(date)
	title = strings.TrimSpace(title)

	iter := contentPath.Iter(root)
	var body string
	for iter.Next() {
		body += iter.Node().String()
	}
	return date + " " + title, body, err
}
Пример #9
0
func reparseHtml(s string) (*xmlpath.Node, error) {
	content := mahonia.NewDecoder("cp932").ConvertString(s)

	doc, err := xhtml.Parse(strings.NewReader(content))
	if err != nil {
		return nil, fmt.Errorf("could not parse HTML for %s ...(snip): %v",
			content[:30], err)
	}

	var b bytes.Buffer
	xhtml.Render(&b, doc)
	fixed := strings.NewReader(b.String())

	root, err := xmlpath.ParseHTML(fixed)
	if err != nil {
		return nil, fmt.Errorf("could not rebuild HTML for %s ...(snip): %v",
			content[:30], err)
	}

	return root, nil
}
Пример #10
0
func crawle(rank, url string, quest chan string, wg *sync.WaitGroup) {
	// XPATH of quest name
	// '//table/tr[(position() mod 2) = 1]/td[1]/a[1]/text()'
	defer wg.Done()
	path := xmlpath.MustCompile("//table/tr/td[1]/span/../a[1]/text()")
	resp, err := http.Get(url)
	if err != nil {
		fmt.Println("Error while fetching", url)
		return
	}
	defer resp.Body.Close()
	root, err := xmlpath.ParseHTML(resp.Body)
	if err != nil {
		fmt.Println("Error while parsing", url)
		return
	}
	iter := path.Iter(root)
	for iter.Next() {
		quest <- fmt.Sprintf("{\"rank\": \"%s\", \"name\": \"%s\"},", rank, iter.Node().String())
	}
}
Пример #11
0
// http://www.jma.go.jp/jp/amedas_h/today-46211.html
// http://www6.kaiho.mlit.go.jp/03kanku/shimoda/ shift-jis
// http://www6.kaiho.mlit.go.jp/03kanku/yokosuka/kisyou.html
func getWindData(c appengine.Context) (*WindData, error) {
	windData := &WindData{}
	client := urlfetch.Client(c)
	resp, err := client.Get(MICS_URL)
	if err != nil {
		return nil, fmt.Errorf("could not get %s: %v", MICS_URL, err)
	}
	if resp.StatusCode != 200 {
		return nil, fmt.Errorf("server responded non-200: %s, %s", MICS_URL, resp.Status)
	}

	defer resp.Body.Close()
	// http://stackoverflow.com/questions/24101721/parse-broken-html-with-golang
	buf := new(bytes.Buffer)
	buf.ReadFrom(resp.Body)
	content := mahonia.NewDecoder("cp932").ConvertString(buf.String())

	doc, err := xhtml.Parse(strings.NewReader(content))
	// https://godoc.org/golang.org/x/net/html
	if err != nil {
		return nil, fmt.Errorf("could not parse HTML for %s: %v", MICS_URL, err)
	}
	var b bytes.Buffer
	xhtml.Render(&b, doc)
	fixed := strings.NewReader(b.String())

	root, err := xmlpath.ParseHTML(fixed)
	if err != nil {
		return nil, fmt.Errorf("could not parse HTML: %s\n Error: %v", content, err)
	}

	path := xmlpath.MustCompile(MICS_TABLE_XPATH)
	table, ok := path.String(root)
	if !ok {
		return nil, fmt.Errorf("could not find table path")
	}
	re := regexp.MustCompile("([^\n])\n")
	windData.Table = re.ReplaceAllString(table, "$1 ")

	path = xmlpath.MustCompile(MICS_DATE_XPATH)
	date, ok := path.String(root)
	if !ok {
		return nil, fmt.Errorf("could not find date")
	}
	windData.Date = date

	imgResp, err := client.Get(MICS_SHIMODA_IMG_URL)
	if err != nil {
		return nil, fmt.Errorf("unable to get img from %s: %v", MICS_SHIMODA_IMG_URL, err)
	}
	if imgResp.StatusCode != 200 {
		return nil, fmt.Errorf("img server responded non-200: %s, %s", MICS_SHIMODA_IMG_URL, imgResp.Status)
	}
	defer imgResp.Body.Close()

	// XXX need to resize the image for Gratina2
	// JPG is more available: http://media.kddi.com/app/publish/torisetsu/pdf/gratina2_torisetsu_shousai.pdf
	// go image packages
	// image/gif, image/jpeg: http://golang.org/pkg/image/gif/#Encode

	pngImg, err := png.Decode(imgResp.Body)
	if err != nil {
		// we can do with only text info
		c.Infof("No image attached. Could not decode png: %v", err)
		return windData, nil
	}
	buf.Reset()
	err = jpeg.Encode(buf, pngImg, &jpeg.Options{Quality: 75})
	if err != nil {
		// we can do with text info only
		c.Infof("No image attached. Could not encode to jpeg: %v", err)
		return windData, nil
	}
	windData.Img = buf.Bytes()
	return windData, nil
}