Exemplo n.º 1
0
func (y *Youtube) fetchMoreHistory(moreHref string) (more, content *html.Node, err error) {
	moreURL := "https://www.youtube.com" + moreHref
	moreReq, err := http.NewRequest("GET", moreURL, nil)
	if err != nil {
		return nil, nil, err
	}
	resp, err := y.s.Do(moreReq)
	if err != nil {
		return nil, nil, err
	}
	defer resp.Body.Close()

	var jsonDoc struct {
		Content string `json:"content_html"`
		More    string `json:"load_more_widget_html"`
	}

	if err := json.NewDecoder(resp.Body).Decode(&jsonDoc); err != nil {
		return nil, nil, err
	}

	content, err = html.Parse(bytes.NewBufferString(jsonDoc.Content))
	if err != nil {
		return nil, nil, err
	}
	more, _ = html.Parse(bytes.NewBufferString(jsonDoc.More))

	return
}
Exemplo n.º 2
0
func scrape(cliaoke_dir string) error {
	base_uri := "http://www.albinoblacksheep.com/audio/midi/"
	response, err := http.Get(base_uri)
	if err != nil {
		return errors.New("Couldn't fetch base content: " + err.Error())
	}
	defer response.Body.Close()

	doc, err := html.Parse(response.Body)
	if err != nil {
		return errors.New("Couldn't understand document body: " + err.Error())
	}

	for _, option := range get_options(doc) {
		slug := get_value(option)
		// log.Println(option)
		if slug != "" {
			var embed *html.Node
			page := base_uri + "/" + slug
			// log.Println(page)
			err := func() error {
				response, err := http.Get(page)
				if err != nil {
					return errors.New("Error fetching page " + page + ": " + err.Error())
				}
				defer response.Body.Close()
				slugdoc, err := html.Parse(response.Body)
				if err != nil {
					return errors.New("Error parsing page " + page + ": " + err.Error())
				}
				embed = get_embed(slugdoc)
				return nil
			}()
			if err != nil {
				return err
			}
			var file_url string
			for _, r := range embed.Attr {
				if r.Key == "src" {
					err = nil
					file_url = r.Val
					goto SKIP_ERR
				}
			}
			return errors.New("No src attribute found in embed in page " + page)

		SKIP_ERR:
			if err = download_file(cliaoke_dir, file_url); err != nil {
				return err
			}
		}
	}
	return nil
}
Exemplo n.º 3
0
func getNodeWithCharset(source []byte) (*html.Node, error) {
	n, err := html.Parse(bytes.NewReader(source))
	if err != nil {
		return nil, err
	}

	charset := getCharset(n)
	if charset == CS_UTF8 {
		return n, nil
	}

	data := convertString(string(source), charset, CS_UTF8)
	n, err = html.Parse(bytes.NewReader([]byte(data)))
	return n, err
}
Exemplo n.º 4
0
// Parse a tree from r.
func Parse(r io.Reader) (*Node, error) {
	dom, err := html.Parse(r)
	if err != nil {
		return nil, err
	}
	return Convert(dom), nil
}
Exemplo n.º 5
0
func urlToNode(url string, client *http.Client, redirect int) (*html.Node, error) {
	if redirect >= MAX_REDIRECT {
		return nil, fmt.Errorf("Too many canonical redirection at %s", url)
	}
	resp, err := client.Get(url)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != 200 {
		if resp.StatusCode < 500 {
			return nil, nil
		} else {
			return nil, fmt.Errorf("Server returns %d code (url: %s).", resp.StatusCode, url)
		}
	}

	root, err := html.Parse(resp.Body)
	if err != nil {
		return nil, err
	}
	s := getCanonicalUrl(root)
	if s != "" && s != url {
		return urlToNode(s, client, redirect+1)
	}
	return root, err
}
Exemplo n.º 6
0
Arquivo: links.go Projeto: thbf/marina
func Extract(url string) ([]string, error) {
	resp, err := http.Get(url)
	if err != nil {
		return nil, err
	}
	if resp.StatusCode != http.StatusOK {
		resp.Body.Close()
		return nil, fmt.Errorf("getting %s: %s", url, resp.Status)
	}

	doc, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil {
		return nil, fmt.Errorf("parsing %s as HTML: %v", url, err)
	}

	var links []string
	visitNode := func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key != "href" {
					continue
				}
				link, err := resp.Request.URL.Parse(a.Val)
				if err != nil {
					continue
				}
				links = append(links, link.String())
			}
		}
	}
	forEachNode(doc, visitNode, nil)
	return links, nil
}
Exemplo n.º 7
0
func title(url string) error {
	resp, err := http.Get(url)
	if err != nil {
		return err
	}

	// Check Content-Type is HTML (e.g., "text/html; charset=utf-8").
	ct := resp.Header.Get("Content-Type")
	if ct != "text/html" && !strings.HasPrefix(ct, "text/html;") {
		resp.Body.Close()
		return fmt.Errorf("%s has type %s, not text/html", url, ct)
	}

	doc, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil {
		return fmt.Errorf("parsing %s as HTML: %v", url, err)
	}
	title, err := soleTitle(doc)
	if err != nil {
		return err
	}
	fmt.Println(title)
	return nil
}
func TestWhitelist(t *testing.T) {

	originalHTML := `<html><head></head><body>0&lt;1<p id="A" foo="abc&#34;def">` +
		`2<b empty="">3</b><i backslash="\">&amp;4</i></p>` +
		`5<blockquote></blockquote><br/>6</body></html>`
	originalHTMLAsBuffer := bytes.NewBufferString(originalHTML)

	rootNode, err := html.Parse(originalHTMLAsBuffer)

	if err != nil {
		t.Fatal(err)
	}

	dummyAttributeArray := []htmlrender.MinimalHtmlNode{
		{
			Data: "blockquote",
		},
	}

	w := new(bytes.Buffer)
	want := `<blockquote></blockquote>`

	if err := Whitelist(w, rootNode, dummyAttributeArray); err != nil {
		t.Fatal(err)
	}
	if got := w.String(); got != want {
		t.Errorf("got vs want:\n%s\n%s\n", got, want)
	}
}
Exemplo n.º 9
0
func outline(url string) error {
	resp, err := http.Get(url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	doc, err := html.Parse(resp.Body)
	if err != nil {
		return err
	}

	var depth int
	forEachNode(doc,
		func(n *html.Node) {
			if n.Type == html.ElementNode {
				fmt.Printf("%*s<%s>\n", depth*2, "", n.Data)
				depth++
			}
		},
		func(n *html.Node) {
			if n.Type == html.ElementNode {
				depth--
				fmt.Printf("%*s</%s>\n", depth*2, "", n.Data)
			}
		})

	return nil
}
Exemplo n.º 10
0
func Parse(r io.Reader) (*Node, error) {
	n, err := html.Parse(r)
	if err != nil {
		return nil, err
	}
	return NewNode(n), nil
}
Exemplo n.º 11
0
func getRootNode(urlToOpen string) *xmlpath.Node {
	req, err := http.NewRequest("GET", urlToOpen, nil)
	if err != nil {
		log("error1: " + err.Error())
	} else {
		response, err := http.DefaultTransport.RoundTrip(req)
		if err != nil {
			log("Error 3: " + err.Error())
		} else {
			defer response.Body.Close()
			content, err := ioutil.ReadAll(response.Body)
			if err != nil {
				log("Error 2: " + err.Error())
			} else {
				root, err := html.Parse(bytes.NewReader(content))

				if err != nil {
					log("Parse error: " + err.Error())
				}

				var b bytes.Buffer
				html.Render(&b, root)
				fixedHtml := b.String()
				reader := strings.NewReader(fixedHtml)
				rootNode, err := xmlpath.ParseHTML(reader)
				if err != nil {
					log("Error 4: " + err.Error())
				} else {
					return rootNode
				}
			}
		}
	}
	return nil
}
Exemplo n.º 12
0
func TestHtmlTagCount(t *testing.T) {
	var tests = []struct {
		args string
		want map[string]int
	}{
		{`<html><head></head><body><a href="foo">Foo</a></body></html>`, map[string]int{"html": 1, "head": 1, "body": 1, "a": 1}},
		{`<html><head></head><body><ul><li><a href="/foo">Foo</a></li><li><a href="/bar">Bar</a></li></ul></body></html>`, map[string]int{"html": 1, "head": 1, "body": 1, "a": 2, "ul": 1, "li": 2}},
		{`<html><head></head><body><ul><li><a href="/foo">Foo</a></li><li><a href="/bar">Bar</a></li></ul><ul><li><a href="/hoge">Hoge</a></li><li><a href="/piyo">Piyo</a></li></ul></body></html>`, map[string]int{"html": 1, "head": 1, "body": 1, "a": 4, "ul": 2, "li": 4}},
	}

	for _, test := range tests {
		descr := fmt.Sprintf("htmlTagCount(%q)", test.args)
		doc, err := html.Parse(strings.NewReader(test.args))
		if err != nil {
			log.Fatal(err)
		}
		var counts map[string]int
		counts = map[string]int{}
		htmlTagCount(counts, doc)
		if !reflect.DeepEqual(counts, test.want) {
			t.Errorf("%s", descr)
			t.Errorf("got")
			for tagName, tagCount := range counts {
				t.Errorf("tagName = %s, tagCount = %d", tagName, tagCount)
			}
			t.Errorf("expect")
			for tagName, tagCount := range test.want {
				t.Errorf("tagName = %s, tagCount = %d", tagName, tagCount)
			}
		}
	}
}
Exemplo n.º 13
0
func GetParameters(client *http.Client, site string) error {
	url, err := url.ParseRequestURI(site)
	if err != nil {
		return err
	}

	url.Path = "/parameters/profile/all"

	respBody, err := DoRequest(client, url, "GET", nil, nil)
	if err != nil {
		return err
	}

	defer respBody.Close()

	doc, err := html.Parse(respBody)
	if err != nil {
		return err
	}

	if verbose {
		fmt.Println("HTML doc parsed ok", "type:", doc.Type, "data:", doc.Data)
	}

	err = CheckHtml(doc, PARAMETERS_PAGE_TITLE)
	if err != nil {
		return err
	}

	return nil
}
Exemplo n.º 14
0
func (u *URLMod) ParseMessage(msg *gochat.Message, c *gochat.Channel) string {
	url := u.Re.FindString(msg.Text)
	response, err := http.Get(url)
	if err != nil {
		return "Error, could not get URL!"
	} else {
		defer response.Body.Close()
		doc, _ := html.Parse(response.Body)
		var f func(*html.Node, bool)
		var re string
		f = func(n *html.Node, pt bool) {
			if pt && n.Type == html.TextNode {
				re = string("Link Title: " + n.Data)
				return
			}
			pt = pt || (n.Type == html.ElementNode && n.Data == "title")
			for c := n.FirstChild; c != nil; c = c.NextSibling {
				f(c, pt)
			}
		}
		f(doc, false)
		return re
	}
	//return ""
}
Exemplo n.º 15
0
func main() {
	doc, err := html.Parse(os.Stdin)
	if err != nil {
		log.Fatal(err)
	}
	printTextNodes(doc)
}
Exemplo n.º 16
0
func main() {

	url := "https://candypot.jp/summaries/1050"
	response, err := http.Get(url)
	if err != nil {
		fmt.Println(err)
	} else {
		if response.StatusCode == 200 {
			fmt.Println(response.Header["Content-Type"])
			body, _ := ioutil.ReadAll(response.Body)

			doc, err := html.Parse(strings.NewReader(string(body)))
			if err != nil {

			} else {
				var f func(*html.Node)
				f = func(n *html.Node) {
					if n.Type == html.ElementNode && n.Data == "h2" {
						fmt.Printf("%q\n", n.FirstChild.Data)
					}
					for c := n.FirstChild; c != nil; c = c.NextSibling {
						f(c)
					}
				}
				f(doc)
			}
		}

	}

}
Exemplo n.º 17
0
func TestVisit(t *testing.T) {
	tests := []struct {
		s    string
		want []string
	}{
		{
			"<a href='link1'><a href='link2'>",
			[]string{"link1", "link2"},
		}, {
			"<div><a href='link1'><a href='link2'></div><a href='link3'>",
			[]string{"link1", "link2", "link3"},
		},
	}

	for _, test := range tests {
		n, err := html.Parse(strings.NewReader(test.s))
		if err != nil {
			t.Errorf("parse failure: %v", err)
		}
		got := visit([]string{}, n)
		if !reflect.DeepEqual(got, test.want) {
			t.Errorf("Expected:%v Actual:%v", test.want, got)
		}
	}
}
Exemplo n.º 18
0
// title inspects the Content-Type header of the server's response
// and returns an error if the document is not html
func title(url string) error {
	resp, err := http.Get(url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	ct := resp.Header.Get("Content-Type")
	if ct != "text/html" && !strings.HasPrefix(ct, "text/html;") {
		return fmt.Errorf("%s has type %s, not text/html", url, ct)
	}

	doc, err := html.Parse(resp.Body)
	if err != nil {
		return fmt.Errorf("parsing %s as HTML: %v", url, err)
	}

	visitNode := func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "title" &&
			n.FirstChild != nil {
			fmt.Println(n.FirstChild.Data)
		}
	}

	forEachNode(doc, visitNode, nil)
	return nil
}
Exemplo n.º 19
0
func main() {
	if len(os.Args) < 3 {
		fmt.Println("Usage: ./ex17 http://example.com")
		os.Exit(1)
	}
	resp, err := http.Get(os.Args[1])
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}
	if resp.StatusCode != http.StatusOK {
		resp.Body.Close()
		fmt.Println("Get Error")
		os.Exit(1)
	}
	doc, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}
	nodes := ElementsByTagName(doc, os.Args[2:]...)
	for i, value := range nodes {
		fmt.Printf("%d: Data = %s \n", i, value.Data)
	}
}
Exemplo n.º 20
0
// Links returns the absolute URLs of all references from an URL of a webpage.
func Links(u string) ([]string, error) {
	s := newSelection("a[href]", u)
	link, err := url.Parse(s.URL)
	if err != nil {
		return nil, err
	}
	r, err := http.Get(link.String())
	if err != nil {
		return nil, err
	}
	defer r.Body.Close()
	doc, err := html.Parse(r.Body)
	if err != nil {
		return nil, err
	}
	sel, err := cascadia.Compile(s.Selector)
	if err != nil {
		return nil, err
	}
	matches := sel.MatchAll(doc)
	var result []string
	for _, m := range matches {
		r, err := resolveURL(hrefString(m), link)
		if err != nil {
			return nil, err
		}
		result = append(result, r)
	}
	return result, nil
}
Exemplo n.º 21
0
// NewDocumentFromReader returns a Document from a generic reader.
// It returns an error as second value if the reader's data cannot be parsed
// as html. It does *not* check if the reader is also an io.Closer, so the
// provided reader is never closed by this call, it is the responsibility
// of the caller to close it if required.
func NewDocumentFromReader(r io.Reader) (*Document, error) {
	root, e := html.Parse(r)
	if e != nil {
		return nil, e
	}
	return newDocument(root, nil), nil
}
Exemplo n.º 22
0
func main() {
	resp, err := http.Get(os.Args[1])
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}
	defer resp.Body.Close()

	doc, err := html.Parse(resp.Body)
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}
	node := ElementByID(doc, os.Args[2])
	if node != nil {
		fmt.Printf("idが'%s'のHTML要素を発見しました\n", os.Args[2])
		fmt.Printf("<%s", node.Data)
		for _, a := range node.Attr {
			fmt.Printf(" %s=\"%s\"", a.Key, a.Val)
		}
		fmt.Printf(">\n")
	} else {
		fmt.Printf("idが'%s'のHTML要素を発見できませんでした\n", os.Args[2])
	}
}
Exemplo n.º 23
0
func main() {
	if len(os.Args) != 3 {
		log.Fatalf("%s url id\n", os.Args[0])
	}

	url, id := os.Args[1], os.Args[2]

	resp, err := http.Get(url)
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	doc, err := html.Parse(resp.Body)
	if err != nil {
		log.Fatalf("parsing HTML: %s", err)
	}

	elem := ElementByID(doc, id)
	if elem == nil {
		log.Fatalf("id = %s not found in %s\n", id, url)
	}

	fmt.Println(elem)
}
Exemplo n.º 24
0
// Scrape finds and serializes the data from Lakehead's
// site. Eventually, it should return an array of
// `Match` reponses.
func (c *Course) scrape(res *HTTPResponse) {

	root, err := html.Parse(res.response.Body)
	if err != nil {
		fmt.Println("Error :", err)
		return err
	}

	data := scrape.FindAll(root, CourseMatcher)

	// Previous course flag for looping.
	prevClass := ""

	courses := []Course{}

	// This actually works
	for _, match := range data {

		currentClass := scrape.Attr(match, "class")

		newCourseFlag := isNewCourse(prevClass, currentClass)
		if newCourseFlag == true {
			course := parse(match)
			courses = append(courses, course)
		}

		switch scrape.Attr(match, "class") {
		case "timetable-course-two":
			prevClass = "timetable-course-two"
		case "timetable-course-one":
			prevClass = "timetable-course-one"
		}

	}
}
Exemplo n.º 25
0
Arquivo: html.go Projeto: snmsts/ros
func sbclBin(path string) string {
	u := uname_m() + "-" + uname()
	condPrintf(1, "open %s\n", path)
	in, err := os.Open(path)
	if err != nil {
		panic(err)
	}
	defer in.Close()
	r := bufio.NewReaderSize(in, 4096)
	doc, err := html.Parse(r)
	if err != nil {
		panic(err)
	}
	var f func(n *html.Node) string
	f = func(n *html.Node) string {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, attr := range n.Attr {
				if k, v := attr.Key, attr.Val; k == "href" && (v[len(v)-3:] == "bz2" || v[len(v)-3:] == "msi") && strings.Index(v, u) != -1 {
					return strings.Split(v, "-")[1]
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			if result := f(c); result != "" {
				return result
			}
		}
		return ""
	}
	return f(doc)
}
Exemplo n.º 26
0
func TestOutline(t *testing.T) {
	url := "http://gopl.io"

	var buf *bytes.Buffer
	buf = new(bytes.Buffer)
	out = buf
	outline(url)
	outlineHTML := buf.String()

	doc, err := html.Parse(buf)
	if err != nil {
		t.Errorf("%v\n", err)
	}

	buf = new(bytes.Buffer)
	out = buf
	forEachNode(doc, startElement, endElement)
	parsedHTML := buf.String()

	if outlineHTML != parsedHTML {
		fmt.Println("!!! OUTLINE HTML !!!")
		fmt.Println(outlineHTML)
		fmt.Println("!!! PARSED HTML !!!")
		fmt.Println(parsedHTML)
		t.Errorf("outlineHTML != parsedHTML\n")
	}
}
Exemplo n.º 27
0
func TestCountWordsAndImages(t *testing.T) {
	var tests = []struct {
		args string
		want map[string]int
	}{
		{`<html><head><style src="/path/style"></style></head><body><a href="foo">Foo Bar Hoge</a></body></html>`, map[string]int{"words": 3, "images": 0}},
		{`<html><head><style src="/path/style"></style></head><body><a href="foo">Foo Bar</a></body></html>`, map[string]int{"words": 2, "images": 0}},
		{`<html><head><script src="/path/script"></script></head><body><a href="foo">Foo</a></body></html>`, map[string]int{"words": 1, "images": 0}},
		{`<html><head><img src="/path/img"></img></head><body><a href="foo">Foo</a></body></html>`, map[string]int{"words": 1, "images": 1}},
	}

	for _, test := range tests {
		descr := fmt.Sprintf("(%q)", test.args)
		doc, err := html.Parse(strings.NewReader(test.args))
		if err != nil {
			log.Fatal(err)
		}
		words, images := countWordsAndImages(doc)
		if !reflect.DeepEqual(map[string]int{"words": words, "images": images}, test.want) {
			t.Errorf("%s", descr)
			t.Errorf("got-------------------")
			t.Errorf("words = %d\n", words)
			t.Errorf("images = %d\n", images)
			t.Errorf("expect---------------")
			for key, value := range test.want {
				t.Errorf("%s = %d", key, value)
			}
		}
	}
}
Exemplo n.º 28
0
Arquivo: main.go Projeto: anykao/p
func TorrentList(url string) ([]Torrent, error) {
	// request and parse the front page
	resp, err := http.Get(url)
	if err != nil {
		return make([]Torrent, 0), err
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		return make([]Torrent, 0), err
	}
	var torrents []Torrent
	if content, ok := scrape.Find(root, scrape.ById("searchResult")); ok {
		// define a matcher
		matcher := func(n *html.Node) bool {
			// must check for nil values
			if n.DataAtom == atom.Tr && n.Parent.DataAtom == atom.Tbody {
				return true
			}
			return false
		}
		// grab all articles and print them
		trs := scrape.FindAll(content, matcher)
		for _, tr := range trs {
			torrents = append(torrents, ParseRecord(tr))
		}
	}
	resp.Body.Close()
	return torrents, nil
}
Exemplo n.º 29
0
func main() {
	// request and parse the front page
	resp, err := http.Get("https://news.ycombinator.com/")
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	// define a matcher
	matcher := func(n *html.Node) bool {
		// must check for nil values
		if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil {
			return scrape.Attr(n.Parent.Parent, "class") == "athing"
		}
		return false
	}
	// grab all articles and print them
	articles := scrape.FindAll(root, matcher)
	for i, article := range articles {
		fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href"))
	}
}
Exemplo n.º 30
0
func parseHTML(path, dest string, dashing Dashing) ([]*reference, error) {
	refs := []*reference{}

	r, err := os.Open(path)
	if err != nil {
		return refs, err
	}
	defer r.Close()
	top, err := html.Parse(r)

	for pattern, etype := range dashing.Selectors {
		m := css.MustCompile(pattern)
		found := m.MatchAll(top)
		for _, n := range found {
			name := text(n)

			// Skip things explicitly ignored.
			if ignored(name) {
				fmt.Printf("Skipping entry for %s (Ignored by dashing JSON)\n", name)
				continue
			}
			// References we want to track.
			refs = append(refs, &reference{name, etype, path + "#" + anchor(n)})
			// We need to modify the DOM with a special link to support TOC.
			n.Parent.InsertBefore(newA(name, etype), n)
		}
	}
	return refs, writeHTML(path, dest, top)
}