func (y *Youtube) fetchMoreHistory(moreHref string) (more, content *html.Node, err error) { moreURL := "https://www.youtube.com" + moreHref moreReq, err := http.NewRequest("GET", moreURL, nil) if err != nil { return nil, nil, err } resp, err := y.s.Do(moreReq) if err != nil { return nil, nil, err } defer resp.Body.Close() var jsonDoc struct { Content string `json:"content_html"` More string `json:"load_more_widget_html"` } if err := json.NewDecoder(resp.Body).Decode(&jsonDoc); err != nil { return nil, nil, err } content, err = html.Parse(bytes.NewBufferString(jsonDoc.Content)) if err != nil { return nil, nil, err } more, _ = html.Parse(bytes.NewBufferString(jsonDoc.More)) return }
func scrape(cliaoke_dir string) error { base_uri := "http://www.albinoblacksheep.com/audio/midi/" response, err := http.Get(base_uri) if err != nil { return errors.New("Couldn't fetch base content: " + err.Error()) } defer response.Body.Close() doc, err := html.Parse(response.Body) if err != nil { return errors.New("Couldn't understand document body: " + err.Error()) } for _, option := range get_options(doc) { slug := get_value(option) // log.Println(option) if slug != "" { var embed *html.Node page := base_uri + "/" + slug // log.Println(page) err := func() error { response, err := http.Get(page) if err != nil { return errors.New("Error fetching page " + page + ": " + err.Error()) } defer response.Body.Close() slugdoc, err := html.Parse(response.Body) if err != nil { return errors.New("Error parsing page " + page + ": " + err.Error()) } embed = get_embed(slugdoc) return nil }() if err != nil { return err } var file_url string for _, r := range embed.Attr { if r.Key == "src" { err = nil file_url = r.Val goto SKIP_ERR } } return errors.New("No src attribute found in embed in page " + page) SKIP_ERR: if err = download_file(cliaoke_dir, file_url); err != nil { return err } } } return nil }
func getNodeWithCharset(source []byte) (*html.Node, error) { n, err := html.Parse(bytes.NewReader(source)) if err != nil { return nil, err } charset := getCharset(n) if charset == CS_UTF8 { return n, nil } data := convertString(string(source), charset, CS_UTF8) n, err = html.Parse(bytes.NewReader([]byte(data))) return n, err }
// Parse a tree from r. func Parse(r io.Reader) (*Node, error) { dom, err := html.Parse(r) if err != nil { return nil, err } return Convert(dom), nil }
func urlToNode(url string, client *http.Client, redirect int) (*html.Node, error) { if redirect >= MAX_REDIRECT { return nil, fmt.Errorf("Too many canonical redirection at %s", url) } resp, err := client.Get(url) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != 200 { if resp.StatusCode < 500 { return nil, nil } else { return nil, fmt.Errorf("Server returns %d code (url: %s).", resp.StatusCode, url) } } root, err := html.Parse(resp.Body) if err != nil { return nil, err } s := getCanonicalUrl(root) if s != "" && s != url { return urlToNode(s, client, redirect+1) } return root, err }
func Extract(url string) ([]string, error) { resp, err := http.Get(url) if err != nil { return nil, err } if resp.StatusCode != http.StatusOK { resp.Body.Close() return nil, fmt.Errorf("getting %s: %s", url, resp.Status) } doc, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { return nil, fmt.Errorf("parsing %s as HTML: %v", url, err) } var links []string visitNode := func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key != "href" { continue } link, err := resp.Request.URL.Parse(a.Val) if err != nil { continue } links = append(links, link.String()) } } } forEachNode(doc, visitNode, nil) return links, nil }
func title(url string) error { resp, err := http.Get(url) if err != nil { return err } // Check Content-Type is HTML (e.g., "text/html; charset=utf-8"). ct := resp.Header.Get("Content-Type") if ct != "text/html" && !strings.HasPrefix(ct, "text/html;") { resp.Body.Close() return fmt.Errorf("%s has type %s, not text/html", url, ct) } doc, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { return fmt.Errorf("parsing %s as HTML: %v", url, err) } title, err := soleTitle(doc) if err != nil { return err } fmt.Println(title) return nil }
func TestWhitelist(t *testing.T) { originalHTML := `<html><head></head><body>0<1<p id="A" foo="abc"def">` + `2<b empty="">3</b><i backslash="\">&4</i></p>` + `5<blockquote></blockquote><br/>6</body></html>` originalHTMLAsBuffer := bytes.NewBufferString(originalHTML) rootNode, err := html.Parse(originalHTMLAsBuffer) if err != nil { t.Fatal(err) } dummyAttributeArray := []htmlrender.MinimalHtmlNode{ { Data: "blockquote", }, } w := new(bytes.Buffer) want := `<blockquote></blockquote>` if err := Whitelist(w, rootNode, dummyAttributeArray); err != nil { t.Fatal(err) } if got := w.String(); got != want { t.Errorf("got vs want:\n%s\n%s\n", got, want) } }
func outline(url string) error { resp, err := http.Get(url) if err != nil { return err } defer resp.Body.Close() doc, err := html.Parse(resp.Body) if err != nil { return err } var depth int forEachNode(doc, func(n *html.Node) { if n.Type == html.ElementNode { fmt.Printf("%*s<%s>\n", depth*2, "", n.Data) depth++ } }, func(n *html.Node) { if n.Type == html.ElementNode { depth-- fmt.Printf("%*s</%s>\n", depth*2, "", n.Data) } }) return nil }
func Parse(r io.Reader) (*Node, error) { n, err := html.Parse(r) if err != nil { return nil, err } return NewNode(n), nil }
func getRootNode(urlToOpen string) *xmlpath.Node { req, err := http.NewRequest("GET", urlToOpen, nil) if err != nil { log("error1: " + err.Error()) } else { response, err := http.DefaultTransport.RoundTrip(req) if err != nil { log("Error 3: " + err.Error()) } else { defer response.Body.Close() content, err := ioutil.ReadAll(response.Body) if err != nil { log("Error 2: " + err.Error()) } else { root, err := html.Parse(bytes.NewReader(content)) if err != nil { log("Parse error: " + err.Error()) } var b bytes.Buffer html.Render(&b, root) fixedHtml := b.String() reader := strings.NewReader(fixedHtml) rootNode, err := xmlpath.ParseHTML(reader) if err != nil { log("Error 4: " + err.Error()) } else { return rootNode } } } } return nil }
func TestHtmlTagCount(t *testing.T) { var tests = []struct { args string want map[string]int }{ {`<html><head></head><body><a href="foo">Foo</a></body></html>`, map[string]int{"html": 1, "head": 1, "body": 1, "a": 1}}, {`<html><head></head><body><ul><li><a href="/foo">Foo</a></li><li><a href="/bar">Bar</a></li></ul></body></html>`, map[string]int{"html": 1, "head": 1, "body": 1, "a": 2, "ul": 1, "li": 2}}, {`<html><head></head><body><ul><li><a href="/foo">Foo</a></li><li><a href="/bar">Bar</a></li></ul><ul><li><a href="/hoge">Hoge</a></li><li><a href="/piyo">Piyo</a></li></ul></body></html>`, map[string]int{"html": 1, "head": 1, "body": 1, "a": 4, "ul": 2, "li": 4}}, } for _, test := range tests { descr := fmt.Sprintf("htmlTagCount(%q)", test.args) doc, err := html.Parse(strings.NewReader(test.args)) if err != nil { log.Fatal(err) } var counts map[string]int counts = map[string]int{} htmlTagCount(counts, doc) if !reflect.DeepEqual(counts, test.want) { t.Errorf("%s", descr) t.Errorf("got") for tagName, tagCount := range counts { t.Errorf("tagName = %s, tagCount = %d", tagName, tagCount) } t.Errorf("expect") for tagName, tagCount := range test.want { t.Errorf("tagName = %s, tagCount = %d", tagName, tagCount) } } } }
func GetParameters(client *http.Client, site string) error { url, err := url.ParseRequestURI(site) if err != nil { return err } url.Path = "/parameters/profile/all" respBody, err := DoRequest(client, url, "GET", nil, nil) if err != nil { return err } defer respBody.Close() doc, err := html.Parse(respBody) if err != nil { return err } if verbose { fmt.Println("HTML doc parsed ok", "type:", doc.Type, "data:", doc.Data) } err = CheckHtml(doc, PARAMETERS_PAGE_TITLE) if err != nil { return err } return nil }
func (u *URLMod) ParseMessage(msg *gochat.Message, c *gochat.Channel) string { url := u.Re.FindString(msg.Text) response, err := http.Get(url) if err != nil { return "Error, could not get URL!" } else { defer response.Body.Close() doc, _ := html.Parse(response.Body) var f func(*html.Node, bool) var re string f = func(n *html.Node, pt bool) { if pt && n.Type == html.TextNode { re = string("Link Title: " + n.Data) return } pt = pt || (n.Type == html.ElementNode && n.Data == "title") for c := n.FirstChild; c != nil; c = c.NextSibling { f(c, pt) } } f(doc, false) return re } //return "" }
func main() { doc, err := html.Parse(os.Stdin) if err != nil { log.Fatal(err) } printTextNodes(doc) }
func main() { url := "https://candypot.jp/summaries/1050" response, err := http.Get(url) if err != nil { fmt.Println(err) } else { if response.StatusCode == 200 { fmt.Println(response.Header["Content-Type"]) body, _ := ioutil.ReadAll(response.Body) doc, err := html.Parse(strings.NewReader(string(body))) if err != nil { } else { var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "h2" { fmt.Printf("%q\n", n.FirstChild.Data) } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) } } } }
func TestVisit(t *testing.T) { tests := []struct { s string want []string }{ { "<a href='link1'><a href='link2'>", []string{"link1", "link2"}, }, { "<div><a href='link1'><a href='link2'></div><a href='link3'>", []string{"link1", "link2", "link3"}, }, } for _, test := range tests { n, err := html.Parse(strings.NewReader(test.s)) if err != nil { t.Errorf("parse failure: %v", err) } got := visit([]string{}, n) if !reflect.DeepEqual(got, test.want) { t.Errorf("Expected:%v Actual:%v", test.want, got) } } }
// title inspects the Content-Type header of the server's response // and returns an error if the document is not html func title(url string) error { resp, err := http.Get(url) if err != nil { return err } defer resp.Body.Close() ct := resp.Header.Get("Content-Type") if ct != "text/html" && !strings.HasPrefix(ct, "text/html;") { return fmt.Errorf("%s has type %s, not text/html", url, ct) } doc, err := html.Parse(resp.Body) if err != nil { return fmt.Errorf("parsing %s as HTML: %v", url, err) } visitNode := func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil { fmt.Println(n.FirstChild.Data) } } forEachNode(doc, visitNode, nil) return nil }
func main() { if len(os.Args) < 3 { fmt.Println("Usage: ./ex17 http://example.com") os.Exit(1) } resp, err := http.Get(os.Args[1]) if err != nil { fmt.Println(err) os.Exit(1) } if resp.StatusCode != http.StatusOK { resp.Body.Close() fmt.Println("Get Error") os.Exit(1) } doc, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { fmt.Println(err) os.Exit(1) } nodes := ElementsByTagName(doc, os.Args[2:]...) for i, value := range nodes { fmt.Printf("%d: Data = %s \n", i, value.Data) } }
// Links returns the absolute URLs of all references from an URL of a webpage. func Links(u string) ([]string, error) { s := newSelection("a[href]", u) link, err := url.Parse(s.URL) if err != nil { return nil, err } r, err := http.Get(link.String()) if err != nil { return nil, err } defer r.Body.Close() doc, err := html.Parse(r.Body) if err != nil { return nil, err } sel, err := cascadia.Compile(s.Selector) if err != nil { return nil, err } matches := sel.MatchAll(doc) var result []string for _, m := range matches { r, err := resolveURL(hrefString(m), link) if err != nil { return nil, err } result = append(result, r) } return result, nil }
// NewDocumentFromReader returns a Document from a generic reader. // It returns an error as second value if the reader's data cannot be parsed // as html. It does *not* check if the reader is also an io.Closer, so the // provided reader is never closed by this call, it is the responsibility // of the caller to close it if required. func NewDocumentFromReader(r io.Reader) (*Document, error) { root, e := html.Parse(r) if e != nil { return nil, e } return newDocument(root, nil), nil }
func main() { resp, err := http.Get(os.Args[1]) if err != nil { fmt.Println(err) os.Exit(1) } defer resp.Body.Close() doc, err := html.Parse(resp.Body) if err != nil { fmt.Println(err) os.Exit(1) } node := ElementByID(doc, os.Args[2]) if node != nil { fmt.Printf("idが'%s'のHTML要素を発見しました\n", os.Args[2]) fmt.Printf("<%s", node.Data) for _, a := range node.Attr { fmt.Printf(" %s=\"%s\"", a.Key, a.Val) } fmt.Printf(">\n") } else { fmt.Printf("idが'%s'のHTML要素を発見できませんでした\n", os.Args[2]) } }
func main() { if len(os.Args) != 3 { log.Fatalf("%s url id\n", os.Args[0]) } url, id := os.Args[1], os.Args[2] resp, err := http.Get(url) if err != nil { log.Fatal(err) } defer resp.Body.Close() doc, err := html.Parse(resp.Body) if err != nil { log.Fatalf("parsing HTML: %s", err) } elem := ElementByID(doc, id) if elem == nil { log.Fatalf("id = %s not found in %s\n", id, url) } fmt.Println(elem) }
// Scrape finds and serializes the data from Lakehead's // site. Eventually, it should return an array of // `Match` reponses. func (c *Course) scrape(res *HTTPResponse) { root, err := html.Parse(res.response.Body) if err != nil { fmt.Println("Error :", err) return err } data := scrape.FindAll(root, CourseMatcher) // Previous course flag for looping. prevClass := "" courses := []Course{} // This actually works for _, match := range data { currentClass := scrape.Attr(match, "class") newCourseFlag := isNewCourse(prevClass, currentClass) if newCourseFlag == true { course := parse(match) courses = append(courses, course) } switch scrape.Attr(match, "class") { case "timetable-course-two": prevClass = "timetable-course-two" case "timetable-course-one": prevClass = "timetable-course-one" } } }
func sbclBin(path string) string { u := uname_m() + "-" + uname() condPrintf(1, "open %s\n", path) in, err := os.Open(path) if err != nil { panic(err) } defer in.Close() r := bufio.NewReaderSize(in, 4096) doc, err := html.Parse(r) if err != nil { panic(err) } var f func(n *html.Node) string f = func(n *html.Node) string { if n.Type == html.ElementNode && n.Data == "a" { for _, attr := range n.Attr { if k, v := attr.Key, attr.Val; k == "href" && (v[len(v)-3:] == "bz2" || v[len(v)-3:] == "msi") && strings.Index(v, u) != -1 { return strings.Split(v, "-")[1] } } } for c := n.FirstChild; c != nil; c = c.NextSibling { if result := f(c); result != "" { return result } } return "" } return f(doc) }
func TestOutline(t *testing.T) { url := "http://gopl.io" var buf *bytes.Buffer buf = new(bytes.Buffer) out = buf outline(url) outlineHTML := buf.String() doc, err := html.Parse(buf) if err != nil { t.Errorf("%v\n", err) } buf = new(bytes.Buffer) out = buf forEachNode(doc, startElement, endElement) parsedHTML := buf.String() if outlineHTML != parsedHTML { fmt.Println("!!! OUTLINE HTML !!!") fmt.Println(outlineHTML) fmt.Println("!!! PARSED HTML !!!") fmt.Println(parsedHTML) t.Errorf("outlineHTML != parsedHTML\n") } }
func TestCountWordsAndImages(t *testing.T) { var tests = []struct { args string want map[string]int }{ {`<html><head><style src="/path/style"></style></head><body><a href="foo">Foo Bar Hoge</a></body></html>`, map[string]int{"words": 3, "images": 0}}, {`<html><head><style src="/path/style"></style></head><body><a href="foo">Foo Bar</a></body></html>`, map[string]int{"words": 2, "images": 0}}, {`<html><head><script src="/path/script"></script></head><body><a href="foo">Foo</a></body></html>`, map[string]int{"words": 1, "images": 0}}, {`<html><head><img src="/path/img"></img></head><body><a href="foo">Foo</a></body></html>`, map[string]int{"words": 1, "images": 1}}, } for _, test := range tests { descr := fmt.Sprintf("(%q)", test.args) doc, err := html.Parse(strings.NewReader(test.args)) if err != nil { log.Fatal(err) } words, images := countWordsAndImages(doc) if !reflect.DeepEqual(map[string]int{"words": words, "images": images}, test.want) { t.Errorf("%s", descr) t.Errorf("got-------------------") t.Errorf("words = %d\n", words) t.Errorf("images = %d\n", images) t.Errorf("expect---------------") for key, value := range test.want { t.Errorf("%s = %d", key, value) } } } }
func TorrentList(url string) ([]Torrent, error) { // request and parse the front page resp, err := http.Get(url) if err != nil { return make([]Torrent, 0), err } root, err := html.Parse(resp.Body) if err != nil { return make([]Torrent, 0), err } var torrents []Torrent if content, ok := scrape.Find(root, scrape.ById("searchResult")); ok { // define a matcher matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.Tr && n.Parent.DataAtom == atom.Tbody { return true } return false } // grab all articles and print them trs := scrape.FindAll(content, matcher) for _, tr := range trs { torrents = append(torrents, ParseRecord(tr)) } } resp.Body.Close() return torrents, nil }
func main() { // request and parse the front page resp, err := http.Get("https://news.ycombinator.com/") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } // define a matcher matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil { return scrape.Attr(n.Parent.Parent, "class") == "athing" } return false } // grab all articles and print them articles := scrape.FindAll(root, matcher) for i, article := range articles { fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href")) } }
func parseHTML(path, dest string, dashing Dashing) ([]*reference, error) { refs := []*reference{} r, err := os.Open(path) if err != nil { return refs, err } defer r.Close() top, err := html.Parse(r) for pattern, etype := range dashing.Selectors { m := css.MustCompile(pattern) found := m.MatchAll(top) for _, n := range found { name := text(n) // Skip things explicitly ignored. if ignored(name) { fmt.Printf("Skipping entry for %s (Ignored by dashing JSON)\n", name) continue } // References we want to track. refs = append(refs, &reference{name, etype, path + "#" + anchor(n)}) // We need to modify the DOM with a special link to support TOC. n.Parent.InsertBefore(newA(name, etype), n) } } return refs, writeHTML(path, dest, top) }