func FetchFullDescription(link string) string { res, err := http.Get(link) if err != nil { log.Fatal(err) } body, err := ioutil.ReadAll(res.Body) res.Body.Close() if err != nil { log.Fatal(err) } doc, err := html.Parse(strings.NewReader(string(body))) content := "" var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "section" { for _, a := range n.Attr { if a.Key == "class" && a.Val == "entry-content cf" { var buf bytes.Buffer html.Render(&buf, n) content = buf.String() break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return content }
func GetEntries(root string, useSummary bool) (entries []*Entry, err error) { filepath.Walk(root, func(path string, info os.FileInfo, err error) error { if strings.ToLower(filepath.Ext(path)) != ".txt" { return nil } entry, _ := GetEntry(path) if entry == nil { return nil } entries = append(entries, entry) if useSummary { doc, err := html.Parse(strings.NewReader(entry.Body)) if err == nil { if text, err := toText(doc); err == nil { if len(text) > 500 { text = text[0:500] + "..." } entry.Body = text } } } entry.Id = entry.Filename[len(root):len(entry.Filename)-3] + "html" return nil }) return }
func GetFeedUrl(u string) (string, error) { resp, err := http.Get(u) if err != nil { return "", err } if strings.Contains(resp.Header.Get("Content-Type"), "xml") { return u, nil } tree, err := html.Parse(resp.Body) if err != nil { return "", err } sel := cascadia.MustCompile("link[rel=alternate][type*=xml]") alt := sel.MatchFirst(tree) if alt == nil { return "", errors.New("no feed link found") } altUrl, found := FindAttr("href", alt.Attr) if !found { return "", errors.New("missing link in alternate") } return ToAbsolute(resp.Request.URL, altUrl.Val), nil }
// FixHtml parses bytes as HTML and returns well-formed HTML if the parse // was successful, or escaped HTML, if not. func fixHtml(linkUrl string, wild []byte) (well []byte) { n, err := html.Parse(bytes.NewReader(wild)) if err != nil { return []byte(html.EscapeString(string(wild))) } fixImgs(linkUrl, n) defer func() { if err := recover(); err == bytes.ErrTooLarge { well = []byte(html.EscapeString(string(wild))) } else if err != nil { panic(err) } }() buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2)) if err := html.Render(buf, n); err != nil { return []byte(html.EscapeString(string(wild))) } well = buf.Bytes() openBody := []byte("<body>") i := bytes.Index(well, openBody) if i < 0 { return []byte(html.EscapeString(string(wild))) } well = well[i+len(openBody):] closeBody := []byte("</body>") i = bytes.Index(well, closeBody) if i < 0 { return []byte(html.EscapeString(string(wild))) } return well[:i] }
func TestSelectors(t *testing.T) { for _, test := range selectorTests { s, err := Compile(test.selector) if err != nil { t.Errorf("error compiling %q: %s", test.selector, err) continue } doc, err := html.Parse(strings.NewReader(test.HTML)) if err != nil { t.Errorf("error parsing %q: %s", test.HTML, err) continue } matches := s.MatchAll(doc) if len(matches) != len(test.results) { t.Errorf("wanted %d elements, got %d instead", len(test.results), len(matches)) continue } for i, m := range matches { got := nodeString(m) if got != test.results[i] { t.Errorf("wanted %s, got %s instead", test.results[i], got) } } } }
func ExampleParse() { s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>` doc, err := html.Parse(strings.NewReader(s)) if err != nil { log.Fatal(err) } var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key == "href" { fmt.Println(a.Val) break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) // Output: // foo // /bar/baz }
func parseStub(stub string) (r redditStub, err error) { var extract func(*html.Node) var doc *html.Node doc, err = html.Parse(strings.NewReader(stub)) if err != nil { return } extract = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { switch { case n.FirstChild.Data == "[link]": r.Link = n.Attr[0].Val case strings.HasSuffix(n.FirstChild.Data, " comments]"): r.Comments = n.Attr[0].Val case strings.HasPrefix(n.Attr[0].Val, "http://www.reddit.com/user/"): r.User = strings.TrimSpace(n.FirstChild.Data) } } for c := n.FirstChild; c != nil; c = c.NextSibling { extract(c) } } extract(doc) return }
func TestFind(t *testing.T) { s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a class="goo" href="/bar/baz">BarBaz</a></ul>` doc, _ := html.Parse(strings.NewReader(s)) _, found := Find(doc, "#foo") if found { t.Errorf("There is no node with id 'foo'") } p, found := Find(doc, "p") if !found || p.Data != "p" { t.Errorf("Couldn't find p") } a, found := Find(doc, "ul a") if !found || a.Data != "a" || Flatten(a) != "Foo" { t.Errorf("Couldn't find a") } goo, found := Find(doc, "ul .goo") if !found || goo.Data != "a" || Flatten(goo) != "BarBaz" { t.Errorf("Couldn't find a with class goo") } }
func TestFlatten(t *testing.T) { s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>` doc, _ := html.Parse(strings.NewReader(s)) if Flatten(doc) != "Links:FooBarBaz" { t.Fatalf("%s was wrong", Flatten(doc)) } }
func GenerateDocument(rawData []byte) *goquery.Document { utf8String := toUtf8(rawData) utf8byteArray := []byte(utf8String) node, err := html.Parse(bytes.NewReader(utf8byteArray)) helper.HandleFatalError("document generation failed:", err) return goquery.NewDocumentFromNode(node) }
// Make a GET request to the given URL and start parsing // its HTML. func ExtractData(entity *Entity, url string) { // Parsing completion channel. done := make(chan bool, 1) res, err := http.Get(url) if err != nil { log.Panicln("Error requesting URL data: ", err) } defer res.Body.Close() doc, err := html.Parse(res.Body) if err != nil { log.Println("Error parsing URL body: ", err) } go ParseHTML(doc, entity, done) for { select { case <-done: go finalizeEntity(entity, doc, EntityDir) default: } } }
func lookupTitle(url string) (title string) { r, err := http.Get(url) if err != nil { return "<Couldn't connect.>" } defer r.Body.Close() /*b, err := ioutil.ReadAll(r.Body) CheckError(err) if len(b) > 30 { b = b[:30] } return string(b)*/ title = "<Untitled page.>" doc, err := html.Parse(r.Body) if err != nil { return "<Failed to parse HTML.>" } var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.DataAtom == atom.Title { title = extract(n) return } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return }
func GetStatus() (Status, error) { resp, err := http.Get(STATUS_URL) if err != nil { log.Println(err) return Status{}, errors.New("Could not access OIT status page") } defer resp.Body.Close() doc, err := html.Parse(resp.Body) statusNode, err := FindStatusBlock(doc) if err != nil { log.Println(err) return Status{}, err } status, err := ExtractStatus(statusNode) if err != nil { log.Println(err) return Status{}, err } reason, err := ExtractReason(statusNode) if err != nil { log.Println(err) return Status{}, err } return Status{status, reason}, nil }
func (e *Embedder) embedRedditSelf(url string) (rv EmbedInfo, err error) { matched, err := regexp.MatchString("reddit.com/r/", url) if err != nil { return } if !matched { err = strategyWhiffError return } rv.URL = url doc, err := goquery.NewDocument(url) if err != nil { return } doc.Find(".expando .usertext-body").Each(func(i int, s *goquery.Selection) { s.Find("a").Each(func(i int, s *goquery.Selection) { if href, ok := s.Attr("href"); ok { embedInfo, err := e.embedImage(href) if err != nil { return } node, err := html.Parse(strings.NewReader(embedInfo.Html)) if err != nil { return } parent := s.Parent().Get(0) parent.RemoveChild(s.Get(0)) parent.AppendChild(node) } }) rv.Html, err = s.Html() return }) return }
func GetParameters(client *http.Client, site string) error { url, err := url.ParseRequestURI(site) if err != nil { return err } url.Path = "/parameters/profile/all" respBody, err := DoRequest(client, url, "GET", nil, nil) if err != nil { return err } defer respBody.Close() doc, err := html.Parse(respBody) if err != nil { return err } if verbose { fmt.Println("HTML doc parsed ok", "type:", doc.Type, "data:", doc.Data) } err = CheckHtml(doc, PARAMETERS_PAGE_TITLE) if err != nil { return err } return nil }
func getQuote(symbol string) (*fquery.Quote, error) { resp, err := http.Get("http://www.bloomberg.com/quote/" + symbol) if err != nil { return nil, err } defer resp.Body.Close() doc, err := html.Parse(resp.Body) if err != nil { return nil, err } /* TODO: detect if fund or plain stock, different layouts... */ quote := &bloomQuote{} walk(doc, quote) return &fquery.Quote{ Name: quote.Name, Symbol: symbol, Updated: time.Now(), Volume: quote.Volume, Open: quote.Open, PreviousClose: quote.PrevClose, DayLow: quote.DayLow, DayHigh: quote.DayHigh, YearLow: quote.YearLow, YearHigh: quote.YearHigh, LastTradePrice: quote.LastTradePrice, DividendYield: quote.DividendYield, EarningsPerShare: quote.EarningsPerShare, DividendExDate: quote.DividendExDate, }, nil }
//Google 画像検索(未使用) // http://godoc.org/code.google.com/p/go.net/html // にのっているサンプルにParse部分を追加 func ParseGoogleImageSearch(w http.ResponseWriter, r io.Reader) { doc, err := html.Parse(r) if err != nil { log.Fatal(err) } var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key == "href" { str := a.Val if strings.Contains(str, "imgurl") { strs := strings.Split(str, "&") imageurl := strings.Split(strs[0], "=") img := imageurl[1] fmt.Fprintf(w, "<html><body><ul><li><a href=%v><img src=%v></a></li></ul></body></html>", img, img) } break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) }
func main() { fd, err := os.Open(os.Args[1]) if err != nil { log.Fatal(err) } err = json.NewDecoder(fd).Decode(&trans) if err != nil { log.Fatal(err) } fd.Close() fd, err = os.Open(os.Args[2]) if err != nil { log.Fatal(err) } doc, err := html.Parse(fd) if err != nil { log.Fatal(err) } fd.Close() generalNode(doc) bs, err := json.MarshalIndent(trans, "", " ") if err != nil { log.Fatal(err) } os.Stdout.Write(bs) os.Stdout.WriteString("\n") }
func ParseAndPrint() map[string]string { //TODO : Take this url as parameter res, err := http.Get("http://sfbay.craigslist.org/search/apa/pen?query=&zoomToPosting=&srchType=A&minAsk=&maxAsk=2500&bedrooms=2&housing_type=&nh=77&nh=79&nh=81&nh=83&nh=84&nh=87") if err != nil { log.Fatal(err) } body, err := ioutil.ReadAll(res.Body) res.Body.Close() doc, err := html.Parse(strings.NewReader(string(body))) if err != nil { log.Fatal(err) } returnUrl := make(map[string]string) var checkForListings func(*html.Node) checkForListings = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key == "href" && strings.HasPrefix(a.Val, "/pen/apa") { if n.FirstChild != nil { returnUrl["http://sfbay.craigslist.org"+a.Val] = n.FirstChild.Data } } } } for c := n.FirstChild; c != nil; c = c.NextSibling { checkForListings(c) } } checkForListings(doc) return returnUrl }
func crawl() { for seed := range urlQue { defer func() { if r := recover(); r != nil { log.Println("Recovered in crawl", r, len(urlQue), len(result), seed) } }() // log.Println(seed,seed.Scheme,seed.Host,seed.Path) resp, err := http.Get(seed.String()) defer resp.Body.Close() if err != nil { log.Printf("some error occured %s\n", err) } if resp.StatusCode == 200 { // body, _ := ioutil.ReadAll(resp.Body) // log.Printf("Respones %s\n", body); z, err := html.Parse(resp.Body) if err != nil { log.Fatal(err) } if len(result) < maxResult-2 { result <- &CrawledResult{seed, z} } else { log.Println("result queue almost at max") } } else { log.Printf("Respones %s\n", resp) } } }
func favicon_try_from_url(uri string) string { c := curl.NewCurl("") cache, err := c.Get(uri) fmt.Println(cache) if err != nil { return "" } // text/html, text/xml, image m := strings.Split(cache.Mime, "/") switch m[0] { case "text": f, err := os.Open(cache.Local) if err == nil { defer f.Close() n, err := html.Parse(f) if err == nil { if u, ok := icon_from_link_rel(n); ok { return u } } } case "image": return uri } return "" }
func main() { flag.Parse() if len(os.Args) < 2 { fmt.Printf("Usage %s [-u] NAME\n", os.Args[0]) os.Exit(0) } var in io.Reader if *url { resp, err := http.Get(os.Args[2]) if err != nil { panic(err) } defer resp.Body.Close() in = resp.Body } else { fi, err := os.Open(os.Args[1]) if err != nil { panic(err) } defer fi.Close() in = bufio.NewReader(fi) } n, err := html.Parse(in) if err != nil { panic(err) } fmt.Println(stringValue(htmlwalk(n, untagText, untagElement))) fmt.Println(extractPromoted(n)) }
func getImageUrls(url string) []string { resp, err := http.Get(url) if err != nil { return nil } defer resp.Body.Close() rootNode, err := html.Parse(resp.Body) if err != nil { return nil } rootNode = rootNode.FirstChild.NextSibling for childNode := rootNode.FirstChild; childNode != nil; childNode = childNode.NextSibling { if strings.ToLower(childNode.Data) == "body" { scriptNode := getNodeByTag(childNode, "script") scriptSrc := scriptNode.FirstChild.Data startIndex := strings.Index(scriptSrc, "Array('http") startIndex += len("Array('") endIndex := strings.Index(scriptSrc, "jpg');") endIndex += len("jpg") urlsString := Substr(scriptSrc, startIndex, endIndex-startIndex) return strings.Split(urlsString, "','") } } return nil }
func parse_links(s string) ([]string, string) { var links []string var title string doc, err := html.Parse(strings.NewReader(s)) if err != nil { log.Fatal(err) } var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "title" { title = n.FirstChild.Data } if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key == "href" { links = append(links, a.Val) break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return links, title }
func ParseEntry(r io.Reader) (*AmebloEntry, error) { root, err := html.Parse(r) if err != nil { return nil, err } s, _ := selector.Selector(".articleText") nodes := s.Find(root) if len(nodes) == 0 { return nil, nil } content := h5.RenderNodesToString(nodes) s, _ = selector.Selector("title") nodes = s.Find(root) if len(nodes) == 0 { return nil, nil } title := extractText(nodes[0].FirstChild) entry := &AmebloEntry{ Title: strings.Split(title, "|")[0], Content: content, } return entry, nil }
// NewDocumentFromReader() returns a Document from a generic reader. // It returns an error as second value if the reader's data cannot be parsed // as html. It does *not* check if the reader is also an io.Closer, so the // provided reader is never closed by this call, it is the responsibility // of the caller to close it if required. func NewDocumentFromReader(r io.Reader) (d *Document, e error) { root, e := html.Parse(r) if e != nil { return nil, e } return newDocument(root, nil), nil }
func ParseItem(r io.Reader) []Result { results := []Result{} doc, err := html.Parse(r) if err != nil { fmt.Println(err) } var result Result var f func(*html.Node) f = func(n *html.Node) { // n.Typeでノードの型をチェックできる、ElementNodeでHTMLタグのNode。 // n.Dataでノートの値をチェックする、aタグをチェックしている if n.Type == html.ElementNode && n.Data == "a" { // n.Attrで属性を一覧する for _, a := range n.Attr { if a.Key == "href" { result.Url = a.Val results = append(results, result) } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return results }
func icon_from_link_rel(local string) (curl.Cache, error) { f, err := os.Open(local) if err != nil { return curl.Cache{}, err } defer f.Close() doc, err := html.Parse(f) if err != nil { return curl.Cache{}, err } de := node_query_select(doc, "html") head := node_query_select(de, "head") links := node_query_selects(head, "link") var href string for _, link := range links { rel := node_get_attribute(link, "rel") if rel == "icon" || rel == "shortcut icon" || rel == "apple-touch-icon" { href = node_get_attribute(link, "href") break } } if href != "" { c := curl.NewCurlerDetail(backend_config().ImageFolder, 0, 0, nil, backend_context.ruler) return c.Get(href) } return curl.Cache{}, new_backenderror(-1, "icon cannot resolved in html") }
func Redact(r io.Reader) (string, error) { doc, err := html.Parse(r) if err != nil { return "", err } var f func(*html.Node) f = func(n *html.Node) { for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode { switch c.Data { case "style", "script", "head", "meta": n.RemoveChild(c) return case "img": for i, attr := range c.Attr { if attr.Key == "src" { c.Attr[i].Key = "data-redacted-src" } } } } else if c.Type == html.CommentNode { n.RemoveChild(c) return } f(c) } } f(doc) buf := bytes.NewBufferString("") err = html.Render(buf, doc) return buf.String(), err }
func main() { flag.Parse() if *uri == "" { flag.PrintDefaults() return } c := curl.NewCurl("e:/") cache, err := c.GetUtf8(*uri) if err != nil { panic(err) } f, err := os.Open(cache.LocalUtf8) if err != nil { panic(err) } defer f.Close() doc, err := html.Parse(f) if err != nil { panic(err) } ex := cleaner.NewExtractor("e:/") article, _, err := ex.MakeHtmlReadable(doc, *uri) if err != nil { panic(err) } print_html_doc(article) }