func main() { if len(os.Args) < 3 { fmt.Println("Usage:", os.Args[0], "filename iterations") os.Exit(1) } filename := os.Args[1] n, _ := strconv.Atoi(os.Args[2]) file, err := ioutil.ReadFile(filename) if err != nil { panic(err) } html := string(file) start := time.Now() for i := 0; i < n; i++ { doc, err := exp_html.Parse(strings.NewReader(html)) if err != nil { panic(err) } if doc.FirstChild != nil { } } end := time.Now() fmt.Printf("%f s\n", end.Sub(start).Seconds()) }
func ExampleParse() { s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>` doc, err := html.Parse(strings.NewReader(s)) if err != nil { log.Fatal(err) } var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key == "href" { fmt.Println(a.Val) break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) // Output: // foo // /bar/baz }
func post(c *goweb.Context) { var ctx = appengine.NewContext(c.Request) var client = urlfetch.Client(ctx) url, err := ioutil.ReadAll(c.Request.Body) if err != nil { handleError(c, ctx, err) return } resp, err := client.Get(string(url)) if err != nil { handleError(c, ctx, err) return } defer resp.Body.Close() node, err := html.Parse(resp.Body) if err != nil { handleError(c, ctx, err) return } var enc = json.NewEncoder(c.ResponseWriter) if err := enc.Encode(newTag(node)); err != nil { handleError(c, ctx, err) return } }
// showComments print comment list. func showComments(auth string, id string) { req, err := http.NewRequest("GET", "https://code.google.com/feeds/issues/p/"+project+"/issues/"+id+"/comments/full", nil) if err != nil { log.Fatal("failed to get comments:", err) } req.Header.Set("Authorization", "GoogleLogin "+auth) res, err := http.DefaultClient.Do(req) if err != nil { log.Fatal("failed to get comments:", err) } defer res.Body.Close() if res.StatusCode != 200 { log.Fatal("failed to authenticate:", res.Status) } var feed Feed err = xml.NewDecoder(res.Body).Decode(&feed) if err != nil { log.Fatal("failed to get comments:", err) } for _, entry := range feed.Entry { doc, err := html.Parse(strings.NewReader(entry.Content)) if err != nil { log.Fatal("failed to parse xml:", err) } text, err := dump(doc) if err != nil { log.Fatal("failed to parse xml:", err) } fmt.Println(entry.Title, "\n", text) } }
// Process the response for a URL. func (this *worker) visitUrl(res *http.Response) []*url.URL { var doc *goquery.Document var harvested []*url.URL var doLinks bool // Load a goquery document and call the visitor function if node, e := html.Parse(res.Body); e != nil { this.logFunc(LogError, "ERROR parsing %s: %s\n", res.Request.URL.String(), e.Error()) } else { doc = goquery.NewDocumentFromNode(node) doc.Url = res.Request.URL } // Visit the document (with nil goquery doc if failed to load) if this.visitor != nil { if harvested, doLinks = this.visitor(res, doc); doLinks && doc != nil { // Links were not processed by the visitor, so process links harvested = this.processLinks(doc) } } else { this.logFunc(LogInfo, "missing visitor function: %s\n", res.Request.URL.String()) } return harvested }
func ExtractTagi(reader io.Reader) (io.Reader, error) { root, err := html.Parse(reader) if err != nil { return nil, err } var r = toNode(root) defer r.Dispose() var sp = r.descendant(Id("singlePage")) if sp == nil { return nil, errors.New("singlePage not found \n" + r.String()) } var p = sp.descendants(Tag("p")) if p == nil { return nil, errors.New("p's not found \n" + r.String()) } var buffer = new(bytes.Buffer) for _, node := range p { html.Render(buffer, node.toNode()) buffer.WriteByte('\n') } return buffer, nil }
func parseFile(filename string) (*html.Node, error) { f, err := os.Open(filename) if err != nil { return nil, err } defer f.Close() return html.Parse(f) }
// GetDoc issues a GET request, parses it and returns a *html.Node. func GetDoc(rawUrl string) (doc *html.Node, err error) { buf, err := GetRaw(rawUrl) if err != nil { return nil, err } doc, err = html.Parse(bytes.NewReader(buf)) if err != nil { return nil, err } return doc, nil }
func TestNewDocument(t *testing.T) { if f, e := os.Open("./testdata/page.html"); e != nil { t.Error(e.Error()) } else { defer f.Close() if node, e := html.Parse(f); e != nil { t.Error(e.Error()) } else { doc = NewDocumentFromNode(node) } } }
//// // To get the value of a feild in the HTML // // string : the name of the attribute that we want to get the value of // string : the name of the tag we want to search in // HTMLParameter : pairs of name/value we want in the balise // // return : the value of the wanted parameter //// func (p *HTMLParser) GetValue(tag string, feilds ...HTMLParameter) []html.Node { aNode, errors := html.Parse(p.Data) if errors != nil { log.Println("ERROR : error while parsing web page") return nil } // recursive search p.GetNodes(aNode, tag, feilds) return p.HTMLNodes }
func LoadDoc(page string) *Document { if f, e := os.Open(fmt.Sprintf("./testdata/%s", page)); e != nil { panic(e.Error()) } else { defer f.Close() if node, e := html.Parse(f); e != nil { panic(e.Error()) } else { return NewDocumentFromNode(node) } } return nil }
// NewDocument() is a Document constructor that takes a string URL as argument. // It loads the specified document, parses it, and stores the root Document // node, ready to be manipulated. func NewDocument(url string) (d *Document, e error) { // Load the URL res, e := http.Get(url) if e != nil { return } defer res.Body.Close() // Parse the HTML into nodes root, e := html.Parse(res.Body) if e != nil { return } // Create and fill the document d = newDocument(root, res.Request.URL) return }
func NewDocument(url string) (d *goquery.Document, e error) { client := newHttpClient() res, e := client.Get(url) if e != nil { return } defer res.Body.Close() // Parse the HTML into nodes root, e := html.Parse(res.Body) if e != nil { return } // Create and fill the document d = goquery.NewDocumentFromNode(root) return }
func ExtractBlickOld(reader io.Reader) (io.Reader, error) { root, err := html.Parse(reader) if err != nil { return nil, err } var r = toNode(root) defer r.Dispose() var art = r.descendant(Class("article")) if art == nil { return nil, errors.New("article not found \n" + r.String()) } var buffer = new(bytes.Buffer) html.Render(buffer, art.toNode()) return buffer, nil }
func debug(w http.ResponseWriter, r *http.Request) { rootNode, err := html.Parse(strings.NewReader(manageChecklistsHtml)) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } checkListIds := make(map[string]bool) for node, depth := rootNode, 0; node != nil; node, depth = nextNode(node, depth) { for i := 0; i < depth; i++ { fmt.Fprintf(w, " ") } debugNode(w, node) checkListId := findCheckListId(node) if checkListId != "" { checkListIds[checkListId] = true fmt.Fprintf(w, "found "+checkListId+"\n") } } }
func Get(user_id string) (f Favstar, err error) { res, err := http.Get("http://favstar.fm/users/" + user_id + "/recent") if err != nil { return } defer res.Body.Close() b, err := ioutil.ReadAll(res.Body) if err != nil { return } doc, err := html.Parse(strings.NewReader(string(b))) if err != nil { return } tweetWithStats := walk(doc, "div", cond{"class": "fs-tweet"}) for _, tweetWithStat := range tweetWithStats { t := walk(tweetWithStat, "p", cond{"class": "fs-tweet-text"}) if t == nil { continue } var e Entry e.Text = t[0].FirstChild.Data favs := walk(tweetWithStat, "div", cond{"data-type": "favs"}) if favs != nil { for _, aa := range walk(favs[0], "a", nil) { e.Fav = append(e.Fav, attr(aa, "title")) } } rts := walk(tweetWithStat, "div", cond{"data-type": "rts"}) if rts != nil { for _, aa := range walk(rts[0], "a", nil) { e.RT = append(e.RT, attr(aa, "title")) } } f.Entry = append(f.Entry, e) } return }
func parseFlickr(r io.Reader) (rv string, err error) { doc, err := html.Parse(r) if err != nil { return "", err } var f func(n *html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "meta" { content := "" isImage := false for _, a := range n.Attr { if a.Key == "property" && a.Val == "og:image" { isImage = true } else if a.Key == "content" { content = a.Val } } if isImage { rv = content return } } child := n.FirstChild for child != nil { if rv == "" { f(child) } child = child.NextSibling } } f(doc) if rv == "" { err = noFlickrUrl } return rv, err }
// Process the response for a URL. func (this *worker) visitUrl(res *http.Response) []*url.URL { var doc *goquery.Document var harvested []*url.URL var doLinks bool // Load a goquery document and call the visitor function if bd, e := ioutil.ReadAll(res.Body); e != nil { this.extender.Error(newCrawlError(e, CekReadBody, res.Request.URL)) this.logFunc(LogError, "ERROR reading body %s: %s", res.Request.URL.String(), e.Error()) } else { if node, e := html.Parse(bytes.NewBuffer(bd)); e != nil { this.extender.Error(newCrawlError(e, CekParseBody, res.Request.URL)) this.logFunc(LogError, "ERROR parsing %s: %s", res.Request.URL.String(), e.Error()) } else { doc = goquery.NewDocumentFromNode(node) doc.Url = res.Request.URL } // Re-assign the body so it can be consumed by the visitor function res.Body = ioutil.NopCloser(bytes.NewBuffer(bd)) } // Visit the document (with nil goquery doc if failed to load) if harvested, doLinks = this.extender.Visit(res, doc); doLinks { // Links were not processed by the visitor, so process links if doc != nil { harvested = this.processLinks(doc) } else { this.extender.Error(newCrawlErrorMessage("No goquery document to process links.", CekProcessLinks, res.Request.URL)) this.logFunc(LogError, "ERROR processing links %s", res.Request.URL.String()) } } // Notify that this URL has been visited this.extender.Visited(res.Request.URL, harvested) return harvested }
func feed(w http.ResponseWriter, r *http.Request) { if r.Method == "GET" { if strings.HasPrefix(r.Header.Get("Accept"), "application/json") { feedId := r.URL.Path[len("/feed/"):] feed := &dbFeed{} // u := "http://loc-blog.de/rss.php?blog_id=5" session := db.session.Copy() c := session.DB("test").C("feeds") numResults, err := c.FindId(bson.ObjectIdHex(feedId)).Count() if err != nil { panic(err) } if numResults == 0 { // feed = insertFeed(u) } else { err = c.FindId(bson.ObjectIdHex(feedId)).One(&feed) if err != nil { panic(err) } } u, _ := url.Parse(feed.Url) for i, _ := range feed.Feed.Items { doc, err := html.Parse(strings.NewReader(feed.Feed.Items[i].Content)) if err != nil { log.Fatal(err) } var f func(*html.Node, *url.URL) f = func(n *html.Node, u *url.URL) { if n.Type == html.ElementNode && n.Data == "img" { for i, _ := range n.Attr { if n.Attr[i].Key == "src" { u2, _ := url.Parse(n.Attr[i].Val) if !u2.IsAbs() { u2.Scheme = u.Scheme u2.Host = u.Host } if !strings.HasPrefix(u2.Path, "/") { u2.Path = "/" + u2.Path } n.Attr[i].Val = u2.String() break } } } if n.Type == html.ElementNode && n.Data == "a" { found := false for i, _ := range n.Attr { if n.Attr[i].Key == "target" { n.Attr[i].Val = "_blank" found = true break } } if !found { attr := new(html.Attribute) attr.Key = "target" attr.Val = "_blank" n.Attr = append(n.Attr, *attr) } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c, u) } } f(doc, u) var wr bytes.Buffer html.Render(&wr, doc) feed.Feed.Items[i].Content = wr.String() } respJSON, _ := json.Marshal(feed) fmt.Fprint(w, string(respJSON)) } else { indexHandler(w, r) } } else if r.Method == "POST" { body, _ := ioutil.ReadAll(r.Body) feed := new(dbFeed) _ = json.Unmarshal(body, feed) log.Printf("Add Feed: %v", feed.Url) respJSON, _ := json.Marshal(insertFeed(feed.Url)) fmt.Fprint(w, string(respJSON)) } }
func generate() { type data struct { site, page string parsed *PageIndex } start := time.Now() index := new(Index) done := make(chan data, len(DocSites)*len(DocPages)) index.Pages = map[string]map[string]*PageIndex{} for site, host := range DocSites { index.Pages[site] = map[string]*PageIndex{} for page, path := range DocPages { d := data{ site: site, page: page, } uri := url.URL{ Scheme: "http", Host: host, Path: path, } go func() { defer func() { done <- d }() resp, err := http.Get(uri.String()) if err != nil { log.Printf("[GoDoc] %s:%s (%s) failed: %s", d.site, d.page, uri, err) return } defer resp.Body.Close() node, err := html.Parse(resp.Body) if err != nil { log.Printf("[GoDoc] %s:%s (%s) failed to parse: %s", d.site, d.page, uri, err) return } pageIndex := new(PageIndex) if err := pageIndex.ParseFrom(uri, node); err != nil { log.Printf("[GoDoc] %s:%s (%s) failed to index: %s", d.site, d.page, uri, err) return } if d.page == "pkg" || d.page == "cmd" { uris := make([]string, 0, len(pageIndex.SectionURLs)) pkgs := make([]string, 0, len(pageIndex.SectionURLs)) need := "/" + d.page + "/" for pkg := range pageIndex.SectionURLs { for _, uri := range pageIndex.SectionURLs[pkg] { if !strings.Contains(uri, need) { continue } uris = append(uris, uri) pkgs = append(pkgs, pkg) } } for i, uri := range uris { pkg := pkgs[i] log.Printf("[GoDoc] Pulling package %q at %q", pkg, uri) u, err := url.Parse(uri) if err != nil { log.Printf("[GoDoc] %s:%s:%s failed to parse URL %q: %s", d.site, d.page, pkg, uri, err) continue } resp, err := http.Get(uri) if err != nil { log.Printf("[GoDoc] bad package URL %q", uri) continue } defer resp.Body.Close() node, err := html.Parse(resp.Body) if err != nil { log.Printf("[GoDoc] %s:%s:%s (%s) failed to parse package: %s", d.site, d.page, pkg, uri, err) continue } if err := pageIndex.ParseFrom(*u, node); err != nil { log.Printf("[GoDoc] %s:%s:%s (%s) failed to index: %s", d.site, d.page, pkg, uri, err) continue } } } d.parsed = pageIndex }() } } for i := 0; i < cap(done); i++ { d := <-done log.Printf("[GoDoc] %s:%s complete", d.site, d.page) index.Pages[d.site][d.page] = d.parsed } godocIndex = index log.Printf("Generate took %s", time.Since(start)) }