Ejemplo n.º 1
0
func main() {
	if len(os.Args) < 3 {
		fmt.Println("Usage:", os.Args[0], "filename iterations")
		os.Exit(1)
	}

	filename := os.Args[1]
	n, _ := strconv.Atoi(os.Args[2])

	file, err := ioutil.ReadFile(filename)
	if err != nil {
		panic(err)
	}
	html := string(file)

	start := time.Now()
	for i := 0; i < n; i++ {
		doc, err := exp_html.Parse(strings.NewReader(html))
		if err != nil {
			panic(err)
		}
		if doc.FirstChild != nil {
		}
	}
	end := time.Now()

	fmt.Printf("%f s\n", end.Sub(start).Seconds())
}
Ejemplo n.º 2
0
func ExampleParse() {
	s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
	doc, err := html.Parse(strings.NewReader(s))
	if err != nil {
		log.Fatal(err)
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" {
					fmt.Println(a.Val)
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
	// Output:
	// foo
	// /bar/baz
}
Ejemplo n.º 3
0
func post(c *goweb.Context) {
	var ctx = appengine.NewContext(c.Request)
	var client = urlfetch.Client(ctx)

	url, err := ioutil.ReadAll(c.Request.Body)

	if err != nil {
		handleError(c, ctx, err)
		return
	}

	resp, err := client.Get(string(url))

	if err != nil {
		handleError(c, ctx, err)
		return
	}

	defer resp.Body.Close()
	node, err := html.Parse(resp.Body)

	if err != nil {
		handleError(c, ctx, err)
		return
	}

	var enc = json.NewEncoder(c.ResponseWriter)

	if err := enc.Encode(newTag(node)); err != nil {
		handleError(c, ctx, err)
		return
	}
}
Ejemplo n.º 4
0
// showComments print comment list.
func showComments(auth string, id string) {
	req, err := http.NewRequest("GET", "https://code.google.com/feeds/issues/p/"+project+"/issues/"+id+"/comments/full", nil)
	if err != nil {
		log.Fatal("failed to get comments:", err)
	}
	req.Header.Set("Authorization", "GoogleLogin "+auth)
	res, err := http.DefaultClient.Do(req)
	if err != nil {
		log.Fatal("failed to get comments:", err)
	}
	defer res.Body.Close()
	if res.StatusCode != 200 {
		log.Fatal("failed to authenticate:", res.Status)
	}
	var feed Feed
	err = xml.NewDecoder(res.Body).Decode(&feed)
	if err != nil {
		log.Fatal("failed to get comments:", err)
	}
	for _, entry := range feed.Entry {
		doc, err := html.Parse(strings.NewReader(entry.Content))
		if err != nil {
			log.Fatal("failed to parse xml:", err)
		}
		text, err := dump(doc)
		if err != nil {
			log.Fatal("failed to parse xml:", err)
		}
		fmt.Println(entry.Title, "\n", text)
	}
}
Ejemplo n.º 5
0
// Process the response for a URL.
func (this *worker) visitUrl(res *http.Response) []*url.URL {
	var doc *goquery.Document
	var harvested []*url.URL
	var doLinks bool

	// Load a goquery document and call the visitor function
	if node, e := html.Parse(res.Body); e != nil {
		this.logFunc(LogError, "ERROR parsing %s: %s\n", res.Request.URL.String(), e.Error())
	} else {
		doc = goquery.NewDocumentFromNode(node)
		doc.Url = res.Request.URL
	}

	// Visit the document (with nil goquery doc if failed to load)
	if this.visitor != nil {
		if harvested, doLinks = this.visitor(res, doc); doLinks && doc != nil {
			// Links were not processed by the visitor, so process links
			harvested = this.processLinks(doc)
		}
	} else {
		this.logFunc(LogInfo, "missing visitor function: %s\n", res.Request.URL.String())
	}

	return harvested
}
Ejemplo n.º 6
0
func ExtractTagi(reader io.Reader) (io.Reader, error) {
	root, err := html.Parse(reader)

	if err != nil {
		return nil, err
	}

	var r = toNode(root)

	defer r.Dispose()

	var sp = r.descendant(Id("singlePage"))

	if sp == nil {
		return nil, errors.New("singlePage not found \n" + r.String())
	}

	var p = sp.descendants(Tag("p"))

	if p == nil {
		return nil, errors.New("p's not found \n" + r.String())
	}

	var buffer = new(bytes.Buffer)

	for _, node := range p {
		html.Render(buffer, node.toNode())
		buffer.WriteByte('\n')
	}

	return buffer, nil
}
Ejemplo n.º 7
0
func parseFile(filename string) (*html.Node, error) {
	f, err := os.Open(filename)
	if err != nil {
		return nil, err
	}
	defer f.Close()
	return html.Parse(f)
}
Ejemplo n.º 8
0
// GetDoc issues a GET request, parses it and returns a *html.Node.
func GetDoc(rawUrl string) (doc *html.Node, err error) {
	buf, err := GetRaw(rawUrl)
	if err != nil {
		return nil, err
	}
	doc, err = html.Parse(bytes.NewReader(buf))
	if err != nil {
		return nil, err
	}
	return doc, nil
}
Ejemplo n.º 9
0
func TestNewDocument(t *testing.T) {
	if f, e := os.Open("./testdata/page.html"); e != nil {
		t.Error(e.Error())
	} else {
		defer f.Close()
		if node, e := html.Parse(f); e != nil {
			t.Error(e.Error())
		} else {
			doc = NewDocumentFromNode(node)
		}
	}
}
Ejemplo n.º 10
0
////
// To get the value of a feild in the HTML
//
//  string : the name of the attribute that we want to get the value of
//  string : the name of the tag we want to search in
//  HTMLParameter : pairs of name/value we want in the balise
//
//  return : the value of the wanted parameter
////
func (p *HTMLParser) GetValue(tag string, feilds ...HTMLParameter) []html.Node {
	aNode, errors := html.Parse(p.Data)

	if errors != nil {
		log.Println("ERROR : error while parsing web page")
		return nil
	}

	// recursive search
	p.GetNodes(aNode, tag, feilds)

	return p.HTMLNodes
}
Ejemplo n.º 11
0
func LoadDoc(page string) *Document {
	if f, e := os.Open(fmt.Sprintf("./testdata/%s", page)); e != nil {
		panic(e.Error())
	} else {
		defer f.Close()
		if node, e := html.Parse(f); e != nil {
			panic(e.Error())
		} else {
			return NewDocumentFromNode(node)
		}
	}
	return nil
}
Ejemplo n.º 12
0
// NewDocument() is a Document constructor that takes a string URL as argument.
// It loads the specified document, parses it, and stores the root Document
// node, ready to be manipulated.
func NewDocument(url string) (d *Document, e error) {
	// Load the URL
	res, e := http.Get(url)
	if e != nil {
		return
	}
	defer res.Body.Close()

	// Parse the HTML into nodes
	root, e := html.Parse(res.Body)
	if e != nil {
		return
	}

	// Create and fill the document
	d = newDocument(root, res.Request.URL)
	return
}
Ejemplo n.º 13
0
func NewDocument(url string) (d *goquery.Document, e error) {
	client := newHttpClient()
	res, e := client.Get(url)
	if e != nil {
		return
	}
	defer res.Body.Close()

	// Parse the HTML into nodes
	root, e := html.Parse(res.Body)
	if e != nil {
		return
	}

	// Create and fill the document
	d = goquery.NewDocumentFromNode(root)
	return
}
Ejemplo n.º 14
0
func ExtractBlickOld(reader io.Reader) (io.Reader, error) {
	root, err := html.Parse(reader)

	if err != nil {
		return nil, err
	}

	var r = toNode(root)
	defer r.Dispose()

	var art = r.descendant(Class("article"))
	if art == nil {
		return nil, errors.New("article not found \n" + r.String())
	}

	var buffer = new(bytes.Buffer)
	html.Render(buffer, art.toNode())

	return buffer, nil
}
Ejemplo n.º 15
0
func debug(w http.ResponseWriter, r *http.Request) {
	rootNode, err := html.Parse(strings.NewReader(manageChecklistsHtml))
	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}

	checkListIds := make(map[string]bool)
	for node, depth := rootNode, 0; node != nil; node, depth = nextNode(node, depth) {
		for i := 0; i < depth; i++ {
			fmt.Fprintf(w, "  ")
		}
		debugNode(w, node)
		checkListId := findCheckListId(node)
		if checkListId != "" {
			checkListIds[checkListId] = true
			fmt.Fprintf(w, "found "+checkListId+"\n")
		}
	}
}
Ejemplo n.º 16
0
func Get(user_id string) (f Favstar, err error) {
	res, err := http.Get("http://favstar.fm/users/" + user_id + "/recent")
	if err != nil {
		return
	}
	defer res.Body.Close()
	b, err := ioutil.ReadAll(res.Body)
	if err != nil {
		return
	}

	doc, err := html.Parse(strings.NewReader(string(b)))
	if err != nil {
		return
	}
	tweetWithStats := walk(doc, "div", cond{"class": "fs-tweet"})
	for _, tweetWithStat := range tweetWithStats {
		t := walk(tweetWithStat, "p", cond{"class": "fs-tweet-text"})
		if t == nil {
			continue
		}
		var e Entry
		e.Text = t[0].FirstChild.Data

		favs := walk(tweetWithStat, "div", cond{"data-type": "favs"})
		if favs != nil {
			for _, aa := range walk(favs[0], "a", nil) {
				e.Fav = append(e.Fav, attr(aa, "title"))
			}
		}
		rts := walk(tweetWithStat, "div", cond{"data-type": "rts"})
		if rts != nil {
			for _, aa := range walk(rts[0], "a", nil) {
				e.RT = append(e.RT, attr(aa, "title"))
			}
		}
		f.Entry = append(f.Entry, e)
	}
	return
}
Ejemplo n.º 17
0
func parseFlickr(r io.Reader) (rv string, err error) {
	doc, err := html.Parse(r)
	if err != nil {
		return "", err
	}

	var f func(n *html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "meta" {
			content := ""
			isImage := false
			for _, a := range n.Attr {
				if a.Key == "property" && a.Val == "og:image" {
					isImage = true
				} else if a.Key == "content" {
					content = a.Val
				}
			}
			if isImage {
				rv = content
				return
			}
		}
		child := n.FirstChild
		for child != nil {
			if rv == "" {
				f(child)
			}
			child = child.NextSibling
		}
	}
	f(doc)
	if rv == "" {
		err = noFlickrUrl
	}
	return rv, err
}
Ejemplo n.º 18
0
// Process the response for a URL.
func (this *worker) visitUrl(res *http.Response) []*url.URL {
	var doc *goquery.Document
	var harvested []*url.URL
	var doLinks bool

	// Load a goquery document and call the visitor function
	if bd, e := ioutil.ReadAll(res.Body); e != nil {
		this.extender.Error(newCrawlError(e, CekReadBody, res.Request.URL))
		this.logFunc(LogError, "ERROR reading body %s: %s", res.Request.URL.String(), e.Error())
	} else {
		if node, e := html.Parse(bytes.NewBuffer(bd)); e != nil {
			this.extender.Error(newCrawlError(e, CekParseBody, res.Request.URL))
			this.logFunc(LogError, "ERROR parsing %s: %s", res.Request.URL.String(), e.Error())
		} else {
			doc = goquery.NewDocumentFromNode(node)
			doc.Url = res.Request.URL
		}
		// Re-assign the body so it can be consumed by the visitor function
		res.Body = ioutil.NopCloser(bytes.NewBuffer(bd))
	}

	// Visit the document (with nil goquery doc if failed to load)
	if harvested, doLinks = this.extender.Visit(res, doc); doLinks {
		// Links were not processed by the visitor, so process links
		if doc != nil {
			harvested = this.processLinks(doc)
		} else {
			this.extender.Error(newCrawlErrorMessage("No goquery document to process links.", CekProcessLinks, res.Request.URL))
			this.logFunc(LogError, "ERROR processing links %s", res.Request.URL.String())
		}
	}
	// Notify that this URL has been visited
	this.extender.Visited(res.Request.URL, harvested)

	return harvested
}
Ejemplo n.º 19
0
func feed(w http.ResponseWriter, r *http.Request) {
	if r.Method == "GET" {
		if strings.HasPrefix(r.Header.Get("Accept"), "application/json") {
			feedId := r.URL.Path[len("/feed/"):]
			feed := &dbFeed{}

			// u := "http://loc-blog.de/rss.php?blog_id=5"

			session := db.session.Copy()
			c := session.DB("test").C("feeds")

			numResults, err := c.FindId(bson.ObjectIdHex(feedId)).Count()
			if err != nil {
				panic(err)
			}

			if numResults == 0 {
				// feed = insertFeed(u)
			} else {
				err = c.FindId(bson.ObjectIdHex(feedId)).One(&feed)
				if err != nil {
					panic(err)
				}
			}

			u, _ := url.Parse(feed.Url)

			for i, _ := range feed.Feed.Items {
				doc, err := html.Parse(strings.NewReader(feed.Feed.Items[i].Content))
				if err != nil {
					log.Fatal(err)
				}
				var f func(*html.Node, *url.URL)
				f = func(n *html.Node, u *url.URL) {
					if n.Type == html.ElementNode && n.Data == "img" {
						for i, _ := range n.Attr {
							if n.Attr[i].Key == "src" {
								u2, _ := url.Parse(n.Attr[i].Val)
								if !u2.IsAbs() {
									u2.Scheme = u.Scheme
									u2.Host = u.Host
								}
								if !strings.HasPrefix(u2.Path, "/") {
									u2.Path = "/" + u2.Path
								}
								n.Attr[i].Val = u2.String()
								break
							}
						}
					}
					if n.Type == html.ElementNode && n.Data == "a" {
						found := false
						for i, _ := range n.Attr {
							if n.Attr[i].Key == "target" {
								n.Attr[i].Val = "_blank"
								found = true
								break
							}
						}
						if !found {
							attr := new(html.Attribute)
							attr.Key = "target"
							attr.Val = "_blank"
							n.Attr = append(n.Attr, *attr)
						}
					}
					for c := n.FirstChild; c != nil; c = c.NextSibling {
						f(c, u)
					}
				}
				f(doc, u)
				var wr bytes.Buffer
				html.Render(&wr, doc)
				feed.Feed.Items[i].Content = wr.String()
			}

			respJSON, _ := json.Marshal(feed)
			fmt.Fprint(w, string(respJSON))
		} else {
			indexHandler(w, r)
		}
	} else if r.Method == "POST" {
		body, _ := ioutil.ReadAll(r.Body)
		feed := new(dbFeed)
		_ = json.Unmarshal(body, feed)
		log.Printf("Add Feed: %v", feed.Url)
		respJSON, _ := json.Marshal(insertFeed(feed.Url))
		fmt.Fprint(w, string(respJSON))
	}
}
Ejemplo n.º 20
0
func generate() {
	type data struct {
		site, page string
		parsed     *PageIndex
	}

	start := time.Now()
	index := new(Index)
	done := make(chan data, len(DocSites)*len(DocPages))

	index.Pages = map[string]map[string]*PageIndex{}
	for site, host := range DocSites {
		index.Pages[site] = map[string]*PageIndex{}
		for page, path := range DocPages {
			d := data{
				site: site,
				page: page,
			}
			uri := url.URL{
				Scheme: "http",
				Host:   host,
				Path:   path,
			}
			go func() {
				defer func() {
					done <- d
				}()
				resp, err := http.Get(uri.String())
				if err != nil {
					log.Printf("[GoDoc] %s:%s (%s) failed: %s", d.site, d.page, uri, err)
					return
				}
				defer resp.Body.Close()

				node, err := html.Parse(resp.Body)
				if err != nil {
					log.Printf("[GoDoc] %s:%s (%s) failed to parse: %s", d.site, d.page, uri, err)
					return
				}

				pageIndex := new(PageIndex)
				if err := pageIndex.ParseFrom(uri, node); err != nil {
					log.Printf("[GoDoc] %s:%s (%s) failed to index: %s", d.site, d.page, uri, err)
					return
				}

				if d.page == "pkg" || d.page == "cmd" {
					uris := make([]string, 0, len(pageIndex.SectionURLs))
					pkgs := make([]string, 0, len(pageIndex.SectionURLs))
					need := "/" + d.page + "/"

					for pkg := range pageIndex.SectionURLs {
						for _, uri := range pageIndex.SectionURLs[pkg] {
							if !strings.Contains(uri, need) {
								continue
							}
							uris = append(uris, uri)
							pkgs = append(pkgs, pkg)
						}
					}

					for i, uri := range uris {
						pkg := pkgs[i]
						log.Printf("[GoDoc] Pulling package %q at %q", pkg, uri)

						u, err := url.Parse(uri)
						if err != nil {
							log.Printf("[GoDoc] %s:%s:%s failed to parse URL %q: %s", d.site, d.page, pkg, uri, err)
							continue
						}

						resp, err := http.Get(uri)
						if err != nil {
							log.Printf("[GoDoc] bad package URL %q", uri)
							continue
						}
						defer resp.Body.Close()

						node, err := html.Parse(resp.Body)
						if err != nil {
							log.Printf("[GoDoc] %s:%s:%s (%s) failed to parse package: %s", d.site, d.page, pkg, uri, err)
							continue
						}

						if err := pageIndex.ParseFrom(*u, node); err != nil {
							log.Printf("[GoDoc] %s:%s:%s (%s) failed to index: %s", d.site, d.page, pkg, uri, err)
							continue
						}
					}
				}

				d.parsed = pageIndex
			}()
		}
	}
	for i := 0; i < cap(done); i++ {
		d := <-done
		log.Printf("[GoDoc] %s:%s complete", d.site, d.page)
		index.Pages[d.site][d.page] = d.parsed
	}
	godocIndex = index
	log.Printf("Generate took %s", time.Since(start))
}