Exemple #1
0
func FetchFullDescription(link string) string {
	res, err := http.Get(link)
	if err != nil {
		log.Fatal(err)
	}
	body, err := ioutil.ReadAll(res.Body)
	res.Body.Close()
	if err != nil {
		log.Fatal(err)
	}
	doc, err := html.Parse(strings.NewReader(string(body)))
	content := ""
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "section" {
			for _, a := range n.Attr {
				if a.Key == "class" && a.Val == "entry-content cf" {
					var buf bytes.Buffer
					html.Render(&buf, n)
					content = buf.String()
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
	return content
}
Exemple #2
0
func GetEntries(root string, useSummary bool) (entries []*Entry, err error) {
	filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
		if strings.ToLower(filepath.Ext(path)) != ".txt" {
			return nil
		}
		entry, _ := GetEntry(path)
		if entry == nil {
			return nil
		}
		entries = append(entries, entry)
		if useSummary {
			doc, err := html.Parse(strings.NewReader(entry.Body))
			if err == nil {
				if text, err := toText(doc); err == nil {
					if len(text) > 500 {
						text = text[0:500] + "..."
					}
					entry.Body = text
				}
			}
		}
		entry.Id = entry.Filename[len(root):len(entry.Filename)-3] + "html"
		return nil
	})
	return
}
Exemple #3
0
func GetFeedUrl(u string) (string, error) {
	resp, err := http.Get(u)
	if err != nil {
		return "", err
	}

	if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
		return u, nil
	}

	tree, err := html.Parse(resp.Body)
	if err != nil {
		return "", err
	}

	sel := cascadia.MustCompile("link[rel=alternate][type*=xml]")
	alt := sel.MatchFirst(tree)
	if alt == nil {
		return "", errors.New("no feed link found")
	}

	altUrl, found := FindAttr("href", alt.Attr)
	if !found {
		return "", errors.New("missing link in alternate")
	}

	return ToAbsolute(resp.Request.URL, altUrl.Val), nil
}
Exemple #4
0
// FixHtml parses bytes as HTML and returns well-formed HTML if the parse
// was successful, or escaped HTML, if not.
func fixHtml(linkUrl string, wild []byte) (well []byte) {
	n, err := html.Parse(bytes.NewReader(wild))
	if err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	fixImgs(linkUrl, n)

	defer func() {
		if err := recover(); err == bytes.ErrTooLarge {
			well = []byte(html.EscapeString(string(wild)))
		} else if err != nil {
			panic(err)
		}
	}()
	buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2))
	if err := html.Render(buf, n); err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	well = buf.Bytes()
	openBody := []byte("<body>")
	i := bytes.Index(well, openBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	well = well[i+len(openBody):]

	closeBody := []byte("</body>")
	i = bytes.Index(well, closeBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	return well[:i]
}
func TestSelectors(t *testing.T) {
	for _, test := range selectorTests {
		s, err := Compile(test.selector)
		if err != nil {
			t.Errorf("error compiling %q: %s", test.selector, err)
			continue
		}

		doc, err := html.Parse(strings.NewReader(test.HTML))
		if err != nil {
			t.Errorf("error parsing %q: %s", test.HTML, err)
			continue
		}

		matches := s.MatchAll(doc)
		if len(matches) != len(test.results) {
			t.Errorf("wanted %d elements, got %d instead", len(test.results), len(matches))
			continue
		}

		for i, m := range matches {
			got := nodeString(m)
			if got != test.results[i] {
				t.Errorf("wanted %s, got %s instead", test.results[i], got)
			}
		}
	}
}
Exemple #6
0
func ExampleParse() {
	s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
	doc, err := html.Parse(strings.NewReader(s))
	if err != nil {
		log.Fatal(err)
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" {
					fmt.Println(a.Val)
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
	// Output:
	// foo
	// /bar/baz
}
Exemple #7
0
func parseStub(stub string) (r redditStub, err error) {
	var extract func(*html.Node)
	var doc *html.Node
	doc, err = html.Parse(strings.NewReader(stub))
	if err != nil {
		return
	}
	extract = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			switch {
			case n.FirstChild.Data == "[link]":
				r.Link = n.Attr[0].Val
			case strings.HasSuffix(n.FirstChild.Data, " comments]"):
				r.Comments = n.Attr[0].Val
			case strings.HasPrefix(n.Attr[0].Val, "http://www.reddit.com/user/"):
				r.User = strings.TrimSpace(n.FirstChild.Data)
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			extract(c)
		}
	}
	extract(doc)
	return
}
func TestFind(t *testing.T) {
	s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a class="goo" href="/bar/baz">BarBaz</a></ul>`
	doc, _ := html.Parse(strings.NewReader(s))

	_, found := Find(doc, "#foo")

	if found {
		t.Errorf("There is no node with id 'foo'")
	}

	p, found := Find(doc, "p")

	if !found || p.Data != "p" {
		t.Errorf("Couldn't find p")
	}

	a, found := Find(doc, "ul a")

	if !found || a.Data != "a" || Flatten(a) != "Foo" {
		t.Errorf("Couldn't find a")
	}

	goo, found := Find(doc, "ul .goo")

	if !found || goo.Data != "a" || Flatten(goo) != "BarBaz" {
		t.Errorf("Couldn't find a with class goo")
	}
}
func TestFlatten(t *testing.T) {
	s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
	doc, _ := html.Parse(strings.NewReader(s))
	if Flatten(doc) != "Links:FooBarBaz" {
		t.Fatalf("%s was wrong", Flatten(doc))
	}
}
Exemple #10
0
func GenerateDocument(rawData []byte) *goquery.Document {
	utf8String := toUtf8(rawData)
	utf8byteArray := []byte(utf8String)
	node, err := html.Parse(bytes.NewReader(utf8byteArray))
	helper.HandleFatalError("document generation failed:", err)
	return goquery.NewDocumentFromNode(node)
}
Exemple #11
0
// Make a GET request to the given URL and start parsing
// its HTML.
func ExtractData(entity *Entity, url string) {
	// Parsing completion channel.
	done := make(chan bool, 1)

	res, err := http.Get(url)
	if err != nil {
		log.Panicln("Error requesting URL data: ", err)
	}

	defer res.Body.Close()

	doc, err := html.Parse(res.Body)
	if err != nil {
		log.Println("Error parsing URL body: ", err)
	}

	go ParseHTML(doc, entity, done)

	for {
		select {
		case <-done:
			go finalizeEntity(entity, doc, EntityDir)
		default:
		}
	}
}
Exemple #12
0
func lookupTitle(url string) (title string) {
	r, err := http.Get(url)
	if err != nil {
		return "<Couldn't connect.>"
	}
	defer r.Body.Close()
	/*b, err := ioutil.ReadAll(r.Body)
	CheckError(err)
	if len(b) > 30 {
		b = b[:30]
	}
	return string(b)*/

	title = "<Untitled page.>"

	doc, err := html.Parse(r.Body)
	if err != nil {
		return "<Failed to parse HTML.>"
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.DataAtom == atom.Title {
			title = extract(n)
			return
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)

	return
}
Exemple #13
0
func GetStatus() (Status, error) {
	resp, err := http.Get(STATUS_URL)
	if err != nil {
		log.Println(err)

		return Status{}, errors.New("Could not access OIT status page")
	}

	defer resp.Body.Close()

	doc, err := html.Parse(resp.Body)

	statusNode, err := FindStatusBlock(doc)
	if err != nil {
		log.Println(err)

		return Status{}, err
	}

	status, err := ExtractStatus(statusNode)
	if err != nil {
		log.Println(err)

		return Status{}, err
	}

	reason, err := ExtractReason(statusNode)
	if err != nil {
		log.Println(err)

		return Status{}, err
	}

	return Status{status, reason}, nil
}
Exemple #14
0
func (e *Embedder) embedRedditSelf(url string) (rv EmbedInfo, err error) {
	matched, err := regexp.MatchString("reddit.com/r/", url)
	if err != nil {
		return
	}
	if !matched {
		err = strategyWhiffError
		return
	}
	rv.URL = url
	doc, err := goquery.NewDocument(url)
	if err != nil {
		return
	}
	doc.Find(".expando .usertext-body").Each(func(i int, s *goquery.Selection) {
		s.Find("a").Each(func(i int, s *goquery.Selection) {
			if href, ok := s.Attr("href"); ok {
				embedInfo, err := e.embedImage(href)
				if err != nil {
					return
				}
				node, err := html.Parse(strings.NewReader(embedInfo.Html))
				if err != nil {
					return
				}
				parent := s.Parent().Get(0)
				parent.RemoveChild(s.Get(0))
				parent.AppendChild(node)
			}
		})
		rv.Html, err = s.Html()
		return
	})
	return
}
Exemple #15
0
func GetParameters(client *http.Client, site string) error {
	url, err := url.ParseRequestURI(site)
	if err != nil {
		return err
	}

	url.Path = "/parameters/profile/all"

	respBody, err := DoRequest(client, url, "GET", nil, nil)
	if err != nil {
		return err
	}

	defer respBody.Close()

	doc, err := html.Parse(respBody)
	if err != nil {
		return err
	}

	if verbose {
		fmt.Println("HTML doc parsed ok", "type:", doc.Type, "data:", doc.Data)
	}

	err = CheckHtml(doc, PARAMETERS_PAGE_TITLE)
	if err != nil {
		return err
	}

	return nil
}
Exemple #16
0
func getQuote(symbol string) (*fquery.Quote, error) {
	resp, err := http.Get("http://www.bloomberg.com/quote/" + symbol)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	doc, err := html.Parse(resp.Body)
	if err != nil {
		return nil, err
	}

	/* TODO: detect if fund or plain stock, different layouts... */
	quote := &bloomQuote{}
	walk(doc, quote)

	return &fquery.Quote{
		Name:             quote.Name,
		Symbol:           symbol,
		Updated:          time.Now(),
		Volume:           quote.Volume,
		Open:             quote.Open,
		PreviousClose:    quote.PrevClose,
		DayLow:           quote.DayLow,
		DayHigh:          quote.DayHigh,
		YearLow:          quote.YearLow,
		YearHigh:         quote.YearHigh,
		LastTradePrice:   quote.LastTradePrice,
		DividendYield:    quote.DividendYield,
		EarningsPerShare: quote.EarningsPerShare,
		DividendExDate:   quote.DividendExDate,
	}, nil
}
Exemple #17
0
//Google 画像検索(未使用)
//  http://godoc.org/code.google.com/p/go.net/html
// にのっているサンプルにParse部分を追加
func ParseGoogleImageSearch(w http.ResponseWriter, r io.Reader) {
	doc, err := html.Parse(r)
	if err != nil {
		log.Fatal(err)
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" {
					str := a.Val
					if strings.Contains(str, "imgurl") {
						strs := strings.Split(str, "&")
						imageurl := strings.Split(strs[0], "=")
						img := imageurl[1]
						fmt.Fprintf(w, "<html><body><ul><li><a href=%v><img src=%v></a></li></ul></body></html>", img, img)
					}
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
}
Exemple #18
0
func main() {
	fd, err := os.Open(os.Args[1])
	if err != nil {
		log.Fatal(err)
	}
	err = json.NewDecoder(fd).Decode(&trans)
	if err != nil {
		log.Fatal(err)
	}
	fd.Close()

	fd, err = os.Open(os.Args[2])
	if err != nil {
		log.Fatal(err)
	}
	doc, err := html.Parse(fd)
	if err != nil {
		log.Fatal(err)
	}
	fd.Close()

	generalNode(doc)
	bs, err := json.MarshalIndent(trans, "", "   ")
	if err != nil {
		log.Fatal(err)
	}
	os.Stdout.Write(bs)
	os.Stdout.WriteString("\n")
}
Exemple #19
0
func ParseAndPrint() map[string]string {
	//TODO : Take this url as parameter
	res, err := http.Get("http://sfbay.craigslist.org/search/apa/pen?query=&zoomToPosting=&srchType=A&minAsk=&maxAsk=2500&bedrooms=2&housing_type=&nh=77&nh=79&nh=81&nh=83&nh=84&nh=87")
	if err != nil {
		log.Fatal(err)
	}
	body, err := ioutil.ReadAll(res.Body)
	res.Body.Close()
	doc, err := html.Parse(strings.NewReader(string(body)))
	if err != nil {
		log.Fatal(err)
	}
	returnUrl := make(map[string]string)
	var checkForListings func(*html.Node)
	checkForListings = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" && strings.HasPrefix(a.Val, "/pen/apa") {
					if n.FirstChild != nil {
						returnUrl["http://sfbay.craigslist.org"+a.Val] = n.FirstChild.Data
					}
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			checkForListings(c)
		}
	}
	checkForListings(doc)
	return returnUrl
}
Exemple #20
0
func crawl() {
	for seed := range urlQue {
		defer func() {
			if r := recover(); r != nil {
				log.Println("Recovered in crawl", r, len(urlQue), len(result), seed)
			}
		}()
		// log.Println(seed,seed.Scheme,seed.Host,seed.Path)
		resp, err := http.Get(seed.String())
		defer resp.Body.Close()
		if err != nil {
			log.Printf("some error occured  %s\n", err)
		}
		if resp.StatusCode == 200 {
			// body, _ := ioutil.ReadAll(resp.Body)
			// log.Printf("Respones %s\n", body);
			z, err := html.Parse(resp.Body)
			if err != nil {
				log.Fatal(err)
			}
			if len(result) < maxResult-2 {
				result <- &CrawledResult{seed, z}
			} else {
				log.Println("result queue almost at max")
			}

		} else {
			log.Printf("Respones %s\n", resp)
		}
	}
}
Exemple #21
0
func favicon_try_from_url(uri string) string {
	c := curl.NewCurl("")

	cache, err := c.Get(uri)
	fmt.Println(cache)
	if err != nil {
		return ""
	}
	// text/html, text/xml, image
	m := strings.Split(cache.Mime, "/")
	switch m[0] {
	case "text":
		f, err := os.Open(cache.Local)
		if err == nil {
			defer f.Close()
			n, err := html.Parse(f)
			if err == nil {
				if u, ok := icon_from_link_rel(n); ok {
					return u
				}
			}
		}

	case "image":
		return uri
	}
	return ""
}
Exemple #22
0
func main() {
	flag.Parse()

	if len(os.Args) < 2 {
		fmt.Printf("Usage %s [-u] NAME\n", os.Args[0])
		os.Exit(0)
	}

	var in io.Reader
	if *url {
		resp, err := http.Get(os.Args[2])
		if err != nil {
			panic(err)
		}
		defer resp.Body.Close()
		in = resp.Body
	} else {
		fi, err := os.Open(os.Args[1])
		if err != nil {
			panic(err)
		}
		defer fi.Close()
		in = bufio.NewReader(fi)
	}

	n, err := html.Parse(in)
	if err != nil {
		panic(err)
	}

	fmt.Println(stringValue(htmlwalk(n, untagText, untagElement)))
	fmt.Println(extractPromoted(n))
}
Exemple #23
0
func getImageUrls(url string) []string {
	resp, err := http.Get(url)
	if err != nil {
		return nil
	}
	defer resp.Body.Close()

	rootNode, err := html.Parse(resp.Body)
	if err != nil {
		return nil
	}

	rootNode = rootNode.FirstChild.NextSibling
	for childNode := rootNode.FirstChild; childNode != nil; childNode = childNode.NextSibling {
		if strings.ToLower(childNode.Data) == "body" {
			scriptNode := getNodeByTag(childNode, "script")
			scriptSrc := scriptNode.FirstChild.Data
			startIndex := strings.Index(scriptSrc, "Array('http")
			startIndex += len("Array('")
			endIndex := strings.Index(scriptSrc, "jpg');")
			endIndex += len("jpg")
			urlsString := Substr(scriptSrc, startIndex, endIndex-startIndex)
			return strings.Split(urlsString, "','")
		}
	}
	return nil
}
Exemple #24
0
func parse_links(s string) ([]string, string) {
	var links []string
	var title string

	doc, err := html.Parse(strings.NewReader(s))
	if err != nil {
		log.Fatal(err)
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "title" {
			title = n.FirstChild.Data

		}
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" {
					links = append(links, a.Val)
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
	return links, title
}
Exemple #25
0
func ParseEntry(r io.Reader) (*AmebloEntry, error) {
	root, err := html.Parse(r)
	if err != nil {
		return nil, err
	}
	s, _ := selector.Selector(".articleText")
	nodes := s.Find(root)
	if len(nodes) == 0 {
		return nil, nil
	}
	content := h5.RenderNodesToString(nodes)

	s, _ = selector.Selector("title")
	nodes = s.Find(root)
	if len(nodes) == 0 {
		return nil, nil
	}
	title := extractText(nodes[0].FirstChild)

	entry := &AmebloEntry{
		Title:   strings.Split(title, "|")[0],
		Content: content,
	}
	return entry, nil
}
Exemple #26
0
// NewDocumentFromReader() returns a Document from a generic reader.
// It returns an error as second value if the reader's data cannot be parsed
// as html. It does *not* check if the reader is also an io.Closer, so the
// provided reader is never closed by this call, it is the responsibility
// of the caller to close it if required.
func NewDocumentFromReader(r io.Reader) (d *Document, e error) {
	root, e := html.Parse(r)
	if e != nil {
		return nil, e
	}
	return newDocument(root, nil), nil
}
Exemple #27
0
func ParseItem(r io.Reader) []Result {
	results := []Result{}
	doc, err := html.Parse(r)
	if err != nil {
		fmt.Println(err)
	}

	var result Result
	var f func(*html.Node)
	f = func(n *html.Node) {
		// n.Typeでノードの型をチェックできる、ElementNodeでHTMLタグのNode。
		// n.Dataでノートの値をチェックする、aタグをチェックしている
		if n.Type == html.ElementNode && n.Data == "a" {
			// n.Attrで属性を一覧する
			for _, a := range n.Attr {
				if a.Key == "href" {
					result.Url = a.Val
					results = append(results, result)
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
	return results
}
Exemple #28
0
func icon_from_link_rel(local string) (curl.Cache, error) {
	f, err := os.Open(local)
	if err != nil {
		return curl.Cache{}, err
	}
	defer f.Close()
	doc, err := html.Parse(f)
	if err != nil {
		return curl.Cache{}, err
	}
	de := node_query_select(doc, "html")
	head := node_query_select(de, "head")
	links := node_query_selects(head, "link")
	var href string
	for _, link := range links {
		rel := node_get_attribute(link, "rel")
		if rel == "icon" || rel == "shortcut icon" || rel == "apple-touch-icon" {
			href = node_get_attribute(link, "href")
			break
		}
	}
	if href != "" {
		c := curl.NewCurlerDetail(backend_config().ImageFolder, 0, 0, nil, backend_context.ruler)
		return c.Get(href)
	}
	return curl.Cache{}, new_backenderror(-1, "icon cannot resolved in html")
}
Exemple #29
0
func Redact(r io.Reader) (string, error) {
	doc, err := html.Parse(r)
	if err != nil {
		return "", err
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			if c.Type == html.ElementNode {
				switch c.Data {
				case "style", "script", "head", "meta":
					n.RemoveChild(c)
					return
				case "img":
					for i, attr := range c.Attr {
						if attr.Key == "src" {
							c.Attr[i].Key = "data-redacted-src"
						}
					}
				}
			} else if c.Type == html.CommentNode {
				n.RemoveChild(c)
				return
			}
			f(c)
		}
	}
	f(doc)
	buf := bytes.NewBufferString("")
	err = html.Render(buf, doc)
	return buf.String(), err
}
Exemple #30
0
func main() {
	flag.Parse()
	if *uri == "" {
		flag.PrintDefaults()
		return
	}
	c := curl.NewCurl("e:/")
	cache, err := c.GetUtf8(*uri)
	if err != nil {
		panic(err)
	}
	f, err := os.Open(cache.LocalUtf8)
	if err != nil {
		panic(err)
	}
	defer f.Close()
	doc, err := html.Parse(f)
	if err != nil {
		panic(err)
	}
	ex := cleaner.NewExtractor("e:/")
	article, _, err := ex.MakeHtmlReadable(doc, *uri)
	if err != nil {
		panic(err)
	}
	print_html_doc(article)
}