func NewFromReader(rdr io.Reader) (*Transformer, error) { tree, err := h5.New(rdr) if err == nil { return New(tree), nil } return nil, err }
func discover(identifier string) (io.ReadCloser, error) { req, err := http.NewRequest("GET", identifier, nil) if err != nil { return nil, err } req.Header.Add("Accept", xrds_mime) resp, err := new(http.Client).Do(req) if err != nil { return nil, err } //If we've got an XRDS document, we're okay, good. if contentType := resp.Header.Get("Content-Type"); strings.HasPrefix(contentType, "application/xrds+xml") { return resp.Body, nil //Well, it might be in the header... } else if h := resp.Header.Get("X-Xrds-Location"); h != "" { return discover(h) //If it's HTML we need to search the meta tags ;.; } else if strings.HasPrefix(contentType, "text/html") { p, e := h5.New(resp.Body) if e != nil { return nil, e } str, ok := discoverFromHTMLNode(p) if ok { return discover(str) } } return resp.Body, errors.New("Could not locate Yadis document!") }
func fetchCategory(url string) Category { req, err := http.NewRequest("GET", url, nil) req.Header.Set("User-Agent", scraperConfig.UserAgent) httpClient := http.Client{ Transport: &http.Transport{ Dial: timeoutDialler(time.Duration(10) * time.Second), DisableKeepAlives: true, }, } var output = Category{} resp, err := httpClient.Do(req) if err != nil { log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err) return output } defer resp.Body.Close() if resp.StatusCode == 200 { tree, err := h5.New(resp.Body) if err != nil { log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err) return output } pathFragments := strings.Split(url, "/") output.Name = pathFragments[len(pathFragments)-1] log.Println("Processing", output.Name) if !categorySet[output.Name] { // prevent cycles. this is wonky, but will do for now t := transform.New(tree) var getUrls = func(n *html.Node) { urls := strings.Split(n.FirstChild.Data, "\n") for _, item := range urls { item = strings.TrimSpace(item) // if we encounter a subcategory, recurse if blekkoSubCat.MatchString(item) { subCatUrl := fmt.Sprintf("https://blekko.com/ws/+/view+%s", item) subCat := fetchCategory(subCatUrl) for _, subUrl := range subCat.Urls { output.Urls = append(output.Urls, subUrl) } } else if item != "" { output.Urls = append(output.Urls, item) } } } t.Apply(getUrls, "#urls-text") categorySet[output.Name] = true } } return output }
func extractMatchingHtmlNodes(response *http.Response, cssSelector string) []*html.Node { tree, err := h5.New(response.Body) if err != nil { log.Fatalf("Error parsing body into tree: %v\n", err) } selectorChain, err := selector.Selector(cssSelector) if err != nil { log.Fatalf("Error parsing cssSelector %v: %v\n", cssSelector, err) } return selectorChain.Find(tree.Top()) }
// ExtractReader - Acts like Extract but first parses html body from reader. func (m SelectorsMap) ExtractReader(buffer io.Reader) (res interface{}, err error) { // If no buffer return error if buffer == nil { err = ErrNoBody return } // Parse body var node *h5.Tree node, err = h5.New(buffer) if err != nil { return } // Extract from top node res = m.Extract(node.Top()) return }
func fetchTagUrls(url string) []string { req, err := http.NewRequest("GET", url, nil) req.Header.Set("User-Agent", scraperConfig.UserAgent) httpClient := http.Client{ Transport: &http.Transport{ Dial: timeoutDialler(time.Duration(10 * time.Second)), DisableKeepAlives: true, }, } var output = []string{} resp, err := httpClient.Do(req) if err != nil { log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err) return output } defer resp.Body.Close() if resp.StatusCode == 200 { tree, err := h5.New(resp.Body) if err != nil { log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err) return output } var GetUrls = func(n *html.Node) { for _, a := range n.Attr { if a.Key == "href" { output = append(output, a.Val) break } } } t := transform.New(tree) t.Apply(GetUrls, "#tags-directory ul li a") } return output }
// Build will construct a Java Docset for Dash, using the Javadoc contained in // the javadocPath provided. func Build(javadocPath string, docsetRoot, docsetName string) error { if exists, err := pathExists(javadocPath); !exists { return errors.New("javadoc path does not exist") } else if err != nil { return err } if exists, err := pathExists(docsetRoot); !exists { return errors.New("docset root path does not exist") } else if err != nil { return err } docsetPath := filepath.Join(docsetRoot, docsetName+".docset") contentsPath := filepath.Join(docsetPath, "Contents") resourcesDir := filepath.Join(contentsPath, "Resources") documentsDir := filepath.Join(resourcesDir, "Documents") if err := os.MkdirAll(documentsDir, 0755); err != nil { return err } if err := copyPath(javadocPath, documentsDir); err != nil { return err } plistPath := filepath.Join(contentsPath, "Info.plist") if err := writePlist(plistPath, docsetName); err != nil { return err } indexFile, err := os.Open(filepath.Join(javadocPath, "index-all.html")) if err != nil { return err } defer indexFile.Close() db, err := initDb(filepath.Join(resourcesDir, "docSet.dsidx")) if err != nil { return err } defer db.Close() tree, err := h5.New(indexFile) if err != nil { return err } itemSelector, err := selector.Selector("dl dt") if err != nil { return err } anchorSelector, err := selector.Selector("a") if err != nil { return err } for _, node := range itemSelector.Find(tree.Top()) { text := nodeText(node, false) anchor := anchorSelector.Find(node)[0] itemType := "" switch { case strings.Contains(text, "Class in"): itemType = "Class" case strings.Contains(text, "Static method in"): itemType = "Method" case strings.Contains(text, "Static variable in"): itemType = "Field" case strings.Contains(text, "Constructor"): itemType = "Constructor" case strings.Contains(text, "Method in"): itemType = "Method" case strings.Contains(text, "Variable in"): itemType = "Field" case strings.Contains(text, "Interface in"): itemType = "Interface" case strings.Contains(text, "Exception in"): itemType = "Exception" case strings.Contains(text, "Error in"): itemType = "Error" case strings.Contains(text, "Enum in"): itemType = "Enum" case strings.Contains(text, "package"): itemType = "Package" case strings.Contains(text, "Annotation Type"): itemType = "Notation" } tx, err := db.Begin() if err != nil { return err } statement, err := tx.Prepare("insert into searchIndex(name, type, path) VALUES(?, ?, ?)") if err != nil { return err } defer statement.Close() if itemType != "" { itemName := nodeText(anchor, true) _, err := statement.Exec(itemName, itemType, nodeAttr(anchor, "href")) if err != nil { return err } } tx.Commit() } return nil }
// newScraper parses the given url and scrapes the site, returning // a scraper object with all the site info and meta tags func newScraper(u *url.URL, timeout int) (*scraper, error) { var title string var language string var author string var description string var generator string var feed string charset := "utf-8" links := make([]string, 0) images := make([]string, 0) keywords := make([]string, 0) compatibility := make(map[string]string) scrpr := func(n *html.Node) { switch n.Data { case "html": language = findAttribute(n, "lang") case "title": title = n.FirstChild.Data case "a": links = addElement(links, u, n, "href") case "img": images = addElement(images, u, n, "src") case "link": typ := findAttribute(n, "type") switch typ { case "application/rss+xml": feed = findAttribute(n, "href") } case "meta": name := findAttribute(n, "name") switch name { case "author": author = findAttribute(n, "content") case "keywords": keywords = strings.Split(findAttribute(n, "content"), ", ") case "description": description = findAttribute(n, "content") case "generator": generator = findAttribute(n, "content") } httpEquiv := findAttribute(n, "http-equiv") switch httpEquiv { case "Content-Type": charset = findCharset(findAttribute(n, "content")) case "X-UA-Compatible": compatibility = mapifyStr(findAttribute(n, "content")) } } } cl := http.Client{ Transport: &http.Transport{ Dial: timeoutDialer(timeout), }, } resp, err := cl.Get(u.String()) if err != nil { return nil, err } defer resp.Body.Close() tree, err := h5.New(resp.Body) if err != nil { return nil, err } tree.Walk(scrpr) return &scraper{title, language, author, description, generator, feed, charset, links, images, keywords, compatibility}, nil }
func parseExecutors(rdr io.Reader) ([]Build, error) { tree, err := h5.New(rdr) if err != nil { return nil, err } body, err := parseGetChild(tree.Top(), atom.Body, 1) if err != nil { return nil, err } table, err := parseGetChild(body, atom.Table, 1) if err != nil { return nil, err } tbody, err := parseGetChild(table, atom.Tbody, 2) if err != nil { return nil, err } tr := tbody.FirstChild var builds []Build for { th, err := parseGetChild(tr, atom.Th, 1) if err == nil { nameLink, err := parseGetChild(th, atom.A, 1) if err != nil { //fmt.Println("link not found in " + h5.NewTree(th).String()) if tr == tbody.LastChild { break } tr = tr.NextSibling continue } if tr.NextSibling == nil { builds = append(builds, Build{nameLink.FirstChild.Data, ""}) break } tr = tr.NextSibling _, err = parseGetChild(tr, atom.Th, 1) if err == nil { // no data row builds = append(builds, Build{nameLink.FirstChild.Data, ""}) continue } if tr.FirstChild == nil || tr.FirstChild.NextSibling == nil { return nil, errors.New("Build without div") } buildTd := tr.FirstChild.NextSibling if buildTd.DataAtom != atom.Td { return nil, errors.New("Expected td but got " + h5.NewTree(buildTd).String()) } buildDiv, err := parseGetChild(buildTd, atom.Div, 1) if err != nil { // empty data row builds = append(builds, Build{nameLink.FirstChild.Data, ""}) } else { build, err := parseGetChild(buildDiv, atom.A, 1) if err != nil { return nil, err } builds = append(builds, Build{nameLink.FirstChild.Data, build.FirstChild.Data}) } } if tr == tbody.LastChild { break } tr = tr.NextSibling } return builds, nil }