Пример #1
0
func NewFromReader(rdr io.Reader) (*Transformer, error) {
	tree, err := h5.New(rdr)
	if err == nil {
		return New(tree), nil
	}
	return nil, err
}
Пример #2
0
func discover(identifier string) (io.ReadCloser, error) {
	req, err := http.NewRequest("GET", identifier, nil)
	if err != nil {
		return nil, err
	}
	req.Header.Add("Accept", xrds_mime)
	resp, err := new(http.Client).Do(req)
	if err != nil {
		return nil, err
	}

	//If we've got an XRDS document, we're okay, good.
	if contentType := resp.Header.Get("Content-Type"); strings.HasPrefix(contentType, "application/xrds+xml") {
		return resp.Body, nil
		//Well, it might be in the header...
	} else if h := resp.Header.Get("X-Xrds-Location"); h != "" {
		return discover(h)
		//If it's HTML we need to search the meta tags ;.;
	} else if strings.HasPrefix(contentType, "text/html") {
		p, e := h5.New(resp.Body)
		if e != nil {
			return nil, e
		}
		str, ok := discoverFromHTMLNode(p)
		if ok {
			return discover(str)
		}
	}

	return resp.Body, errors.New("Could not locate Yadis document!")

}
Пример #3
0
func fetchCategory(url string) Category {
	req, err := http.NewRequest("GET", url, nil)
	req.Header.Set("User-Agent", scraperConfig.UserAgent)

	httpClient := http.Client{
		Transport: &http.Transport{
			Dial:              timeoutDialler(time.Duration(10) * time.Second),
			DisableKeepAlives: true,
		},
	}

	var output = Category{}

	resp, err := httpClient.Do(req)
	if err != nil {
		log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err)
		return output
	}
	defer resp.Body.Close()

	if resp.StatusCode == 200 {

		tree, err := h5.New(resp.Body)
		if err != nil {
			log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err)
			return output
		}

		pathFragments := strings.Split(url, "/")
		output.Name = pathFragments[len(pathFragments)-1]
		log.Println("Processing", output.Name)

		if !categorySet[output.Name] {
			// prevent cycles. this is wonky, but will do for now
			t := transform.New(tree)
			var getUrls = func(n *html.Node) {
				urls := strings.Split(n.FirstChild.Data, "\n")
				for _, item := range urls {
					item = strings.TrimSpace(item)
					// if we encounter a subcategory, recurse
					if blekkoSubCat.MatchString(item) {
						subCatUrl := fmt.Sprintf("https://blekko.com/ws/+/view+%s", item)
						subCat := fetchCategory(subCatUrl)
						for _, subUrl := range subCat.Urls {
							output.Urls = append(output.Urls, subUrl)
						}
					} else if item != "" {
						output.Urls = append(output.Urls, item)
					}
				}
			}
			t.Apply(getUrls, "#urls-text")

			categorySet[output.Name] = true
		}
	}
	return output
}
Пример #4
0
func extractMatchingHtmlNodes(response *http.Response, cssSelector string) []*html.Node {
	tree, err := h5.New(response.Body)
	if err != nil {
		log.Fatalf("Error parsing body into tree: %v\n", err)
	}

	selectorChain, err := selector.Selector(cssSelector)
	if err != nil {
		log.Fatalf("Error parsing cssSelector %v: %v\n", cssSelector, err)
	}

	return selectorChain.Find(tree.Top())
}
Пример #5
0
// ExtractReader - Acts like Extract but first parses html body from reader.
func (m SelectorsMap) ExtractReader(buffer io.Reader) (res interface{}, err error) {
	// If no buffer return error
	if buffer == nil {
		err = ErrNoBody
		return
	}

	// Parse body
	var node *h5.Tree
	node, err = h5.New(buffer)
	if err != nil {
		return
	}

	// Extract from top node
	res = m.Extract(node.Top())
	return
}
Пример #6
0
func fetchTagUrls(url string) []string {
	req, err := http.NewRequest("GET", url, nil)
	req.Header.Set("User-Agent", scraperConfig.UserAgent)

	httpClient := http.Client{
		Transport: &http.Transport{
			Dial:              timeoutDialler(time.Duration(10 * time.Second)),
			DisableKeepAlives: true,
		},
	}

	var output = []string{}

	resp, err := httpClient.Do(req)
	if err != nil {
		log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err)
		return output
	}
	defer resp.Body.Close()

	if resp.StatusCode == 200 {
		tree, err := h5.New(resp.Body)
		if err != nil {
			log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err)
			return output
		}

		var GetUrls = func(n *html.Node) {
			for _, a := range n.Attr {
				if a.Key == "href" {
					output = append(output, a.Val)
					break
				}
			}
		}
		t := transform.New(tree)
		t.Apply(GetUrls, "#tags-directory ul li a")
	}
	return output
}
Пример #7
0
// Build will construct a Java Docset for Dash, using the Javadoc contained in
// the javadocPath provided.
func Build(javadocPath string, docsetRoot, docsetName string) error {
	if exists, err := pathExists(javadocPath); !exists {
		return errors.New("javadoc path does not exist")
	} else if err != nil {
		return err
	}

	if exists, err := pathExists(docsetRoot); !exists {
		return errors.New("docset root path does not exist")
	} else if err != nil {
		return err
	}

	docsetPath := filepath.Join(docsetRoot, docsetName+".docset")
	contentsPath := filepath.Join(docsetPath, "Contents")
	resourcesDir := filepath.Join(contentsPath, "Resources")
	documentsDir := filepath.Join(resourcesDir, "Documents")
	if err := os.MkdirAll(documentsDir, 0755); err != nil {
		return err
	}

	if err := copyPath(javadocPath, documentsDir); err != nil {
		return err
	}

	plistPath := filepath.Join(contentsPath, "Info.plist")
	if err := writePlist(plistPath, docsetName); err != nil {
		return err
	}

	indexFile, err := os.Open(filepath.Join(javadocPath, "index-all.html"))
	if err != nil {
		return err
	}

	defer indexFile.Close()

	db, err := initDb(filepath.Join(resourcesDir, "docSet.dsidx"))
	if err != nil {
		return err
	}

	defer db.Close()

	tree, err := h5.New(indexFile)
	if err != nil {
		return err
	}

	itemSelector, err := selector.Selector("dl dt")
	if err != nil {
		return err
	}

	anchorSelector, err := selector.Selector("a")
	if err != nil {
		return err
	}

	for _, node := range itemSelector.Find(tree.Top()) {
		text := nodeText(node, false)
		anchor := anchorSelector.Find(node)[0]
		itemType := ""

		switch {
		case strings.Contains(text, "Class in"):
			itemType = "Class"
		case strings.Contains(text, "Static method in"):
			itemType = "Method"
		case strings.Contains(text, "Static variable in"):
			itemType = "Field"
		case strings.Contains(text, "Constructor"):
			itemType = "Constructor"
		case strings.Contains(text, "Method in"):
			itemType = "Method"
		case strings.Contains(text, "Variable in"):
			itemType = "Field"
		case strings.Contains(text, "Interface in"):
			itemType = "Interface"
		case strings.Contains(text, "Exception in"):
			itemType = "Exception"
		case strings.Contains(text, "Error in"):
			itemType = "Error"
		case strings.Contains(text, "Enum in"):
			itemType = "Enum"
		case strings.Contains(text, "package"):
			itemType = "Package"
		case strings.Contains(text, "Annotation Type"):
			itemType = "Notation"
		}

		tx, err := db.Begin()
		if err != nil {
			return err
		}

		statement, err := tx.Prepare("insert into searchIndex(name, type, path) VALUES(?, ?, ?)")
		if err != nil {
			return err
		}
		defer statement.Close()

		if itemType != "" {
			itemName := nodeText(anchor, true)
			_, err := statement.Exec(itemName, itemType, nodeAttr(anchor, "href"))
			if err != nil {
				return err
			}
		}

		tx.Commit()
	}

	return nil
}
Пример #8
0
// newScraper parses the given url and scrapes the site, returning
// a scraper object with all the site info and meta tags
func newScraper(u *url.URL, timeout int) (*scraper, error) {
	var title string
	var language string
	var author string
	var description string
	var generator string
	var feed string
	charset := "utf-8"
	links := make([]string, 0)
	images := make([]string, 0)
	keywords := make([]string, 0)
	compatibility := make(map[string]string)

	scrpr := func(n *html.Node) {
		switch n.Data {
		case "html":
			language = findAttribute(n, "lang")
		case "title":
			title = n.FirstChild.Data
		case "a":
			links = addElement(links, u, n, "href")
		case "img":
			images = addElement(images, u, n, "src")
		case "link":
			typ := findAttribute(n, "type")
			switch typ {
			case "application/rss+xml":
				feed = findAttribute(n, "href")
			}
		case "meta":
			name := findAttribute(n, "name")
			switch name {
			case "author":
				author = findAttribute(n, "content")
			case "keywords":
				keywords = strings.Split(findAttribute(n, "content"), ", ")
			case "description":
				description = findAttribute(n, "content")
			case "generator":
				generator = findAttribute(n, "content")
			}

			httpEquiv := findAttribute(n, "http-equiv")
			switch httpEquiv {
			case "Content-Type":
				charset = findCharset(findAttribute(n, "content"))
			case "X-UA-Compatible":
				compatibility = mapifyStr(findAttribute(n, "content"))
			}
		}
	}

	cl := http.Client{
		Transport: &http.Transport{
			Dial: timeoutDialer(timeout),
		},
	}

	resp, err := cl.Get(u.String())
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	tree, err := h5.New(resp.Body)
	if err != nil {
		return nil, err
	}
	tree.Walk(scrpr)

	return &scraper{title,
		language,
		author,
		description,
		generator,
		feed,
		charset,
		links,
		images,
		keywords,
		compatibility}, nil
}
Пример #9
0
func parseExecutors(rdr io.Reader) ([]Build, error) {
	tree, err := h5.New(rdr)
	if err != nil {
		return nil, err
	}
	body, err := parseGetChild(tree.Top(), atom.Body, 1)
	if err != nil {
		return nil, err
	}
	table, err := parseGetChild(body, atom.Table, 1)
	if err != nil {
		return nil, err
	}
	tbody, err := parseGetChild(table, atom.Tbody, 2)
	if err != nil {
		return nil, err
	}
	tr := tbody.FirstChild
	var builds []Build
	for {
		th, err := parseGetChild(tr, atom.Th, 1)
		if err == nil {
			nameLink, err := parseGetChild(th, atom.A, 1)
			if err != nil {
				//fmt.Println("link not found in " + h5.NewTree(th).String())
				if tr == tbody.LastChild {
					break
				}
				tr = tr.NextSibling
				continue
			}
			if tr.NextSibling == nil {
				builds = append(builds, Build{nameLink.FirstChild.Data, ""})
				break
			}
			tr = tr.NextSibling
			_, err = parseGetChild(tr, atom.Th, 1)
			if err == nil {
				// no data row
				builds = append(builds, Build{nameLink.FirstChild.Data, ""})
				continue
			}
			if tr.FirstChild == nil || tr.FirstChild.NextSibling == nil {
				return nil, errors.New("Build without div")
			}
			buildTd := tr.FirstChild.NextSibling
			if buildTd.DataAtom != atom.Td {
				return nil, errors.New("Expected td but got " + h5.NewTree(buildTd).String())
			}
			buildDiv, err := parseGetChild(buildTd, atom.Div, 1)
			if err != nil {
				// empty data row
				builds = append(builds, Build{nameLink.FirstChild.Data, ""})
			} else {
				build, err := parseGetChild(buildDiv, atom.A, 1)
				if err != nil {
					return nil, err
				}
				builds = append(builds, Build{nameLink.FirstChild.Data, build.FirstChild.Data})
			}
		}
		if tr == tbody.LastChild {
			break
		}
		tr = tr.NextSibling
	}

	return builds, nil
}