func runDatCase(c []byte) int { var counter int defer func() { if e := recover(); e != nil { fmt.Println("ERROR while running test case:", e) counter++ } }() parts := bytes.Split(c, []byte("#")) if len(parts) != 4 { counter++ } if len(parts) != 4 && *verbose { fmt.Printf("Malformed test case: %d, %q\n", len(parts), string(c)) return counter } fmt.Println("Running test case:", string(c)) testData := make(map[string]string) for _, p := range parts[1:] { t := bytes.Split(p, []byte("\n")) testData[string(t[0])] = string(t[1]) } p := h5.NewParserFromString(string(testData["data"])) err := p.Parse() if err != nil { fmt.Println("Test case:", string(c)) fmt.Println("ERROR parsing: ", err) counter++ } else { if *verbose { fmt.Println("SUCCESS!!!") } } return counter }
func eztvUrl(id string) (string, error) { id = cleanID(id) res, err := http.Get("http://eztv.it/showlist/") if err != nil { return id, err } defer res.Body.Close() b, err := ioutil.ReadAll(res.Body) if err != nil { return id, err } p := h5.NewParserFromString(strings.Replace(string(b), "Pending</b>", "Pending", -1)) if err = p.Parse(); err != nil { return id, err } rows := sel(p.Tree(), "tr[name=hover]") for _, row := range rows { a := sel(sel(row, "td.forum_thread_post")[0], "a")[0] if cleanID(a.Children[0].Data()) == id { for _, attr := range a.Attr { if attr.Name == "href" { return "http://eztv.it" + attr.Value, nil } } return id, errors.New("URI not found") } } return id, errors.New("Show «" + id + "» not found") }
func eztvParser(uri string) (e [][]string, err error) { res, err := http.Get(uri) if err != nil { return nil, err } defer res.Body.Close() body, err := ioutil.ReadAll(res.Body) if err != nil { return nil, err } // p := h5.NewParser(body) h := strings.Replace(string(body), "<center>", "", 1) // repair (remove) first unclosed <center> tag p := h5.NewParserFromString(h) if err = p.Parse(); err != nil { return nil, err } rows := sel( sel(p.Tree(), "table.forum_header_noborder")[0], "tr.forum_header_border") for _, row := range rows { e = append(e, eztvSnipp(row)) } return }
//Loads html file from url, with core http library, returns it in h5.node format, very useful for latter parsing //In this case whole html file is node, but any later selections will also be nodes. func GetHtmlNodeFromUrl(url string, filename string) (node *h5.Node) { fmt.Printf("Getting data from url\n") res, err := http.Get(url) if err != nil { log.Fatalf("Error getting valid response from url: %s\n", url) } defer res.Body.Close() buffer, err := ioutil.ReadAll(res.Body) if err != nil { log.Printf("Could not read from reader\n") return } p := h5.NewParserFromString(string(buffer)) err = p.Parse() if err != nil { log.Fatalf("Error parsing body as html: %s", err) } node = p.Tree() SaveHtmlNodeToFile(buffer, filename) return }
func main() { if len(os.Args) < 3 { fmt.Println("Usage:", os.Args[0], "filename iterations") os.Exit(1) } filename := os.Args[1] n, _ := strconv.Atoi(os.Args[2]) file, err := ioutil.ReadFile(filename) if err != nil { panic(err) } html := string(file) start := time.Now() for i := 0; i < n; i++ { p := h5.NewParserFromString(html) err := p.Parse() if err != nil { panic(err) } p.Tree() } end := time.Now() fmt.Printf("%f s\n", end.Sub(start).Seconds()) }
/* * utility functions */ func parseString(s string) *h5.Node { p := h5.NewParserFromString(s) err := p.Parse() if err != nil { log.Fatal(err.Error()) } return p.Tree() }
//This function gets html from file on disk func GetHtmlNodeFromFile(filename string) (node *h5.Node) { content, err := ioutil.ReadFile(filename) if err != nil { return nil } p := h5.NewParserFromString(string(content)) err = p.Parse() if err != nil { log.Printf("Error: %s\n", err) return nil } node = p.Tree() return }
func GetURLToLatestBukkit(channel string) string { /* Bukkit DL API sucks so we have to HTML scrape */ var resp *http.Response var err error if channel == BETA_BUILD { resp, err = http.Get(BETA_ENDPOINT) } else if channel == RECOMMENDED_BUILD { resp, err = http.Get(RECOMMENDED_ENDPOINT) } defer resp.Body.Close() if servr.LogIfFatal(err) { Puts("I failed to install the server. :(") return "FAIL" } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if servr.LogIfFatal(err) { Puts("I failed to install the server. :(") return "FAIL" } bodyStr := string(body) doc := h5.NewParserFromString(bodyStr) err = doc.Parse() if servr.LogIfFatal(err) { Puts("I failed to install the server. :(") return "FAIL" } tree := doc.Tree() currentArtifactId := 1 skipBuild := true builds := make(map[int]string) buildInfo := make(map[int]string) tree.Walk(func(node *h5.Node) { attrSlice := node.Attr for _, attr := range attrSlice { attrValueChars := strings.Split(attr.Value, "") if len(attrValueChars) > 27 { // TODO: replace the following with actual regex if attrValueChars[26] == "/" && attrValueChars[1] == "d" { builds[currentArtifactId] = "http://dl.bukkit.org" + attr.Value } else { continue } if skipBuild { skipBuild = false continue } else { version := func() string { attrs := node.Attr for _, attr := range attrs { if attr.Name == "title" { return attr.Value } } return "UNKNOWN" } buildInfo[currentArtifactId] = version() _, exists := buildInfo[currentArtifactId] if exists != true { buildInfo[currentArtifactId] = "UNKNOWN" } } currentArtifactId++ } else { continue } } }) Puts("Here is all of the different bukkit server releases for the " + channel + " channel.") orderedBuilds := []string{} currentBuild := 1 for i := len(builds); i > 0; i-- { for id, _ := range builds { for infoId, info := range buildInfo { if id == currentBuild { if infoId == id { orderedBuilds = append(orderedBuilds, " ["+strconv.Itoa(id)+"]: "+info) } else { continue } } } } currentBuild++ } for i := 0; i < len(orderedBuilds); i++ { Puts(orderedBuilds[i]) } maxOptions := len(orderedBuilds) var buildOption string var buildOptionInt int fmt.Printf("\nWhat build would you like to install? [1-" + strconv.Itoa(maxOptions) + "] ") fmt.Scan(&buildOption) buildOptionInt, err = strconv.Atoi(buildOption) if err != nil || buildOptionInt > maxOptions { Puts("Invalid option, please try again.") return GetURLToLatestBukkit(channel) } Puts("Selected " + buildInfo[buildOptionInt] + ".") return builds[buildOptionInt] }
func fetchCategoryGQ(url string) Category { req, err := http.NewRequest("GET", url, nil) req.Header.Set("User-Agent", scraperConfig.UserAgent) httpClient := http.Client{ Transport: &http.Transport{ Dial: timeoutDialler(time.Duration(10 * time.Second)), DisableKeepAlives: true, }, } var output = Category{} resp, err := httpClient.Do(req) if err != nil { log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err) return output } defer resp.Body.Close() if resp.StatusCode == 200 { body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err) return output } /* doc, err := goquery.Parse(string(body)) if err != nil { log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err) return output } */ parser := h5.NewParserFromString(string(body)) err = parser.Parse() if err != nil { log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err) return output } doc := parser.Tree() pathFragments := strings.Split(url, "/") output.Name = pathFragments[len(pathFragments)-1] log.Println("Processing", output.Name) if !categorySet[output.Name] { // prevent cycles. this is wonky, but will do for now /* nodes := doc.Find("#urls-text") if len(nodes) == 1 { } */ t := transform.NewTransform(doc) var getUrls = func(n *h5.Node) { urls := strings.Split(n.Children[0].Data(), "\n") for _, item := range urls { item = strings.TrimSpace(item) if blekkoSubCat.MatchString(item) { /* // if we encounter a subcategory, recurse subCat := fetchCategory(subCatUrl) for _, subUrl := range subCat.Urls { output.Urls = append(output.Urls, subUrl) } */ // make n-level categories 1st level subCatUrl := fmt.Sprintf("https://blekko.com/ws/+/view+%s", item) go downloadUrls(subCatUrl) } else if item != "" { output.Urls = append(output.Urls, item) } } } t.Apply(getUrls, "#urls-text") categorySet[output.Name] = true } } return output }