Beispiel #1
1
func runDatCase(c []byte) int {
	var counter int
	defer func() {
		if e := recover(); e != nil {
			fmt.Println("ERROR while running test case:", e)
			counter++
		}
	}()
	parts := bytes.Split(c, []byte("#"))
	if len(parts) != 4 {
		counter++
	}
	if len(parts) != 4 && *verbose {
		fmt.Printf("Malformed test case: %d, %q\n", len(parts), string(c))
		return counter
	}
	fmt.Println("Running test case:", string(c))
	testData := make(map[string]string)
	for _, p := range parts[1:] {
		t := bytes.Split(p, []byte("\n"))
		testData[string(t[0])] = string(t[1])
	}
	p := h5.NewParserFromString(string(testData["data"]))
	err := p.Parse()
	if err != nil {
		fmt.Println("Test case:", string(c))
		fmt.Println("ERROR parsing: ", err)
		counter++
	} else {
		if *verbose {
			fmt.Println("SUCCESS!!!")
		}
	}
	return counter
}
Beispiel #2
0
func eztvUrl(id string) (string, error) {
	id = cleanID(id)
	res, err := http.Get("http://eztv.it/showlist/")
	if err != nil {
		return id, err
	}
	defer res.Body.Close()
	b, err := ioutil.ReadAll(res.Body)
	if err != nil {
		return id, err
	}
	p := h5.NewParserFromString(strings.Replace(string(b), "Pending</b>", "Pending", -1))
	if err = p.Parse(); err != nil {
		return id, err
	}
	rows := sel(p.Tree(), "tr[name=hover]")
	for _, row := range rows {
		a := sel(sel(row, "td.forum_thread_post")[0], "a")[0]
		if cleanID(a.Children[0].Data()) == id {
			for _, attr := range a.Attr {
				if attr.Name == "href" {
					return "http://eztv.it" + attr.Value, nil
				}
			}
			return id, errors.New("URI not found")
		}
	}
	return id, errors.New("Show «" + id + "» not found")
}
Beispiel #3
0
func eztvParser(uri string) (e [][]string, err error) {
	res, err := http.Get(uri)
	if err != nil {
		return nil, err
	}
	defer res.Body.Close()
	body, err := ioutil.ReadAll(res.Body)
	if err != nil {
		return nil, err
	}

	// p := h5.NewParser(body)
	h := strings.Replace(string(body), "<center>", "", 1) // repair (remove) first unclosed <center> tag
	p := h5.NewParserFromString(h)

	if err = p.Parse(); err != nil {
		return nil, err
	}
	rows := sel(
		sel(p.Tree(), "table.forum_header_noborder")[0],
		"tr.forum_header_border")
	for _, row := range rows {
		e = append(e, eztvSnipp(row))
	}
	return
}
Beispiel #4
0
//Loads html file from url, with core http library, returns it in h5.node format, very useful for latter parsing
//In this case whole html file is node, but any later selections will also be nodes.
func GetHtmlNodeFromUrl(url string, filename string) (node *h5.Node) {
	fmt.Printf("Getting data from url\n")
	res, err := http.Get(url)
	if err != nil {
		log.Fatalf("Error getting valid response from url: %s\n", url)
	}
	defer res.Body.Close()

	buffer, err := ioutil.ReadAll(res.Body)
	if err != nil {
		log.Printf("Could not read from reader\n")
		return
	}

	p := h5.NewParserFromString(string(buffer))

	err = p.Parse()
	if err != nil {
		log.Fatalf("Error parsing body as html: %s", err)
	}

	node = p.Tree()

	SaveHtmlNodeToFile(buffer, filename)

	return
}
func main() {
	if len(os.Args) < 3 {
		fmt.Println("Usage:", os.Args[0], "filename iterations")
		os.Exit(1)
	}

	filename := os.Args[1]
	n, _ := strconv.Atoi(os.Args[2])

	file, err := ioutil.ReadFile(filename)
	if err != nil {
		panic(err)
	}
	html := string(file)

	start := time.Now()
	for i := 0; i < n; i++ {
		p := h5.NewParserFromString(html)
		err := p.Parse()
		if err != nil {
			panic(err)
		}
		p.Tree()
	}
	end := time.Now()

	fmt.Printf("%f s\n", end.Sub(start).Seconds())
}
Beispiel #6
0
/*
 * utility functions
 */
func parseString(s string) *h5.Node {
	p := h5.NewParserFromString(s)

	err := p.Parse()
	if err != nil {
		log.Fatal(err.Error())
	}
	return p.Tree()
}
Beispiel #7
0
//This function gets html from file on disk
func GetHtmlNodeFromFile(filename string) (node *h5.Node) {
	content, err := ioutil.ReadFile(filename)
	if err != nil {
		return nil
	}

	p := h5.NewParserFromString(string(content))

	err = p.Parse()
	if err != nil {
		log.Printf("Error: %s\n", err)
		return nil
	}

	node = p.Tree()

	return

}
Beispiel #8
0
func GetURLToLatestBukkit(channel string) string {
	/* Bukkit DL API sucks so we have to HTML scrape */
	var resp *http.Response
	var err error

	if channel == BETA_BUILD {
		resp, err = http.Get(BETA_ENDPOINT)
	} else if channel == RECOMMENDED_BUILD {
		resp, err = http.Get(RECOMMENDED_ENDPOINT)
	}

	defer resp.Body.Close()

	if servr.LogIfFatal(err) {
		Puts("I failed to install the server. :(")
		return "FAIL"
	}

	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if servr.LogIfFatal(err) {
		Puts("I failed to install the server. :(")
		return "FAIL"
	}

	bodyStr := string(body)
	doc := h5.NewParserFromString(bodyStr)
	err = doc.Parse()
	if servr.LogIfFatal(err) {
		Puts("I failed to install the server. :(")
		return "FAIL"
	}

	tree := doc.Tree()

	currentArtifactId := 1
	skipBuild := true
	builds := make(map[int]string)
	buildInfo := make(map[int]string)

	tree.Walk(func(node *h5.Node) {
		attrSlice := node.Attr

		for _, attr := range attrSlice {
			attrValueChars := strings.Split(attr.Value, "")

			if len(attrValueChars) > 27 {
				// TODO: replace the following with actual regex

				if attrValueChars[26] == "/" && attrValueChars[1] == "d" {
					builds[currentArtifactId] = "http://dl.bukkit.org" + attr.Value
				} else {
					continue
				}

				if skipBuild {
					skipBuild = false
					continue
				} else {
					version := func() string {
						attrs := node.Attr

						for _, attr := range attrs {
							if attr.Name == "title" {
								return attr.Value
							}
						}
						return "UNKNOWN"
					}

					buildInfo[currentArtifactId] = version()

					_, exists := buildInfo[currentArtifactId]
					if exists != true {
						buildInfo[currentArtifactId] = "UNKNOWN"
					}
				}

				currentArtifactId++
			} else {
				continue
			}
		}
	})

	Puts("Here is all of the different bukkit server releases for the " + channel + " channel.")

	orderedBuilds := []string{}
	currentBuild := 1
	for i := len(builds); i > 0; i-- {
		for id, _ := range builds {
			for infoId, info := range buildInfo {
				if id == currentBuild {
					if infoId == id {
						orderedBuilds = append(orderedBuilds, "  ["+strconv.Itoa(id)+"]: "+info)
					} else {
						continue
					}
				}
			}
		}

		currentBuild++
	}

	for i := 0; i < len(orderedBuilds); i++ {
		Puts(orderedBuilds[i])
	}

	maxOptions := len(orderedBuilds)
	var buildOption string
	var buildOptionInt int

	fmt.Printf("\nWhat build would you like to install? [1-" + strconv.Itoa(maxOptions) + "] ")
	fmt.Scan(&buildOption)

	buildOptionInt, err = strconv.Atoi(buildOption)
	if err != nil || buildOptionInt > maxOptions {
		Puts("Invalid option, please try again.")
		return GetURLToLatestBukkit(channel)
	}

	Puts("Selected " + buildInfo[buildOptionInt] + ".")

	return builds[buildOptionInt]
}
func fetchCategoryGQ(url string) Category {
	req, err := http.NewRequest("GET", url, nil)
	req.Header.Set("User-Agent", scraperConfig.UserAgent)

	httpClient := http.Client{
		Transport: &http.Transport{
			Dial:              timeoutDialler(time.Duration(10 * time.Second)),
			DisableKeepAlives: true,
		},
	}

	var output = Category{}

	resp, err := httpClient.Do(req)
	if err != nil {
		log.Printf("HTTP_ERROR url:'%s' error:'%s'\n", url, err)
		return output
	}
	defer resp.Body.Close()

	if resp.StatusCode == 200 {

		body, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err)
			return output
		}

		/*
		   doc, err := goquery.Parse(string(body))
		   if err != nil {
		       log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err)
		       return output
		   }
		*/
		parser := h5.NewParserFromString(string(body))
		err = parser.Parse()
		if err != nil {
			log.Printf("HTML_ERROR url:'%s' error:'%s'\n", url, err)
			return output
		}
		doc := parser.Tree()

		pathFragments := strings.Split(url, "/")
		output.Name = pathFragments[len(pathFragments)-1]
		log.Println("Processing", output.Name)

		if !categorySet[output.Name] {
			// prevent cycles. this is wonky, but will do for now

			/*
			   nodes := doc.Find("#urls-text")
			   if len(nodes) == 1 {
			   }
			*/

			t := transform.NewTransform(doc)
			var getUrls = func(n *h5.Node) {
				urls := strings.Split(n.Children[0].Data(), "\n")
				for _, item := range urls {
					item = strings.TrimSpace(item)
					if blekkoSubCat.MatchString(item) {
						/*
						   // if we encounter a subcategory, recurse
						   subCat := fetchCategory(subCatUrl)
						   for _, subUrl := range subCat.Urls {
						       output.Urls = append(output.Urls, subUrl)
						   }
						*/
						// make n-level categories 1st level
						subCatUrl := fmt.Sprintf("https://blekko.com/ws/+/view+%s", item)
						go downloadUrls(subCatUrl)
					} else if item != "" {
						output.Urls = append(output.Urls, item)
					}
				}
			}
			t.Apply(getUrls, "#urls-text")

			categorySet[output.Name] = true
		}
	}
	return output
}