Esempio n. 1
0
func main() {
	if len(os.Args) < 2 {
		usage()
	}

	url := os.Args[1]
	isDebug := false
	if len(os.Args) >= 3 {
		if os.Args[2] == "debug" {
			isDebug = true
		} else {
			usage()
		}
	}

	rawhtml, _, _, err := sandblast.FetchURL(url)
	if err != nil {
		log.Fatalf("Could not fetch url: %s\n", url)
	}

	node, err := html.Parse(bytes.NewReader([]byte(rawhtml)))
	if err != nil {
		log.Fatal("Parsing error: ", err)
	}
	title, text, simplified, flattened, cleaned, err := sandblast.ExtractEx(node, sandblast.KeepLinks)
	if err != nil {
		log.Fatal("Extraction error: ", err)
	}

	fmt.Printf("TITLE: %s\n", title)
	if isDebug {
		fmt.Printf("SIMPLIFIED:\n%s\n", simplified.DebugString())
		fmt.Printf("FLATTENED:\n%s\n", flattened.DebugString())
		fmt.Printf("CLEANED:\n%s\n", cleaned.DebugString())
	}
	fmt.Printf("TEXT:\n%s\n", text)
}
Esempio n. 2
0
func extractTest(test test, writeextract bool) ([]byte, string) {
	in, err := test.input.Open()
	must(err)
	defer in.Close()

	body, err := ioutil.ReadAll(in)
	must(err)

	e, _, _ := charset.DetermineEncoding(body, "UTF-8")
	r := transform.NewReader(bytes.NewReader(body), e.NewDecoder())
	node, err := html.Parse(r)
	must(err)

	_, output, simplified, flattened, cleaned, err := sandblast.ExtractEx(node, 0)
	must(err)

	if writeextract {
		fmt.Printf("SIMPLIFIED:\n%s\n", simplified.DebugString())
		fmt.Printf("FLATTENED:\n%s\n", flattened.DebugString())
		fmt.Printf("CLEANED:\n%s\n", cleaned.DebugString())
	}

	return body, output
}