func main() { if len(os.Args) < 2 { usage() } url := os.Args[1] isDebug := false if len(os.Args) >= 3 { if os.Args[2] == "debug" { isDebug = true } else { usage() } } rawhtml, _, _, err := sandblast.FetchURL(url) if err != nil { log.Fatalf("Could not fetch url: %s\n", url) } node, err := html.Parse(bytes.NewReader([]byte(rawhtml))) if err != nil { log.Fatal("Parsing error: ", err) } title, text, simplified, flattened, cleaned, err := sandblast.ExtractEx(node, sandblast.KeepLinks) if err != nil { log.Fatal("Extraction error: ", err) } fmt.Printf("TITLE: %s\n", title) if isDebug { fmt.Printf("SIMPLIFIED:\n%s\n", simplified.DebugString()) fmt.Printf("FLATTENED:\n%s\n", flattened.DebugString()) fmt.Printf("CLEANED:\n%s\n", cleaned.DebugString()) } fmt.Printf("TEXT:\n%s\n", text) }
func extractTest(test test, writeextract bool) ([]byte, string) { in, err := test.input.Open() must(err) defer in.Close() body, err := ioutil.ReadAll(in) must(err) e, _, _ := charset.DetermineEncoding(body, "UTF-8") r := transform.NewReader(bytes.NewReader(body), e.NewDecoder()) node, err := html.Parse(r) must(err) _, output, simplified, flattened, cleaned, err := sandblast.ExtractEx(node, 0) must(err) if writeextract { fmt.Printf("SIMPLIFIED:\n%s\n", simplified.DebugString()) fmt.Printf("FLATTENED:\n%s\n", flattened.DebugString()) fmt.Printf("CLEANED:\n%s\n", cleaned.DebugString()) } return body, output }