func main() { flag.Parse() if len(*uri) == 0 { log.Println("html_extractor --uri http://xxxx.com") return } tf, _, _ := d.ExtractHtml(*uri) log.Println(tf) }
func extract_html(w http.ResponseWriter, r *http.Request) { r.ParseForm() url := r.FormValue("url") if len(url) == 0 { write_json(w, _status.config()) } tf, _, _ := htmldoc.ExtractHtml(url) w.Header().Set("Content-Type", "text/html; charset=utf-8") http.ServeFile(w, r, tf) }
func touch_entry(echan <-chan *meta.Entry, done chan<- int) { for e := <-echan; e != nil; e = <-echan { tf, sc, _ := d.ExtractHtml(e.Link) // cf := feeds.NewContentFile(tf, sc.Words, sc.Imgs) feeds.NewEntryOperator().SetContent(e.Link, tf, sc.WordCount, sc.Images) // feeds.EntryUpdateContent(tf, e.Link) // feeds.FetchEntryImagesExternal(e) } done <- 0 }
func extract_simple_json(w http.ResponseWriter, r *http.Request) { r.ParseForm() uri := r.FormValue("url") if len(uri) == 0 { write_json(w, _status.config()) } tf, _, _ := htmldoc.ExtractHtml(uri) val := struct { Url string `json:"url,omitempty"` Target string `json:"target,omitempty"` }{Url: uri, Target: tf} write_json(w, val) }
func main() { flag.Parse() if len(*uri) == 0 { fmt.Println(`image_crawler --uri http://www.baidu.com/`) } fmt.Println(*uri) _, sc, _ := d.ExtractHtml(*uri) fmt.Println(sc.WordCount, sc.LinkCount, len(sc.Images)) // curl := d.NewCurl(target_folder) capl := (len(sc.Images) + cocurrents - 1) / cocurrents if capl == 0 { return } cnt := (len(sc.Images) + capl - 1) / capl done := make(chan int, cnt) split_task(sc.Images, capl, done) for cnt > 0 { <-done cnt-- } // for _, img := range sc.Images { // go download_image(img, done) // imgf, _, _, err := curl.Download(img) // fmt.Println(imgf, err) /* if err == nil { fn := path.Base(imgf) ext := extension(mt) t := path.Join(target_folder, fn+ext) err = os.Rename(imgf, t) fmt.Println(t, err) } */ // } }