Example #1
0
func main() {
	flag.Parse()
	if len(*uri) == 0 {
		log.Println("html_extractor --uri http://xxxx.com")
		return
	}
	tf, _, _ := d.ExtractHtml(*uri)
	log.Println(tf)
}
Example #2
0
func extract_html(w http.ResponseWriter, r *http.Request) {
	r.ParseForm()
	url := r.FormValue("url")
	if len(url) == 0 {
		write_json(w, _status.config())
	}
	tf, _, _ := htmldoc.ExtractHtml(url)
	w.Header().Set("Content-Type", "text/html; charset=utf-8")
	http.ServeFile(w, r, tf)
}
Example #3
0
func touch_entry(echan <-chan *meta.Entry, done chan<- int) {
	for e := <-echan; e != nil; e = <-echan {
		tf, sc, _ := d.ExtractHtml(e.Link)
		//		cf := feeds.NewContentFile(tf, sc.Words, sc.Imgs)
		feeds.NewEntryOperator().SetContent(e.Link, tf, sc.WordCount, sc.Images)
		//		feeds.EntryUpdateContent(tf, e.Link)
		//		feeds.FetchEntryImagesExternal(e)
	}
	done <- 0
}
Example #4
0
func extract_simple_json(w http.ResponseWriter, r *http.Request) {
	r.ParseForm()
	uri := r.FormValue("url")
	if len(uri) == 0 {
		write_json(w, _status.config())
	}
	tf, _, _ := htmldoc.ExtractHtml(uri)
	val := struct {
		Url    string `json:"url,omitempty"`
		Target string `json:"target,omitempty"`
	}{Url: uri, Target: tf}

	write_json(w, val)
}
Example #5
0
func main() {
	flag.Parse()
	if len(*uri) == 0 {
		fmt.Println(`image_crawler --uri http://www.baidu.com/`)
	}
	fmt.Println(*uri)
	_, sc, _ := d.ExtractHtml(*uri)
	fmt.Println(sc.WordCount, sc.LinkCount, len(sc.Images))

	//	curl := d.NewCurl(target_folder)

	capl := (len(sc.Images) + cocurrents - 1) / cocurrents
	if capl == 0 {
		return
	}
	cnt := (len(sc.Images) + capl - 1) / capl
	done := make(chan int, cnt)
	split_task(sc.Images, capl, done)
	for cnt > 0 {
		<-done
		cnt--
	}
	//	for _, img := range sc.Images {
	//		go download_image(img, done)
	//		imgf, _, _, err := curl.Download(img)
	//		fmt.Println(imgf, err)
	/*		if err == nil {
				fn := path.Base(imgf)
				ext := extension(mt)
				t := path.Join(target_folder, fn+ext)
				err = os.Rename(imgf, t)
				fmt.Println(t, err)
			}
	*/
	//	}
}