func checkPanUseableFromAjax(idx int, schedule_fenxi_id int) (res bool) { odd_html := myinit.GetOddItemFromAjax(idx, schedule_fenxi_id) table_string := "<table>" + odd_html + "</table>" html_obj, _ := goquery.ParseString(table_string) odds_tr := html_obj.Find("table tbody tr") for i := 0; i < odds_tr.Length(); i++ { tr_item := odds_tr.Eq(i) td_of_company := tr_item.Find("td").Eq(1) if td_of_company.Find("p a").Attr("title") == "" { continue } is_useable := checkPanTr(tr_item, schedule_fenxi_id) if is_useable == false { return false } } if odds_tr.Length() >= 30 { ajax_res := checkPanUseableFromAjax(idx+30, schedule_fenxi_id) if ajax_res == false { return false } } return true }
func main() { x, _ := goquery.ParseString(example) x.Find("a div").Val("lol") fmt.Println(x.Html()) fmt.Println(x.Find("div").HasClass("yo")) fmt.Println(x.Find("").Attrs("href")) }
func main() { var url = "http://toutiao.io" p, err := goquery.ParseUrl(url) if err != nil { panic(err) } // toutiao.io title := p.Find("title").Text() fmt.Println(title) t := p.Find(".title a") for i := 0; i < t.Length(); i++ { d := t.Eq(i).Text() l := t.Eq(i).Attr("href") c := p.Find(".summary a").Eq(i).Text() fmt.Println(l, d, "||", c) } // geek.csdn.net p, err = goquery.ParseUrl("http://geek.csdn.net/hotest") if err != nil { panic(err) } title = p.Find("title").Text() t = p.Find("a.title") fmt.Println(title) for i := 0; i < t.Length(); i++ { l := t.Eq(i).Attr("href") c := t.Eq(i).Text() fmt.Println(l, c) } // ituring.com p, err = goquery.ParseUrl("http://www.ituring.com.cn/") if err != nil { panic(err) } title = p.Find("title").Text() t = p.Find(".arc-list").Eq(0) x, _ := goquery.ParseString(t.Html()) t = x.Find("dt a") fmt.Println(title) for i := 0; i < t.Length(); i++ { c := t.Eq(i).Text() l := t.Eq(i).Attr("href") fmt.Printf("http://www.ituring.com.cn/%s %s\n", l, c) } }
func Extract(webEntity *model.WebEntity) { if webEntity == nil || webEntity.Body == nil { return } html := *webEntity.Body nodes, err := goquery.ParseString(html) if err != nil { fmt.Println("goquery ParseString error : ", err) panic(err) } else { title := nodes.Find("title").Text() fmt.Println("title : ", title) } }
func ParsePanUrlFromAjax(idx int, schedule_fenxi_id int, pan_html_string_info map[string]string) (res bool) { pan_int_info := make(map[string]int) pan_float_info := make(map[string]float32) pan_string_info := make(map[string]string) odd_html := myinit.GetOddItemFromAjax(idx, schedule_fenxi_id) table_string := "<table>" + odd_html + "</table>" html_obj, _ := goquery.ParseString(table_string) odds_tr := html_obj.Find("table tbody tr") pan_int_info["schedule_fenxi_id"] = schedule_fenxi_id pan_string_info["schedule_date"] = pan_html_string_info["schedule_date"] pan_string_info["schedule_no"] = pan_html_string_info["schedule_no"] pan_string_info["schedule_result_no"] = pan_html_string_info["schedule_result_no"] pan_string_info["schedule_league"] = pan_html_string_info["schedule_league"] pan_string_info["schedule_home"] = pan_html_string_info["schedule_home"] pan_string_info["schedule_guest"] = pan_html_string_info["schedule_guest"] pan_string_info["schedule_game_desc"] = pan_html_string_info["schedule_game_desc"] pan_string_info["schedule_date_desc"] = pan_html_string_info["schedule_date_desc"] for i := 0; i < odds_tr.Length(); i++ { tr_item := odds_tr.Eq(i) td_of_company := tr_item.Find("td").Eq(1) if td_of_company.Find("p a").Attr("title") == "" { continue } parse_res := doParsePanTr(tr_item, schedule_fenxi_id, pan_int_info, pan_float_info, pan_string_info, true) if parse_res == false { return false } } if odds_tr.Length() >= 30 { ajax_res := ParsePanUrlFromAjax(idx+30, schedule_fenxi_id, pan_html_string_info) if ajax_res == false { return false } } return true }
func spider(word string) { for { url := <-urls html, err := openUrl(url) if err != nil { errChan <- fmt.Sprintf("%s [Open]\t[%s]\t%s\r\n", time.Now().Format("2006/01/02 03:04:05"), url, err.Error()) continue } nodes, err := goquery.ParseString(html) if err != nil { errChan <- fmt.Sprintf("%s [Parsing]:\t[%s]\t%s\r\n", time.Now().Format("2006/01/02 03:04:05"), url, err.Error()) continue } // put found url into channel nodes.Find("a").Each(func(idx int, el *goquery.Node) { go func() { newurl := "" for _, attr := range el.Attr { if attr.Key == "href" { if strings.HasPrefix(attr.Val, "http") { newurl = attr.Val } else { // handle relative path // TODO this needs care newurl = strings.Trim(url, "/") + "/" + strings.Trim(attr.Val, "/") } addNewUrl(newurl) } } }() }) // match word if -1 != strings.Index(html, word) { log.Println("*Found*\t", url) foundChan <- fmt.Sprintf("%s\t%s\r\n", time.Now().Format("2006/01/02 03:04:05"), url) } } }
func (m MHtml) Htmlstring() { var example = ` <html> <head> <title> </title> </head> <body> <div class=hey cust-a="wow"><h2>Title here</h2></div> <span><h2>Yoyoyo</h2></span> <div id="box"> <span> content<a href=""><div><li></li></div></a> </span> </div> <div class="yo hey"> <a href="xyz"><div class="cow sheep bunny"><h8>content</h8></div></a> </div> </body> </html> ` tags := []string{"div#box", "div.cow", "[cust-a=wow]"} x, err := goquery.ParseString(example) if err != nil { log.Fatal(err) } for _, tag := range tags { log.Printf("--%s----------------------------\n", tag) n := x.Find(tag) log.Printf("len:%d \n", len(n)) if len(n) <= 0 { log.Println("not find\n") continue } n.Print() } }
func GetTwitterHTML(url string) (string, error) { // var err error //向服务端发送get请求 request, err := http.NewRequest("GET", url, nil) response, err := client.Do(request) defer response.Body.Close() if response.StatusCode == 200 { str, err := ioutil.ReadAll(response.Body) bodystr := string(str) //f := bytes.NewReader(str) node, er := goquery.ParseString(bodystr) if er == nil { ns := node.Find(".cards-media-container div") if ns != nil && ns.Length() > 0 { for i := 0; i < ns.Length(); i++ { no := ns.Eq(i) img := no.Find("").Attrs("data-url") fmt.Println(img[0]) if img[0] != "" { la := strings.LastIndex(img[0], "/") na := img[0][(la + 1):] la = strings.LastIndex(na, ":") na = na[:la] if na != "" { stt, _ := GetHTMLContentWithURL(img[0]) go UploadString(stt, na) } } } } } return bodystr, err } return "", err }
func ParsePanChangeUrl(schedule_fenxi_id int, company_id string, pan_html_int_info map[string]int, pan_html_float_info map[string]float32, pan_html_string_info map[string]string) (res bool) { pan_int_info := make(map[string]int) pan_float_info := make(map[string]float32) pan_string_info := make(map[string]string) for k_i, v_i := range pan_html_int_info { pan_int_info[k_i] = v_i } for k_f, v_f := range pan_html_float_info { pan_float_info[k_f] = v_f } for k_s, v_s := range pan_html_string_info { pan_string_info[k_s] = v_s } pan_string_info["predict1_result"] = "" pan_string_info["predict1_cmt"] = "" pan_string_info["predict2_result"] = "" pan_string_info["predict2_cmt"] = "" asiapanlog.ClearOldPanLog(schedule_fenxi_id, company_id) body_content := myinit.GetOddsFromAjax(schedule_fenxi_id, company_id) body := []byte(body_content) body_json, err := simplejson.NewJson(body) if err != nil { panic(err.Error()) } tr_items, _ := body_json.Array() for _, tr_string := range tr_items { table_string := "<table>" + tr_string.(string) + "</table>" html_obj, _ := goquery.ParseString(table_string) pan_log_item := html_obj.Find("table tbody tr td") home_td := pan_log_item.Eq(0) pan_td := pan_log_item.Eq(1) guest_td := pan_log_item.Eq(2) time_td := pan_log_item.Eq(3) pan_string_info["real_pan_desc"] = pan_td.Text() home_real_water_string := home_td.Text() home_real_water_str := strings.Replace(home_real_water_string, "↑", "", -1) home_real_water_str = strings.Replace(home_real_water_str, "↓", "", -1) guest_real_water_string := guest_td.Text() guest_real_water_str := strings.Replace(guest_real_water_string, "↑", "", -1) guest_real_water_str = strings.Replace(guest_real_water_str, "↓", "", -1) home_real_water_32, _ := strconv.ParseFloat(home_real_water_str, 32) guest_real_water_32, _ := strconv.ParseFloat(guest_real_water_str, 32) pan_float_info["real_home_water"] = float32(home_real_water_32) pan_float_info["real_guest_water"] = float32(guest_real_water_32) pan_string_info["pan_change_time"] = time_td.Text() home_pan_change_type := pan_td.Find("font").Text() home_pan_change_type = strings.TrimSpace(home_pan_change_type) pan_int_info["home_pan_change_type"] = 0 if home_pan_change_type == "升" { pan_int_info["home_pan_change_type"] = 1 pan_string_info["home_pan_change_type_desc"] = home_pan_change_type } if home_pan_change_type == "降" { pan_int_info["home_pan_change_type"] = -1 pan_string_info["home_pan_change_type_desc"] = home_pan_change_type } home_water_up_down_flag := home_td.Attr("class") pan_int_info["home_water_change_type"] = 0 if home_water_up_down_flag == "tips_down" { pan_int_info["home_water_change_type"] = -1 // down pan_string_info["home_water_change_type_desc"] = "水位降" // down } if home_water_up_down_flag == "tips_up" { pan_int_info["home_water_change_type"] = 1 // up pan_string_info["home_water_change_type_desc"] = "水位升" // up } real_pan_string := strings.Replace(pan_string_info["real_pan_desc"], pan_string_info["home_pan_change_type_desc"], "", -1) real_pan_desc := strings.TrimSpace(real_pan_string) has_panmap, real_pan_value := panmap.GetPanValueByPanDesc(real_pan_desc) if has_panmap == false { fmt.Println(pan_string_info["schedule_home"]) fmt.Println(pan_string_info["company_id"]) fmt.Println(pan_string_info["company_name"]) fmt.Println(real_pan_desc, "no exist") return false } pan_float_info["real_pan"] = real_pan_value asiapanlog.Add(pan_int_info, pan_float_info, pan_string_info) } return true }
func FetchTrByAjax(idx int, schedule_fenxi_id int, date string, home string) { odd_html := myinit.GetOddItemFromAjax(idx, schedule_fenxi_id) // fmt.Println(odd_html) // fmt.Println(pan_html_obj.HtmlAll()) table_string := "<table>" + odd_html + "</table>" html_obj, _ := goquery.ParseString(table_string) // fmt.Println("=====") odds_tr := html_obj.Find("table tbody tr") pan_int_info := make(map[string]int) pan_float_info := make(map[string]float32) pan_string_info := make(map[string]string) pan_string_info["schedule_date"] = date pan_string_info["schedule_home"] = home for i := 0; i < odds_tr.Length(); i++ { tr_item := odds_tr.Eq(i) td_of_company := tr_item.Find("td").Eq(1) if td_of_company.Find("p a").Attr("title") == "" { continue } pan_string_info["company_name"] = td_of_company.Find("p a").Attr("title") table_of_pan_detail := tr_item.Find("td .pl_table_data") table_of_opentime_pan := table_of_pan_detail.Eq(1) tds_of_opentime_pan_table := table_of_opentime_pan.Find("tbody tr td") open_pan_32, _ := strconv.ParseFloat(tds_of_opentime_pan_table.Eq(1).Attr("ref"), 32) pan_float_info["open_pan"] = float32(open_pan_32) pan_string_info["open_pan_desc"] = tds_of_opentime_pan_table.Eq(1).Text() table_of_realtime_pan := table_of_pan_detail.Eq(0) tds_of_realtime_pan_table := table_of_realtime_pan.Find("tbody tr td") real_pan_32, _ := strconv.ParseFloat(tds_of_realtime_pan_table.Eq(1).Attr("ref"), 32) pan_float_info["real_pan"] = float32(real_pan_32) pan_string_info["real_pan_desc"] = tds_of_realtime_pan_table.Eq(1).Text() td_item_of_real_pan := tds_of_realtime_pan_table.Eq(1) home_pan_change_type := td_item_of_real_pan.Find("font").Text() home_pan_change_type = strings.TrimSpace(home_pan_change_type) pan_int_info["home_pan_change_type"] = 0 if home_pan_change_type == "升" { pan_int_info["home_pan_change_type"] = 1 pan_string_info["home_pan_change_type_desc"] = home_pan_change_type } if home_pan_change_type == "降" { pan_int_info["home_pan_change_type"] = -1 pan_string_info["home_pan_change_type_desc"] = home_pan_change_type } real_pan_string := strings.Replace(pan_string_info["real_pan_desc"], pan_string_info["home_pan_change_type_desc"], "", -1) real_pan_desc := strings.TrimSpace(real_pan_string) fmt.Println("====here====") fmt.Println("date:", pan_string_info["schedule_date"], pan_string_info["schedule_home"], pan_string_info["company_name"]) fmt.Println(idx + i) // if(real_pan_desc==""){ // fmt.Println("trtime:") // fmt.Println(tr_item.HtmlAll()) // }else{ // fmt.Println("oktrtime:") // fmt.Println(tr_item.HtmlAll()) // } // fmt.Println("open:", pan_string_info["open_pan_desc"], pan_float_info["open_pan"]) // fmt.Println("real:", real_pan_desc, pan_float_info["real_pan"]) // fmt.Println("real desc:", pan_string_info["real_pan_desc"], pan_string_info["home_pan_change_type_desc"]) fmt.Println("++end here+++") panmap.Add(pan_string_info["open_pan_desc"], pan_float_info["open_pan"]) panmap.Add(real_pan_desc, pan_float_info["real_pan"]) } if odds_tr.Length() >= 30 { FetchTrByAjax(idx+30, schedule_fenxi_id, pan_string_info["schedule_date"], pan_string_info["schedule_home"]) } }
func ParsePanUrlFromAjax(idx int, schedule_fenxi_id int, pan_html_string_info map[string]string) (res bool) { pan_int_info := make(map[string]int) pan_float_info := make(map[string]float32) pan_string_info := make(map[string]string) odd_html := myinit.GetOddItemFromAjax(idx, schedule_fenxi_id) table_string := "<table>" + odd_html + "</table>" html_obj, _ := goquery.ParseString(table_string) odds_tr := html_obj.Find("table tbody tr") pan_int_info["schedule_fenxi_id"] = schedule_fenxi_id pan_string_info["schedule_date"] = pan_html_string_info["schedule_date"] pan_string_info["schedule_no"] = pan_html_string_info["schedule_no"] pan_string_info["schedule_result_no"] = pan_html_string_info["schedule_result_no"] pan_string_info["schedule_league"] = pan_html_string_info["schedule_league"] pan_string_info["schedule_home"] = pan_html_string_info["schedule_home"] pan_string_info["schedule_guest"] = pan_html_string_info["schedule_guest"] pan_string_info["schedule_game_desc"] = pan_html_string_info["schedule_game_desc"] pan_string_info["schedule_date_desc"] = pan_html_string_info["schedule_date_desc"] for i := 0; i < odds_tr.Length(); i++ { tr_item := odds_tr.Eq(i) td_of_company := tr_item.Find("td").Eq(1) if td_of_company.Find("p a").Attr("title") == "" { continue } company_id := tr_item.Attr("id") pan_string_info["company_id"] = company_id pan_string_info["company_name"] = td_of_company.Find("p a").Attr("title") var is_big_company = 0 if td_of_company.Find("p img").Attr("src") == "" { is_big_company = 0 } else { is_big_company = 1 fmt.Println("src:" + td_of_company.Find("p img").Attr("src")) } pan_int_info["is_big_company"] = is_big_company table_of_pan_detail := tr_item.Find("td .pl_table_data") table_of_opentime_pan := table_of_pan_detail.Eq(1) tds_of_opentime_pan_table := table_of_opentime_pan.Find("tbody tr td") open_pan_32, _ := strconv.ParseFloat(tds_of_opentime_pan_table.Eq(1).Attr("ref"), 32) pan_float_info["open_pan"] = float32(open_pan_32) pan_string_info["open_pan_desc"] = tds_of_opentime_pan_table.Eq(1).Text() open_home_water_32, _ := strconv.ParseFloat(tds_of_opentime_pan_table.Eq(0).Text(), 32) open_guest_water_32, _ := strconv.ParseFloat(tds_of_opentime_pan_table.Eq(2).Text(), 32) pan_float_info["open_home_water"] = float32(open_home_water_32) pan_float_info["open_guest_water"] = float32(open_guest_water_32) td_of_pan_time := tr_item.Find("td time") pan_string_info["open_pan_time"] = td_of_pan_time.Eq(1).Text() table_of_realtime_pan := table_of_pan_detail.Eq(0) tds_of_realtime_pan_table := table_of_realtime_pan.Find("tbody tr td") real_pan_32, _ := strconv.ParseFloat(tds_of_realtime_pan_table.Eq(1).Attr("ref"), 32) pan_float_info["real_pan"] = float32(real_pan_32) pan_string_info["real_pan_desc"] = tds_of_realtime_pan_table.Eq(1).Text() home_real_water_string := tds_of_realtime_pan_table.Eq(0).Text() home_real_water_str := strings.Replace(home_real_water_string, "↑", "", -1) home_real_water_str = strings.Replace(home_real_water_str, "↓", "", -1) guest_real_water_string := tds_of_realtime_pan_table.Eq(2).Text() guest_real_water_str := strings.Replace(guest_real_water_string, "↑", "", -1) guest_real_water_str = strings.Replace(guest_real_water_str, "↓", "", -1) home_real_water_32, _ := strconv.ParseFloat(home_real_water_str, 32) guest_real_water_32, _ := strconv.ParseFloat(guest_real_water_str, 32) pan_float_info["real_home_water"] = float32(home_real_water_32) pan_float_info["real_guest_water"] = float32(guest_real_water_32) pan_string_info["pan_change_time"] = td_of_pan_time.Eq(0).Text() td_item_of_real_pan := tds_of_realtime_pan_table.Eq(1) home_pan_change_type := td_item_of_real_pan.Find("font").Text() home_pan_change_type = strings.TrimSpace(home_pan_change_type) pan_int_info["home_pan_change_type"] = 0 if home_pan_change_type == "升" { pan_int_info["home_pan_change_type"] = 1 pan_string_info["home_pan_change_type_desc"] = home_pan_change_type } if home_pan_change_type == "降" { pan_int_info["home_pan_change_type"] = -1 pan_string_info["home_pan_change_type_desc"] = home_pan_change_type } home_water_up_down_flag := tds_of_realtime_pan_table.Eq(0).Attr("class") pan_int_info["home_water_change_type"] = 0 if home_water_up_down_flag == "ping" { pan_int_info["home_water_change_type"] = -1 // down pan_string_info["home_water_change_type_desc"] = "水位降" // down } if home_water_up_down_flag == "ying" { pan_int_info["home_water_change_type"] = 1 // up pan_string_info["home_water_change_type_desc"] = "水位升" // up } real_pan_string := strings.Replace(pan_string_info["real_pan_desc"], pan_string_info["home_pan_change_type_desc"], "", -1) real_pan_desc := strings.TrimSpace(real_pan_string) // panmap.Add(pan_string_info["open_pan_desc"], pan_float_info["open_pan"]) // panmap.Add(real_pan_desc, pan_float_info["real_pan"]) fmt.Println("date:", pan_string_info["schedule_date"], pan_string_info["schedule_home"]) fmt.Println("open:", pan_string_info["open_pan_desc"], pan_float_info["open_pan"]) fmt.Println("real:", real_pan_desc, pan_float_info["real_pan"]) parse_change_data := ParsePanChangeUrl(schedule_fenxi_id, company_id, pan_int_info, pan_float_info, pan_string_info) if parse_change_data == false { continue } if pan_float_info["open_pan"] > 0 || pan_float_info["real_pan"] > 0 { delete_asiapan := new(myinit.AsiaPan) del_result, _ := myinit.Engine.Where("schedule_fenxi_id=? ", schedule_fenxi_id).Delete(delete_asiapan) delete_schedule := new(myinit.Schedule) del_schedule_result, _ := myinit.Engine.Where("schedule_fenxi_id=? ", schedule_fenxi_id).Delete(delete_schedule) fmt.Println(del_result, del_schedule_result) fmt.Println("开盘>0 或者即时盘 >0") return false } predict1_result, predict1_cmt := analyse.AnalysePanResult1(pan_int_info, pan_float_info, pan_string_info) pan_string_info["predict1_result"] = predict1_result pan_string_info["predict1_cmt"] = predict1_cmt predict2_result, predict2_cmt := analyse.AnalysePanResult2(pan_int_info, pan_float_info, pan_string_info) pan_string_info["predict2_result"] = predict2_result pan_string_info["predict2_cmt"] = predict2_cmt fmt.Println("float_open_pan") fmt.Println(pan_float_info["open_home_water"]) fmt.Println("=====") // fmt.Println("company:" + company) // fmt.Println("home_pan_change_type:" + home_pan_change_type) // fmt.Println("is big company:" + is_big_company) // fmt.Println("change_time:" + change_time) // fmt.Println("open_time:" + open_time) // fmt.Println("flag:" + home_water_change_type + " " + home_water_up_down_flag) fmt.Println("home_real_water:", pan_float_info["real_home_water"]) fmt.Println("home_real_water water sting:" + home_real_water_string) // fmt.Println("guest_real_water:", guest_real_water) // fmt.Println("guest_real_water water sting:" + guest_real_water_string) // fmt.Println("pan:", real_pan, " ", real_pan_desc) // fmt.Println("open_home_water water:", open_home_water) // fmt.Println("open_guest_water water:", open_guest_water) // fmt.Println("open pan:", open_pan, " ", open_pan_desc) exist_asiapan := new(myinit.AsiaPan) has, _ := myinit.Engine.Where("schedule_fenxi_id=? AND company_id=? ", schedule_fenxi_id, company_id).Get(exist_asiapan) if has { fmt.Println(pan_string_info["company_name"] + "pan已存在!") if exist_asiapan.PanChangeTime != pan_string_info["pan_change_time"] { fmt.Println(pan_string_info["company_name"] + "pan有变化!") update_affected, update_err := asiapan.UpdateAsiaPanInfo(pan_int_info, pan_float_info, pan_string_info) fmt.Println(update_affected) fmt.Println(update_err) } } else { asiapan.Add(pan_int_info, pan_float_info, pan_string_info) } // count_open_water := open_home_water +open_guest_water count_real_water := pan_float_info["real_home_water"] + pan_float_info["real_guest_water"] if count_real_water < 1.75 || count_real_water > 2 { fmt.Println("+++++++") fmt.Println(pan_float_info["real_home_water"]) fmt.Println(pan_float_info["real_guest_water"]) fmt.Println(schedule_fenxi_id) fmt.Println("+++++++eeeeeeeeeeee") // delete_asiapan2 := new(myinit.AsiaPan) // delete2, _ := myinit.Engine.Where("schedule_fenxi_id=? AND company_id=? ", schedule_fenxi_id, company_id).Delete(delete_asiapan2) // fmt.Println(delete2) } } if odds_tr.Length() >= 30 { ParsePanUrlFromAjax(idx+30, schedule_fenxi_id, pan_html_string_info) } return true }
func main() { x, _ := goquery.ParseString(example) x.Find("a div").Print() fmt.Println("---") x.Find("a div.cow").Print() }