Exemple #1
0
func MakeDeadLoopSpider() *spiders.Spider {
	spider := spiders.Spider{}
	spider.Name = "deal_loop_spider"
	spider.StartUrls = []string{"http://www.baidu.com/s?wd=1"}
	spider.ParseMap = make(map[string]func(response *http.Response) ([]*http.Request, error))
	spider.ParseMap[spiders.BASE_PARSE_NAME] = func(response *http.Response) ([]*http.Request, error) {
		doc, err := goquery.NewDocumentFromReader(strings.NewReader(response.Body))
		if err != nil {
			return nil, err
		}
		nodes := doc.Find("#page .n").Nodes
		if len(nodes) == 0 {
			return nil, err
		}
		nextNode := nodes[len(nodes)-1]
		attrList := nextNode.Attr
		var nextPageLink string
		for _, attr := range attrList {
			if attr.Key == "href" {
				nextPageLink = attr.Val
			}
		}
		nextPage := "http://www.baidu.com" + nextPageLink
		request, err := http.NewRequest("GET", nextPage, spider.Name, spiders.BASE_PARSE_NAME, nil, 0)
		requestList := make([]*http.Request, 0)
		requestList = append(requestList, request)
		return requestList, nil
	}
	return &spider
}
func MakeMuiltiplySpiders() *spiders.Spider {
	spider := spiders.Spider{}
	spider.Name = "muiltiply_spider"
	spider.StartUrls = []string{"http://www.baidu.com/s?wd=1"}
	spider.ParseMap = make(map[string]func(response *http.Response) ([]*http.Request, error))
	spider.ParseMap[spiders.BASE_PARSE_NAME] = func(response *http.Response) ([]*http.Request, error) {
		doc, err := goquery.NewDocumentFromReader(strings.NewReader(response.Body))
		if err != nil {
			return nil, err
		}
		requestList := make([]*http.Request, 0, 10)
		doc.Find("#page a").Each(func(index int, hrefNode *goquery.Selection) {
			href, isExist := hrefNode.Attr("href")
			if !isExist {
				return
			}
			nextPage := "http://www.baidu.com" + href
			request, err := http.NewRequest("GET", nextPage, spider.Name, spiders.BASE_PARSE_NAME, nil, 0)
			if err != nil {
				log.Println(err)
			}
			requestList = append(requestList, request)
		})
		return requestList, nil
	}
	return &spider
}
Exemple #3
0
func MakeStockSpider() *spiders.Spider {
	spider := spiders.Spider{}
	spider.Name = "stock_spider"
	spider.BeforeMethod = func() {
		db.DefaultMysqlConnectMap.InitConnection(spider.Name, "root:@/wxstock?charset=utf8")
	}
	spider.AfterMethod = func() {
		db.DefaultMysqlConnectMap.CloseContection(spider.Name)
	}
	urlPrefix := "http://money.finance.sina.com.cn/d/api/openapi_proxy.php/?__s=[[%22hq%22,%22hs_a%22,%22%22,0,"
	urlSuffix := ",40]]&callback=FDC_DC.theTableData"
	pageSize := 40
	spider.StartUrls = []string{urlPrefix + "0" + urlSuffix}
	spider.ParseMap = make(map[string]func(response *http.Response) ([]*http.Request, error))
	spider.ParseMap[spiders.BASE_PARSE_NAME] = func(response *http.Response) ([]*http.Request, error) {
		body := response.Body
		startIndex := strings.Index(body, "FDC_DC.theTableData")
		jsonString := body[startIndex+len("FDC_DC.theTableData(") : len(body)-2]
		var jsonResult []Stock
		err := json.Unmarshal([]byte(jsonString), &jsonResult)
		if err != nil {
			log.Println(err)
		}
		for _, item := range jsonResult[0].Items {
			code := item[1]
			name := item[2]
			rows, err := db.DefaultMysqlConnectMap.Query(spider.Name, "select id from db_stock where code= ? ", code)
			if err != nil {
				log.Println(err)
				continue
			}
			if !rows.Next() {
				rows.Close()
				db.DefaultMysqlConnectMap.Exec(spider.Name, "insert into db_stock(gmt_create,code,name)values(now(), ?, ? )", code, name)
			} else {
				rows.Close()
			}
		}
		if strings.Contains(response.GoResponse.Request.URL.RawQuery, ",0,0,40]]") {
			totalPage := int(math.Ceil(float64(jsonResult[0].Count) / float64(pageSize)))
			requestList := make([]*http.Request, totalPage, totalPage)
			pageNo := 1
			for pageNo <= totalPage {
				request, err := http.NewRequest("GET", urlPrefix+strconv.Itoa(pageNo)+urlSuffix, spider.Name, spiders.BASE_PARSE_NAME, nil, 0)
				if err != nil {
					log.Println(err)
				}
				requestList[pageNo-1] = request
				pageNo += 1
			}
			return requestList, nil
		}
		return nil, nil
	}
	return &spider
}
func MakeStockPriceSpider() *spiders.Spider {
	spider := spiders.Spider{}
	spider.Name = "stock_price_spider"
	urlPrefix := "http://money.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/"
	spider.BeforeMethod = func() {
		db.DefaultMysqlConnectMap.InitConnection(spider.Name, "root:@/wxstock?charset=utf8")
		rows, err := db.DefaultMysqlConnectMap.Query(spider.Name, "select id,code from db_stock")
		if err != nil {
			log.Println(err)
		} else {
			defer rows.Close()
			spider.ExtData = make(map[string]interface{})
			codeIdMap := make(map[string]int)
			spider.ExtData["codeIdMap"] = codeIdMap
			for rows.Next() {
				var id int
				var code string
				rows.Scan(&id, &code)
				codeIdMap[code] = id
			}
		}
		priceRow, priceErr := db.DefaultMysqlConnectMap.Query(spider.Name, "select id,date from db_stock_price")
		if priceErr != nil {
			log.Println(err)
		} else {
			defer priceRow.Close()
			existMap := make(map[string]bool)
			spider.ExtData["existMap"] = existMap
			for priceRow.Next() {
				var id int
				var date string
				priceRow.Scan(&id, &date)
				existMap[strconv.Itoa(id)+":"+date] = true
			}
		}
		maxDateRow, maxErr := db.DefaultMysqlConnectMap.Query(spider.Name, "select max(date),stock_id from db_stock_price group by stock_id")
		if maxErr != nil {
			log.Println(err)
		} else {
			defer maxDateRow.Close()
			maxDataMap := make(map[int]string)
			spider.ExtData["maxDataMap"] = maxDataMap
			for maxDateRow.Next() {
				var maxDate string
				var stockId int
				maxDateRow.Scan(&maxDate, &stockId)
				maxDataMap[stockId] = maxDate
			}
		}
	}
	spider.AfterMethod = func() {
		db.DefaultMysqlConnectMap.CloseContection(spider.Name)
	}
	spider.StartUrls = []string{"http://money.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/600570.phtml?year=2015&jidu=2"}
	spider.ParseMap = make(map[string]func(response *http.Response) ([]*http.Request, error))
	spider.ParseMap[spiders.BASE_PARSE_NAME] = func(response *http.Response) ([]*http.Request, error) {
		doc, err := goquery.NewDocumentFromReader(strings.NewReader(response.Body))
		if err != nil {
			log.Println(err)
			return nil, err
		}
		code := response.Request.GoRequest.URL.Path[len("/corp/go.php/vMS_MarketHistory/stockid/"):len("/corp/go.php/vMS_MarketHistory/stockid/600570")]
		doc.Find("#FundHoldSharesTable tr").Each(func(index int, selection *goquery.Selection) {
			if index == 0 {
				return
			}
			date := strings.TrimSpace(selection.First().Find("div a").Text())
			if len(date) == 0 {
				return
			}
			priceString := strings.TrimSpace(selection.Find("td:nth-child(3) div").Text())
			priceFloat, _ := strconv.ParseFloat(priceString, 32)
			price := int(priceFloat * 1000)
			id := (spider.ExtData["codeIdMap"]).(map[string]int)[code]
			_, ok := (spider.ExtData["existMap"]).(map[string]bool)[strconv.Itoa(id)+":"+date]
			if !ok {
				_, err := db.DefaultMysqlConnectMap.Exec(spider.Name, "insert into db_stock_price(gmt_create,creator,stock_id,date,price)values(now(),'go', ? , ? , ? )", id, date, price)
				if err != nil {
					log.Println(err)
				}
			}
		})
		return nil, nil
	}
	spider.InitStartUrls = func() {
		spider.StartUrls = make([]string, 0)
		today := time.Now()
		year := today.Year()
		month := today.Month()
		jidu := int(math.Ceil(float64(month) / float64(3)))
		codeIdMap := (spider.ExtData["codeIdMap"]).(map[string]int)
		maxDataMap := (spider.ExtData["maxDataMap"]).(map[int]string)
		for code, id := range codeIdMap {
			stringData, ok := maxDataMap[id]
			var maxYear int
			var maxJidu int
			if ok {
				maxDate, _ := time.Parse("2006-01-02", stringData)
				maxYear = maxDate.Year()
				maxMonth := maxDate.Month()
				maxJidu = int(math.Ceil(float64(maxMonth) / float64(3)))
			} else {
				maxYear = year
				maxJidu = jidu
			}
			var i int = 1
			if maxYear >= year {
				i = maxJidu
				for ; i <= jidu; i++ {
					spider.StartUrls = append(spider.StartUrls, urlPrefix+code+".phtml?year="+strconv.Itoa(year)+"&jidu="+strconv.Itoa(i))
				}
			} else {
				for ; i <= jidu; i++ {
					spider.StartUrls = append(spider.StartUrls, urlPrefix+code+".phtml?year="+strconv.Itoa(year)+"&jidu="+strconv.Itoa(i))
				}
				if i < maxJidu {
					i = maxJidu
				}
				for ; i < 5; i++ {
					spider.StartUrls = append(spider.StartUrls, urlPrefix+code+".phtml?year="+strconv.Itoa(year-1)+"&jidu="+strconv.Itoa(i))
				}
			}
		}
	}
	return &spider
}