Example #1
0
func (this *PageOutSql) getConfigData() {
	if this.audit != -1 {
		return
	}
	err := this.db.Ping()
	if err != nil {
		logs.GetFirstLogger().Error("ping mysql error :" + err.Error())
		return
	}

	rows, err := this.db.Query("select status from deploy")
	if err != nil {
		logs.GetFirstLogger().Error("select status from deploy error :" + err.Error())
		return
	}
	defer rows.Close()
	var status int
	if rows.Next() {
		err := rows.Scan(&status)
		if err != nil {
			logs.GetFirstLogger().Error("fetch status value error :" + err.Error())
		}
	}

}
Example #2
0
//下载文件,并对字符编码做相应的处理
func (this *HttpDownLoader) downloadFile(p *page.Page, req *page.Request) (*page.Page, string) {
	var err error
	var httpResp *http.Response
	var urlStr string
	var method string
	urlStr = req.GetUrl()
	if len(urlStr) == 0 {
		logs.GetFirstLogger().Error("url is empty")
		p.SetStatus(true, "url is empty")
		return p, ""
	}

	method = req.GetMethod()

	if method == "POST" {
		httpResp, err = http.Post(req.GetUrl(), "application/x-www-form-urlencoded", strings.NewReader(req.GetPostData()))
	} else {
		httpResp, err = http.Get(req.GetUrl())
	}

	if err != nil {
		logs.GetFirstLogger().Error("http visit error :" + err.Error())
		p.SetStatus(true, err.Error())
	}
	p.SetHeader(httpResp.Header)
	p.SetCookies(httpResp.Cookies())
	body, _ := ioutil.ReadAll(httpResp.Body)
	bodyStr := string(body)
	defer httpResp.Body.Close()
	return p, bodyStr
}
Example #3
0
func (this *Spider) AddRequest(req *page.Request) *Spider {
	if req == nil {
		logs.GetFirstLogger().Error("request is nil")
		return this
	} else if req.GetUrl() == "" {
		logs.GetFirstLogger().Error("request is empty")
	}

	this.m_scheduler.Push(req)
	return this
}
Example #4
0
func main() {
	logs.GetFirstLogger().SetLevel("warn")
	logs.GetFirstLogger().Info("crawl begin www.youxiduo.com")
	var sp *spider.Spider
	sp = spider.NewSpider(NewYouxiduoProcesser(""), "youxiduo")
	out := page.NewPageOutSql()
	sp.AddUrl("http://www.youxiduo.com/zixun/game/", "html", "list").AddPageOut(out).Run()
	//sp.AddUrl("http://www.youxiduo.com/zixun/game/108444.shtml", "html", "content").Run()
	out.Release()
	logs.GetFirstLogger().Info("crawl end www.youxiduo.com")

}
Example #5
0
func (this *YouxiduoProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.GetErrMsg())
		return
	}
	var body string = p.GetBody()
	var urlTag string = p.GetRequest().GetUrlTag()
	p.SetUrlTag(urlTag)
	//分析这个页面是LIST页面还是内容页面
	// <div class="infroList"><ul><li>...</div>===>LIST
	// <div class="pagebreak">...</div>===>LIST
	// CONTENT
	//<div class="article"

	if urlTag == "list" {
		//
		//1.寻找news-brief的content
		regList, err := regexp.Compile(`<div class=\"infroList\">(\s|.)*<\/ul>(\s|.)*<div class=\"pagebreak\">`)
		if err != nil {
			logs.GetFirstLogger().Error("分析页面出错,正则表达式错误了,url = " + p.GetRequest().GetUrl())
		}
		var infroList []string = regList.FindAllString(body, -1)

		if len(infroList) > 0 {
			this.parseNewsBreifInfo(infroList[0], p)
		} else {
			logs.GetFirstLogger().Info("No more list items")
		}
		//先寻找额外的LIST页面
		if !p.IsBreak() {
			regPageBreak, err := regexp.Compile(`<div class=\"pagebreak\">(\s|.)+<li class=\"lastPage\">`)
			if err != nil {
				logs.GetFirstLogger().Error("分析页面出错,翻页正则表达式错误,url = " + p.GetRequest().GetUrl())
			}
			var pageBreakList []string = regPageBreak.FindAllString(body, -1)
			if len(pageBreakList) > 0 {
				this.parseNewsLinkListInfo(pageBreakList[0], p)
			} else {
				logs.GetFirstLogger().Info("No more links")
			}

		}

	} else {
		//CONTENT
		this.parseNewsDetail(body, p)
	}

}
Example #6
0
func NewPageOutSql() *PageOutSql {
	db, err := sql.Open("mysql", "colefan:123456@tcp(192.168.13.21:3306)/news_tation")
	if err != nil {
		logs.GetFirstLogger().Error("mysql open error :" + err.Error())
	}

	list := make(map[string]*PageItems)
	return &PageOutSql{pageResult: list, db: db, audit: -1}
}
Example #7
0
func (this *HttpDownLoader) DownLoad(req *page.Request) *page.Page {
	var p = page.NewPage(req)
	var respType = req.GetRespType()
	switch respType {
	case "html":
		return this.downloadHtml(p, req)
	default:
		logs.GetFirstLogger().Error("error request type : " + respType)
	}

	return nil
}
Example #8
0
func NewSpider(processerInst page.PageProcesser, taskName string) *Spider {
	spiderInst := &Spider{m_taskName: taskName, m_pageProcesser: processerInst}
	spiderInst.m_exitWhenDone = true

	if spiderInst.m_scheduler == nil {
		spiderInst.SetScheduler(scheduler.NewQueueScheduler(false))
	}

	if spiderInst.m_downLoader == nil {
		spiderInst.SetDownLoader(downloader.NewHttpDownLoader())
	}

	logs.GetFirstLogger().Info("*** start spider ***")
	spiderInst.m_outputs = make([]page.PageOut, 0)
	spiderInst.m_handlingMap = make(map[string]string)
	return spiderInst

}
Example #9
0
func (this *YouxiduoProcesser) parseNewsBreifInfo(content string, p *page.Page) *page.Page {
	logs.GetFirstLogger().Trace("B TEST LIST ITEMS")
	var pos1 int = strings.Index(content, "<li>")
	var pos2 int = strings.Index(content, "</li>")
	var count int = 1

	for pos1 >= 0 && pos2 >= 0 && (pos2 > pos1) {
		item := page.NewPageItems("")
		tmpStr := string(content[pos1 : pos2+5])
		content = string(content[pos2+5 : len(content)])

		pos1 = strings.Index(content, "<li>")
		pos2 = strings.Index(content, "</li>")
		logs.GetFirstLogger().Trace("B================>")
		reg, _ := regexp.Compile(`<span>(.)*[\d]{4}-[\d]{2}-[\d]{2}`)
		timeStr := reg.FindString(tmpStr)
		reg, _ = regexp.Compile(`[\d]{4}-[\d]{2}-[\d]{2}`)
		timeStr = reg.FindString(timeStr)
		if this.exitDate > timeStr {
			p.SetBreak(true)
			continue
		}
		item.AddItem("time", timeStr)

		reg, _ = regexp.Compile("title=\"(.)*\"")
		title := reg.FindString(tmpStr)
		title = string(title[strings.Index(title, "\"")+1 : len(title)])
		title = string(title[0:strings.Index(title, "\"")])
		logs.GetFirstLogger().Trace("title = " + title)
		//p.AddResultItem("title", title)
		item.AddItem("title", title)
		reg, _ = regexp.Compile("<img src=(.)*alt")
		pic := reg.FindString(tmpStr)
		pic = string(pic[strings.Index(pic, "\"")+1 : len(pic)])
		pic = string(pic[0:strings.Index(pic, "\"")])

		if util.IsRelativePath(pic) {
			pic = util.GetRealUrl(p.GetRequest().GetUrl(), pic)
		}
		logs.GetFirstLogger().Trace("pic = " + pic)
		//p.AddResultItem("pic", pic)
		item.AddItem("pic", pic)

		reg, _ = regexp.Compile("<p>(.)*</p>")
		info := reg.FindString(tmpStr)
		logs.GetFirstLogger().Trace("info = " + info)
		//p.AddResultItem("info", info)
		info = strings.Replace(info, "'", "\"", -1)
		info = strings.Replace(info, "&#39;", "\"", -1)

		item.AddItem("info", info)

		reg, _ = regexp.Compile("<span(.)*<a(.)*</span>")
		detailurl := reg.FindString(tmpStr)
		reg, _ = regexp.Compile("href(.)*\">")
		detailurl = reg.FindString(detailurl)
		detailurl = detailurl[strings.Index(detailurl, "\"")+1 : len(detailurl)]
		detailurl = detailurl[0:strings.Index(detailurl, "\"")]
		logs.GetFirstLogger().Trace("detailurl = " + detailurl)
		//p.AddResultItem("detailurl", detailurl)
		item.AddItem("detailurl", detailurl)
		//p.AddResultItem("key", detailurl)
		item.SetKey(detailurl)
		p.AddNewUrl(detailurl, "content")

		logs.GetFirstLogger().Trace("E================>")
		logs.GetFirstLogger().Tracef("count = %d", count)
		count = count + 1
		logs.GetFirstLogger().Warn(title)

		pos1 = strings.Index(content, "<li>")
		pos2 = strings.Index(content, "</li>")
		p.AddPageItems(item)
	}

	return p
}
Example #10
0
func (this *YouxiduoProcesser) parseNewsDetail(content string, p *page.Page) *page.Page {
	logs.GetFirstLogger().Trace("B TEST ARTICLE")
	//println(content)
	//tile , 不用考虑,在前面已经获取过了
	item := page.NewPageItems(p.GetRequest().GetUrl())
	reg, _ := regexp.Compile(`<div><span><em(.)*<\/span></div>`)
	newssrc := reg.FindString(content)

	//news_src,新闻来源
	reg, _ = regexp.Compile(`<a(.)*<\/a>`)
	newssrc = reg.FindString(newssrc)
	newssrc = newssrc[strings.Index(newssrc, ">")+1 : len(newssrc)]
	if strings.Index(newssrc, "<") >= 0 {
		newssrc = newssrc[0:strings.Index(newssrc, "<")]
	}

	logs.GetFirstLogger().Trace("newssrc = " + newssrc)
	//p.AddResultItem("news_src", newssrc)
	item.AddItem("news_src", newssrc)
	//news_content,新闻内容
	reg, _ = regexp.Compile(`<div class=\"artCon\">(.|\s)*<\/div>(\s)*<div class=\"pagebreak\"`)
	news := reg.FindString(content)
	if len(news) > 0 {
		pbIndex := strings.Index(news, "<div class=\"pagebreak\"")
		if pbIndex > 0 {
			news = news[0:pbIndex]
		}

	}
	newsIndex1 := strings.Index(news, ">")
	newsIndex2 := strings.Index(news, "</div>")
	if newsIndex1 >= 0 && newsIndex2 >= 0 {
		news = news[newsIndex1+1 : newsIndex2]
	}

	//p.AddResultItem("news_content", news)
	news = strings.Replace(news, "'", "\"", -1)
	news = strings.Replace(news, "&#39;", "\"", -1)
	//	imgSrcIndex := strings.Index(news, "<img src=\"/")
	//	if imgSrcIndex >= 0 {
	//		news = strings.Replace(news, "<img src=\"/", "<img src=\""+util.GetUrlDomain(p.GetRequest().GetUrl())+"/", -1)
	//	}
	////////////////////
	imgSrcIndex := strings.Index(news, "<img ")
	if imgSrcIndex >= 0 {
		news = strings.Replace(news, "<img src=\"/", "<img src=\""+util.GetUrlDomain(p.GetRequest().GetUrl())+"/", -1)
		news = strings.Replace(news, "<img alt=\"[^\"]\" src=\"/", "<img src=\""+util.GetUrlDomain(p.GetRequest().GetUrl())+"/", -1)
		//println(news_content)

		//	println("===============")
		reg, _ = regexp.Compile(`<img[^>]*>`)
		imgList := reg.FindAllString(news, -1)
		for _, img := range imgList {
			//strings.Replace(news_content, img)
			//println("old img ==>" + img)
			newImg := img
			styleIndex := strings.Index(newImg, "style=\"")
			if styleIndex >= 0 {
				styleStr := newImg[styleIndex+len("style=\""):]
				endIndex := strings.Index(styleStr, "\"")
				if endIndex > 0 {
					styleStr = styleStr[0:endIndex]
				}
				newstyleStr := changeImgSize(styleStr)
				newImg = strings.Replace(img, styleStr, newstyleStr, -1)

			} else {
				//找width,找height
				reg2, _ := regexp.Compile(`width=\"[0-9]+\"`)
				tmpWidthStr := reg2.FindString(img)

				reg2, _ = regexp.Compile(`height=\"[0-9]+\"`)
				tmpHeightStr := reg2.FindString(img)
				//println("tmp height str = " + tmpHeightStr)
				var f float32 = 1.0
				if len(tmpWidthStr) > 0 {
					tmpStr1 := tmpWidthStr[strings.Index(tmpWidthStr, "\"")+1:]
					tmpStr1 = tmpStr1[0:strings.Index(tmpStr1, "\"")]

					tmpWidth, _ := strconv.Atoi(tmpStr1)
					if tmpWidth > 360 {
						f = float32(tmpWidth) / 360.0
						if len(tmpHeightStr) > 0 {
							tmpStr2 := tmpHeightStr[strings.Index(tmpHeightStr, "\"")+1:]
							tmpStr2 = tmpStr2[0:strings.Index(tmpStr2, "\"")]
							tmpHeight, _ := strconv.Atoi(tmpStr2)

							newImg = strings.Replace(img, tmpWidthStr, "width=\"360\"", -1)
							tmpHeight = int(float32(tmpHeight) / f)
							newImg = strings.Replace(newImg, tmpHeightStr, "height=\""+strconv.Itoa(tmpHeight)+"\"", -1)

						} else {
							newImg = strings.Replace(img, tmpWidthStr, "width=\"360\"", -1)
						}

					}
				}

			}

			//有没有STYLE,有style的处理style
			//有没有width
			//有没有height
			//println("new img ==>" + newImg)
			if img != newImg {
				news = strings.Replace(news, img, newImg, -1)
			}
		}
	}
	//////
	news = strings.Replace(news, "<a[^>]*>官方网站</a>", "", -1)

	logs.GetFirstLogger().Trace("news = " + news)
	//判断是否有视频在新闻中,如有则过滤到哦
	reg, _ = regexp.Compile(`<[^>]*shockwave-flash[^>]*>`)
	tmpN := reg.FindString(news)

	item.AddItem("news_content", news)
	//p.AddResultItem("key", p.GetRequest().GetUrl())
	if len(tmpN) <= 0 {
		p.AddPageItems(item)
	}

	logs.GetFirstLogger().Trace("E TEST ARTICLE")

	return p
}
Example #11
0
func (this *PageOutSql) output(item *PageItems) {
	logs.GetFirstLogger().Info("NEWS BEGIN==============================")
	logs.GetFirstLogger().Info("url\t= " + item.GetKey())
	logs.GetFirstLogger().Info("url2\t= " + item.GetItem("detailurl"))
	logs.GetFirstLogger().Info("title\t= " + item.GetItem("title"))
	logs.GetFirstLogger().Info("pic\t= " + item.GetItem("pic"))
	logs.GetFirstLogger().Info("breif\t= " + item.GetItem("info"))
	logs.GetFirstLogger().Info("time\t= " + item.GetItem("time"))
	logs.GetFirstLogger().Info("src\t= " + item.GetItem("news_src"))
	logs.GetFirstLogger().Info("news\t= " + item.GetItem("news_content"))
	logs.GetFirstLogger().Info("NEWS END  ==============================")
	sqlStr := "insert into news(`origin_url`,`title`,`icon_url`,`brief`,`get_time`,`origin`,`content`,`type`,`status`) values ("
	sqlStr = sqlStr + "'" + item.GetKey() + "'," + "'" + item.GetItem("title") + "'," + "'" + item.GetItem("pic") + "'," + "'" + item.GetItem("info") + "'," + "'" + item.GetItem("time") + "'," + "'" + item.GetItem("news_src") + "'," + "'" + item.GetItem("news_content") + "'," + "0,0)"
	//	logs.GetFirstLogger().Info(sqlStr)

	if this.audit == -1 {
		this.getConfigData()
	}

	logs.GetFirstLogger().Info(sqlStr)
	rows, err := this.db.Query("select count(*) from news where title='" + item.GetItem("title") + "'")
	if err != nil {
		logs.GetFirstLogger().Error("select count(*) from news error : " + err.Error())
		return
	}
	defer rows.Close()
	var sameCount int = 0

	if rows.Next() {
		err := rows.Scan(&sameCount)
		if err != nil {
			logs.GetFirstLogger().Error("fetch status value error :" + err.Error())
			return
		}
	}

	if sameCount > 0 {
		return
	}

	_, err2 := this.db.Exec(sqlStr)
	if err2 != nil {
		logs.GetFirstLogger().Error("insert error : " + err2.Error())
		logs.GetFirstLogger().Error("sql = " + sqlStr)
	}

}