示例#1
0
func SearchNoMatch(f *os.File, targetArray []string, checkfile string) {
	sourceBuf := bufio.NewReader(f)

	index := 0
	lineNo := 0

	resultArray := make([]string, len(targetArray))
	for {
		line, err := sourceBuf.ReadString('\n')
		if err != nil || io.EOF == err {
			break
		}
		lineNo = lineNo + 1

		line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line)
		if strings.Contains(line, targetArray[index]) {
			resultArray[index] = line
			index = index + 1

		} else if index < len(targetArray) && index > 0 {
			for i := 0; i < index; i++ {
				logs.Debug(resultArray[i])
			}
			logs.Debug(line)
			index = 0
			continue
		}

		if index >= len(targetArray) {
			index = 0
		}

	}
}
示例#2
0
func HandleArg() (string, string, int, error) {
	targetFile := "target.txt"
	checkFile := "source.txt"
	searchType := SEARCH_NO_MATCH_UP

	var err error

	if len(os.Args) > 1 && len(os.Args) < 4 {
		logs.Debug("usage searchTarget error, please use help")
		return "", "", 0, errors.New("usage error")
	}
	if len(os.Args) >= 4 {
		searchType, err = strconv.Atoi(os.Args[3])
		if err == nil {
			targetFile = os.Args[1]
			checkFile = os.Args[2]
		} else {
			searchType = SEARCH_NO_MATCH_UP
		}
	}

	typeStr := "SEARCH_NO_MATCH"
	if searchType == SEARCH_MATCH {
		typeStr = "SEARCH_MATCH"
	} else if searchType == SEARCH_NO_MATCH_UP {
		typeStr = "SEARCH_NO_MATCH_UP"
	}

	logs.Debug("targetFileName:%s, checkFileName:%s, searchType:%s\n", targetFile, checkFile, typeStr)
	return targetFile, checkFile, searchType, nil
}
示例#3
0
func Help() bool {
	if len(os.Args) == 2 && strings.EqualFold(os.Args[1], "help") {
		logs.Debug("Usage: searchTarget [targetFileName checkFileName searchType][line num]")
		logs.Debug("searchType have two values 1: SEARCH_NO_MATCH 2:SEARCH_MATCH 3:SEARCH_NO_MATCH_UP 4:SEARCH_MATCH_GREP_LINE")
		logs.Debug("when use SEARCH_MATCH_GREP_LINE searchType, need line num parameter")
		return true
	}
	return false
}
示例#4
0
func GetDetailUrl(HostUrl string) string {
	query, err := goquery.NewDocument(HostUrl)
	if err != nil {
		logs.Debug("err: %s", err.Error())
		return ""
	}
	titles := query.Find("#primary div.primary-site h2.entry-title")

	for index := 0; index < titles.Length(); index++ {
		t := titles.Eq(index)
		ta := t.Find("a[href]")
		tsrc, _ := ta.Attr("href")
		if len(tsrc) > 0 {
			PageUrlList.Add(tsrc)
		}
	}

	naviDom := query.Find("#pagenavi a.page-numbers")
	if naviDom.Length() <= 0 {
		return ""
	}

	nextPage := naviDom.Eq(naviDom.Length() - 1)
	title, _ := nextPage.Attr("title")
	if title == "下页" {
		href, _ := nextPage.Attr("href")
		return href
	}
	return ""
}
示例#5
0
func SearchMatch(f *os.File, targetArray []string, checkfile string) {
	sourceBuf := bufio.NewReader(f)

	index := 0
	lineNo := 0

	resultArray := make([]string, len(targetArray))
	for {
		line, err := sourceBuf.ReadString('\n')
		if err != nil || io.EOF == err {
			break
		}
		lineNo = lineNo + 1

		if strings.Contains(line, targetArray[index]) {
			line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line)
			resultArray[index] = line
			index = index + 1

		} else if index < len(targetArray) {
			index = 0
			continue
		}

		if index == len(targetArray) {
			for _, data := range resultArray {
				logs.Debug(data)
			}
			index = 0
		}
	}
}
示例#6
0
文件: dams1.go 项目: huih/webspider
func getDams1ItemData(objectStr string, query *goquery.Document, ctx *Context) {
	itemBody := query.Find(objectStr)

	if itemBody.Length() <= 0 {
		logs.Debug("finished %s", ctx.GetUrl())
		return
	}

	//get detail page url
	for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ {
		div := itemBody.Eq(index)
		hrefa := div.Find("div.thumbnail-style h4 a[href]")
		href, _ := hrefa.Attr("href")

		var tc = false
		for _, u := range ctx.Spider.GetFilterUrl() {
			tc = strings.Contains(href, u)
			if tc {
				break
			}
		}
		if tc {
			continue
		}

		ctx.AddQueue(&context.Request{
			Url:  href,
			Rule: "详细页面",
		})
	}
}
示例#7
0
func TestImageCanShow(t *testing.T) {
	//image ok
	//url := "http://zhiliaoyuan-zhiliao.stor.sinaapp.com/uploads/2016/01/20160119154259_54748.png"

	//image bad
	url := "http://zhiliaoyuan-zhiliao.stor.sinaapp.com/uploads/2016/01/20160117180444_86670.gif"

	ret, err := ImageCanShow(url)
	if err != nil {
		logs.Debug("err: %s", err.Error())
		return
	}
	if ret {
		logs.Debug("image is ok")
	} else {
		logs.Debug("image is not ok")
	}
}
示例#8
0
func PrintStatistics(interval time.Duration) {
	for {
		if PageImageStop && ImageUrlList.Length() <= 0 {
			break
		}

		logs.Debug("checkPages: %d, checkImages: %d, checkFail: %d, imageOk: %d, imageBad: %d, pageListsize: %d, imageListSize: %d",
			checkState.CheckPage, checkState.CheckImage, checkState.CheckFail, checkState.ImageOk, checkState.ImageBad, PageUrlList.Length(), ImageUrlList.Length())
		time.Sleep(interval * time.Second)
	}
}
示例#9
0
func main() {

	logs.SetUsePrefix(false)
	if Help() {
		return
	}

	fileName, sourceName, searchType, err := HandleArg()
	if err != nil {
		return
	}

	targetArray, err := file.ReadFile(fileName)
	if err != nil {
		logs.Debug("open file(%s) error:%s", fileName, err.Error())
		return
	}

	sourceFile, err := os.Open(sourceName)
	if err != nil {
		logs.Debug("open file(%s) error: %s", sourceName, err.Error())
		return
	}
	defer sourceFile.Close()

	switch searchType {
	case SEARCH_NO_MATCH:
		SearchNoMatch(sourceFile, targetArray, sourceName)
	case SEARCH_MATCH:
		SearchMatch(sourceFile, targetArray, sourceName)
	case SEARCH_NO_MATCH_UP:
		SearchNoMatchUp(sourceFile, targetArray, sourceName)
	case SEARCH_MATCH_GREP_LINE:
		SearchMatchGrepLine(sourceFile, targetArray, sourceName)
	default:
		logs.Debug("no match search type")
	}

}
示例#10
0
func SearchNoMatchUp(f *os.File, targetArray []string, checkfile string) {
	sourceBuf := bufio.NewReader(f)

	index := 0
	lineNo := 0

	if len(targetArray) < 2 {
		logs.Debug("the target string must exceed two lines or two lines.")
		return
	}

	resultArray := make([]string, len(targetArray))
	for {
		line, err := sourceBuf.ReadString('\n')
		if err != nil || io.EOF == err {
			break
		}
		lineNo = lineNo + 1

		line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line)
		if index == 0 && strings.Contains(line, targetArray[index]) == false {
			resultArray[index] = line
			index = index + 1
		} else if index == 1 && strings.Contains(line, targetArray[index]) == false && strings.Contains(line, targetArray[0]) == false {
			index = 0
			resultArray[index] = line
			index = index + 1
		} else if index == 1 && strings.Contains(line, targetArray[index]) {
			for i := 0; i < index; i++ {
				logs.Debug(resultArray[i])
			}
			logs.Debug(line)
			index = 0
		} else {
			index = 0
		}
	}
}
示例#11
0
func SaveDataToLocalDB(paperTitle string, paperContent string, source_title string, source_address string) {

	isExist, err := DataInDB(source_address)
	if err != nil {
		logs.Error("judge url in db or not error")
	} else if isExist == true {
		logs.Debug("current url is exsist in database")
		return
	}

	sql := "insert into zhiliaoyuan(id, paper_title, source_title, source_url, add_time) values("
	sql += "nextval('zhiliaoyuan_id_seq'),'" + paperTitle + "','"
	sql += source_title + "','" + source_address + "',now());"

	err = db.ExecuteSql(sql)
	if err != nil {
		logs.Error(err.Error())
	}
}
示例#12
0
func main() {
	//create remove repeat map
	pageRepeat = make(map[string]int)
	finished = make(chan bool)

	//start parse pages
	//go GetPages("http://www.zhiliaoyuan.com")
	GetPagesFromFile("page_input.txt")

	//start get image url
	go GetPageImageUrl()

	//start check image
	go CheckImage()

	//start statistics
	go PrintStatistics(5)

	<-finished

	logs.Debug("finished check image ...")
}
示例#13
0
文件: cnblog.go 项目: huih/webspider
			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&context.Request{
							Url:  "http://www.cnblogs.com",
							Rule: aid["Rule"].(string),
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					itemBody := query.Find("div.post_item_body")

					if itemBody.Length() <= 0 {
						logs.Debug("finished %s", ctx.GetUrl())
						return
					}

					//get detail page url
					for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ {
						div := itemBody.Eq(index)
						hrefa := div.Find("a[class]")
						href, _ := hrefa.Attr("href")

						var tc = false
						for _, u := range ctx.Spider.GetFilterUrl() {
							tc = strings.Contains(href, u)
							if tc {
								break
							}
示例#14
0
文件: ibm.go 项目: huih/webspider
			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&context.Request{
							Url:  "http://www.ibm.com/developerworks/cn/views/linux/libraryview.jsp",
							Rule: aid["Rule"].(string),
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					itemBody := query.Find("#ibm-content-body #ibm-content-main div.ibm-container table tbody tr a[href]")

					if itemBody.Length() <= 0 {
						logs.Debug("finished %s", ctx.GetUrl())
						return
					}

					//get detail page url
					for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ {
						aobj := itemBody.Eq(index)
						href, _ := aobj.Attr("href")

						var tc = false
						for _, u := range ctx.Spider.GetFilterUrl() {
							tc = strings.Contains(href, u)
							if tc {
								break
							}
						}
示例#15
0
文件: freebuf.go 项目: huih/webspider
			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&context.Request{
							Url:  "http://www.freebuf.com/",
							Rule: aid["Rule"].(string),
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					itemBody := query.Find("div.row div.main-mid #timeline div.news_inner")

					if itemBody.Length() <= 0 {
						logs.Debug("finished %s", ctx.GetUrl())
						return
					}

					//get detail page url
					for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ {
						div := itemBody.Eq(index)
						hrefa := div.Find("div.news-info dl dt a[href]")
						href, _ := hrefa.Attr("href")

						var tc = false
						for _, u := range ctx.Spider.GetFilterUrl() {
							tc = strings.Contains(href, u)
							if tc {
								break
							}
示例#16
0
文件: infoq.go 项目: huih/webspider
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&context.Request{
							Url:  "http://www.infoq.com/cn/articles",
							Rule: aid["Rule"].(string),
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					getInfoqItemData("div.news_type1", query, ctx)
					getInfoqItemData("div.news_type2", query, ctx)

					//get next page url
					if ctx.Spider.GetStopSpider() {
						logs.Debug("stop spider: %s", ctx.Spider.GetHostUrl())
						return
					}

					nextPage := query.Find("div.load_more_articles a.blue")
					if nextPage.Length() <= 0 {
						return
					}

					text, _ := nextPage.Attr("href")

					if text != "" {
						isLast := strings.Contains(ctx.GetUrl(), text)
						if (isLast == false) && ctx.Spider.ContinueGetItem() {

							nextPageUrl := ctx.Spider.GetHostUrl() + text
示例#17
0
文件: csdn.go 项目: huih/webspider
			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&context.Request{
							Url:  "http://blog.csdn.net/",
							Rule: aid["Rule"].(string),
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					itemBody := query.Find("div.main_center div.blog_list")

					if itemBody.Length() <= 0 {
						logs.Debug("finished %s", ctx.GetUrl())
						return
					}

					//get detail page url
					for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ {
						div := itemBody.Eq(index)
						hrefa := div.Find("h1 a[name]")
						href, _ := hrefa.Attr("href")

						var tc = false
						for _, u := range ctx.Spider.GetFilterUrl() {
							tc = strings.Contains(href, u)
							if tc {
								break
							}
示例#18
0
func SearchMatchGrepLine(f *os.File, targetArray []string, checkfile string) {

	//read the number of line from args
	printNum := 1
	lineNum := 1
	if len(os.Args) >= 5 {
		lineNum, _ = strconv.Atoi(os.Args[4])
	}
	if lineNum <= 0 {
		lineNum = 1
	}

	sourceBuf := bufio.NewReader(f)

	index := 0
	lineNo := 0

	fixedList := fixedlist.New(len(targetArray) + lineNum)
	bakFixedList := fixedlist.New(lineNum)

	for {
		line, err := sourceBuf.ReadString('\n')
		if err != nil || io.EOF == err {
			break
		}
		lineNo = lineNo + 1

		line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line)
		if strings.Contains(line, targetArray[index]) {
			index = index + 1
			fixedList.Add(line)
		} else {
			fixedList.Add(line)
			index = 0
		}

		if index < len(targetArray) {
			continue
		}

		if index >= len(targetArray) {
			tmpLineNum := 1
			for {
				//read lineNum line to bakFixedList
				line, err := sourceBuf.ReadString('\n')
				if err != nil || io.EOF == err { //read finished
					break
				}
				lineNo = lineNo + 1
				line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line)
				bakFixedList.Add(line)
				tmpLineNum = tmpLineNum + 1
				if tmpLineNum >= lineNum {
					break
				}
			}
		}

		if index >= len(targetArray) {
			//print fixedList comment
			logs.Debug("xxxxxxxxxxxxxx%d timesxxxxxxxxxxxx\n", printNum)
			printNum = printNum + 1
			for {
				v := fixedList.PopFront()
				if v == nil {
					break
				}
				logs.Debug("%s", v)
			}

			//print bakFixedList comment
			for {
				v := bakFixedList.PopFront()
				if v == nil {
					break
				}
				logs.Debug("%s", v)
				fixedList.Add(v)
			}
		}
		index = 0
	}
}
示例#19
0
文件: freebuf.go 项目: huih/webspider
			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&context.Request{
							Url:  "http://www.freebuf.com/",
							Rule: aid["Rule"].(string),
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					itemBody := query.Find("div.row div.main-mid #timeline div.news_inner")

					if itemBody.Length() <= 0 {
						logs.Debug("finished %s", ctx.GetUrl())
						return
					}

					//get detail page url
					for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ {
						div := itemBody.Eq(index)
						hrefa := div.Find("div.news-info dl dt a[href]")
						href, _ := hrefa.Attr("href")

						var tc = false
						for _, u := range ctx.Spider.GetFilterUrl() {
							tc = strings.Contains(href, u)
							if tc {
								break
							}
示例#20
0
文件: dams1.go 项目: huih/webspider
					//find content
					bodyContent := query.Find("div.container div.row-fluid div.span8 div.blog")

					//filter bad paper
					for _, dom := range ctx.Spider.GetFilterDom() {
						tmpDom := bodyContent.Find(dom)
						if tmpDom.Length() > 1 {
							return
						}
					}

					bodyContent = formatDams1Content(ctx.Spider.GetRemoveDom(), bodyContent)

					if bodyContent == nil {
						logs.Debug("bodyContent is null")
						return
					}
					bodyHtml, err := bodyContent.Html()
					if err != nil {
						logs.Debug("get body content html err: %s", err.Error())
						return
					}

					//remove bad content
					for _, item := range dams1DeleteContent {
						reg := regexp.MustCompile(item)
						bodyHtml = reg.ReplaceAllLiteralString(bodyHtml, "")
					}

					paperTitle = strings.TrimSpace(paperTitle)
示例#21
0
			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&context.Request{
							Url:  "http://www.linuxnews.com",
							Rule: aid["Rule"].(string),
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					tmpquery, _ := query.Html()
					logs.Debug(tmpquery)

					itemBody := query.Find("ul.article-list")

					if itemBody.Length() <= 0 {
						logs.Debug("finished %s", ctx.GetUrl())
						return
					}

					//get detail page url
					for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ {
						div := itemBody.Eq(index)
						title := div.Find("span.title a.new")
						href, _ := title.Attr("href")

						var tc = false
示例#22
0
文件: dams.go 项目: huih/webspider
					//find content
					bodyContent := query.Find("div.container div.row-fluid div.span8 div.blog")

					//filter bad paper
					for _, dom := range ctx.Spider.GetFilterDom() {
						tmpDom := bodyContent.Find(dom)
						if tmpDom.Length() > 1 {
							return
						}
					}

					bodyContent = formatDamsContent(ctx.Spider.GetRemoveDom(), bodyContent)

					if bodyContent == nil {
						logs.Debug("bodyContent is null")
						return
					}
					bodyHtml, err := bodyContent.Html()
					if err != nil {
						logs.Debug("get body content html err: %s", err.Error())
						return
					}

					//remove bad content
					for _, item := range damsDeleteContent {
						reg := regexp.MustCompile(item)
						bodyHtml = reg.ReplaceAllLiteralString(bodyHtml, "")
					}

					paperTitle = strings.TrimSpace(paperTitle)