func SearchNoMatch(f *os.File, targetArray []string, checkfile string) { sourceBuf := bufio.NewReader(f) index := 0 lineNo := 0 resultArray := make([]string, len(targetArray)) for { line, err := sourceBuf.ReadString('\n') if err != nil || io.EOF == err { break } lineNo = lineNo + 1 line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line) if strings.Contains(line, targetArray[index]) { resultArray[index] = line index = index + 1 } else if index < len(targetArray) && index > 0 { for i := 0; i < index; i++ { logs.Debug(resultArray[i]) } logs.Debug(line) index = 0 continue } if index >= len(targetArray) { index = 0 } } }
func HandleArg() (string, string, int, error) { targetFile := "target.txt" checkFile := "source.txt" searchType := SEARCH_NO_MATCH_UP var err error if len(os.Args) > 1 && len(os.Args) < 4 { logs.Debug("usage searchTarget error, please use help") return "", "", 0, errors.New("usage error") } if len(os.Args) >= 4 { searchType, err = strconv.Atoi(os.Args[3]) if err == nil { targetFile = os.Args[1] checkFile = os.Args[2] } else { searchType = SEARCH_NO_MATCH_UP } } typeStr := "SEARCH_NO_MATCH" if searchType == SEARCH_MATCH { typeStr = "SEARCH_MATCH" } else if searchType == SEARCH_NO_MATCH_UP { typeStr = "SEARCH_NO_MATCH_UP" } logs.Debug("targetFileName:%s, checkFileName:%s, searchType:%s\n", targetFile, checkFile, typeStr) return targetFile, checkFile, searchType, nil }
func Help() bool { if len(os.Args) == 2 && strings.EqualFold(os.Args[1], "help") { logs.Debug("Usage: searchTarget [targetFileName checkFileName searchType][line num]") logs.Debug("searchType have two values 1: SEARCH_NO_MATCH 2:SEARCH_MATCH 3:SEARCH_NO_MATCH_UP 4:SEARCH_MATCH_GREP_LINE") logs.Debug("when use SEARCH_MATCH_GREP_LINE searchType, need line num parameter") return true } return false }
func GetDetailUrl(HostUrl string) string { query, err := goquery.NewDocument(HostUrl) if err != nil { logs.Debug("err: %s", err.Error()) return "" } titles := query.Find("#primary div.primary-site h2.entry-title") for index := 0; index < titles.Length(); index++ { t := titles.Eq(index) ta := t.Find("a[href]") tsrc, _ := ta.Attr("href") if len(tsrc) > 0 { PageUrlList.Add(tsrc) } } naviDom := query.Find("#pagenavi a.page-numbers") if naviDom.Length() <= 0 { return "" } nextPage := naviDom.Eq(naviDom.Length() - 1) title, _ := nextPage.Attr("title") if title == "下页" { href, _ := nextPage.Attr("href") return href } return "" }
func SearchMatch(f *os.File, targetArray []string, checkfile string) { sourceBuf := bufio.NewReader(f) index := 0 lineNo := 0 resultArray := make([]string, len(targetArray)) for { line, err := sourceBuf.ReadString('\n') if err != nil || io.EOF == err { break } lineNo = lineNo + 1 if strings.Contains(line, targetArray[index]) { line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line) resultArray[index] = line index = index + 1 } else if index < len(targetArray) { index = 0 continue } if index == len(targetArray) { for _, data := range resultArray { logs.Debug(data) } index = 0 } } }
func getDams1ItemData(objectStr string, query *goquery.Document, ctx *Context) { itemBody := query.Find(objectStr) if itemBody.Length() <= 0 { logs.Debug("finished %s", ctx.GetUrl()) return } //get detail page url for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ { div := itemBody.Eq(index) hrefa := div.Find("div.thumbnail-style h4 a[href]") href, _ := hrefa.Attr("href") var tc = false for _, u := range ctx.Spider.GetFilterUrl() { tc = strings.Contains(href, u) if tc { break } } if tc { continue } ctx.AddQueue(&context.Request{ Url: href, Rule: "详细页面", }) } }
func TestImageCanShow(t *testing.T) { //image ok //url := "http://zhiliaoyuan-zhiliao.stor.sinaapp.com/uploads/2016/01/20160119154259_54748.png" //image bad url := "http://zhiliaoyuan-zhiliao.stor.sinaapp.com/uploads/2016/01/20160117180444_86670.gif" ret, err := ImageCanShow(url) if err != nil { logs.Debug("err: %s", err.Error()) return } if ret { logs.Debug("image is ok") } else { logs.Debug("image is not ok") } }
func PrintStatistics(interval time.Duration) { for { if PageImageStop && ImageUrlList.Length() <= 0 { break } logs.Debug("checkPages: %d, checkImages: %d, checkFail: %d, imageOk: %d, imageBad: %d, pageListsize: %d, imageListSize: %d", checkState.CheckPage, checkState.CheckImage, checkState.CheckFail, checkState.ImageOk, checkState.ImageBad, PageUrlList.Length(), ImageUrlList.Length()) time.Sleep(interval * time.Second) } }
func main() { logs.SetUsePrefix(false) if Help() { return } fileName, sourceName, searchType, err := HandleArg() if err != nil { return } targetArray, err := file.ReadFile(fileName) if err != nil { logs.Debug("open file(%s) error:%s", fileName, err.Error()) return } sourceFile, err := os.Open(sourceName) if err != nil { logs.Debug("open file(%s) error: %s", sourceName, err.Error()) return } defer sourceFile.Close() switch searchType { case SEARCH_NO_MATCH: SearchNoMatch(sourceFile, targetArray, sourceName) case SEARCH_MATCH: SearchMatch(sourceFile, targetArray, sourceName) case SEARCH_NO_MATCH_UP: SearchNoMatchUp(sourceFile, targetArray, sourceName) case SEARCH_MATCH_GREP_LINE: SearchMatchGrepLine(sourceFile, targetArray, sourceName) default: logs.Debug("no match search type") } }
func SearchNoMatchUp(f *os.File, targetArray []string, checkfile string) { sourceBuf := bufio.NewReader(f) index := 0 lineNo := 0 if len(targetArray) < 2 { logs.Debug("the target string must exceed two lines or two lines.") return } resultArray := make([]string, len(targetArray)) for { line, err := sourceBuf.ReadString('\n') if err != nil || io.EOF == err { break } lineNo = lineNo + 1 line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line) if index == 0 && strings.Contains(line, targetArray[index]) == false { resultArray[index] = line index = index + 1 } else if index == 1 && strings.Contains(line, targetArray[index]) == false && strings.Contains(line, targetArray[0]) == false { index = 0 resultArray[index] = line index = index + 1 } else if index == 1 && strings.Contains(line, targetArray[index]) { for i := 0; i < index; i++ { logs.Debug(resultArray[i]) } logs.Debug(line) index = 0 } else { index = 0 } } }
func SaveDataToLocalDB(paperTitle string, paperContent string, source_title string, source_address string) { isExist, err := DataInDB(source_address) if err != nil { logs.Error("judge url in db or not error") } else if isExist == true { logs.Debug("current url is exsist in database") return } sql := "insert into zhiliaoyuan(id, paper_title, source_title, source_url, add_time) values(" sql += "nextval('zhiliaoyuan_id_seq'),'" + paperTitle + "','" sql += source_title + "','" + source_address + "',now());" err = db.ExecuteSql(sql) if err != nil { logs.Error(err.Error()) } }
func main() { //create remove repeat map pageRepeat = make(map[string]int) finished = make(chan bool) //start parse pages //go GetPages("http://www.zhiliaoyuan.com") GetPagesFromFile("page_input.txt") //start get image url go GetPageImageUrl() //start check image go CheckImage() //start statistics go PrintStatistics(5) <-finished logs.Debug("finished check image ...") }
"生成请求": { AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { ctx.AddQueue(&context.Request{ Url: "http://www.cnblogs.com", Rule: aid["Rule"].(string), }) } return nil }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() itemBody := query.Find("div.post_item_body") if itemBody.Length() <= 0 { logs.Debug("finished %s", ctx.GetUrl()) return } //get detail page url for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ { div := itemBody.Eq(index) hrefa := div.Find("a[class]") href, _ := hrefa.Attr("href") var tc = false for _, u := range ctx.Spider.GetFilterUrl() { tc = strings.Contains(href, u) if tc { break }
"生成请求": { AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { ctx.AddQueue(&context.Request{ Url: "http://www.ibm.com/developerworks/cn/views/linux/libraryview.jsp", Rule: aid["Rule"].(string), }) } return nil }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() itemBody := query.Find("#ibm-content-body #ibm-content-main div.ibm-container table tbody tr a[href]") if itemBody.Length() <= 0 { logs.Debug("finished %s", ctx.GetUrl()) return } //get detail page url for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ { aobj := itemBody.Eq(index) href, _ := aobj.Attr("href") var tc = false for _, u := range ctx.Spider.GetFilterUrl() { tc = strings.Contains(href, u) if tc { break } }
"生成请求": { AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { ctx.AddQueue(&context.Request{ Url: "http://www.freebuf.com/", Rule: aid["Rule"].(string), }) } return nil }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() itemBody := query.Find("div.row div.main-mid #timeline div.news_inner") if itemBody.Length() <= 0 { logs.Debug("finished %s", ctx.GetUrl()) return } //get detail page url for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ { div := itemBody.Eq(index) hrefa := div.Find("div.news-info dl dt a[href]") href, _ := hrefa.Attr("href") var tc = false for _, u := range ctx.Spider.GetFilterUrl() { tc = strings.Contains(href, u) if tc { break }
for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { ctx.AddQueue(&context.Request{ Url: "http://www.infoq.com/cn/articles", Rule: aid["Rule"].(string), }) } return nil }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() getInfoqItemData("div.news_type1", query, ctx) getInfoqItemData("div.news_type2", query, ctx) //get next page url if ctx.Spider.GetStopSpider() { logs.Debug("stop spider: %s", ctx.Spider.GetHostUrl()) return } nextPage := query.Find("div.load_more_articles a.blue") if nextPage.Length() <= 0 { return } text, _ := nextPage.Attr("href") if text != "" { isLast := strings.Contains(ctx.GetUrl(), text) if (isLast == false) && ctx.Spider.ContinueGetItem() { nextPageUrl := ctx.Spider.GetHostUrl() + text
"生成请求": { AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { ctx.AddQueue(&context.Request{ Url: "http://blog.csdn.net/", Rule: aid["Rule"].(string), }) } return nil }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() itemBody := query.Find("div.main_center div.blog_list") if itemBody.Length() <= 0 { logs.Debug("finished %s", ctx.GetUrl()) return } //get detail page url for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ { div := itemBody.Eq(index) hrefa := div.Find("h1 a[name]") href, _ := hrefa.Attr("href") var tc = false for _, u := range ctx.Spider.GetFilterUrl() { tc = strings.Contains(href, u) if tc { break }
func SearchMatchGrepLine(f *os.File, targetArray []string, checkfile string) { //read the number of line from args printNum := 1 lineNum := 1 if len(os.Args) >= 5 { lineNum, _ = strconv.Atoi(os.Args[4]) } if lineNum <= 0 { lineNum = 1 } sourceBuf := bufio.NewReader(f) index := 0 lineNo := 0 fixedList := fixedlist.New(len(targetArray) + lineNum) bakFixedList := fixedlist.New(lineNum) for { line, err := sourceBuf.ReadString('\n') if err != nil || io.EOF == err { break } lineNo = lineNo + 1 line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line) if strings.Contains(line, targetArray[index]) { index = index + 1 fixedList.Add(line) } else { fixedList.Add(line) index = 0 } if index < len(targetArray) { continue } if index >= len(targetArray) { tmpLineNum := 1 for { //read lineNum line to bakFixedList line, err := sourceBuf.ReadString('\n') if err != nil || io.EOF == err { //read finished break } lineNo = lineNo + 1 line = fmt.Sprintf("%s:%d %s", checkfile, lineNo, line) bakFixedList.Add(line) tmpLineNum = tmpLineNum + 1 if tmpLineNum >= lineNum { break } } } if index >= len(targetArray) { //print fixedList comment logs.Debug("xxxxxxxxxxxxxx%d timesxxxxxxxxxxxx\n", printNum) printNum = printNum + 1 for { v := fixedList.PopFront() if v == nil { break } logs.Debug("%s", v) } //print bakFixedList comment for { v := bakFixedList.PopFront() if v == nil { break } logs.Debug("%s", v) fixedList.Add(v) } } index = 0 } }
//find content bodyContent := query.Find("div.container div.row-fluid div.span8 div.blog") //filter bad paper for _, dom := range ctx.Spider.GetFilterDom() { tmpDom := bodyContent.Find(dom) if tmpDom.Length() > 1 { return } } bodyContent = formatDams1Content(ctx.Spider.GetRemoveDom(), bodyContent) if bodyContent == nil { logs.Debug("bodyContent is null") return } bodyHtml, err := bodyContent.Html() if err != nil { logs.Debug("get body content html err: %s", err.Error()) return } //remove bad content for _, item := range dams1DeleteContent { reg := regexp.MustCompile(item) bodyHtml = reg.ReplaceAllLiteralString(bodyHtml, "") } paperTitle = strings.TrimSpace(paperTitle)
"生成请求": { AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { ctx.AddQueue(&context.Request{ Url: "http://www.linuxnews.com", Rule: aid["Rule"].(string), }) } return nil }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() tmpquery, _ := query.Html() logs.Debug(tmpquery) itemBody := query.Find("ul.article-list") if itemBody.Length() <= 0 { logs.Debug("finished %s", ctx.GetUrl()) return } //get detail page url for index := 0; index < itemBody.Length() && ctx.Spider.ContinueGetItem(); index++ { div := itemBody.Eq(index) title := div.Find("span.title a.new") href, _ := title.Attr("href") var tc = false
//find content bodyContent := query.Find("div.container div.row-fluid div.span8 div.blog") //filter bad paper for _, dom := range ctx.Spider.GetFilterDom() { tmpDom := bodyContent.Find(dom) if tmpDom.Length() > 1 { return } } bodyContent = formatDamsContent(ctx.Spider.GetRemoveDom(), bodyContent) if bodyContent == nil { logs.Debug("bodyContent is null") return } bodyHtml, err := bodyContent.Html() if err != nil { logs.Debug("get body content html err: %s", err.Error()) return } //remove bad content for _, item := range damsDeleteContent { reg := regexp.MustCompile(item) bodyHtml = reg.ReplaceAllLiteralString(bodyHtml, "") } paperTitle = strings.TrimSpace(paperTitle)