func SaveRule(form url.Values, opUser string) (errMsg string, err error) { rule := model.NewCrawlRule() err = util.ConvertAssign(rule, form) if err != nil { logger.Errorln("rule ConvertAssign error", err) errMsg = err.Error() return } rule.OpUser = opUser if rule.Id != 0 { err = rule.Persist(rule) } else { _, err = rule.Insert() } if err != nil { errMsg = "内部服务器错误" logger.Errorln("rule save:", errMsg, ":", err) return } return }
// 获取抓取规则列表(分页) func FindRuleByPage(conds map[string]string, curPage, limit int) ([]*model.CrawlRule, int) { conditions := make([]string, 0, len(conds)) for k, v := range conds { conditions = append(conditions, k+"="+v) } rule := model.NewCrawlRule() limitStr := strconv.Itoa((curPage-1)*limit) + "," + strconv.Itoa(limit) ruleList, err := rule.Where(strings.Join(conditions, " AND ")).Order("id DESC").Limit(limitStr). FindAll() if err != nil { logger.Errorln("rule service FindArticleByPage Error:", err) return nil, 0 } total, err := rule.Count() if err != nil { logger.Errorln("rule service FindArticleByPage COUNT Error:", err) return nil, 0 } return ruleList, total }
// 获取url对应的文章并根据规则进行解析 func ParseArticle(articleUrl string, auto bool) (*model.Article, error) { articleUrl = strings.TrimSpace(articleUrl) if !strings.HasPrefix(articleUrl, "http") { articleUrl = "http://" + articleUrl } tmpArticle := model.NewArticle() err := tmpArticle.Where("url=" + articleUrl).Find("id") if err != nil || tmpArticle.Id != 0 { logger.Errorln(articleUrl, "has exists:", err) return nil, errors.New("has exists!") } urlPaths := strings.SplitN(articleUrl, "/", 5) domain := urlPaths[2] for k, v := range domainPatch { if strings.Contains(domain, k) && !strings.Contains(domain, "www."+k) { domain = v break } } rule := model.NewCrawlRule() err = rule.Where("domain=" + domain).Find() if err != nil { logger.Errorln("find rule by domain error:", err) return nil, err } if rule.Id == 0 { logger.Errorln("domain:", domain, "not exists!") return nil, errors.New("domain not exists") } var doc *goquery.Document if doc, err = goquery.NewDocument(articleUrl); err != nil { logger.Errorln("goquery newdocument error:", err) return nil, err } author, authorTxt := "", "" if rule.InUrl { index, err := strconv.Atoi(rule.Author) if err != nil { logger.Errorln("author rule is illegal:", rule.Author, "error:", err) return nil, err } author = urlPaths[index] authorTxt = author } else { if strings.HasPrefix(rule.Author, ".") || strings.HasPrefix(rule.Author, "#") { authorSelection := doc.Find(rule.Author) author, err = authorSelection.Html() if err != nil { logger.Errorln("goquery parse author error:", err) return nil, err } author = strings.TrimSpace(author) authorTxt = strings.TrimSpace(authorSelection.Text()) } else { // 某些个人博客,页面中没有作者的信息,因此,规则中 author 即为 作者 author = rule.Author authorTxt = rule.Author } } title := "" doc.Find(rule.Title).Each(func(i int, selection *goquery.Selection) { if title != "" { return } tmpTitle := strings.TrimSpace(strings.TrimPrefix(selection.Text(), "原")) tmpTitle = strings.TrimSpace(strings.TrimPrefix(tmpTitle, "荐")) tmpTitle = strings.TrimSpace(strings.TrimPrefix(tmpTitle, "转")) tmpTitle = strings.TrimSpace(strings.TrimPrefix(tmpTitle, "顶")) if tmpTitle != "" { title = tmpTitle } }) if title == "" { logger.Errorln("url:", articleUrl, "parse title error:", err) return nil, err } replacer := strings.NewReplacer("[置顶]", "", "[原]", "", "[转]", "") title = strings.TrimSpace(replacer.Replace(title)) contentSelection := doc.Find(rule.Content) // relative url -> abs url contentSelection.Find("img").Each(func(i int, s *goquery.Selection) { if v, ok := s.Attr("src"); ok { if !strings.HasPrefix(v, "http") { s.SetAttr("src", domain+v) } } }) content, err := contentSelection.Html() if err != nil { logger.Errorln("goquery parse content error:", err) return nil, err } content = strings.TrimSpace(content) txt := strings.TrimSpace(contentSelection.Text()) txt = articleRe.ReplaceAllLiteralString(txt, " ") txt = articleSpaceRe.ReplaceAllLiteralString(txt, " ") // 自动抓取,内容长度不能少于 300 字 if auto && len(txt) < 300 { logger.Infoln(articleUrl, "content is short") return nil, errors.New("content is short") } pubDate := util.TimeNow() if rule.PubDate != "" { pubDate = strings.TrimSpace(doc.Find(rule.PubDate).First().Text()) // sochina patch re := regexp.MustCompile("[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}") submatches := re.FindStringSubmatch(pubDate) if len(submatches) > 0 { pubDate = submatches[0] } } if pubDate == "" { pubDate = util.TimeNow() } article := model.NewArticle() article.Domain = domain article.Name = rule.Name article.Author = author article.AuthorTxt = authorTxt article.Title = title article.Content = content article.Txt = txt article.PubDate = pubDate article.Url = articleUrl article.Lang = rule.Lang article.Ctime = util.TimeNow() _, err = article.Insert() if err != nil { logger.Errorln("insert article error:", err) return nil, err } return article, nil }