예제 #1
0
func SaveRule(form url.Values, opUser string) (errMsg string, err error) {
	rule := model.NewCrawlRule()
	err = util.ConvertAssign(rule, form)
	if err != nil {
		logger.Errorln("rule ConvertAssign error", err)
		errMsg = err.Error()
		return
	}

	rule.OpUser = opUser

	if rule.Id != 0 {
		err = rule.Persist(rule)
	} else {
		_, err = rule.Insert()
	}

	if err != nil {
		errMsg = "内部服务器错误"
		logger.Errorln("rule save:", errMsg, ":", err)
		return
	}

	return
}
예제 #2
0
// 获取抓取规则列表(分页)
func FindRuleByPage(conds map[string]string, curPage, limit int) ([]*model.CrawlRule, int) {
	conditions := make([]string, 0, len(conds))
	for k, v := range conds {
		conditions = append(conditions, k+"="+v)
	}

	rule := model.NewCrawlRule()

	limitStr := strconv.Itoa((curPage-1)*limit) + "," + strconv.Itoa(limit)
	ruleList, err := rule.Where(strings.Join(conditions, " AND ")).Order("id DESC").Limit(limitStr).
		FindAll()
	if err != nil {
		logger.Errorln("rule service FindArticleByPage Error:", err)
		return nil, 0
	}

	total, err := rule.Count()
	if err != nil {
		logger.Errorln("rule service FindArticleByPage COUNT Error:", err)
		return nil, 0
	}

	return ruleList, total
}
예제 #3
0
// 获取url对应的文章并根据规则进行解析
func ParseArticle(articleUrl string, auto bool) (*model.Article, error) {
	articleUrl = strings.TrimSpace(articleUrl)
	if !strings.HasPrefix(articleUrl, "http") {
		articleUrl = "http://" + articleUrl
	}

	tmpArticle := model.NewArticle()
	err := tmpArticle.Where("url=" + articleUrl).Find("id")
	if err != nil || tmpArticle.Id != 0 {
		logger.Errorln(articleUrl, "has exists:", err)
		return nil, errors.New("has exists!")
	}

	urlPaths := strings.SplitN(articleUrl, "/", 5)
	domain := urlPaths[2]

	for k, v := range domainPatch {
		if strings.Contains(domain, k) && !strings.Contains(domain, "www."+k) {
			domain = v
			break
		}
	}

	rule := model.NewCrawlRule()
	err = rule.Where("domain=" + domain).Find()
	if err != nil {
		logger.Errorln("find rule by domain error:", err)
		return nil, err
	}

	if rule.Id == 0 {
		logger.Errorln("domain:", domain, "not exists!")
		return nil, errors.New("domain not exists")
	}

	var doc *goquery.Document
	if doc, err = goquery.NewDocument(articleUrl); err != nil {
		logger.Errorln("goquery newdocument error:", err)
		return nil, err
	}

	author, authorTxt := "", ""
	if rule.InUrl {
		index, err := strconv.Atoi(rule.Author)
		if err != nil {
			logger.Errorln("author rule is illegal:", rule.Author, "error:", err)
			return nil, err
		}
		author = urlPaths[index]
		authorTxt = author
	} else {
		if strings.HasPrefix(rule.Author, ".") || strings.HasPrefix(rule.Author, "#") {
			authorSelection := doc.Find(rule.Author)
			author, err = authorSelection.Html()
			if err != nil {
				logger.Errorln("goquery parse author error:", err)
				return nil, err
			}

			author = strings.TrimSpace(author)
			authorTxt = strings.TrimSpace(authorSelection.Text())
		} else {
			// 某些个人博客,页面中没有作者的信息,因此,规则中 author 即为 作者
			author = rule.Author
			authorTxt = rule.Author
		}
	}

	title := ""
	doc.Find(rule.Title).Each(func(i int, selection *goquery.Selection) {
		if title != "" {
			return
		}

		tmpTitle := strings.TrimSpace(strings.TrimPrefix(selection.Text(), "原"))
		tmpTitle = strings.TrimSpace(strings.TrimPrefix(tmpTitle, "荐"))
		tmpTitle = strings.TrimSpace(strings.TrimPrefix(tmpTitle, "转"))
		tmpTitle = strings.TrimSpace(strings.TrimPrefix(tmpTitle, "顶"))
		if tmpTitle != "" {
			title = tmpTitle
		}
	})

	if title == "" {
		logger.Errorln("url:", articleUrl, "parse title error:", err)
		return nil, err
	}

	replacer := strings.NewReplacer("[置顶]", "", "[原]", "", "[转]", "")
	title = strings.TrimSpace(replacer.Replace(title))

	contentSelection := doc.Find(rule.Content)

	// relative url -> abs url
	contentSelection.Find("img").Each(func(i int, s *goquery.Selection) {
		if v, ok := s.Attr("src"); ok {
			if !strings.HasPrefix(v, "http") {
				s.SetAttr("src", domain+v)
			}
		}
	})

	content, err := contentSelection.Html()
	if err != nil {
		logger.Errorln("goquery parse content error:", err)
		return nil, err
	}
	content = strings.TrimSpace(content)
	txt := strings.TrimSpace(contentSelection.Text())
	txt = articleRe.ReplaceAllLiteralString(txt, " ")
	txt = articleSpaceRe.ReplaceAllLiteralString(txt, " ")

	// 自动抓取,内容长度不能少于 300 字
	if auto && len(txt) < 300 {
		logger.Infoln(articleUrl, "content is short")
		return nil, errors.New("content is short")
	}

	pubDate := util.TimeNow()
	if rule.PubDate != "" {
		pubDate = strings.TrimSpace(doc.Find(rule.PubDate).First().Text())

		// sochina patch
		re := regexp.MustCompile("[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}")
		submatches := re.FindStringSubmatch(pubDate)
		if len(submatches) > 0 {
			pubDate = submatches[0]
		}
	}

	if pubDate == "" {
		pubDate = util.TimeNow()
	}

	article := model.NewArticle()
	article.Domain = domain
	article.Name = rule.Name
	article.Author = author
	article.AuthorTxt = authorTxt
	article.Title = title
	article.Content = content
	article.Txt = txt
	article.PubDate = pubDate
	article.Url = articleUrl
	article.Lang = rule.Lang
	article.Ctime = util.TimeNow()

	_, err = article.Insert()
	if err != nil {
		logger.Errorln("insert article error:", err)
		return nil, err
	}

	return article, nil
}