Пример #1
0
// ParseOneProject 处理单个 project
func (ProjectLogic) ParseOneProject(projectUrl string) error {
	if !strings.HasPrefix(projectUrl, "http") {
		projectUrl = OsChinaDomain + projectUrl
	}

	var (
		doc *goquery.Document
		err error
	)

	// 加上 ?fromerr=xfwefs,否则页面有 js 重定向
	if doc, err = goquery.NewDocument(projectUrl + "?fromerr=xfwefs"); err != nil {
		return errors.New("goquery fetch " + projectUrl + " error:" + err.Error())
	}

	// 标题
	category := strings.TrimSpace(doc.Find(".Project .name").Text())
	name := strings.TrimSpace(doc.Find(".Project .name u").Text())
	if category == "" && name == "" {
		return errors.New("projectUrl:" + projectUrl + " category and name are empty")
	}

	tmpIndex := strings.LastIndex(category, name)
	if tmpIndex != -1 {
		category = category[:tmpIndex]
	}

	// uri
	uri := projectUrl[strings.LastIndex(projectUrl, "/")+1:]

	project := &model.OpenProject{}

	_, err = MasterDB.Where("uri=?", uri).Get(project)
	// 已经存在
	if project.Id != 0 {
		logger.Infoln("url", projectUrl, "has exists!")
		return nil
	}

	logoSelection := doc.Find(".Project .PN img")
	if logoSelection.AttrOr("title", "") != "" {
		project.Logo = logoSelection.AttrOr("src", "")

		if !strings.HasPrefix(project.Logo, "http") {
			project.Logo = OsChinaDomain + project.Logo
		}

		project.Logo, err = DefaultUploader.TransferUrl(nil, project.Logo, ProjectLogoPrefix)
		if err != nil {
			logger.Errorln("project logo upload error:", err)
		}
	}

	// 获取项目相关链接
	doc.Find("#Body .urls li").Each(func(i int, liSelection *goquery.Selection) {
		aSelection := liSelection.Find("a")
		uri := util.FetchRealUrl(OsChinaDomain + aSelection.AttrOr("href", ""))
		switch aSelection.Text() {
		case "软件首页":
			project.Home = uri
		case "软件文档":
			project.Doc = uri
		case "软件下载":
			project.Download = uri
		}
	})

	ctime := time.Now()
	doc.Find("#Body .attrs li").Each(func(i int, liSelection *goquery.Selection) {
		aSelection := liSelection.Find("a")
		txt := aSelection.Text()
		if i == 0 {
			project.Licence = txt
			if txt == "未知" {
				project.Licence = "其他"
			}
		} else if i == 1 {
			project.Lang = txt
		} else if i == 2 {
			project.Os = txt
		} else if i == 3 {
			dtime, err := time.ParseInLocation("2006年01月02日", aSelection.Last().Text(), time.Local)
			if err != nil {
				logger.Errorln("parse ctime error:", err)
			} else {
				ctime = dtime.Local()
			}
		}
	})

	project.Name = name
	project.Category = category
	project.Uri = uri
	project.Repo = strings.TrimSpace(doc.Find("#Body .github-widget").AttrOr("data-repo", ""))
	project.Src = "https://github.com/" + project.Repo

	pos := strings.Index(project.Repo, "/")
	if pos > -1 {
		project.Author = project.Repo[:pos]
	} else {
		project.Author = "网友"
	}

	if project.Doc == "" {
		// TODO:暂时认为一定是 Go 语言
		project.Doc = "https://godoc.org/" + project.Src[8:]
	}

	desc := ""
	doc.Find("#Body .detail").Find("p").NextAll().Each(func(i int, domSelection *goquery.Selection) {
		doc.FindSelection(domSelection).WrapHtml(`<div id="tmp` + strconv.Itoa(i) + `"></div>`)
		domHtml, _ := doc.Find("#tmp" + strconv.Itoa(i)).Html()
		if domSelection.Is("pre") {
			desc += domHtml + "\n\n"
		} else {
			desc += html2md.Convert(domHtml) + "\n\n"
		}
	})

	project.Desc = strings.TrimSpace(desc)
	project.Username = PresetUsernames[rand.Intn(4)]
	project.Status = model.ProjectStatusOnline
	project.Ctime = model.OftenTime(ctime)

	_, err = MasterDB.Insert(project)
	if err != nil {
		return errors.New("insert into open project error:" + err.Error())
	}

	return nil
}
Пример #2
0
// 处理 Reddit 中的一条资源
func (this *RedditLogic) dealRedditOneResource(contentSelection *goquery.Selection) error {
	aSelection := contentSelection.Find(".title a.title")

	title := aSelection.Text()
	if title == "" {
		return errors.New("title is empty")
	}

	resourceUrl, ok := aSelection.Attr("href")
	if !ok || resourceUrl == "" {
		return errors.New("resource url is empty")
	}

	isReddit := false

	resource := &model.Resource{}
	// Reddit 自身的内容
	if contentSelection.HasClass("self") {
		isReddit = true
		resourceUrl = this.domain + resourceUrl
	}

	_, err := MasterDB.Where("url=?", resourceUrl).Get(resource)
	if err != nil {
		return err
	}
	// 已经存在
	if resource.Id != 0 {
		// 如果是 reddit 本身的,可以更新评论信息
		if !isReddit {
			return errors.New("url" + resourceUrl + "has exists!")
		}
	}

	if isReddit {

		resource.Form = model.ContentForm

		var doc *goquery.Document

		if doc, err = goquery.NewDocument(resourceUrl); err != nil {
			return errors.New("goquery reddit.com/r/golang self newdocument error:" + err.Error())
		}

		content, err := doc.Find("#siteTable .usertext .md").Html()
		if err != nil {
			return err
		}

		doc.Find(".commentarea .comment .usertext .md").Each(func(i int, contentSel *goquery.Selection) {
			if i == 0 {
				content += `<hr/>**评论:**<br/><br/>`
			}

			comment, err := contentSel.Html()
			if err != nil {
				return
			}

			comment = strings.TrimSpace(comment)
			comment = resourceRe.ReplaceAllLiteralString(comment, "\n")

			author := contentSel.ParentsFiltered(".usertext").Prev().Find(".author").Text()
			content += author + ": <pre>" + comment + "</pre>"
		})

		if strings.TrimSpace(content) == "" {
			return errors.New("goquery reddit.com/r/golang self newdocument(" + resourceUrl + ") error: content is empty")
		}

		resource.Content = content

		// reddit 本身的,当做其他资源
		resource.Catid = 4
	} else {
		resource.Form = model.LinkForm

		// Github,是开源项目
		if contentSelection.Find(".title .domain a").Text() == "github.com" {
			resource.Catid = 2
		} else {
			resource.Catid = 1
		}
	}

	resource.Title = title
	resource.Url = resourceUrl
	resource.Uid = PresetUids[rand.Intn(4)]

	ctime := time.Now()
	datetime, ok := contentSelection.Find(".tagline time").Attr("datetime")
	if ok {
		dtime, err := time.ParseInLocation(time.RFC3339, datetime, time.UTC)
		if err != nil {
			logger.Errorln("parse ctime error:", err)
		} else {
			ctime = dtime.Local()
		}
	}
	resource.Ctime = model.OftenTime(ctime)

	if resource.Id == 0 {
		session := MasterDB.NewSession()
		defer session.Close()
		session.Begin()

		_, err = session.Insert(resource)
		if err != nil {
			session.Rollback()
			return errors.New("insert into Resource error:" + err.Error())
		}

		// 存扩展信息
		resourceEx := &model.ResourceEx{}
		resourceEx.Id = resource.Id
		if _, err = session.Insert(resourceEx); err != nil {
			session.Rollback()
			return errors.New("insert into ResourceEx error:" + err.Error())
		}
		session.Commit()
	} else {
		if _, err = MasterDB.Id(resource.Id).Update(resource); err != nil {
			return errors.New("update resource:" + strconv.Itoa(resource.Id) + " error:" + err.Error())
		}
	}

	return nil
}