// ParseOneProject 处理单个 project func (ProjectLogic) ParseOneProject(projectUrl string) error { if !strings.HasPrefix(projectUrl, "http") { projectUrl = OsChinaDomain + projectUrl } var ( doc *goquery.Document err error ) // 加上 ?fromerr=xfwefs,否则页面有 js 重定向 if doc, err = goquery.NewDocument(projectUrl + "?fromerr=xfwefs"); err != nil { return errors.New("goquery fetch " + projectUrl + " error:" + err.Error()) } // 标题 category := strings.TrimSpace(doc.Find(".Project .name").Text()) name := strings.TrimSpace(doc.Find(".Project .name u").Text()) if category == "" && name == "" { return errors.New("projectUrl:" + projectUrl + " category and name are empty") } tmpIndex := strings.LastIndex(category, name) if tmpIndex != -1 { category = category[:tmpIndex] } // uri uri := projectUrl[strings.LastIndex(projectUrl, "/")+1:] project := &model.OpenProject{} _, err = MasterDB.Where("uri=?", uri).Get(project) // 已经存在 if project.Id != 0 { logger.Infoln("url", projectUrl, "has exists!") return nil } logoSelection := doc.Find(".Project .PN img") if logoSelection.AttrOr("title", "") != "" { project.Logo = logoSelection.AttrOr("src", "") if !strings.HasPrefix(project.Logo, "http") { project.Logo = OsChinaDomain + project.Logo } project.Logo, err = DefaultUploader.TransferUrl(nil, project.Logo, ProjectLogoPrefix) if err != nil { logger.Errorln("project logo upload error:", err) } } // 获取项目相关链接 doc.Find("#Body .urls li").Each(func(i int, liSelection *goquery.Selection) { aSelection := liSelection.Find("a") uri := util.FetchRealUrl(OsChinaDomain + aSelection.AttrOr("href", "")) switch aSelection.Text() { case "软件首页": project.Home = uri case "软件文档": project.Doc = uri case "软件下载": project.Download = uri } }) ctime := time.Now() doc.Find("#Body .attrs li").Each(func(i int, liSelection *goquery.Selection) { aSelection := liSelection.Find("a") txt := aSelection.Text() if i == 0 { project.Licence = txt if txt == "未知" { project.Licence = "其他" } } else if i == 1 { project.Lang = txt } else if i == 2 { project.Os = txt } else if i == 3 { dtime, err := time.ParseInLocation("2006年01月02日", aSelection.Last().Text(), time.Local) if err != nil { logger.Errorln("parse ctime error:", err) } else { ctime = dtime.Local() } } }) project.Name = name project.Category = category project.Uri = uri project.Repo = strings.TrimSpace(doc.Find("#Body .github-widget").AttrOr("data-repo", "")) project.Src = "https://github.com/" + project.Repo pos := strings.Index(project.Repo, "/") if pos > -1 { project.Author = project.Repo[:pos] } else { project.Author = "网友" } if project.Doc == "" { // TODO:暂时认为一定是 Go 语言 project.Doc = "https://godoc.org/" + project.Src[8:] } desc := "" doc.Find("#Body .detail").Find("p").NextAll().Each(func(i int, domSelection *goquery.Selection) { doc.FindSelection(domSelection).WrapHtml(`<div id="tmp` + strconv.Itoa(i) + `"></div>`) domHtml, _ := doc.Find("#tmp" + strconv.Itoa(i)).Html() if domSelection.Is("pre") { desc += domHtml + "\n\n" } else { desc += html2md.Convert(domHtml) + "\n\n" } }) project.Desc = strings.TrimSpace(desc) project.Username = PresetUsernames[rand.Intn(4)] project.Status = model.ProjectStatusOnline project.Ctime = model.OftenTime(ctime) _, err = MasterDB.Insert(project) if err != nil { return errors.New("insert into open project error:" + err.Error()) } return nil }
// 处理 Reddit 中的一条资源 func (this *RedditLogic) dealRedditOneResource(contentSelection *goquery.Selection) error { aSelection := contentSelection.Find(".title a.title") title := aSelection.Text() if title == "" { return errors.New("title is empty") } resourceUrl, ok := aSelection.Attr("href") if !ok || resourceUrl == "" { return errors.New("resource url is empty") } isReddit := false resource := &model.Resource{} // Reddit 自身的内容 if contentSelection.HasClass("self") { isReddit = true resourceUrl = this.domain + resourceUrl } _, err := MasterDB.Where("url=?", resourceUrl).Get(resource) if err != nil { return err } // 已经存在 if resource.Id != 0 { // 如果是 reddit 本身的,可以更新评论信息 if !isReddit { return errors.New("url" + resourceUrl + "has exists!") } } if isReddit { resource.Form = model.ContentForm var doc *goquery.Document if doc, err = goquery.NewDocument(resourceUrl); err != nil { return errors.New("goquery reddit.com/r/golang self newdocument error:" + err.Error()) } content, err := doc.Find("#siteTable .usertext .md").Html() if err != nil { return err } doc.Find(".commentarea .comment .usertext .md").Each(func(i int, contentSel *goquery.Selection) { if i == 0 { content += `<hr/>**评论:**<br/><br/>` } comment, err := contentSel.Html() if err != nil { return } comment = strings.TrimSpace(comment) comment = resourceRe.ReplaceAllLiteralString(comment, "\n") author := contentSel.ParentsFiltered(".usertext").Prev().Find(".author").Text() content += author + ": <pre>" + comment + "</pre>" }) if strings.TrimSpace(content) == "" { return errors.New("goquery reddit.com/r/golang self newdocument(" + resourceUrl + ") error: content is empty") } resource.Content = content // reddit 本身的,当做其他资源 resource.Catid = 4 } else { resource.Form = model.LinkForm // Github,是开源项目 if contentSelection.Find(".title .domain a").Text() == "github.com" { resource.Catid = 2 } else { resource.Catid = 1 } } resource.Title = title resource.Url = resourceUrl resource.Uid = PresetUids[rand.Intn(4)] ctime := time.Now() datetime, ok := contentSelection.Find(".tagline time").Attr("datetime") if ok { dtime, err := time.ParseInLocation(time.RFC3339, datetime, time.UTC) if err != nil { logger.Errorln("parse ctime error:", err) } else { ctime = dtime.Local() } } resource.Ctime = model.OftenTime(ctime) if resource.Id == 0 { session := MasterDB.NewSession() defer session.Close() session.Begin() _, err = session.Insert(resource) if err != nil { session.Rollback() return errors.New("insert into Resource error:" + err.Error()) } // 存扩展信息 resourceEx := &model.ResourceEx{} resourceEx.Id = resource.Id if _, err = session.Insert(resourceEx); err != nil { session.Rollback() return errors.New("insert into ResourceEx error:" + err.Error()) } session.Commit() } else { if _, err = MasterDB.Id(resource.Id).Update(resource); err != nil { return errors.New("update resource:" + strconv.Itoa(resource.Id) + " error:" + err.Error()) } } return nil }