func findArtistAndMember(req *wcg.Request) *hplink.Member { var artist *hplink.Artist var member *hplink.Member var funcs = []func(){ func() { _, ent := entities.Artist.Get().Key( req.Param(paramKeyArtist), ).MustOne(req) if ent != nil { artist = ent.(*hplink.Artist) } }, func() { _, ent := entities.Member.Get().Key( fmt.Sprintf("%s.%s", req.Param(paramKeyArtist), req.Param(paramKeyMember)), ).MustOne(req) if ent != nil { member = ent.(*hplink.Member) } }, } iterator.ParallelSlice(funcs, func(i int, v func()) error { v() return nil }) if artist == nil || member == nil { return nil } member.Artist = artist return member }
func _crawlAmebloPost(req *wcg.Request, post *hplink.AmebloPost, members []hplink.Member) error { crawler := crawlers.NewAmebloPostCrawler(cacheutil.NewURLCacheAwareClient(req, entities.URLCache)) req.Logger.Infof("[Task.Crawlers.AmebloPosts] crawling URL: %s", post.URL) crawled, err := crawler.RunOnPostURL(post.URL) if err != nil { return err } post.NumLikes = crawled.NumLikes post.NumComments = crawled.NumComments post.NumReblogs = crawled.NumReblogs post.PostAt = crawled.PostAt post.Title = crawled.Title post.Theme = crawled.Theme post.IsContentsCrawled = true post.Images = make([]models.Image, len(crawled.ImageURLs)) if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil { iterator.ParallelSlice(crawled.ImageURLs, func(i int, url string) error { req.Logger.Infof("[Task.Crawlers.AmebloPosts] caching image URL %s (on %s)", url, post.URL) const retries = 5 var err error var cache *models.ImageCache for j := 0; j < retries; j++ { if j != 0 { req.Logger.Infof("[Task.Crawlers.AmebloPosts] Retry image URL caching: %s (on %s) (%d)", url, post.URL, j) } cache, err = cacher.Cache(url) if err == nil { post.Images[i] = *cache.ToImage() return nil } lib.WaitAndEnsureAfter(lib.Now(), 3*time.Second) } req.Logger.Warnf("[Task.Crawlers.AmebloPosts] Failed to cache image URL %s (on %s) - %v", url, post.URL, err) return err }) } else { for j, url := range crawled.ImageURLs { req.Logger.Infof("[Task.Crawlers.AmebloPosts] skip caching image URL %s (on %s)", url, post.URL) post.Images[j] = models.Image{ URL: url, } } } // Update MemberKey only if not set. if post.MemberKey == "" { if m := _guessMember(req, post, members); m != nil { post.MemberKey = m.Key } } req.Logger.Infof("[Task.Crawlers.AmebloPosts] finished crawling") return nil }
func setupAPICrawlerSettings(app *server.App) { var API = app.API() var urlValidator = server.Handler(func(req *wcg.Request) response.Response { _, err := url.Parse(req.Param("url")) if err != nil { return response.APINotFound } return nil }) API.GET("/crawlersettings/:type/", middleware.EntityAll(CrawlerSettings.Query().Filter("Type=", request.Value(func(req *wcg.Request) interface{} { t, _ := hplink.ParseCrawlerSettingsType(req.Param("type")) if t == hplink.CrawlerSettingsTypeUnknown { return entities.FilterValueSkip } return t }))), ) API.GET("/crawlersettings/ameblo/stats/", server.Handler(func(req *wcg.Request) response.Response { type post struct { URL string `json:"url"` At time.Time `json:"at"` } type stats struct { URL string `json:"url"` FirstPost *post `json:"first_post,omitempty"` LastPost *post `json:"last_post,omitempty"` TotalPosts int `json:"total_posts"` CrawledPosts int `json:"crawled_posts"` ImageFailurePosts int `json:"image_failure_posts"` } p := CrawlerSettings.Query().Filter("Type=", hplink.CrawlerSettingsTypeAmeblo).MustExecute(req) settings := p.Data.([]hplink.CrawlerSettings) s := make([]stats, len(settings)) if err := iterator.ParallelSlice(settings, func(i int, v *hplink.CrawlerSettings) error { s[i].URL = v.URL s[i].TotalPosts = AmebloPost.Query().Filter("SettingsURL=", v.URL).MustCount(req) if s[i].TotalPosts > 0 { s[i].CrawledPosts = AmebloPost.Query().Filter("SettingsURL=", v.URL).Filter("IsContentsCrawled=", true).MustCount(req) s[i].ImageFailurePosts = AmebloPost.Query().Filter("SettingsURL=", v.URL).Filter("Images.Height=", 0).MustCount(req) pf := AmebloPost.Query().Filter("SettingsURL=", v.URL).Order("PostAt").Limit(1).MustExecute(req) pl := AmebloPost.Query().Filter("SettingsURL=", v.URL).Order("-PostAt").Limit(1).MustExecute(req) first := pf.Head().(*hplink.AmebloPost) last := pl.Head().(*hplink.AmebloPost) s[i].FirstPost = &post{ URL: first.URL, At: first.PostAt, } s[i].LastPost = &post{ URL: last.URL, At: last.PostAt, } return nil } return nil }); err != nil { panic(err) } return response.NewJSONResponse(s) }), ) API.GET("/crawlersettings/:url.json", middleware.EntityGet(CrawlerSettings.Get(), "url"), ) API.PUT("/crawlersettings/:url.json", urlValidator, middleware.ParseForm(func(v *validators.FormValidator) { v.Field("artist_key").Required() }), middleware.EntityPutOrCreate( CrawlerSettings.Put(), "url", ), ) API.DELETE("/crawlersettings/:url.json", urlValidator, middleware.EntityDelete(CrawlerSettings.Delete(), "url"), ) }
func runTasksCrawlersAmebloEntryLists(req *wcg.Request, task *models.AsyncAPITask) (*models.AsyncAPITaskProgress, error) { const FollowLinkKey = "fl" const SettingsKey = "s" const URLKey = "u" var query = req.HTTPRequest().URL.Query() var settingsList []*hplink.CrawlerSettings var urlList []string if settingsKeys, ok := query[SettingsKey]; ok { _, _list := entities.CrawlerSettings.GetMulti().Keys(settingsKeys...).MustList(req) settingsList = _list.([]*hplink.CrawlerSettings) } else { query := entities.CrawlerSettings.Query().Filter("Type=", hplink.CrawlerSettingsTypeAmeblo) if pagination := query.MustExecute(req); pagination.Length() > 0 { list := pagination.Data.([]hplink.CrawlerSettings) settingsList = make([]*hplink.CrawlerSettings, len(list)) for i := range list { settingsList[i] = &list[i] } } } var numList = len(settingsList) urlList = make([]string, numList) if urls, ok := query[URLKey]; ok { if numList != len(urls) { return nil, fmt.Errorf("List mismatch - found %d settings but %d urls are specified", numList, len(urls)) } urlList = query[URLKey] } else { for i := range settingsList { urlList[i] = (*hplink.AmebloCrawlerSettings)(settingsList[i]).GetEntryListURL() } } startTime := lib.Now() nextParamSettingsKeys := make([]string, numList) nextParamURLs := make([]string, numList) err := iterator.ParallelSlice(settingsList, func(i int, v *hplink.CrawlerSettings) error { next, err := _crawlAmebloEntryList(req, v, urlList[i]) if err != nil { settingsList[i].Error = []byte(fmt.Sprintf("%v", err)) settingsList[i].Status = hplink.CrawlerStatusFailure settingsList[i].LastRun = lib.Now() return err } settingsList[i].Error = nil settingsList[i].Status = hplink.CrawlerStatusSuccess settingsList[i].LastRun = lib.Now() if next != "" { nextParamSettingsKeys[i] = v.URL nextParamURLs[i] = next } return nil }) entities.CrawlerSettings.PutMulti().MustUpdate(req, settingsList) if err != nil { return nil, err } if req.Query(FollowLinkKey) != "true" { return nil, err } // fl=true make a recursive call to follow next links // reduce empty urls from nextParam* and return it for recursive call var fixedNextParamSettingsKeys []string var fixedNextParamURLs []string var hasNext = false for i := range nextParamURLs { if nextParamURLs[i] != "" { hasNext = true fixedNextParamSettingsKeys = append(fixedNextParamSettingsKeys, nextParamSettingsKeys[i]) fixedNextParamURLs = append(fixedNextParamURLs, nextParamURLs[i]) } } var progress models.AsyncAPITaskProgress var lastProgress = task.LastProgress() if lastProgress == nil { progress.Current = len(urlList) progress.Total = 0 } else { progress.Current = lastProgress.Current + len(urlList) } if hasNext { progress.Next = url.Values{ FollowLinkKey: []string{"true"}, SettingsKey: fixedNextParamSettingsKeys, URLKey: fixedNextParamURLs, } wait := configs.GetIntValue(req, "hplink.ameblo_crawler_url_wait", 2, 0, 10) lib.WaitAndEnsureAfter(startTime, time.Duration(wait)*time.Second) } req.Logger.Infof("No more URL needs to be crawled.") return &progress, nil }