func _crawlAmebloPost(req *wcg.Request, post *hplink.AmebloPost, members []hplink.Member) error { crawler := crawlers.NewAmebloPostCrawler(cacheutil.NewURLCacheAwareClient(req, entities.URLCache)) req.Logger.Infof("[Task.Crawlers.AmebloPosts] crawling URL: %s", post.URL) crawled, err := crawler.RunOnPostURL(post.URL) if err != nil { return err } post.NumLikes = crawled.NumLikes post.NumComments = crawled.NumComments post.NumReblogs = crawled.NumReblogs post.PostAt = crawled.PostAt post.Title = crawled.Title post.Theme = crawled.Theme post.IsContentsCrawled = true post.Images = make([]models.Image, len(crawled.ImageURLs)) if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil { iterator.ParallelSlice(crawled.ImageURLs, func(i int, url string) error { req.Logger.Infof("[Task.Crawlers.AmebloPosts] caching image URL %s (on %s)", url, post.URL) const retries = 5 var err error var cache *models.ImageCache for j := 0; j < retries; j++ { if j != 0 { req.Logger.Infof("[Task.Crawlers.AmebloPosts] Retry image URL caching: %s (on %s) (%d)", url, post.URL, j) } cache, err = cacher.Cache(url) if err == nil { post.Images[i] = *cache.ToImage() return nil } lib.WaitAndEnsureAfter(lib.Now(), 3*time.Second) } req.Logger.Warnf("[Task.Crawlers.AmebloPosts] Failed to cache image URL %s (on %s) - %v", url, post.URL, err) return err }) } else { for j, url := range crawled.ImageURLs { req.Logger.Infof("[Task.Crawlers.AmebloPosts] skip caching image URL %s (on %s)", url, post.URL) post.Images[j] = models.Image{ URL: url, } } } // Update MemberKey only if not set. if post.MemberKey == "" { if m := _guessMember(req, post, members); m != nil { post.MemberKey = m.Key } } req.Logger.Infof("[Task.Crawlers.AmebloPosts] finished crawling") return nil }
func _crawlAmebloEntryList(req *wcg.Request, settings *hplink.CrawlerSettings, url string) (string, error) { _, _artist, err := entities.Artist.Get().Key(settings.ArtistKey).One(req) if err != nil { return "", err } p, err := entities.Member.Query().Filter("ArtistKey=", settings.ArtistKey).Execute(req) if err != nil { return "", err } if _artist == nil { req.Logger.Warnf("No artist found by the key %q (CralwerSettings: %s)", settings.ArtistKey, settings.URL) return "", nil } artist := _artist.(*hplink.Artist) members := p.Data.([]hplink.Member) crawler := crawlers.NewAmebloPostCrawler(urlfetch.NewHTTPClient(req)) postList, err := crawler.RunOnListURL(url) if err != nil { return "", fmt.Errorf("Error crawling %q - %v", url, err) } if len(postList.List) == 0 { return "", nil } var amebloPostList []*hplink.AmebloPost amebloPostKeys := make([]*datastore.Key, len(postList.List)) for i, post := range postList.List { amebloPostKeys[i] = entities.AmebloPost.NewKey(req, post.URL, nil) } _, _currentPostList, err := entities.AmebloPost.GetMulti().DatastoreKeys(amebloPostKeys...).List(req) if err != nil { return "", err } currentPostList := _currentPostList.([]*hplink.AmebloPost) for i, post := range postList.List { if currentPostList[i] != nil && currentPostList[i].IsContentsCrawled { // we don't update entities if they are already crawled by _processAmebloPosts continue } amebloPost := &hplink.AmebloPost{ URL: post.URL, Title: post.Title, PostAt: post.PostAt, Theme: post.Theme, ArtistKey: artist.Key, SettingsURL: settings.URL, IsContentsCrawled: false, } if settings.MemberKey != "" { amebloPost.MemberKey = settings.MemberKey } else { if m := _guessMember(req, amebloPost, members); m != nil { amebloPost.MemberKey = m.Key } } amebloPostList = append(amebloPostList, amebloPost) } _, err = entities.AmebloPost.PutMulti().DatastoreKeys(amebloPostKeys...).Update(req, amebloPostList) if err != nil { return "", err } return postList.Next, nil }