func _syncArtistProfiles(req *wcg.Request) ([]*datastore.Key, []*hplink.Artist, error) { req.Logger.Infof("Importing the artist list") sourceList, err := crawlers.NewHelloProject(urlfetch.NewHTTPClient(req)).Run() if err != nil { return nil, nil, err } if len(sourceList) == 0 { return nil, nil, errSyncNoArtistInfo } keys := make([]string, len(sourceList)) for i, source := range sourceList { keys[i] = source.Key } dskeys, _list := entities.Artist.GetMulti().Keys(keys...).UseDefaultIfNil(true).MustList(req) list := _list.([]*hplink.Artist) for i, source := range sourceList { list[i].Index = source.Index list[i].Key = source.Key list[i].Name = source.Name list[i].Thumbnail = models.Image{ URL: source.ImageURL, } } if lib.IsProduction() { if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil { for i, source := range sourceList { cache, err := cacher.Cache(source.ImageURL) if err != nil { req.Logger.Warnf("Image cache failed: %v", err) } else { list[i].Thumbnail = *cache.ToImage() } } } } entities.Artist.PutMulti().DatastoreKeys(dskeys...).Cache(ckAllArtits).MustUpdate(req, list) return dskeys, list, nil }
func (cacher *ImageCacher) openImageStream(url string) (*http.Response, error) { client := urlfetch.NewHTTPClient(cacher.req) resp, err := client.Get(url) if err != nil { return nil, err } if resp.StatusCode != 200 { resp.Body.Close() return nil, fmt.Errorf(errInvalidImageStatus, url, resp.Status) } contentType := resp.Header.Get("content-type") contentSize := resp.ContentLength if !strings.HasPrefix(contentType, "image/") { resp.Body.Close() return nil, fmt.Errorf(errInvalidImageContentType, url, contentType) } if contentSize == 0 { resp.Body.Close() return nil, fmt.Errorf(errInvalidImageContentSize, url) } return resp, nil }
func _syncMemberProfiles(req *wcg.Request, key *datastore.Key, artist *hplink.Artist) ([]*datastore.Key, []*hplink.Member, error) { // // Crawling pages. // req.Logger.Infof("Importing the member list from the artist page ...") artistPageInfo, err := crawlers.NewArtist(urlfetch.NewHTTPClient(req), artist.Key).Run() if err != nil { return nil, nil, err } keys := make([]string, len(artistPageInfo.Members)) memberPageInfoList := make([]*crawlers.MemberPageInfo, len(artistPageInfo.Members)) for i, member := range artistPageInfo.Members { req.Logger.Infof("Importing member details") result, err := crawlers.NewMember(urlfetch.NewHTTPClient(req), artist.Key, member.Key).Run() if err != nil { return nil, nil, err } keys[i] = fmt.Sprintf("%s.%s", artist.Key, member.Key) memberPageInfoList[i] = result } // optional task to update crawler settings var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() syncCrawlerSettings(req, key, artistPageInfo) }() defer wg.Wait() // // Update Datastore // // Check existing Member entities to merge with the crawling result. memberKeys, _members := entities.Member.GetMulti().Keys(keys...).UseDefaultIfNil(true).MustList(req) members := _members.([]*hplink.Member) // MemberPublicProfile can always be overwritten profiles := make([]*hplink.MemberPublicProfile, len(members)) for i := range profiles { profiles[i] = &hplink.MemberPublicProfile{} } for i, member := range artistPageInfo.Members { members[i].Key = keys[i] members[i].ShortKey = member.Key members[i].ArtistKey = artist.Key members[i].Name = member.Name members[i].Index = member.Index members[i].Birthday = memberPageInfoList[i].Birthday members[i].Joinday = memberPageInfoList[i].Joinday members[i].Images = make([]models.Image, len(memberPageInfoList[i].ImageURLs)) for j, u := range memberPageInfoList[i].ImageURLs { // TODO: integrate blob service members[i].Images[j] = models.Image{ URL: u, } } profiles[i].Key = keys[i] profiles[i].Nicknames = memberPageInfoList[i].Nicknames profiles[i].BloodType = memberPageInfoList[i].BloodType profiles[i].Hometown = memberPageInfoList[i].Hometown profiles[i].Skills = memberPageInfoList[i].Skills profiles[i].Hobbies = memberPageInfoList[i].Hobbies profiles[i].MusicGenres = memberPageInfoList[i].MusicGenres profiles[i].Sports = memberPageInfoList[i].Sports profiles[i].Motto = memberPageInfoList[i].Motto } // cache images if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil { for _, m := range members { for i := range m.Images { cache, err := cacher.Cache(m.Images[i].URL) if err != nil { req.Logger.Warnf("Image cache failed: %v", err) } else { m.Images[i] = *cache.ToImage() } } } } entities.Member.PutMulti().DatastoreKeys(memberKeys...).Cache( fmt.Sprintf(ckAllMembersTemplate, artist.Key), ).MustUpdate(req, members) profileKeys := make([]*datastore.Key, len(memberKeys)) for i, key := range memberKeys { profileKeys[i] = entities.MemberPublicProfile.NewKey(req, key.StringID(), nil) } entities.MemberPublicProfile.PutMulti().DatastoreKeys(profileKeys...).Cache( fmt.Sprintf(ckAllMembersTemplate, artist.Key), ).MustUpdate(req, profiles) return memberKeys, members, nil }
// NewHTTPClient implements Crawler#NewHTTPClient() func (c *IEPGDataCrawler) NewHTTPClient() *http.Client { return urlfetch.NewHTTPClient(c.request) }
func _crawlAmebloEntryList(req *wcg.Request, settings *hplink.CrawlerSettings, url string) (string, error) { _, _artist, err := entities.Artist.Get().Key(settings.ArtistKey).One(req) if err != nil { return "", err } p, err := entities.Member.Query().Filter("ArtistKey=", settings.ArtistKey).Execute(req) if err != nil { return "", err } if _artist == nil { req.Logger.Warnf("No artist found by the key %q (CralwerSettings: %s)", settings.ArtistKey, settings.URL) return "", nil } artist := _artist.(*hplink.Artist) members := p.Data.([]hplink.Member) crawler := crawlers.NewAmebloPostCrawler(urlfetch.NewHTTPClient(req)) postList, err := crawler.RunOnListURL(url) if err != nil { return "", fmt.Errorf("Error crawling %q - %v", url, err) } if len(postList.List) == 0 { return "", nil } var amebloPostList []*hplink.AmebloPost amebloPostKeys := make([]*datastore.Key, len(postList.List)) for i, post := range postList.List { amebloPostKeys[i] = entities.AmebloPost.NewKey(req, post.URL, nil) } _, _currentPostList, err := entities.AmebloPost.GetMulti().DatastoreKeys(amebloPostKeys...).List(req) if err != nil { return "", err } currentPostList := _currentPostList.([]*hplink.AmebloPost) for i, post := range postList.List { if currentPostList[i] != nil && currentPostList[i].IsContentsCrawled { // we don't update entities if they are already crawled by _processAmebloPosts continue } amebloPost := &hplink.AmebloPost{ URL: post.URL, Title: post.Title, PostAt: post.PostAt, Theme: post.Theme, ArtistKey: artist.Key, SettingsURL: settings.URL, IsContentsCrawled: false, } if settings.MemberKey != "" { amebloPost.MemberKey = settings.MemberKey } else { if m := _guessMember(req, amebloPost, members); m != nil { amebloPost.MemberKey = m.Key } } amebloPostList = append(amebloPostList, amebloPost) } _, err = entities.AmebloPost.PutMulti().DatastoreKeys(amebloPostKeys...).Update(req, amebloPostList) if err != nil { return "", err } return postList.Next, nil }