Exemplo n.º 1
0
func findArtistAndMember(req *wcg.Request) *hplink.Member {
	var artist *hplink.Artist
	var member *hplink.Member
	var funcs = []func(){
		func() {
			_, ent := entities.Artist.Get().Key(
				req.Param(paramKeyArtist),
			).MustOne(req)
			if ent != nil {
				artist = ent.(*hplink.Artist)
			}
		},
		func() {
			_, ent := entities.Member.Get().Key(
				fmt.Sprintf("%s.%s", req.Param(paramKeyArtist), req.Param(paramKeyMember)),
			).MustOne(req)
			if ent != nil {
				member = ent.(*hplink.Member)
			}
		},
	}
	iterator.ParallelSlice(funcs, func(i int, v func()) error {
		v()
		return nil
	})
	if artist == nil || member == nil {
		return nil
	}
	member.Artist = artist
	return member
}
func _crawlAmebloPost(req *wcg.Request, post *hplink.AmebloPost, members []hplink.Member) error {
	crawler := crawlers.NewAmebloPostCrawler(cacheutil.NewURLCacheAwareClient(req, entities.URLCache))
	req.Logger.Infof("[Task.Crawlers.AmebloPosts] crawling URL: %s", post.URL)
	crawled, err := crawler.RunOnPostURL(post.URL)
	if err != nil {
		return err
	}
	post.NumLikes = crawled.NumLikes
	post.NumComments = crawled.NumComments
	post.NumReblogs = crawled.NumReblogs
	post.PostAt = crawled.PostAt
	post.Title = crawled.Title
	post.Theme = crawled.Theme
	post.IsContentsCrawled = true
	post.Images = make([]models.Image, len(crawled.ImageURLs))
	if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil {
		iterator.ParallelSlice(crawled.ImageURLs, func(i int, url string) error {
			req.Logger.Infof("[Task.Crawlers.AmebloPosts] caching image URL %s (on %s)", url, post.URL)
			const retries = 5
			var err error
			var cache *models.ImageCache
			for j := 0; j < retries; j++ {
				if j != 0 {
					req.Logger.Infof("[Task.Crawlers.AmebloPosts] Retry image URL caching: %s (on %s) (%d)", url, post.URL, j)
				}
				cache, err = cacher.Cache(url)
				if err == nil {
					post.Images[i] = *cache.ToImage()
					return nil
				}
				lib.WaitAndEnsureAfter(lib.Now(), 3*time.Second)
			}
			req.Logger.Warnf("[Task.Crawlers.AmebloPosts] Failed to cache image URL %s (on %s) - %v", url, post.URL, err)
			return err
		})
	} else {
		for j, url := range crawled.ImageURLs {
			req.Logger.Infof("[Task.Crawlers.AmebloPosts] skip caching image URL %s (on %s)", url, post.URL)
			post.Images[j] = models.Image{
				URL: url,
			}
		}
	}
	// Update MemberKey only if not set.
	if post.MemberKey == "" {
		if m := _guessMember(req, post, members); m != nil {
			post.MemberKey = m.Key
		}
	}
	req.Logger.Infof("[Task.Crawlers.AmebloPosts] finished crawling")
	return nil
}
Exemplo n.º 3
0
func setupAPICrawlerSettings(app *server.App) {
	var API = app.API()

	var urlValidator = server.Handler(func(req *wcg.Request) response.Response {
		_, err := url.Parse(req.Param("url"))
		if err != nil {
			return response.APINotFound
		}
		return nil
	})

	API.GET("/crawlersettings/:type/",
		middleware.EntityAll(CrawlerSettings.Query().Filter("Type=", request.Value(func(req *wcg.Request) interface{} {
			t, _ := hplink.ParseCrawlerSettingsType(req.Param("type"))
			if t == hplink.CrawlerSettingsTypeUnknown {
				return entities.FilterValueSkip
			}
			return t
		}))),
	)

	API.GET("/crawlersettings/ameblo/stats/",
		server.Handler(func(req *wcg.Request) response.Response {
			type post struct {
				URL string    `json:"url"`
				At  time.Time `json:"at"`
			}
			type stats struct {
				URL               string `json:"url"`
				FirstPost         *post  `json:"first_post,omitempty"`
				LastPost          *post  `json:"last_post,omitempty"`
				TotalPosts        int    `json:"total_posts"`
				CrawledPosts      int    `json:"crawled_posts"`
				ImageFailurePosts int    `json:"image_failure_posts"`
			}
			p := CrawlerSettings.Query().Filter("Type=", hplink.CrawlerSettingsTypeAmeblo).MustExecute(req)
			settings := p.Data.([]hplink.CrawlerSettings)
			s := make([]stats, len(settings))
			if err := iterator.ParallelSlice(settings, func(i int, v *hplink.CrawlerSettings) error {
				s[i].URL = v.URL
				s[i].TotalPosts = AmebloPost.Query().Filter("SettingsURL=", v.URL).MustCount(req)
				if s[i].TotalPosts > 0 {
					s[i].CrawledPosts = AmebloPost.Query().Filter("SettingsURL=", v.URL).Filter("IsContentsCrawled=", true).MustCount(req)
					s[i].ImageFailurePosts = AmebloPost.Query().Filter("SettingsURL=", v.URL).Filter("Images.Height=", 0).MustCount(req)
					pf := AmebloPost.Query().Filter("SettingsURL=", v.URL).Order("PostAt").Limit(1).MustExecute(req)
					pl := AmebloPost.Query().Filter("SettingsURL=", v.URL).Order("-PostAt").Limit(1).MustExecute(req)
					first := pf.Head().(*hplink.AmebloPost)
					last := pl.Head().(*hplink.AmebloPost)
					s[i].FirstPost = &post{
						URL: first.URL, At: first.PostAt,
					}
					s[i].LastPost = &post{
						URL: last.URL, At: last.PostAt,
					}
					return nil
				}
				return nil
			}); err != nil {
				panic(err)
			}
			return response.NewJSONResponse(s)
		}),
	)

	API.GET("/crawlersettings/:url.json",
		middleware.EntityGet(CrawlerSettings.Get(), "url"),
	)

	API.PUT("/crawlersettings/:url.json",
		urlValidator,
		middleware.ParseForm(func(v *validators.FormValidator) {
			v.Field("artist_key").Required()
		}),
		middleware.EntityPutOrCreate(
			CrawlerSettings.Put(),
			"url",
		),
	)

	API.DELETE("/crawlersettings/:url.json",
		urlValidator,
		middleware.EntityDelete(CrawlerSettings.Delete(), "url"),
	)
}
func runTasksCrawlersAmebloEntryLists(req *wcg.Request, task *models.AsyncAPITask) (*models.AsyncAPITaskProgress, error) {
	const FollowLinkKey = "fl"
	const SettingsKey = "s"
	const URLKey = "u"

	var query = req.HTTPRequest().URL.Query()
	var settingsList []*hplink.CrawlerSettings
	var urlList []string
	if settingsKeys, ok := query[SettingsKey]; ok {
		_, _list := entities.CrawlerSettings.GetMulti().Keys(settingsKeys...).MustList(req)
		settingsList = _list.([]*hplink.CrawlerSettings)
	} else {
		query := entities.CrawlerSettings.Query().Filter("Type=", hplink.CrawlerSettingsTypeAmeblo)
		if pagination := query.MustExecute(req); pagination.Length() > 0 {
			list := pagination.Data.([]hplink.CrawlerSettings)
			settingsList = make([]*hplink.CrawlerSettings, len(list))
			for i := range list {
				settingsList[i] = &list[i]
			}
		}
	}

	var numList = len(settingsList)
	urlList = make([]string, numList)
	if urls, ok := query[URLKey]; ok {
		if numList != len(urls) {
			return nil, fmt.Errorf("List mismatch - found %d settings but %d urls are specified", numList, len(urls))
		}
		urlList = query[URLKey]
	} else {
		for i := range settingsList {
			urlList[i] = (*hplink.AmebloCrawlerSettings)(settingsList[i]).GetEntryListURL()
		}
	}

	startTime := lib.Now()
	nextParamSettingsKeys := make([]string, numList)
	nextParamURLs := make([]string, numList)
	err := iterator.ParallelSlice(settingsList, func(i int, v *hplink.CrawlerSettings) error {
		next, err := _crawlAmebloEntryList(req, v, urlList[i])
		if err != nil {
			settingsList[i].Error = []byte(fmt.Sprintf("%v", err))
			settingsList[i].Status = hplink.CrawlerStatusFailure
			settingsList[i].LastRun = lib.Now()
			return err
		}
		settingsList[i].Error = nil
		settingsList[i].Status = hplink.CrawlerStatusSuccess
		settingsList[i].LastRun = lib.Now()
		if next != "" {
			nextParamSettingsKeys[i] = v.URL
			nextParamURLs[i] = next
		}
		return nil
	})
	entities.CrawlerSettings.PutMulti().MustUpdate(req, settingsList)
	if err != nil {
		return nil, err
	}
	if req.Query(FollowLinkKey) != "true" {
		return nil, err
	}
	// fl=true make a recursive call to follow next links
	// reduce empty urls from nextParam* and return it for recursive call
	var fixedNextParamSettingsKeys []string
	var fixedNextParamURLs []string
	var hasNext = false
	for i := range nextParamURLs {
		if nextParamURLs[i] != "" {
			hasNext = true
			fixedNextParamSettingsKeys = append(fixedNextParamSettingsKeys, nextParamSettingsKeys[i])
			fixedNextParamURLs = append(fixedNextParamURLs, nextParamURLs[i])
		}
	}
	var progress models.AsyncAPITaskProgress
	var lastProgress = task.LastProgress()
	if lastProgress == nil {
		progress.Current = len(urlList)
		progress.Total = 0
	} else {
		progress.Current = lastProgress.Current + len(urlList)
	}
	if hasNext {
		progress.Next = url.Values{
			FollowLinkKey: []string{"true"},
			SettingsKey:   fixedNextParamSettingsKeys,
			URLKey:        fixedNextParamURLs,
		}
		wait := configs.GetIntValue(req, "hplink.ameblo_crawler_url_wait", 2, 0, 10)
		lib.WaitAndEnsureAfter(startTime, time.Duration(wait)*time.Second)
	}
	req.Logger.Infof("No more URL needs to be crawled.")
	return &progress, nil
}