Beispiel #1
0
// Get status updates from an account's timeline
func TwitterPublicMessagesByAccount() {
	for _, territory := range socialHarvest.Config.Harvest.Territories {
		// If different credentials were set for the territory, this will find and set them
		harvester.NewTwitterTerritoryCredentials(territory.Name)

		for _, account := range territory.Accounts.Twitter {
			// Build params for search
			params := url.Values{}
			params.Set("include_entities", "true")
			if len(territory.Content.Options.Lang) > 0 {
				params.Set("lang", territory.Content.Options.Lang)
			}
			if len(territory.Content.Options.TwitterGeocode) > 0 {
				params.Set("geocode", territory.Content.Options.TwitterGeocode)
			}

			harvestState := config.HarvestState{
				LastId:         "",
				LastTime:       time.Now(),
				PagesHarvested: 1,
				ItemsHarvested: 0,
			}

			// Limit to 10 pages max. Anything more will simply take too long and cause issues.
			maxPages := territory.Limits.MaxResultsPages
			if maxPages == 0 {
				maxPages = 10
			}

			// Fetch X pages of results
			for i := 0; i < maxPages; i++ {
				lastHarvestId := socialHarvest.Database.GetLastHarvestId(territory.Name, "twitter", "TwitterPublicMessagesByAccount", account)
				if lastHarvestId != "" {
					params.Set("since_id", lastHarvestId)
				}
				// Determine if the account is by id or username (both are accepted)
				if _, err := strconv.Atoi(account); err == nil {
					params.Set("user_id", account)
				} else {
					params.Set("screen_name", account)
				}
				params.Set("contributor_details", "true")

				updatedParams, updatedHarvestState := harvester.TwitterAccountStream(territory.Name, harvestState, params)
				params = updatedParams
				harvestState = updatedHarvestState

				// Always save this on each page. Then if something crashes for some reason during a harvest of several pages, we can pick up where we left off. Rather than starting over again.
				if harvestState.ItemsHarvested > 0 {
					socialHarvest.Database.SetLastHarvestTime(territory.Name, "twitter", "TwitterPublicMessagesByAccount", account, harvestState.LastTime, harvestState.LastId, harvestState.ItemsHarvested)
				}
				// We also avoid using "break" because the for loop is now based on number of pages to harvest.
				// But this could lead to harvesting pages taht don't exist, so we should still "break" in that case.
				// Since every call to FacebookFeed() should return with a new Until value, we'll look to see if it's empty. If so, it was the latest page of results from FB. Break the loop.
				if params.Get("since_id") == "" {
					// log.Println("completed search - no more pages of results")
					break
				}
			}

		}
	}
	return
}
Beispiel #2
0
// Searches Twitter for status updates by territory keyword criteria
func TwitterPublicMessagesByKeyword() {
	for _, territory := range socialHarvest.Config.Harvest.Territories {
		// If different credentials were set for the territory, this will find and set them
		harvester.NewTwitterTerritoryCredentials(territory.Name)

		// Build params for search
		params := url.Values{}
		params.Set("include_entities", "true")
		if len(territory.Content.Options.Lang) > 0 {
			params.Set("lang", territory.Content.Options.Lang)
		}
		if len(territory.Content.Options.TwitterGeocode) > 0 {
			params.Set("geocode", territory.Content.Options.TwitterGeocode)
		}

		// Search all keywords
		if len(territory.Content.Keywords) > 0 {
			for _, keyword := range territory.Content.Keywords {
				log.Print("Searching for: " + keyword)

				// A globally set limit in the Social Harvest config (or default of "100")
				if territory.Limits.ResultsPerPage != "" {
					params.Set("count", territory.Limits.ResultsPerPage)
				} else {
					params.Set("count", "100")
				}

				// Keep track of the last id harvested, the number of items harvested, etc. This information will be returend from `harvester.TwitterSearch()`
				// on each call in the loop. We'll just keep incrementing the items and overwriting the last id and time. This information then gets saved to the harvest series.
				// So then on the next harvest, we can see where we left off so we don't request the same data again from the API. This doesn't guarantee the prevention of dupes
				// of course, but it does decrease unnecessary API calls which helps with rate limiting and efficiency.
				harvestState := config.HarvestState{
					LastId:         "",
					LastTime:       time.Now(),
					PagesHarvested: 1,
					ItemsHarvested: 0,
				}

				// Limit to 10 pages max. Anything more will simply take too long and cause issues.
				maxPages := territory.Limits.MaxResultsPages
				if maxPages == 0 {
					maxPages = 10
				}

				// Fetch all pages (it keeps going until there are no more, but that could be problematic for API rate limits - so in the Social Harvest config, a limit can be put on number of pages returned)
				for i := 0; i < maxPages; i++ {
					// Note: The "since" seems to get removed in the "next" pagination link.
					// It would have worked perfectly and stopped if they held on to it as a limiter. Now, we need to hold on to it in the harvester and watch.
					// When results start coming in that have a time older than this "since" value - break the loop (also note, configuration can limit pages too).
					// However. If nothing has truly been posted since the last harvest, then no results will be returned when passing "since" and that will help a little.
					// So always pass it. Since we only get the "next" page, we don't need to change it (and it does help particularly with account feeds).
					lastHarvestId := socialHarvest.Database.GetLastHarvestId(territory.Name, "twitter", "TwitterPublicMessagesByKeyword", keyword)
					if lastHarvestId != "" {
						params.Set("since_id", lastHarvestId)
					}

					updatedParams, updatedHarvestState := harvester.TwitterSearch(territory.Name, harvestState, keyword, params)
					params = updatedParams
					harvestState = updatedHarvestState
					//log.Println("harvested a page of results from twitter")

					if harvestState.ItemsHarvested > 0 {
						socialHarvest.Database.SetLastHarvestTime(territory.Name, "twitter", "TwitterPublicMessagesByKeyword", keyword, harvestState.LastTime, harvestState.LastId, harvestState.ItemsHarvested)
					}
					// We also avoid using "break" because the for loop is now based on number of pages to harvest.
					// But this could lead to harvesting pages taht don't exist, so we should still "break" in that case.
					// Since every call to FacebookFeed() should return with a new Until value, we'll look to see if it's empty. If so, it was the latest page of results from FB. Break the loop.
					if params.Get("since_id") == "" {
						// log.Println("completed search - no more pages of results")
						break
					}
				}
			}
		}
	}
	log.Println("done twitter public message search")
	return
}