// Get status updates from an account's timeline func TwitterPublicMessagesByAccount() { for _, territory := range socialHarvest.Config.Harvest.Territories { // If different credentials were set for the territory, this will find and set them harvester.NewTwitterTerritoryCredentials(territory.Name) for _, account := range territory.Accounts.Twitter { // Build params for search params := url.Values{} params.Set("include_entities", "true") if len(territory.Content.Options.Lang) > 0 { params.Set("lang", territory.Content.Options.Lang) } if len(territory.Content.Options.TwitterGeocode) > 0 { params.Set("geocode", territory.Content.Options.TwitterGeocode) } harvestState := config.HarvestState{ LastId: "", LastTime: time.Now(), PagesHarvested: 1, ItemsHarvested: 0, } // Limit to 10 pages max. Anything more will simply take too long and cause issues. maxPages := territory.Limits.MaxResultsPages if maxPages == 0 { maxPages = 10 } // Fetch X pages of results for i := 0; i < maxPages; i++ { lastHarvestId := socialHarvest.Database.GetLastHarvestId(territory.Name, "twitter", "TwitterPublicMessagesByAccount", account) if lastHarvestId != "" { params.Set("since_id", lastHarvestId) } // Determine if the account is by id or username (both are accepted) if _, err := strconv.Atoi(account); err == nil { params.Set("user_id", account) } else { params.Set("screen_name", account) } params.Set("contributor_details", "true") updatedParams, updatedHarvestState := harvester.TwitterAccountStream(territory.Name, harvestState, params) params = updatedParams harvestState = updatedHarvestState // Always save this on each page. Then if something crashes for some reason during a harvest of several pages, we can pick up where we left off. Rather than starting over again. if harvestState.ItemsHarvested > 0 { socialHarvest.Database.SetLastHarvestTime(territory.Name, "twitter", "TwitterPublicMessagesByAccount", account, harvestState.LastTime, harvestState.LastId, harvestState.ItemsHarvested) } // We also avoid using "break" because the for loop is now based on number of pages to harvest. // But this could lead to harvesting pages taht don't exist, so we should still "break" in that case. // Since every call to FacebookFeed() should return with a new Until value, we'll look to see if it's empty. If so, it was the latest page of results from FB. Break the loop. if params.Get("since_id") == "" { // log.Println("completed search - no more pages of results") break } } } } return }
// Searches Twitter for status updates by territory keyword criteria func TwitterPublicMessagesByKeyword() { for _, territory := range socialHarvest.Config.Harvest.Territories { // If different credentials were set for the territory, this will find and set them harvester.NewTwitterTerritoryCredentials(territory.Name) // Build params for search params := url.Values{} params.Set("include_entities", "true") if len(territory.Content.Options.Lang) > 0 { params.Set("lang", territory.Content.Options.Lang) } if len(territory.Content.Options.TwitterGeocode) > 0 { params.Set("geocode", territory.Content.Options.TwitterGeocode) } // Search all keywords if len(territory.Content.Keywords) > 0 { for _, keyword := range territory.Content.Keywords { log.Print("Searching for: " + keyword) // A globally set limit in the Social Harvest config (or default of "100") if territory.Limits.ResultsPerPage != "" { params.Set("count", territory.Limits.ResultsPerPage) } else { params.Set("count", "100") } // Keep track of the last id harvested, the number of items harvested, etc. This information will be returend from `harvester.TwitterSearch()` // on each call in the loop. We'll just keep incrementing the items and overwriting the last id and time. This information then gets saved to the harvest series. // So then on the next harvest, we can see where we left off so we don't request the same data again from the API. This doesn't guarantee the prevention of dupes // of course, but it does decrease unnecessary API calls which helps with rate limiting and efficiency. harvestState := config.HarvestState{ LastId: "", LastTime: time.Now(), PagesHarvested: 1, ItemsHarvested: 0, } // Limit to 10 pages max. Anything more will simply take too long and cause issues. maxPages := territory.Limits.MaxResultsPages if maxPages == 0 { maxPages = 10 } // Fetch all pages (it keeps going until there are no more, but that could be problematic for API rate limits - so in the Social Harvest config, a limit can be put on number of pages returned) for i := 0; i < maxPages; i++ { // Note: The "since" seems to get removed in the "next" pagination link. // It would have worked perfectly and stopped if they held on to it as a limiter. Now, we need to hold on to it in the harvester and watch. // When results start coming in that have a time older than this "since" value - break the loop (also note, configuration can limit pages too). // However. If nothing has truly been posted since the last harvest, then no results will be returned when passing "since" and that will help a little. // So always pass it. Since we only get the "next" page, we don't need to change it (and it does help particularly with account feeds). lastHarvestId := socialHarvest.Database.GetLastHarvestId(territory.Name, "twitter", "TwitterPublicMessagesByKeyword", keyword) if lastHarvestId != "" { params.Set("since_id", lastHarvestId) } updatedParams, updatedHarvestState := harvester.TwitterSearch(territory.Name, harvestState, keyword, params) params = updatedParams harvestState = updatedHarvestState //log.Println("harvested a page of results from twitter") if harvestState.ItemsHarvested > 0 { socialHarvest.Database.SetLastHarvestTime(territory.Name, "twitter", "TwitterPublicMessagesByKeyword", keyword, harvestState.LastTime, harvestState.LastId, harvestState.ItemsHarvested) } // We also avoid using "break" because the for loop is now based on number of pages to harvest. // But this could lead to harvesting pages taht don't exist, so we should still "break" in that case. // Since every call to FacebookFeed() should return with a new Until value, we'll look to see if it's empty. If so, it was the latest page of results from FB. Break the loop. if params.Get("since_id") == "" { // log.Println("completed search - no more pages of results") break } } } } } log.Println("done twitter public message search") return }