// Search for status updates and just pass the Tweet along (no special mapping required like FacebookPost{} because the Tweet struct is used across multiple API calls unlike Facebook) // All "search" functions (and anything that gets data from an API) will now normalize the data, mapping it to a Social Harvest struct. // This means there will be no way to get the original data from the service (back in the main app or from any other Go package that imports the harvester). // This is fine because if someone wanted the original data, they could use packages like anaconda directly. // What happens now is all data pulled from earch service's API will be sent to a channel (the harvester observer). However, this function should NOT be called in a go-subroutine though. // We don't want to make multiple API calls in parallel (rate limits). // NOTE: The number of items sent to the observer will be returned along with the last message's time and id. The main package can record this in the harvest logs/table. // The harvester will not keep track of this information itself. Its only job is to gather data, send it to the channel and report back on how much was sent (and the last id/time). Period. // It doens't care if the data is stored in a database, logged, or streamed out from an API. It just harvests and sends without looking or caring. // Whereas previously it would be doing the db calls and logging, etc. This has now all been taken care of with the observer. All of these other processes simply subscribe and listen. // // Always passed in first (always): the territory name, and the position in the harvest (HarvestState) ... the rest are going to vary based on the API but typically are the query and options // @return options(for pagination), count of items, last id, last time. func TwitterSearch(territoryName string, harvestState config.HarvestState, query string, options url.Values) (url.Values, config.HarvestState) { searchResults, _ := services.twitter.GetSearch(query, options) // The cool thing about Twitter's API is that we have all the user data we need already. So we make less HTTP requests than when using Facebook's API. for _, tweet := range searchResults.Statuses { //log.Println(tweet) // log.Println("processing a tweet....") tweetCreatedTime, err := time.Parse(time.RubyDate, tweet.CreatedAt) // Only take tweets that have a time (and an ID from Facebook) if err == nil && len(tweet.IdStr) > 0 { harvestState.ItemsHarvested++ // If this is the most recent tweet in the results, set it's date and id (to be returned) so we can continue where we left off in future harvests if harvestState.LastTime.IsZero() || tweetCreatedTime.Unix() > harvestState.LastTime.Unix() { harvestState.LastTime = tweetCreatedTime harvestState.LastId = tweet.IdStr } // determine gender var contributorGender = DetectGender(tweet.User.Name) // TODO: figure out type somehow... var contributorType = DetectContributorType(tweet.User.Name, contributorGender) // While `tweet.Place.CountryCode` exists, city and state/region don't. So that doesn't help much. I suppose we could use it for supporting information send to the geocoder... // But if the `tweet.User.Location` string already has a country code or name in there then I don't want to confuse it. // I thought I'd be able to use more geo data from Twitter, but I guess not for now. Really happy the geocoder is now in memory. var contributorCountry = "" var contributorRegion = "" var contributorCity = "" var contributorCityPopulation = int32(0) var statusLongitude = 0.0 var statusLatitude = 0.0 lng, errLng := tweet.Longitude() if errLng == nil { statusLongitude = lng } lat, errLat := tweet.Latitude() if errLat == nil { statusLatitude = lat } // Contributor location lookup (if no lat/lng was found on the message - try to reduce number of geocode lookups) contributorLat := 0.0 contributorLng := 0.0 if statusLatitude == 0.0 || statusLatitude == 0.0 { // Do not make a request for nothing (there are no 1 character locations either). if len(tweet.User.Location) > 1 { location := services.geocoder.Geocode(tweet.User.Location) contributorLat = location.Latitude contributorLng = location.Longitude contributorRegion = location.Region contributorCity = location.City contributorCityPopulation = location.Population contributorCountry = location.Country } //contributorLat, contributorLng = Geocode(tweet.User.Location) } else { reverseLocation := services.geocoder.ReverseGeocode(statusLatitude, statusLongitude) contributorRegion = reverseLocation.Region contributorCity = reverseLocation.City contributorCityPopulation = reverseLocation.Population contributorCountry = reverseLocation.Country // keep these, no need to change - might change accuracy, etc. contributorLat = statusLatitude contributorLng = statusLongitude } // Contributor geohash var contributorLocationGeoHash = geohash.Encode(contributorLat, contributorLng) // This is produced with empty lat/lng values - don't store it. if contributorLocationGeoHash == "7zzzzzzzzzzz" { contributorLocationGeoHash = "" } // Generate a harvest_id to avoid potential dupes (a unique index is placed on this field and all insert errors ignored). harvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName) message := config.SocialHarvestMessage{ Time: tweetCreatedTime, HarvestId: harvestId, Territory: territoryName, Network: "twitter", ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, ContributorVerified: Btoi(tweet.User.Verified), ContributorFollowers: tweet.User.FollowersCount, ContributorStatusesCount: int(tweet.User.StatusesCount), ContributorGender: contributorGender, ContributorType: contributorType, Message: tweet.Text, Sentiment: services.sentimentAnalyzer.Classify(tweet.Text), IsQuestion: Btoi(IsQuestion(tweet.Text, harvestConfig.QuestionRegex)), MessageId: tweet.IdStr, TwitterRetweetCount: tweet.RetweetCount, TwitterFavoriteCount: tweet.FavoriteCount, } go StoreHarvestedData(message) LogJson(message, "messages") // Keywords are stored on the same collection as hashtags - but under a `keyword` field instead of `tag` field as to not confuse the two. // Keywords are found across every network, whereas hashtags are only found on a few. // Limit to words 4 characters or more and only return 8 keywords. This could greatly increase the database size if not limited. // Typically for Twitter, not more than 10 keywords are returned. keywords := GetKeywords(tweet.Text, 4, 8) if len(keywords) > 0 { for _, keyword := range keywords { if keyword != "" { keywordHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + keyword) // Again, keyword share the same series/table/collection hashtag := config.SocialHarvestHashtag{ Time: tweetCreatedTime, HarvestId: keywordHarvestId, Territory: territoryName, Network: "twitter", MessageId: tweet.IdStr, ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorType: contributorType, ContributorGender: contributorGender, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Keyword: keyword, } StoreHarvestedData(hashtag) LogJson(hashtag, "hashtags") } } } // shared links if len(tweet.Entities.Urls) > 0 { for _, link := range tweet.Entities.Urls { if len(link.Url) > 0 { // Shared link harvest id has to be different because otherwise only one would be stored sharedLinkHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + link.Expanded_url) linkHostName := "" pUrl, _ := url.Parse(link.Url) linkHostName = pUrl.Host sharedLink := config.SocialHarvestSharedLink{ Time: tweetCreatedTime, HarvestId: sharedLinkHarvestId, Territory: territoryName, Network: "twitter", MessageId: tweet.IdStr, ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorType: contributorType, ContributorGender: contributorGender, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Url: link.Url, ExpandedUrl: link.Expanded_url, Host: linkHostName, } StoreHarvestedData(sharedLink) LogJson(sharedLink, "shared_links") } } } // more shared links (media entities) if len(tweet.Entities.Media) > 0 { for _, media := range tweet.Entities.Media { if len(media.Url) > 0 { sharedMediaHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + media.Expanded_url) mediaHostName := "" pUrl, _ := url.Parse(media.Url) mediaHostName = pUrl.Host sharedMedia := config.SocialHarvestSharedLink{ Time: tweetCreatedTime, HarvestId: sharedMediaHarvestId, Territory: territoryName, Network: "twitter", MessageId: tweet.IdStr, ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorType: contributorType, ContributorGender: contributorGender, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Url: media.Url, ExpandedUrl: media.Expanded_url, Host: mediaHostName, Type: media.Type, Source: media.Media_url, } StoreHarvestedData(sharedMedia) LogJson(sharedMedia, "shared_links") } } } // hashtags if len(tweet.Entities.Hashtags) > 0 { for _, tag := range tweet.Entities.Hashtags { if len(tag.Text) > 0 { hashtagHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + tag.Text) hashtag := config.SocialHarvestHashtag{ Time: tweetCreatedTime, HarvestId: hashtagHarvestId, Territory: territoryName, Network: "twitter", MessageId: tweet.IdStr, ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorType: contributorType, ContributorGender: contributorGender, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Tag: tag.Text, } StoreHarvestedData(hashtag) LogJson(hashtag, "hashtags") } } } // mentions if len(tweet.Entities.User_mentions) > 0 { for _, mentionedUser := range tweet.Entities.User_mentions { if len(mentionedUser.Id_str) > 0 { mentionHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + mentionedUser.Id_str) // TODO: add mentioned user info? (another api request) mention := config.SocialHarvestMention{ Time: tweetCreatedTime, HarvestId: mentionHarvestId, Territory: territoryName, Network: "twitter", MessageId: tweet.IdStr, ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorType: contributorType, ContributorGender: contributorGender, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, MentionedId: mentionedUser.Id_str, MentionedScreenName: mentionedUser.Screen_name, MentionedName: mentionedUser.Name, } StoreHarvestedData(mention) LogJson(mention, "mentions") } } } } else { log.Println("Could not parse the time from the Tweet, so I'm throwing it away!") log.Println(err) } } return options, harvestState }
// Harvests from a specific Twitter account stream func TwitterAccountStream(territoryName string, harvestState config.HarvestState, options url.Values) (url.Values, config.HarvestState) { searchResults, _ := services.twitter.GetUserTimeline(options) // The cool thing about Twitter's API is that we have all the user data we need already. So we make less HTTP requests than when using Facebook's API. for _, tweet := range searchResults { //log.Println(tweet) // log.Println("processing a tweet....") tweetCreatedTime, err := time.Parse(time.RubyDate, tweet.CreatedAt) // Only take tweets that have a time (and an ID from Facebook) if err == nil && len(tweet.IdStr) > 0 { harvestState.ItemsHarvested++ // If this is the most recent tweet in the results, set it's date and id (to be returned) so we can continue where we left off in future harvests if harvestState.LastTime.IsZero() || tweetCreatedTime.Unix() > harvestState.LastTime.Unix() { harvestState.LastTime = tweetCreatedTime harvestState.LastId = tweet.IdStr } // determine gender var contributorGender = DetectGender(tweet.User.Name) // TODO: figure out type somehow... var contributorType = DetectContributorType(tweet.User.Name, contributorGender) var contributorCountry = "" var contributorRegion = "" var contributorCity = "" var contributorCityPopulation = int32(0) var statusLongitude = 0.0 var statusLatitude = 0.0 lng, errLng := tweet.Longitude() if errLng == nil { statusLongitude = lng } lat, errLat := tweet.Latitude() if errLat == nil { statusLatitude = lat } // Contributor location lookup (if no lat/lng was found on the message - try to reduce number of geocode lookups) contributorLat := 0.0 contributorLng := 0.0 if statusLatitude == 0.0 || statusLatitude == 0.0 { // Do not make a request for nothing (there are no 1 character locations either). if len(tweet.User.Location) > 1 { location := services.geocoder.Geocode(tweet.User.Location) contributorLat = location.Latitude contributorLng = location.Longitude contributorRegion = location.Region contributorCity = location.City contributorCityPopulation = location.Population contributorCountry = location.Country } //contributorLat, contributorLng = Geocode(tweet.User.Location) } else { reverseLocation := services.geocoder.ReverseGeocode(statusLatitude, statusLongitude) contributorRegion = reverseLocation.Region contributorCity = reverseLocation.City contributorCityPopulation = reverseLocation.Population contributorCountry = reverseLocation.Country // keep these, no need to change - might change accuracy, etc. contributorLat = statusLatitude contributorLng = statusLongitude } // Contributor geohash var contributorLocationGeoHash = geohash.Encode(contributorLat, contributorLng) // This is produced with empty lat/lng values - don't store it. if contributorLocationGeoHash == "7zzzzzzzzzzz" { contributorLocationGeoHash = "" } // Generate a harvest_id to avoid potential dupes (a unique index is placed on this field and all insert errors ignored). harvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName) message := config.SocialHarvestMessage{ Time: tweetCreatedTime, HarvestId: harvestId, Territory: territoryName, Network: "twitter", ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, ContributorVerified: Btoi(tweet.User.Verified), ContributorFollowers: tweet.User.FollowersCount, ContributorStatusesCount: int(tweet.User.StatusesCount), ContributorGender: contributorGender, ContributorType: contributorType, Message: tweet.Text, IsQuestion: Btoi(IsQuestion(tweet.Text, harvestConfig.QuestionRegex)), MessageId: tweet.IdStr, TwitterRetweetCount: tweet.RetweetCount, TwitterFavoriteCount: tweet.FavoriteCount, } // Send to the harvester observer StoreHarvestedData(message) LogJson(message, "messages") // shared links if len(tweet.Entities.Urls) > 0 { for _, link := range tweet.Entities.Urls { if len(link.Url) > 0 { // Shared link harvest id has to be different because otherwise only one would be stored sharedLinkHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + link.Expanded_url) linkHostName := "" pUrl, _ := url.Parse(link.Url) linkHostName = pUrl.Host // TODO: ADD contributor gender, contributor type sharedLink := config.SocialHarvestSharedLink{ Time: tweetCreatedTime, HarvestId: sharedLinkHarvestId, Territory: territoryName, Network: "twitter", MessageId: tweet.IdStr, ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorType: contributorType, ContributorGender: contributorGender, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Url: link.Url, ExpandedUrl: link.Expanded_url, Host: linkHostName, } // Send to the harvester observer StoreHarvestedData(sharedLink) LogJson(sharedLink, "shared_links") } } } // more shared links (media entities) if len(tweet.Entities.Media) > 0 { for _, media := range tweet.Entities.Media { if len(media.Url) > 0 { sharedMediaHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + media.Expanded_url) mediaHostName := "" pUrl, _ := url.Parse(media.Url) mediaHostName = pUrl.Host // TODO: ADD contributor gender, contributor type sharedMedia := config.SocialHarvestSharedLink{ Time: tweetCreatedTime, HarvestId: sharedMediaHarvestId, Territory: territoryName, Network: "twitter", MessageId: tweet.IdStr, ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorType: contributorType, ContributorGender: contributorGender, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Url: media.Url, ExpandedUrl: media.Expanded_url, Host: mediaHostName, Type: media.Type, Source: media.Media_url, } // Send to the harvester observer StoreHarvestedData(sharedMedia) LogJson(sharedMedia, "shared_links") } } } // hashtags if len(tweet.Entities.Hashtags) > 0 { for _, tag := range tweet.Entities.Hashtags { if len(tag.Text) > 0 { hashtagHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + tag.Text) // TODO: ADD contributor gender, contributor type hashtag := config.SocialHarvestHashtag{ Time: tweetCreatedTime, HarvestId: hashtagHarvestId, Territory: territoryName, Network: "twitter", MessageId: tweet.IdStr, ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorType: contributorType, ContributorGender: contributorGender, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Tag: tag.Text, } // Send to the harvester observer StoreHarvestedData(hashtag) LogJson(hashtag, "hashtags") } } } // mentions if len(tweet.Entities.User_mentions) > 0 { for _, mentionedUser := range tweet.Entities.User_mentions { if len(mentionedUser.Id_str) > 0 { mentionHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + mentionedUser.Id_str) // TODO: ADD contributor gender, contributor type // and mentioned user info (another api request) mention := config.SocialHarvestMention{ Time: tweetCreatedTime, HarvestId: mentionHarvestId, Territory: territoryName, Network: "twitter", MessageId: tweet.IdStr, ContributorId: tweet.User.IdStr, ContributorScreenName: tweet.User.ScreenName, ContributorName: tweet.User.Name, ContributorLang: tweet.User.Lang, ContributorType: contributorType, ContributorGender: contributorGender, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, MentionedId: mentionedUser.Id_str, MentionedScreenName: mentionedUser.Screen_name, MentionedName: mentionedUser.Name, } // Send to the harvester observer StoreHarvestedData(mention) LogJson(mention, "mentions") } } } } else { log.Println("Could not parse the time from the Tweet, so I'm throwing it away!") log.Println(err) } } return options, harvestState }
// Gets Google+ activities (posts) by searching for a keyword. func GooglePlusActivitySearch(territoryName string, harvestState config.HarvestState, query string, options url.Values) (url.Values, config.HarvestState) { limit, lErr := strconv.ParseInt(options.Get("count"), 10, 64) if lErr != nil { limit = 20 } if limit > 20 { limit = 20 } // If there's a next page token, it'll be used to continue to the next page for this harvest nextPageToken := options.Get("nextPageToken") activities, err := services.googlePlus.Activities.Search(query).MaxResults(limit).PageToken(nextPageToken).Do() if err == nil { // Passed back to whatever called this function, so it can continue with the next page. options.Set("nextPageToken", activities.NextPageToken) for _, item := range activities.Items { itemCreatedTime, err := time.Parse(time.RFC3339, item.Published) // Only take instagrams that have a time if err == nil && len(item.Id) > 0 { harvestState.ItemsHarvested++ // If this is the most recent tweet in the results, set it's date and id (to be returned) so we can continue where we left off in future harvests if harvestState.LastTime.IsZero() || itemCreatedTime.Unix() > harvestState.LastTime.Unix() { harvestState.LastTime = itemCreatedTime harvestState.LastId = item.Id } // Generate a harvest_id to avoid potential dupes (a unique index is placed on this field and all insert errors ignored). harvestId := GetHarvestMd5(item.Id + "googlePlus" + territoryName) // contributor row (who created the message) // NOTE: This is synchronous...but that's ok because while I'd love to use channels and make a bunch of requests at once, there's rate limits from these APIs... // Plus the contributor info tells us a few things about the message, such as locale. Other series will use this data. contributor, err := services.googlePlus.People.Get(item.Actor.Id).Do() if err != nil { log.Println(err) return options, harvestState } var contributorGender = 0 if contributor.Gender == "male" { contributorGender = 1 } if contributor.Gender == "female" { contributorGender = -1 } var contributorType = DetectContributorType(item.Actor.DisplayName, contributorGender) contributorLanguage := LocaleToLanguageISO(contributor.Language) var itemLat = 0.0 var itemLng = 0.0 // Reverse code to get city, state, country, etc. var contributorCountry = "" var contributorRegion = "" var contributorCity = "" var contributorCityPopulation = int32(0) if item.Location != nil && item.Location.Position != nil { if item.Location.Position.Latitude != 0.0 && item.Location.Position.Longitude != 0.0 { itemLat = item.Location.Position.Latitude itemLng = item.Location.Position.Longitude reverseLocation := services.geocoder.ReverseGeocode(item.Location.Position.Latitude, item.Location.Position.Longitude) contributorRegion = reverseLocation.Region contributorCity = reverseLocation.City contributorCityPopulation = reverseLocation.Population contributorCountry = reverseLocation.Country } } // Geohash var locationGeoHash = geohash.Encode(itemLat, itemLng) // This is produced with empty lat/lng values - don't store it. if locationGeoHash == "7zzzzzzzzzzz" { locationGeoHash = "" } // message row messageRow := config.SocialHarvestMessage{ Time: itemCreatedTime, HarvestId: harvestId, Territory: territoryName, Network: "googlePlus", MessageId: item.Id, ContributorId: item.Actor.Id, ContributorScreenName: item.Actor.DisplayName, ContributorName: item.Actor.DisplayName, ContributorGender: contributorGender, ContributorType: contributorType, ContributorLang: contributorLanguage, ContributorLongitude: itemLng, ContributorLatitude: itemLat, ContributorGeohash: locationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Message: item.Object.Content, Sentiment: services.sentimentAnalyzer.Classify(item.Object.Content), IsQuestion: Btoi(IsQuestion(item.Object.OriginalContent, harvestConfig.QuestionRegex)), GooglePlusReshares: item.Object.Resharers.TotalItems, GooglePlusOnes: item.Object.Plusoners.TotalItems, } StoreHarvestedData(messageRow) LogJson(messageRow, "messages") // Keywords are stored on the same collection as hashtags - but under a `keyword` field instead of `tag` field as to not confuse the two. // Limit to words 4 characters or more and only return 8 keywords. This could greatly increase the database size if not limited. keywords := GetKeywords(item.Object.OriginalContent, 4, 8) if len(keywords) > 0 { for _, keyword := range keywords { if keyword != "" { keywordHarvestId := GetHarvestMd5(item.Id + "googlePlus" + territoryName + keyword) // Again, keyword share the same series/table/collection hashtag := config.SocialHarvestHashtag{ Time: itemCreatedTime, HarvestId: keywordHarvestId, Territory: territoryName, Network: "googlePlus", MessageId: item.Id, ContributorId: item.Actor.Id, ContributorScreenName: item.Actor.DisplayName, ContributorName: item.Actor.DisplayName, ContributorGender: contributorGender, ContributorType: contributorType, ContributorLang: contributorLanguage, ContributorLongitude: itemLng, ContributorLatitude: itemLat, ContributorGeohash: locationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Keyword: keyword, } StoreHarvestedData(hashtag) LogJson(hashtag, "hashtags") } } } if len(item.Object.Attachments) > 0 { for _, attachment := range item.Object.Attachments { hostName := "" if len(attachment.Url) > 0 { pUrl, _ := url.Parse(attachment.Url) hostName = pUrl.Host } previewImg := "" if attachment.Image != nil { previewImg = attachment.Image.Url } fullImg := "" if attachment.FullImage != nil { fullImg = attachment.FullImage.Url } sharedLinksRow := config.SocialHarvestSharedLink{ Time: itemCreatedTime, HarvestId: harvestId, Territory: territoryName, Network: "googlePlus", MessageId: item.Id, ContributorId: item.Actor.Id, ContributorScreenName: item.Actor.DisplayName, ContributorName: item.Actor.DisplayName, ContributorGender: contributorGender, ContributorType: contributorType, ContributorLang: contributorLanguage, ContributorLongitude: itemLng, ContributorLatitude: itemLat, ContributorGeohash: locationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Type: attachment.ObjectType, Preview: previewImg, Source: fullImg, Url: attachment.Url, ExpandedUrl: ExpandUrl(attachment.Url), Host: hostName, } StoreHarvestedData(sharedLinksRow) LogJson(sharedLinksRow, "shared_links") } } } } } else { log.Println(err) } return options, harvestState }
// Takes an array of Post structs and converts it to JSON and logs to file (to be picked up by Fluentd, Logstash, Ik, etc.) func FacebookPostsOut(posts []FacebookPost, territoryName string, params FacebookParams) (int, string, time.Time) { var itemsHarvested = 0 var latestId = "" var latestTime time.Time for _, post := range posts { postCreatedTime, err := time.Parse("2006-01-02T15:04:05-0700", post.CreatedTime) // Only take posts that have a time (and an ID from Facebook) if err == nil && len(post.Id) > 0 { itemsHarvested++ // If this is the most recent post in the results, set it's date and id (to be returned) so we can continue where we left off in future harvests if latestTime.IsZero() || postCreatedTime.Unix() > latestTime.Unix() { latestTime = postCreatedTime latestId = post.Id } hostName := "" if len(post.Link) > 0 { pUrl, _ := url.Parse(post.Link) hostName = pUrl.Host } // Generate a harvest_id to avoid potential dupes (a unique index is placed on this field and all insert errors ignored). harvestId := GetHarvestMd5(post.Id + "facebook" + territoryName) //log.Println(harvestId) // contributor row (who created the message) // NOTE: This is synchronous...but that's ok because while I'd love to use channels and make a bunch of requests at once, there's rate limits from these APIs... // Plus the contributor info tells us a few things about the message, such as locale. Other series will use this data. var contributor = FacebookAccount{} contributor = FacebookGetUserInfo(post.From.Id, params) var contributorGender = 0 if contributor.Gender == "male" { contributorGender = 1 } if contributor.Gender == "female" { contributorGender = -1 } var contributorName = contributor.Name if len(contributor.FirstName) > 0 { contributorName = contributor.FirstName + " " + contributor.LastName } var contributorType = "person" if len(contributor.CompanyOverview) > 0 || len(contributor.Founded) > 0 || len(contributor.Category) > 0 { contributorType = "company" } // Reverse code to get city, state, country, etc. var contributorCountry = "" var contributorRegion = "" var contributorCity = "" var contributorCityPopulation = int32(0) // This isn't always available with Geobed information and while many counties will be, they still need to be decoded with the Geonames data set (id numbers to string names). // When Geobed updates, then Social Harvest can add county information in again. "State" (US state) has also changed to "Region" due to the data sets being used. // A little consistency has been lost, but geocoding is all internal now. Not a bad trade off. // var contributorCounty = "" if contributor.Location.Latitude != 0.0 && contributor.Location.Latitude != 0.0 { reverseLocation := services.geocoder.ReverseGeocode(contributor.Location.Latitude, contributor.Location.Longitude) contributorRegion = reverseLocation.Region contributorCity = reverseLocation.City contributorCountry = reverseLocation.Country contributorCityPopulation = reverseLocation.Population // contributorCounty = reverseLocation.County } // Geohash var locationGeoHash = geohash.Encode(contributor.Location.Latitude, contributor.Location.Longitude) // This is produced with empty lat/lng values - don't store it. if locationGeoHash == "7zzzzzzzzzzz" { locationGeoHash = "" } // TODO: Category (use a classifier in the future for this?) // message row messageRow := config.SocialHarvestMessage{ Time: postCreatedTime, HarvestId: harvestId, Territory: territoryName, Network: "facebook", MessageId: post.Id, ContributorId: post.From.Id, ContributorScreenName: post.From.Name, ContributorName: contributorName, ContributorGender: contributorGender, ContributorType: contributorType, ContributorLang: LocaleToLanguageISO(contributor.Locale), ContributorLongitude: contributor.Location.Longitude, ContributorLatitude: contributor.Location.Latitude, ContributorGeohash: locationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, ContributorLikes: contributor.Likes, Message: post.Message, FacebookShares: post.Shares.Count, Category: contributor.Category, Sentiment: services.sentimentAnalyzer.Classify(post.Message), IsQuestion: Btoi(IsQuestion(post.Message, harvestConfig.QuestionRegex)), } StoreHarvestedData(messageRow) LogJson(messageRow, "messages") // Keywords are stored on the same collection as hashtags - but under a `keyword` field instead of `tag` field as to not confuse the two. // Limit to words 4 characters or more and only return 8 keywords. This could greatly increase the database size if not limited. keywords := GetKeywords(post.Message, 4, 8) if len(keywords) > 0 { for _, keyword := range keywords { if keyword != "" { keywordHarvestId := GetHarvestMd5(post.Id + "facebook" + territoryName + keyword) // Again, keyword share the same series/table/collection hashtag := config.SocialHarvestHashtag{ Time: postCreatedTime, HarvestId: keywordHarvestId, Territory: territoryName, Network: "facebook", MessageId: post.Id, ContributorId: post.From.Id, ContributorScreenName: post.From.Name, ContributorName: contributorName, ContributorGender: contributorGender, ContributorType: contributorType, ContributorLang: LocaleToLanguageISO(contributor.Locale), ContributorLongitude: contributor.Location.Longitude, ContributorLatitude: contributor.Location.Latitude, ContributorGeohash: locationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Keyword: keyword, } StoreHarvestedData(hashtag) LogJson(hashtag, "hashtags") } } } // shared links row // TODO: expand short urls (Facebook doesn't do it for us unfortunately) if len(post.Link) > 0 { sharedLinksRow := config.SocialHarvestSharedLink{ Time: postCreatedTime, HarvestId: harvestId, Territory: territoryName, Network: "facebook", MessageId: post.Id, ContributorId: post.From.Id, ContributorScreenName: post.From.Name, ContributorName: contributorName, ContributorGender: contributorGender, ContributorType: contributorType, ContributorLang: LocaleToLanguageISO(contributor.Locale), ContributorLongitude: contributor.Location.Longitude, ContributorLatitude: contributor.Location.Latitude, ContributorGeohash: locationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, Type: post.Type, Preview: post.Picture, Source: post.Source, Url: post.Link, ExpandedUrl: ExpandUrl(post.Link), Host: hostName, } StoreHarvestedData(sharedLinksRow) LogJson(sharedLinksRow, "shared_links") } // mentions row (note the harvest id in the following - any post that has multiple subobjects to be stored separately will need a different harvest id, else only one of those subobjects would be stored) for _, tag := range post.StoryTags { for _, mention := range tag { // The harvest id is going to have to be a little different in this case too...Otherwise, we would only get one mention per post. storyTagsMentionHarvestId := GetHarvestMd5(post.Id + mention.Id + territoryName) // TODO: Keep an eye on this, it may add too many API requests... var mentionedContributor = FacebookAccount{} mentionedContributor = FacebookGetUserInfo(mention.Id, params) var mentionedGender = 0 if mentionedContributor.Gender == "male" { mentionedGender = 1 } if mentionedContributor.Gender == "female" { mentionedGender = -1 } var mentionedName = mentionedContributor.Name if len(mentionedContributor.FirstName) > 0 { mentionedName = mentionedContributor.FirstName + " " + mentionedContributor.LastName } var mentionedType = "person" if len(mentionedContributor.CompanyOverview) > 0 || len(mentionedContributor.Founded) > 0 || len(mentionedContributor.Category) > 0 { mentionedType = "company" } var mentionedLocationGeoHash = geohash.Encode(mentionedContributor.Location.Latitude, mentionedContributor.Location.Longitude) // This is produced with empty lat/lng values - don't store it. if mentionedLocationGeoHash == "7zzzzzzzzzzz" { mentionedLocationGeoHash = "" } mentionRow := config.SocialHarvestMention{ Time: postCreatedTime, HarvestId: storyTagsMentionHarvestId, Territory: territoryName, Network: "facebook", MessageId: post.Id, ContributorId: post.From.Id, ContributorScreenName: post.From.Name, ContributorName: contributorName, ContributorGender: contributorGender, ContributorType: contributorType, ContributorLongitude: contributor.Location.Longitude, ContributorLatitude: contributor.Location.Latitude, ContributorGeohash: locationGeoHash, ContributorLang: LocaleToLanguageISO(contributor.Locale), MentionedId: mention.Id, MentionedScreenName: mention.Name, MentionedName: mentionedName, MentionedGender: mentionedGender, MentionedType: mentionedType, MentionedLongitude: mentionedContributor.Location.Longitude, MentionedLatitude: mentionedContributor.Location.Latitude, MentionedGeohash: mentionedLocationGeoHash, MentionedLang: LocaleToLanguageISO(mentionedContributor.Locale), } StoreHarvestedData(mentionRow) LogJson(mentionRow, "mentions") } } // Also try MessageTags (which exist on user and page feeds, whereas StoryTags are available on public posts search) for _, tag := range post.MessageTags { for _, mention := range tag { // Same here, the harvest id is going to have to be a little different in this case too...Otherwise, we would only get one mention per post. MessageTagsMentionHarvestId := GetHarvestMd5(post.Id + mention.Id + territoryName) // TODO: Keep an eye on this, it may add too many API requests... // TODO: this is repeated. don't repeat. var mentionedContributor = FacebookAccount{} mentionedContributor = FacebookGetUserInfo(mention.Id, params) var mentionedGender = 0 if mentionedContributor.Gender == "male" { mentionedGender = 1 } if mentionedContributor.Gender == "female" { mentionedGender = -1 } var mentionedName = mentionedContributor.Name if len(mentionedContributor.FirstName) > 0 { mentionedName = mentionedContributor.FirstName + " " + mentionedContributor.LastName } var mentionedType = "person" if len(mentionedContributor.CompanyOverview) > 0 || len(mentionedContributor.Founded) > 0 || len(mentionedContributor.Category) > 0 { mentionedType = "company" } var mentionedLocationGeoHash = geohash.Encode(mentionedContributor.Location.Latitude, mentionedContributor.Location.Longitude) // This is produced with empty lat/lng values - don't store it. if mentionedLocationGeoHash == "7zzzzzzzzzzz" { mentionedLocationGeoHash = "" } mentionRow := config.SocialHarvestMention{ Time: postCreatedTime, HarvestId: MessageTagsMentionHarvestId, Territory: territoryName, Network: "facebook", MessageId: post.Id, ContributorId: post.From.Id, ContributorScreenName: post.From.Name, ContributorName: contributorName, ContributorGender: contributorGender, ContributorType: contributorType, ContributorLongitude: contributor.Location.Longitude, ContributorLatitude: contributor.Location.Latitude, ContributorGeohash: locationGeoHash, ContributorLang: LocaleToLanguageISO(contributor.Locale), MentionedId: mention.Id, MentionedScreenName: mention.Name, MentionedName: mentionedName, MentionedGender: mentionedGender, MentionedType: mentionedType, MentionedLongitude: mentionedContributor.Location.Longitude, MentionedLatitude: mentionedContributor.Location.Latitude, MentionedGeohash: mentionedLocationGeoHash, MentionedLang: LocaleToLanguageISO(mentionedContributor.Locale), } StoreHarvestedData(mentionRow) LogJson(mentionRow, "mentions") } } } else { log.Println("Could not parse the time from the Facebook post, so I'm throwing it away!") log.Println(err) } } // return the number of items harvested return itemsHarvested, latestId, latestTime }
// Get recent Instagram for media related to specific tags on Instagram func InstagramSearch(territoryName string, harvestState config.HarvestState, tag string, options url.Values) (url.Values, config.HarvestState) { count, err := strconv.ParseUint(options.Get("count"), 10, 64) if err != nil { count = 100 } opt := &instagram.Parameters{Count: count} // If there is a starting point (pagination / pick up where last harvest left off) if options.Get("max_tag_id") != "" { opt.MinID = options.Get("min_tag_id") } media, next, err := services.instagram.Tags.RecentMedia(tag, opt) if err == nil { for _, item := range media { instagramCreatedTime := time.Unix(0, item.CreatedTime*int64(time.Second)) // Only take instagrams that have a time if err == nil && len(item.ID) > 0 { harvestState.ItemsHarvested++ // If this is the most recent tweet in the results, set it's date and id (to be returned) so we can continue where we left off in future harvests if harvestState.LastTime.IsZero() || instagramCreatedTime.Unix() > harvestState.LastTime.Unix() { harvestState.LastTime = instagramCreatedTime harvestState.LastId = item.ID } // determine gender var contributorGender = DetectGender(item.User.FullName) // Figure out type (based on if a gender could be detected, name, etc.) var contributorType = DetectContributorType(item.User.FullName, contributorGender) var contributorCountry = "" var contributorRegion = "" var contributorCity = "" var contributorCityPopulation = int32(0) var statusLongitude = 0.0 var statusLatitude = 0.0 if item.Location != nil { statusLatitude = item.Location.Latitude statusLongitude = item.Location.Longitude } // Contributor location lookup (if no lat/lng was found on the message - try to reduce number of geocode lookups) contributorLat := 0.0 contributorLng := 0.0 if statusLatitude != 0.0 && statusLatitude != 0.0 { reverseLocation := services.geocoder.ReverseGeocode(statusLatitude, statusLongitude) contributorRegion = reverseLocation.Region contributorCity = reverseLocation.City contributorCityPopulation = reverseLocation.Population contributorCountry = reverseLocation.Country // They don't provide user location of any sort, so use the status lat/lng. contributorLat = statusLatitude contributorLng = statusLongitude } // Contributor geohash var contributorLocationGeoHash = geohash.Encode(contributorLat, contributorLng) // This is produced with empty lat/lng values - don't store it. if contributorLocationGeoHash == "7zzzzzzzzzzz" { contributorLocationGeoHash = "" } // Generate a harvest_id to avoid potential dupes (a unique index is placed on this field and all insert errors ignored). harvestId := GetHarvestMd5(item.ID + "instagram" + territoryName) // Retrieve the contributor for the "counts" info (everything else is actually already given with the media - kinda sad to even have to make this request) var contributor, contributorErr = services.instagram.Users.Get(item.User.ID) contributorFollowedByCount := 0 contributorMediaCount := 0 if contributorErr == nil { contributorFollowedByCount = contributor.Counts.FollowedBy contributorMediaCount = contributor.Counts.Media } caption := "" isQuestion := 0 if item.Caption != nil { caption = item.Caption.Text isQuestion = Btoi(IsQuestion(caption, harvestConfig.QuestionRegex)) } message := config.SocialHarvestMessage{ Time: instagramCreatedTime, HarvestId: harvestId, Territory: territoryName, Network: "instagram", ContributorId: item.User.ID, ContributorScreenName: item.User.Username, ContributorName: item.User.FullName, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, ContributorFollowers: contributorFollowedByCount, ContributorStatusesCount: contributorMediaCount, ContributorGender: contributorGender, ContributorType: contributorType, Message: caption, Sentiment: services.sentimentAnalyzer.Classify(caption), IsQuestion: isQuestion, MessageId: item.ID, LikeCount: item.Likes.Count, } // Send to the harvester observer go StoreHarvestedData(message) LogJson(message, "messages") // Keywords are stored on the same collection as hashtags - but under a `keyword` field instead of `tag` field as to not confuse the two. // Limit to words 4 characters or more and only return 8 keywords. This could greatly increase the database size if not limited. keywords := GetKeywords(caption, 4, 8) if len(keywords) > 0 { for _, keyword := range keywords { if keyword != "" { keywordHarvestId := GetHarvestMd5(item.ID + "instagram" + territoryName + keyword) // Again, keyword share the same series/table/collection hashtag := config.SocialHarvestHashtag{ Time: instagramCreatedTime, HarvestId: keywordHarvestId, Territory: territoryName, Network: "instagram", MessageId: item.ID, ContributorId: item.User.ID, ContributorScreenName: item.User.Username, ContributorName: item.User.FullName, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, ContributorGender: contributorGender, ContributorType: contributorType, Keyword: keyword, } StoreHarvestedData(hashtag) LogJson(hashtag, "hashtags") } } } // shared links (the media in Instagram's case...for data query and aggregation reasons, we aren't treating media as part of the message) // though, less confusing is Instagram's own API which provides a "link" field (and they are always also the expanded version) linkHostName := "" pUrl, _ := url.Parse(item.Link) linkHostName = pUrl.Host // This changes depending on the Type preview := "" source := "" if item.Type == "video" { preview = item.Videos.LowResolution.URL source = item.Videos.StandardResolution.URL } if item.Type == "image" { preview = item.Images.Thumbnail.URL source = item.Images.StandardResolution.URL } sharedLink := config.SocialHarvestSharedLink{ Time: instagramCreatedTime, HarvestId: harvestId, Territory: territoryName, Network: "instagram", MessageId: item.ID, ContributorId: item.User.ID, ContributorScreenName: item.User.Username, ContributorName: item.User.FullName, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, ContributorGender: contributorGender, ContributorType: contributorType, Url: item.Link, ExpandedUrl: item.Link, Host: linkHostName, Type: item.Type, Preview: preview, Source: source, } // Send to the harvester observer StoreHarvestedData(sharedLink) LogJson(sharedLink, "shared_links") // hashtags if len(item.Tags) > 0 { for _, tag := range item.Tags { if len(tag) > 0 { hashtagHarvestId := GetHarvestMd5(item.ID + "instagram" + territoryName + tag) // TODO: ADD contributor gender, contributor type hashtag := config.SocialHarvestHashtag{ Time: instagramCreatedTime, HarvestId: hashtagHarvestId, Territory: territoryName, Network: "instagram", MessageId: item.ID, ContributorId: item.User.ID, ContributorScreenName: item.User.Username, ContributorName: item.User.FullName, ContributorLongitude: contributorLng, ContributorLatitude: contributorLat, ContributorGeohash: contributorLocationGeoHash, ContributorCity: contributorCity, ContributorCityPopulation: contributorCityPopulation, ContributorRegion: contributorRegion, ContributorCountry: contributorCountry, ContributorGender: contributorGender, ContributorType: contributorType, Tag: tag, } // Send to the harvester observer StoreHarvestedData(hashtag) LogJson(hashtag, "hashtags") } } } } else { log.Println("Could not parse the time from the Instagram, so I'm throwing it away!") log.Println(err) } // Set it, but it won't be used to make requests in the future if instagramCreatedTime.Unix() > harvestState.LastTime.Unix() { harvestState.LastTime = instagramCreatedTime } } // This is where the id will come from (like Facebook) to be passed back in updated harvestState if next.NextMaxID != "" { harvestState.LastId = next.NextMaxID } // ...and always set it for the params, so the loop can get the next page (and if empty string, it should stop) options.Set("max_tag_id", next.NextMaxID) } return options, harvestState }