Пример #1
0
// Search for status updates and just pass the Tweet along (no special mapping required like FacebookPost{} because the Tweet struct is used across multiple API calls unlike Facebook)
// All "search" functions (and anything that gets data from an API) will now normalize the data, mapping it to a Social Harvest struct.
// This means there will be no way to get the original data from the service (back in the main app or from any other Go package that imports the harvester).
// This is fine because if someone wanted the original data, they could use packages like anaconda directly.
// What happens now is all data pulled from earch service's API will be sent to a channel (the harvester observer). However, this function should NOT be called in a go-subroutine though.
// We don't want to make multiple API calls in parallel (rate limits).
// NOTE: The number of items sent to the observer will be returned along with the last message's time and id. The main package can record this in the harvest logs/table.
// The harvester will not keep track of this information itself. Its only job is to gather data, send it to the channel and report back on how much was sent (and the last id/time). Period.
// It doens't care if the data is stored in a database, logged, or streamed out from an API. It just harvests and sends without looking or caring.
// Whereas previously it would be doing the db calls and logging, etc. This has now all been taken care of with the observer. All of these other processes simply subscribe and listen.
//
// Always passed in first (always): the territory name, and the position in the harvest (HarvestState) ... the rest are going to vary based on the API but typically are the query and options
// @return options(for pagination), count of items, last id, last time.
func TwitterSearch(territoryName string, harvestState config.HarvestState, query string, options url.Values) (url.Values, config.HarvestState) {
	searchResults, _ := services.twitter.GetSearch(query, options)
	// The cool thing about Twitter's API is that we have all the user data we need already. So we make less HTTP requests than when using Facebook's API.
	for _, tweet := range searchResults.Statuses {
		//log.Println(tweet)
		//	log.Println("processing a tweet....")

		tweetCreatedTime, err := time.Parse(time.RubyDate, tweet.CreatedAt)
		// Only take tweets that have a time (and an ID from Facebook)
		if err == nil && len(tweet.IdStr) > 0 {
			harvestState.ItemsHarvested++
			// If this is the most recent tweet in the results, set it's date and id (to be returned) so we can continue where we left off in future harvests
			if harvestState.LastTime.IsZero() || tweetCreatedTime.Unix() > harvestState.LastTime.Unix() {
				harvestState.LastTime = tweetCreatedTime
				harvestState.LastId = tweet.IdStr
			}

			// determine gender
			var contributorGender = DetectGender(tweet.User.Name)

			// TODO: figure out type somehow...
			var contributorType = DetectContributorType(tweet.User.Name, contributorGender)

			// While `tweet.Place.CountryCode` exists, city and state/region don't. So that doesn't help much. I suppose we could use it for supporting information send to the geocoder...
			// But if the `tweet.User.Location` string already has a country code or name in there then I don't want to confuse it.
			// I thought I'd be able to use more geo data from Twitter, but I guess not for now. Really happy the geocoder is now in memory.
			var contributorCountry = ""
			var contributorRegion = ""
			var contributorCity = ""
			var contributorCityPopulation = int32(0)

			var statusLongitude = 0.0
			var statusLatitude = 0.0
			lng, errLng := tweet.Longitude()
			if errLng == nil {
				statusLongitude = lng
			}
			lat, errLat := tweet.Latitude()
			if errLat == nil {
				statusLatitude = lat
			}

			// Contributor location lookup (if no lat/lng was found on the message - try to reduce number of geocode lookups)
			contributorLat := 0.0
			contributorLng := 0.0
			if statusLatitude == 0.0 || statusLatitude == 0.0 {
				// Do not make a request for nothing (there are no 1 character locations either).
				if len(tweet.User.Location) > 1 {
					location := services.geocoder.Geocode(tweet.User.Location)
					contributorLat = location.Latitude
					contributorLng = location.Longitude
					contributorRegion = location.Region
					contributorCity = location.City
					contributorCityPopulation = location.Population
					contributorCountry = location.Country
				}

				//contributorLat, contributorLng = Geocode(tweet.User.Location)
			} else {
				reverseLocation := services.geocoder.ReverseGeocode(statusLatitude, statusLongitude)
				contributorRegion = reverseLocation.Region
				contributorCity = reverseLocation.City
				contributorCityPopulation = reverseLocation.Population
				contributorCountry = reverseLocation.Country

				// keep these, no need to change - might change accuracy, etc.
				contributorLat = statusLatitude
				contributorLng = statusLongitude
			}

			// Contributor geohash
			var contributorLocationGeoHash = geohash.Encode(contributorLat, contributorLng)
			// This is produced with empty lat/lng values - don't store it.
			if contributorLocationGeoHash == "7zzzzzzzzzzz" {
				contributorLocationGeoHash = ""
			}

			// Generate a harvest_id to avoid potential dupes (a unique index is placed on this field and all insert errors ignored).
			harvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName)

			message := config.SocialHarvestMessage{
				Time:                      tweetCreatedTime,
				HarvestId:                 harvestId,
				Territory:                 territoryName,
				Network:                   "twitter",
				ContributorId:             tweet.User.IdStr,
				ContributorScreenName:     tweet.User.ScreenName,
				ContributorName:           tweet.User.Name,
				ContributorLang:           tweet.User.Lang,
				ContributorLongitude:      contributorLng,
				ContributorLatitude:       contributorLat,
				ContributorGeohash:        contributorLocationGeoHash,
				ContributorCity:           contributorCity,
				ContributorCityPopulation: contributorCityPopulation,
				ContributorRegion:         contributorRegion,
				ContributorCountry:        contributorCountry,
				ContributorVerified:       Btoi(tweet.User.Verified),
				ContributorFollowers:      tweet.User.FollowersCount,
				ContributorStatusesCount:  int(tweet.User.StatusesCount),
				ContributorGender:         contributorGender,
				ContributorType:           contributorType,
				Message:                   tweet.Text,
				Sentiment:                 services.sentimentAnalyzer.Classify(tweet.Text),
				IsQuestion:                Btoi(IsQuestion(tweet.Text, harvestConfig.QuestionRegex)),
				MessageId:                 tweet.IdStr,
				TwitterRetweetCount:       tweet.RetweetCount,
				TwitterFavoriteCount:      tweet.FavoriteCount,
			}
			go StoreHarvestedData(message)
			LogJson(message, "messages")

			// Keywords are stored on the same collection as hashtags - but under a `keyword` field instead of `tag` field as to not confuse the two.
			// Keywords are found across every network, whereas hashtags are only found on a few.
			// Limit to words 4 characters or more and only return 8 keywords. This could greatly increase the database size if not limited.
			// Typically for Twitter, not more than 10 keywords are returned.
			keywords := GetKeywords(tweet.Text, 4, 8)
			if len(keywords) > 0 {
				for _, keyword := range keywords {
					if keyword != "" {
						keywordHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + keyword)

						// Again, keyword share the same series/table/collection
						hashtag := config.SocialHarvestHashtag{
							Time:                      tweetCreatedTime,
							HarvestId:                 keywordHarvestId,
							Territory:                 territoryName,
							Network:                   "twitter",
							MessageId:                 tweet.IdStr,
							ContributorId:             tweet.User.IdStr,
							ContributorScreenName:     tweet.User.ScreenName,
							ContributorName:           tweet.User.Name,
							ContributorLang:           tweet.User.Lang,
							ContributorType:           contributorType,
							ContributorGender:         contributorGender,
							ContributorLongitude:      contributorLng,
							ContributorLatitude:       contributorLat,
							ContributorGeohash:        contributorLocationGeoHash,
							ContributorCity:           contributorCity,
							ContributorCityPopulation: contributorCityPopulation,
							ContributorRegion:         contributorRegion,
							ContributorCountry:        contributorCountry,
							Keyword:                   keyword,
						}
						StoreHarvestedData(hashtag)
						LogJson(hashtag, "hashtags")
					}
				}
			}

			// shared links
			if len(tweet.Entities.Urls) > 0 {
				for _, link := range tweet.Entities.Urls {
					if len(link.Url) > 0 {
						// Shared link harvest id has to be different because otherwise only one would be stored
						sharedLinkHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + link.Expanded_url)

						linkHostName := ""
						pUrl, _ := url.Parse(link.Url)
						linkHostName = pUrl.Host

						sharedLink := config.SocialHarvestSharedLink{
							Time:                      tweetCreatedTime,
							HarvestId:                 sharedLinkHarvestId,
							Territory:                 territoryName,
							Network:                   "twitter",
							MessageId:                 tweet.IdStr,
							ContributorId:             tweet.User.IdStr,
							ContributorScreenName:     tweet.User.ScreenName,
							ContributorName:           tweet.User.Name,
							ContributorLang:           tweet.User.Lang,
							ContributorType:           contributorType,
							ContributorGender:         contributorGender,
							ContributorLongitude:      contributorLng,
							ContributorLatitude:       contributorLat,
							ContributorGeohash:        contributorLocationGeoHash,
							ContributorCity:           contributorCity,
							ContributorCityPopulation: contributorCityPopulation,
							ContributorRegion:         contributorRegion,
							ContributorCountry:        contributorCountry,
							Url:                       link.Url,
							ExpandedUrl:               link.Expanded_url,
							Host:                      linkHostName,
						}
						StoreHarvestedData(sharedLink)
						LogJson(sharedLink, "shared_links")
					}
				}
			}

			// more shared links (media entities)
			if len(tweet.Entities.Media) > 0 {
				for _, media := range tweet.Entities.Media {
					if len(media.Url) > 0 {
						sharedMediaHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + media.Expanded_url)

						mediaHostName := ""
						pUrl, _ := url.Parse(media.Url)
						mediaHostName = pUrl.Host

						sharedMedia := config.SocialHarvestSharedLink{
							Time:                      tweetCreatedTime,
							HarvestId:                 sharedMediaHarvestId,
							Territory:                 territoryName,
							Network:                   "twitter",
							MessageId:                 tweet.IdStr,
							ContributorId:             tweet.User.IdStr,
							ContributorScreenName:     tweet.User.ScreenName,
							ContributorName:           tweet.User.Name,
							ContributorLang:           tweet.User.Lang,
							ContributorType:           contributorType,
							ContributorGender:         contributorGender,
							ContributorLongitude:      contributorLng,
							ContributorLatitude:       contributorLat,
							ContributorGeohash:        contributorLocationGeoHash,
							ContributorCity:           contributorCity,
							ContributorCityPopulation: contributorCityPopulation,
							ContributorRegion:         contributorRegion,
							ContributorCountry:        contributorCountry,
							Url:                       media.Url,
							ExpandedUrl:               media.Expanded_url,
							Host:                      mediaHostName,
							Type:                      media.Type,
							Source:                    media.Media_url,
						}
						StoreHarvestedData(sharedMedia)
						LogJson(sharedMedia, "shared_links")
					}
				}
			}

			// hashtags
			if len(tweet.Entities.Hashtags) > 0 {
				for _, tag := range tweet.Entities.Hashtags {
					if len(tag.Text) > 0 {
						hashtagHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + tag.Text)

						hashtag := config.SocialHarvestHashtag{
							Time:                      tweetCreatedTime,
							HarvestId:                 hashtagHarvestId,
							Territory:                 territoryName,
							Network:                   "twitter",
							MessageId:                 tweet.IdStr,
							ContributorId:             tweet.User.IdStr,
							ContributorScreenName:     tweet.User.ScreenName,
							ContributorName:           tweet.User.Name,
							ContributorLang:           tweet.User.Lang,
							ContributorType:           contributorType,
							ContributorGender:         contributorGender,
							ContributorLongitude:      contributorLng,
							ContributorLatitude:       contributorLat,
							ContributorGeohash:        contributorLocationGeoHash,
							ContributorCity:           contributorCity,
							ContributorCityPopulation: contributorCityPopulation,
							ContributorRegion:         contributorRegion,
							ContributorCountry:        contributorCountry,
							Tag:                       tag.Text,
						}
						StoreHarvestedData(hashtag)
						LogJson(hashtag, "hashtags")
					}
				}
			}

			// mentions
			if len(tweet.Entities.User_mentions) > 0 {
				for _, mentionedUser := range tweet.Entities.User_mentions {
					if len(mentionedUser.Id_str) > 0 {
						mentionHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + mentionedUser.Id_str)

						// TODO: add mentioned user info? (another api request)
						mention := config.SocialHarvestMention{
							Time:                  tweetCreatedTime,
							HarvestId:             mentionHarvestId,
							Territory:             territoryName,
							Network:               "twitter",
							MessageId:             tweet.IdStr,
							ContributorId:         tweet.User.IdStr,
							ContributorScreenName: tweet.User.ScreenName,
							ContributorName:       tweet.User.Name,
							ContributorLang:       tweet.User.Lang,
							ContributorType:       contributorType,
							ContributorGender:     contributorGender,
							ContributorLongitude:  contributorLng,
							ContributorLatitude:   contributorLat,
							ContributorGeohash:    contributorLocationGeoHash,

							MentionedId:         mentionedUser.Id_str,
							MentionedScreenName: mentionedUser.Screen_name,
							MentionedName:       mentionedUser.Name,
						}
						StoreHarvestedData(mention)
						LogJson(mention, "mentions")
					}
				}
			}

		} else {
			log.Println("Could not parse the time from the Tweet, so I'm throwing it away!")
			log.Println(err)
		}
	}

	return options, harvestState
}
Пример #2
0
// Gets Google+ activities (posts) by searching for a keyword.
func GooglePlusActivitySearch(territoryName string, harvestState config.HarvestState, query string, options url.Values) (url.Values, config.HarvestState) {
	limit, lErr := strconv.ParseInt(options.Get("count"), 10, 64)
	if lErr != nil {
		limit = 20
	}
	if limit > 20 {
		limit = 20
	}
	// If there's a next page token, it'll be used to continue to the next page for this harvest
	nextPageToken := options.Get("nextPageToken")

	activities, err := services.googlePlus.Activities.Search(query).MaxResults(limit).PageToken(nextPageToken).Do()
	if err == nil {
		// Passed back to whatever called this function, so it can continue with the next page.
		options.Set("nextPageToken", activities.NextPageToken)

		for _, item := range activities.Items {

			itemCreatedTime, err := time.Parse(time.RFC3339, item.Published)
			// Only take instagrams that have a time
			if err == nil && len(item.Id) > 0 {
				harvestState.ItemsHarvested++
				// If this is the most recent tweet in the results, set it's date and id (to be returned) so we can continue where we left off in future harvests
				if harvestState.LastTime.IsZero() || itemCreatedTime.Unix() > harvestState.LastTime.Unix() {
					harvestState.LastTime = itemCreatedTime
					harvestState.LastId = item.Id
				}

				// Generate a harvest_id to avoid potential dupes (a unique index is placed on this field and all insert errors ignored).
				harvestId := GetHarvestMd5(item.Id + "googlePlus" + territoryName)

				// contributor row (who created the message)
				// NOTE: This is synchronous...but that's ok because while I'd love to use channels and make a bunch of requests at once, there's rate limits from these APIs...
				// Plus the contributor info tells us a few things about the message, such as locale. Other series will use this data.
				contributor, err := services.googlePlus.People.Get(item.Actor.Id).Do()
				if err != nil {
					log.Println(err)
					return options, harvestState
				}

				var contributorGender = 0
				if contributor.Gender == "male" {
					contributorGender = 1
				}
				if contributor.Gender == "female" {
					contributorGender = -1
				}
				var contributorType = DetectContributorType(item.Actor.DisplayName, contributorGender)
				contributorLanguage := LocaleToLanguageISO(contributor.Language)

				var itemLat = 0.0
				var itemLng = 0.0
				// Reverse code to get city, state, country, etc.
				var contributorCountry = ""
				var contributorRegion = ""
				var contributorCity = ""
				var contributorCityPopulation = int32(0)
				if item.Location != nil && item.Location.Position != nil {
					if item.Location.Position.Latitude != 0.0 && item.Location.Position.Longitude != 0.0 {
						itemLat = item.Location.Position.Latitude
						itemLng = item.Location.Position.Longitude
						reverseLocation := services.geocoder.ReverseGeocode(item.Location.Position.Latitude, item.Location.Position.Longitude)
						contributorRegion = reverseLocation.Region
						contributorCity = reverseLocation.City
						contributorCityPopulation = reverseLocation.Population
						contributorCountry = reverseLocation.Country
					}
				}

				// Geohash
				var locationGeoHash = geohash.Encode(itemLat, itemLng)
				// This is produced with empty lat/lng values - don't store it.
				if locationGeoHash == "7zzzzzzzzzzz" {
					locationGeoHash = ""
				}

				// message row
				messageRow := config.SocialHarvestMessage{
					Time:                      itemCreatedTime,
					HarvestId:                 harvestId,
					Territory:                 territoryName,
					Network:                   "googlePlus",
					MessageId:                 item.Id,
					ContributorId:             item.Actor.Id,
					ContributorScreenName:     item.Actor.DisplayName,
					ContributorName:           item.Actor.DisplayName,
					ContributorGender:         contributorGender,
					ContributorType:           contributorType,
					ContributorLang:           contributorLanguage,
					ContributorLongitude:      itemLng,
					ContributorLatitude:       itemLat,
					ContributorGeohash:        locationGeoHash,
					ContributorCity:           contributorCity,
					ContributorCityPopulation: contributorCityPopulation,
					ContributorRegion:         contributorRegion,
					ContributorCountry:        contributorCountry,
					Message:                   item.Object.Content,
					Sentiment:                 services.sentimentAnalyzer.Classify(item.Object.Content),
					IsQuestion:                Btoi(IsQuestion(item.Object.OriginalContent, harvestConfig.QuestionRegex)),
					GooglePlusReshares:        item.Object.Resharers.TotalItems,
					GooglePlusOnes:            item.Object.Plusoners.TotalItems,
				}
				StoreHarvestedData(messageRow)
				LogJson(messageRow, "messages")

				// Keywords are stored on the same collection as hashtags - but under a `keyword` field instead of `tag` field as to not confuse the two.
				// Limit to words 4 characters or more and only return 8 keywords. This could greatly increase the database size if not limited.
				keywords := GetKeywords(item.Object.OriginalContent, 4, 8)
				if len(keywords) > 0 {
					for _, keyword := range keywords {
						if keyword != "" {
							keywordHarvestId := GetHarvestMd5(item.Id + "googlePlus" + territoryName + keyword)

							// Again, keyword share the same series/table/collection
							hashtag := config.SocialHarvestHashtag{
								Time:                      itemCreatedTime,
								HarvestId:                 keywordHarvestId,
								Territory:                 territoryName,
								Network:                   "googlePlus",
								MessageId:                 item.Id,
								ContributorId:             item.Actor.Id,
								ContributorScreenName:     item.Actor.DisplayName,
								ContributorName:           item.Actor.DisplayName,
								ContributorGender:         contributorGender,
								ContributorType:           contributorType,
								ContributorLang:           contributorLanguage,
								ContributorLongitude:      itemLng,
								ContributorLatitude:       itemLat,
								ContributorGeohash:        locationGeoHash,
								ContributorCity:           contributorCity,
								ContributorCityPopulation: contributorCityPopulation,
								ContributorRegion:         contributorRegion,
								ContributorCountry:        contributorCountry,
								Keyword:                   keyword,
							}
							StoreHarvestedData(hashtag)
							LogJson(hashtag, "hashtags")
						}
					}
				}

				if len(item.Object.Attachments) > 0 {
					for _, attachment := range item.Object.Attachments {
						hostName := ""
						if len(attachment.Url) > 0 {
							pUrl, _ := url.Parse(attachment.Url)
							hostName = pUrl.Host
						}

						previewImg := ""
						if attachment.Image != nil {
							previewImg = attachment.Image.Url
						}
						fullImg := ""
						if attachment.FullImage != nil {
							fullImg = attachment.FullImage.Url
						}

						sharedLinksRow := config.SocialHarvestSharedLink{
							Time:                      itemCreatedTime,
							HarvestId:                 harvestId,
							Territory:                 territoryName,
							Network:                   "googlePlus",
							MessageId:                 item.Id,
							ContributorId:             item.Actor.Id,
							ContributorScreenName:     item.Actor.DisplayName,
							ContributorName:           item.Actor.DisplayName,
							ContributorGender:         contributorGender,
							ContributorType:           contributorType,
							ContributorLang:           contributorLanguage,
							ContributorLongitude:      itemLng,
							ContributorLatitude:       itemLat,
							ContributorGeohash:        locationGeoHash,
							ContributorCity:           contributorCity,
							ContributorCityPopulation: contributorCityPopulation,
							ContributorRegion:         contributorRegion,
							ContributorCountry:        contributorCountry,
							Type:                      attachment.ObjectType,
							Preview:                   previewImg,
							Source:                    fullImg,
							Url:                       attachment.Url,
							ExpandedUrl:               ExpandUrl(attachment.Url),
							Host:                      hostName,
						}
						StoreHarvestedData(sharedLinksRow)
						LogJson(sharedLinksRow, "shared_links")
					}
				}

			}
		}
	} else {
		log.Println(err)
	}

	return options, harvestState
}
Пример #3
0
// Harvests from a specific Twitter account stream
func TwitterAccountStream(territoryName string, harvestState config.HarvestState, options url.Values) (url.Values, config.HarvestState) {

	searchResults, _ := services.twitter.GetUserTimeline(options)
	// The cool thing about Twitter's API is that we have all the user data we need already. So we make less HTTP requests than when using Facebook's API.
	for _, tweet := range searchResults {
		//log.Println(tweet)
		//	log.Println("processing a tweet....")

		tweetCreatedTime, err := time.Parse(time.RubyDate, tweet.CreatedAt)
		// Only take tweets that have a time (and an ID from Facebook)
		if err == nil && len(tweet.IdStr) > 0 {
			harvestState.ItemsHarvested++
			// If this is the most recent tweet in the results, set it's date and id (to be returned) so we can continue where we left off in future harvests
			if harvestState.LastTime.IsZero() || tweetCreatedTime.Unix() > harvestState.LastTime.Unix() {
				harvestState.LastTime = tweetCreatedTime
				harvestState.LastId = tweet.IdStr
			}

			// determine gender
			var contributorGender = DetectGender(tweet.User.Name)

			// TODO: figure out type somehow...
			var contributorType = DetectContributorType(tweet.User.Name, contributorGender)

			var contributorCountry = ""
			var contributorRegion = ""
			var contributorCity = ""
			var contributorCityPopulation = int32(0)

			var statusLongitude = 0.0
			var statusLatitude = 0.0
			lng, errLng := tweet.Longitude()
			if errLng == nil {
				statusLongitude = lng
			}
			lat, errLat := tweet.Latitude()
			if errLat == nil {
				statusLatitude = lat
			}

			// Contributor location lookup (if no lat/lng was found on the message - try to reduce number of geocode lookups)
			contributorLat := 0.0
			contributorLng := 0.0
			if statusLatitude == 0.0 || statusLatitude == 0.0 {
				// Do not make a request for nothing (there are no 1 character locations either).
				if len(tweet.User.Location) > 1 {
					location := services.geocoder.Geocode(tweet.User.Location)
					contributorLat = location.Latitude
					contributorLng = location.Longitude
					contributorRegion = location.Region
					contributorCity = location.City
					contributorCityPopulation = location.Population
					contributorCountry = location.Country
				}
				//contributorLat, contributorLng = Geocode(tweet.User.Location)
			} else {
				reverseLocation := services.geocoder.ReverseGeocode(statusLatitude, statusLongitude)
				contributorRegion = reverseLocation.Region
				contributorCity = reverseLocation.City
				contributorCityPopulation = reverseLocation.Population
				contributorCountry = reverseLocation.Country

				// keep these, no need to change - might change accuracy, etc.
				contributorLat = statusLatitude
				contributorLng = statusLongitude
			}

			// Contributor geohash
			var contributorLocationGeoHash = geohash.Encode(contributorLat, contributorLng)
			// This is produced with empty lat/lng values - don't store it.
			if contributorLocationGeoHash == "7zzzzzzzzzzz" {
				contributorLocationGeoHash = ""
			}

			// Generate a harvest_id to avoid potential dupes (a unique index is placed on this field and all insert errors ignored).
			harvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName)

			message := config.SocialHarvestMessage{
				Time:                      tweetCreatedTime,
				HarvestId:                 harvestId,
				Territory:                 territoryName,
				Network:                   "twitter",
				ContributorId:             tweet.User.IdStr,
				ContributorScreenName:     tweet.User.ScreenName,
				ContributorName:           tweet.User.Name,
				ContributorLang:           tweet.User.Lang,
				ContributorLongitude:      contributorLng,
				ContributorLatitude:       contributorLat,
				ContributorGeohash:        contributorLocationGeoHash,
				ContributorCity:           contributorCity,
				ContributorCityPopulation: contributorCityPopulation,
				ContributorRegion:         contributorRegion,
				ContributorCountry:        contributorCountry,
				ContributorVerified:       Btoi(tweet.User.Verified),
				ContributorFollowers:      tweet.User.FollowersCount,
				ContributorStatusesCount:  int(tweet.User.StatusesCount),
				ContributorGender:         contributorGender,
				ContributorType:           contributorType,
				Message:                   tweet.Text,
				IsQuestion:                Btoi(IsQuestion(tweet.Text, harvestConfig.QuestionRegex)),
				MessageId:                 tweet.IdStr,
				TwitterRetweetCount:       tweet.RetweetCount,
				TwitterFavoriteCount:      tweet.FavoriteCount,
			}
			// Send to the harvester observer
			StoreHarvestedData(message)
			LogJson(message, "messages")

			// shared links
			if len(tweet.Entities.Urls) > 0 {
				for _, link := range tweet.Entities.Urls {
					if len(link.Url) > 0 {
						// Shared link harvest id has to be different because otherwise only one would be stored
						sharedLinkHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + link.Expanded_url)

						linkHostName := ""
						pUrl, _ := url.Parse(link.Url)
						linkHostName = pUrl.Host

						// TODO: ADD contributor gender, contributor type
						sharedLink := config.SocialHarvestSharedLink{
							Time:                      tweetCreatedTime,
							HarvestId:                 sharedLinkHarvestId,
							Territory:                 territoryName,
							Network:                   "twitter",
							MessageId:                 tweet.IdStr,
							ContributorId:             tweet.User.IdStr,
							ContributorScreenName:     tweet.User.ScreenName,
							ContributorName:           tweet.User.Name,
							ContributorLang:           tweet.User.Lang,
							ContributorType:           contributorType,
							ContributorGender:         contributorGender,
							ContributorLongitude:      contributorLng,
							ContributorLatitude:       contributorLat,
							ContributorGeohash:        contributorLocationGeoHash,
							ContributorCity:           contributorCity,
							ContributorCityPopulation: contributorCityPopulation,
							ContributorRegion:         contributorRegion,
							ContributorCountry:        contributorCountry,
							Url:                       link.Url,
							ExpandedUrl:               link.Expanded_url,
							Host:                      linkHostName,
						}
						// Send to the harvester observer
						StoreHarvestedData(sharedLink)
						LogJson(sharedLink, "shared_links")
					}
				}
			}

			// more shared links (media entities)
			if len(tweet.Entities.Media) > 0 {
				for _, media := range tweet.Entities.Media {
					if len(media.Url) > 0 {
						sharedMediaHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + media.Expanded_url)

						mediaHostName := ""
						pUrl, _ := url.Parse(media.Url)
						mediaHostName = pUrl.Host

						// TODO: ADD contributor gender, contributor type
						sharedMedia := config.SocialHarvestSharedLink{
							Time:                      tweetCreatedTime,
							HarvestId:                 sharedMediaHarvestId,
							Territory:                 territoryName,
							Network:                   "twitter",
							MessageId:                 tweet.IdStr,
							ContributorId:             tweet.User.IdStr,
							ContributorScreenName:     tweet.User.ScreenName,
							ContributorName:           tweet.User.Name,
							ContributorLang:           tweet.User.Lang,
							ContributorType:           contributorType,
							ContributorGender:         contributorGender,
							ContributorLongitude:      contributorLng,
							ContributorLatitude:       contributorLat,
							ContributorGeohash:        contributorLocationGeoHash,
							ContributorCity:           contributorCity,
							ContributorCityPopulation: contributorCityPopulation,
							ContributorRegion:         contributorRegion,
							ContributorCountry:        contributorCountry,
							Url:                       media.Url,
							ExpandedUrl:               media.Expanded_url,
							Host:                      mediaHostName,
							Type:                      media.Type,
							Source:                    media.Media_url,
						}
						// Send to the harvester observer
						StoreHarvestedData(sharedMedia)
						LogJson(sharedMedia, "shared_links")
					}
				}
			}

			// hashtags
			if len(tweet.Entities.Hashtags) > 0 {
				for _, tag := range tweet.Entities.Hashtags {
					if len(tag.Text) > 0 {
						hashtagHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + tag.Text)

						// TODO: ADD contributor gender, contributor type
						hashtag := config.SocialHarvestHashtag{
							Time:                      tweetCreatedTime,
							HarvestId:                 hashtagHarvestId,
							Territory:                 territoryName,
							Network:                   "twitter",
							MessageId:                 tweet.IdStr,
							ContributorId:             tweet.User.IdStr,
							ContributorScreenName:     tweet.User.ScreenName,
							ContributorName:           tweet.User.Name,
							ContributorLang:           tweet.User.Lang,
							ContributorType:           contributorType,
							ContributorGender:         contributorGender,
							ContributorLongitude:      contributorLng,
							ContributorLatitude:       contributorLat,
							ContributorGeohash:        contributorLocationGeoHash,
							ContributorCity:           contributorCity,
							ContributorCityPopulation: contributorCityPopulation,
							ContributorRegion:         contributorRegion,
							ContributorCountry:        contributorCountry,
							Tag:                       tag.Text,
						}
						// Send to the harvester observer
						StoreHarvestedData(hashtag)
						LogJson(hashtag, "hashtags")
					}
				}
			}

			// mentions
			if len(tweet.Entities.User_mentions) > 0 {
				for _, mentionedUser := range tweet.Entities.User_mentions {
					if len(mentionedUser.Id_str) > 0 {
						mentionHarvestId := GetHarvestMd5(tweet.IdStr + "twitter" + territoryName + mentionedUser.Id_str)

						// TODO: ADD contributor gender, contributor type
						// and mentioned user info (another api request)
						mention := config.SocialHarvestMention{
							Time:                  tweetCreatedTime,
							HarvestId:             mentionHarvestId,
							Territory:             territoryName,
							Network:               "twitter",
							MessageId:             tweet.IdStr,
							ContributorId:         tweet.User.IdStr,
							ContributorScreenName: tweet.User.ScreenName,
							ContributorName:       tweet.User.Name,
							ContributorLang:       tweet.User.Lang,
							ContributorType:       contributorType,
							ContributorGender:     contributorGender,
							ContributorLongitude:  contributorLng,
							ContributorLatitude:   contributorLat,
							ContributorGeohash:    contributorLocationGeoHash,

							MentionedId:         mentionedUser.Id_str,
							MentionedScreenName: mentionedUser.Screen_name,
							MentionedName:       mentionedUser.Name,
						}
						// Send to the harvester observer
						StoreHarvestedData(mention)
						LogJson(mention, "mentions")
					}
				}
			}

		} else {
			log.Println("Could not parse the time from the Tweet, so I'm throwing it away!")
			log.Println(err)
		}
	}

	return options, harvestState
}
Пример #4
0
// Get recent Instagram for media related to specific tags on Instagram
func InstagramSearch(territoryName string, harvestState config.HarvestState, tag string, options url.Values) (url.Values, config.HarvestState) {
	count, err := strconv.ParseUint(options.Get("count"), 10, 64)
	if err != nil {
		count = 100
	}
	opt := &instagram.Parameters{Count: count}

	// If there is a starting point (pagination / pick up where last harvest left off)
	if options.Get("max_tag_id") != "" {
		opt.MinID = options.Get("min_tag_id")
	}

	media, next, err := services.instagram.Tags.RecentMedia(tag, opt)

	if err == nil {
		for _, item := range media {
			instagramCreatedTime := time.Unix(0, item.CreatedTime*int64(time.Second))
			// Only take instagrams that have a time
			if err == nil && len(item.ID) > 0 {
				harvestState.ItemsHarvested++
				// If this is the most recent tweet in the results, set it's date and id (to be returned) so we can continue where we left off in future harvests
				if harvestState.LastTime.IsZero() || instagramCreatedTime.Unix() > harvestState.LastTime.Unix() {
					harvestState.LastTime = instagramCreatedTime
					harvestState.LastId = item.ID
				}

				// determine gender
				var contributorGender = DetectGender(item.User.FullName)

				// Figure out type (based on if a gender could be detected, name, etc.)
				var contributorType = DetectContributorType(item.User.FullName, contributorGender)

				var contributorCountry = ""
				var contributorRegion = ""
				var contributorCity = ""
				var contributorCityPopulation = int32(0)

				var statusLongitude = 0.0
				var statusLatitude = 0.0
				if item.Location != nil {
					statusLatitude = item.Location.Latitude
					statusLongitude = item.Location.Longitude
				}

				// Contributor location lookup (if no lat/lng was found on the message - try to reduce number of geocode lookups)
				contributorLat := 0.0
				contributorLng := 0.0
				if statusLatitude != 0.0 && statusLatitude != 0.0 {
					reverseLocation := services.geocoder.ReverseGeocode(statusLatitude, statusLongitude)
					contributorRegion = reverseLocation.Region
					contributorCity = reverseLocation.City
					contributorCityPopulation = reverseLocation.Population
					contributorCountry = reverseLocation.Country

					// They don't provide user location of any sort, so use the status lat/lng.
					contributorLat = statusLatitude
					contributorLng = statusLongitude
				}

				// Contributor geohash
				var contributorLocationGeoHash = geohash.Encode(contributorLat, contributorLng)
				// This is produced with empty lat/lng values - don't store it.
				if contributorLocationGeoHash == "7zzzzzzzzzzz" {
					contributorLocationGeoHash = ""
				}

				// Generate a harvest_id to avoid potential dupes (a unique index is placed on this field and all insert errors ignored).
				harvestId := GetHarvestMd5(item.ID + "instagram" + territoryName)

				// Retrieve the contributor for the "counts" info (everything else is actually already given with the media - kinda sad to even have to make this request)
				var contributor, contributorErr = services.instagram.Users.Get(item.User.ID)
				contributorFollowedByCount := 0
				contributorMediaCount := 0
				if contributorErr == nil {
					contributorFollowedByCount = contributor.Counts.FollowedBy
					contributorMediaCount = contributor.Counts.Media
				}

				caption := ""
				isQuestion := 0
				if item.Caption != nil {
					caption = item.Caption.Text
					isQuestion = Btoi(IsQuestion(caption, harvestConfig.QuestionRegex))
				}

				message := config.SocialHarvestMessage{
					Time:                      instagramCreatedTime,
					HarvestId:                 harvestId,
					Territory:                 territoryName,
					Network:                   "instagram",
					ContributorId:             item.User.ID,
					ContributorScreenName:     item.User.Username,
					ContributorName:           item.User.FullName,
					ContributorLongitude:      contributorLng,
					ContributorLatitude:       contributorLat,
					ContributorGeohash:        contributorLocationGeoHash,
					ContributorCity:           contributorCity,
					ContributorCityPopulation: contributorCityPopulation,
					ContributorRegion:         contributorRegion,
					ContributorCountry:        contributorCountry,
					ContributorFollowers:      contributorFollowedByCount,
					ContributorStatusesCount:  contributorMediaCount,
					ContributorGender:         contributorGender,
					ContributorType:           contributorType,
					Message:                   caption,
					Sentiment:                 services.sentimentAnalyzer.Classify(caption),
					IsQuestion:                isQuestion,
					MessageId:                 item.ID,
					LikeCount:                 item.Likes.Count,
				}
				// Send to the harvester observer
				go StoreHarvestedData(message)
				LogJson(message, "messages")

				// Keywords are stored on the same collection as hashtags - but under a `keyword` field instead of `tag` field as to not confuse the two.
				// Limit to words 4 characters or more and only return 8 keywords. This could greatly increase the database size if not limited.
				keywords := GetKeywords(caption, 4, 8)
				if len(keywords) > 0 {
					for _, keyword := range keywords {
						if keyword != "" {
							keywordHarvestId := GetHarvestMd5(item.ID + "instagram" + territoryName + keyword)

							// Again, keyword share the same series/table/collection
							hashtag := config.SocialHarvestHashtag{
								Time:                      instagramCreatedTime,
								HarvestId:                 keywordHarvestId,
								Territory:                 territoryName,
								Network:                   "instagram",
								MessageId:                 item.ID,
								ContributorId:             item.User.ID,
								ContributorScreenName:     item.User.Username,
								ContributorName:           item.User.FullName,
								ContributorLongitude:      contributorLng,
								ContributorLatitude:       contributorLat,
								ContributorGeohash:        contributorLocationGeoHash,
								ContributorCity:           contributorCity,
								ContributorCityPopulation: contributorCityPopulation,
								ContributorRegion:         contributorRegion,
								ContributorCountry:        contributorCountry,
								ContributorGender:         contributorGender,
								ContributorType:           contributorType,
								Keyword:                   keyword,
							}
							StoreHarvestedData(hashtag)
							LogJson(hashtag, "hashtags")
						}
					}
				}

				// shared links (the media in Instagram's case...for data query and aggregation reasons, we aren't treating media as part of the message)
				// though, less confusing is Instagram's own API which provides a "link" field (and they are always also the expanded version)
				linkHostName := ""
				pUrl, _ := url.Parse(item.Link)
				linkHostName = pUrl.Host

				// This changes depending on the Type
				preview := ""
				source := ""
				if item.Type == "video" {
					preview = item.Videos.LowResolution.URL
					source = item.Videos.StandardResolution.URL
				}
				if item.Type == "image" {
					preview = item.Images.Thumbnail.URL
					source = item.Images.StandardResolution.URL
				}

				sharedLink := config.SocialHarvestSharedLink{
					Time:                      instagramCreatedTime,
					HarvestId:                 harvestId,
					Territory:                 territoryName,
					Network:                   "instagram",
					MessageId:                 item.ID,
					ContributorId:             item.User.ID,
					ContributorScreenName:     item.User.Username,
					ContributorName:           item.User.FullName,
					ContributorLongitude:      contributorLng,
					ContributorLatitude:       contributorLat,
					ContributorGeohash:        contributorLocationGeoHash,
					ContributorCity:           contributorCity,
					ContributorCityPopulation: contributorCityPopulation,
					ContributorRegion:         contributorRegion,
					ContributorCountry:        contributorCountry,
					ContributorGender:         contributorGender,
					ContributorType:           contributorType,
					Url:                       item.Link,
					ExpandedUrl:               item.Link,
					Host:                      linkHostName,
					Type:                      item.Type,
					Preview:                   preview,
					Source:                    source,
				}
				// Send to the harvester observer
				StoreHarvestedData(sharedLink)
				LogJson(sharedLink, "shared_links")

				// hashtags
				if len(item.Tags) > 0 {
					for _, tag := range item.Tags {
						if len(tag) > 0 {
							hashtagHarvestId := GetHarvestMd5(item.ID + "instagram" + territoryName + tag)

							// TODO: ADD contributor gender, contributor type
							hashtag := config.SocialHarvestHashtag{
								Time:                      instagramCreatedTime,
								HarvestId:                 hashtagHarvestId,
								Territory:                 territoryName,
								Network:                   "instagram",
								MessageId:                 item.ID,
								ContributorId:             item.User.ID,
								ContributorScreenName:     item.User.Username,
								ContributorName:           item.User.FullName,
								ContributorLongitude:      contributorLng,
								ContributorLatitude:       contributorLat,
								ContributorGeohash:        contributorLocationGeoHash,
								ContributorCity:           contributorCity,
								ContributorCityPopulation: contributorCityPopulation,
								ContributorRegion:         contributorRegion,
								ContributorCountry:        contributorCountry,
								ContributorGender:         contributorGender,
								ContributorType:           contributorType,
								Tag:                       tag,
							}
							// Send to the harvester observer
							StoreHarvestedData(hashtag)
							LogJson(hashtag, "hashtags")
						}
					}
				}

			} else {
				log.Println("Could not parse the time from the Instagram, so I'm throwing it away!")
				log.Println(err)
			}

			// Set it, but it won't be used to make requests in the future
			if instagramCreatedTime.Unix() > harvestState.LastTime.Unix() {
				harvestState.LastTime = instagramCreatedTime
			}
		}

		// This is where the id will come from (like Facebook) to be passed back in updated harvestState
		if next.NextMaxID != "" {
			harvestState.LastId = next.NextMaxID
		}
		// ...and always set it for the params, so the loop can get the next page (and if empty string, it should stop)
		options.Set("max_tag_id", next.NextMaxID)
	}

	return options, harvestState
}