コード例 #1
0
ファイル: geobed.go プロジェクト: RavenB/geobed
// Reverse geocode
func (g *GeoBed) ReverseGeocode(lat float64, lng float64) GeobedCity {
	c := GeobedCity{}

	gh := geohash.Encode(lat, lng)
	// This is produced with empty lat/lng values - don't look for anything.
	if gh == "7zzzzzzzzzzz" {
		return c
	}

	// Note: All geohashes are going to be 12 characters long. Even if the precision on the lat/lng isn't great. The geohash package will center things.
	// Obviously lat/lng like 37, -122 is a guess. That's no where near the resolution of a city. Though we're going to allow guesses.
	mostMatched := 0
	matched := 0
	for k, v := range g.c {
		// check first two characters to reduce the number of loops
		if v.Geohash[0] == gh[0] && v.Geohash[1] == gh[1] {
			matched = 2
			for i := 2; i <= len(gh); i++ {
				//log.Println(gh[0:i])
				if v.Geohash[0:i] == gh[0:i] {
					matched++
				}
			}
			// tie breakers go to city with larger population (NOTE: There's still a chance that the next pass will uncover a better match)
			if matched == mostMatched && g.c[k].Population > c.Population {
				c = g.c[k]
				// log.Println("MATCHES")
				// log.Println(matched)
				// log.Println("CITY")
				// log.Println(c.City)
				// log.Println("POPULATION")
				// log.Println(c.Population)
			}
			if matched > mostMatched {
				c = g.c[k]
				mostMatched = matched
			}
		}
	}

	return c
}
コード例 #2
0
ファイル: geobed.go プロジェクト: RavenB/geobed
// Unzips the data sets and loads the data.
func (g *GeoBed) loadDataSets() {
	locationDedupeIdx = make(map[string]bool)

	for _, f := range dataSetFiles {
		// This one is zipped
		if f["id"] == "geonamesCities1000" {
			rz, err := zip.OpenReader(f["path"])
			if err != nil {
				log.Fatal(err)
			}
			defer rz.Close()

			for _, uF := range rz.File {
				fi, err := uF.Open()

				if err != nil {
					log.Fatal(err)
				}
				defer fi.Close()

				// Geonames uses a tab delineated format and it's not even consistent. No CSV reader that I've found for Go can understand this.
				// I'm not expecting any reader to either because it's an invalid CSV to be frank. However, we can still split up each row by \t
				scanner := bufio.NewScanner(fi)
				scanner.Split(bufio.ScanLines)

				i := 1
				for scanner.Scan() {
					i++

					// So regexp, sadly, must be used (well, unless I wanted parse each string byte by byte, pushing each into a buffer to append to a slice until a tab is reached, etc.).
					// But I'd have to also then put in a condition if the next byte was a \t rune, then append an empty string, etc. This just, for now, seems nicer (easier).
					// This is only an import/update, so it shouldn't be an issue for performance. If it is, then I'll look into other solutions.
					fields := regexp.MustCompile("\t").Split(scanner.Text(), 19)

					// NOTE: Now using a combined GeobedCity struct since not all data sets have the same fields.
					// Plus, the entire point was to geocode forward and reverse. Bonus information like elevation and such is just superfluous.
					// Leaving it here because it may be configurable... If options are passed to NewGeobed() then maybe Geobed can simply be a Geonames search.
					// Don't even load in MaxMind data...And if that's the case, maybe that bonus information is desired.
					if len(fields) == 19 {
						//id, _ := strconv.Atoi(fields[0])
						lat, _ := strconv.ParseFloat(fields[4], 64)
						lng, _ := strconv.ParseFloat(fields[5], 64)
						pop, _ := strconv.Atoi(fields[14])
						//elv, _ := strconv.Atoi(fields[15])
						//dem, _ := strconv.Atoi(fields[16])

						gh := geohash.Encode(lat, lng)
						// This is produced with empty lat/lng values - don't store it.
						if gh == "7zzzzzzzzzzz" {
							gh = ""
						}

						var c GeobedCity
						c.City = strings.Trim(string(fields[1]), " ")
						c.CityAlt = string(fields[3])
						c.Country = string(fields[8])
						c.Region = string(fields[10])
						c.Latitude = lat
						c.Longitude = lng
						c.Population = int32(pop)
						c.Geohash = gh

						// Don't include entries without a city name. If we want to geocode the centers of countries and states, then we can do that faster through other means.
						if len(c.City) > 0 {
							g.c = append(g.c, c)
						}
					}
				}
			}
		}

		// ...And this one is Gzipped (and this one may have worked with the CSV package, but parse it the same way as the others line by line)
		if f["id"] == "maxmindWorldCities" {
			// It also has a lot of dupes
			maxMindCityDedupeIdx = make(map[string][]string)

			fi, err := os.Open(f["path"])
			if err != nil {
				log.Println(err)
			}
			defer fi.Close()

			fz, err := gzip.NewReader(fi)
			if err != nil {
				log.Println(err)
			}
			defer fz.Close()

			scanner := bufio.NewScanner(fz)
			scanner.Split(bufio.ScanLines)

			i := 1
			for scanner.Scan() {
				i++
				t := scanner.Text()

				fields := strings.Split(t, ",")
				if len(fields) == 7 {
					var b bytes.Buffer
					b.WriteString(fields[0]) // country
					b.WriteString(fields[3]) // region
					b.WriteString(fields[1]) // city

					idx := b.String()
					b.Reset()
					maxMindCityDedupeIdx[idx] = fields
				}
			}

			// Loop the map of fields after dupes have been removed (about 1/5th less... 2.6m vs 3.1m inreases lookup performance).
			for _, fields := range maxMindCityDedupeIdx {
				if fields[0] != "" && fields[0] != "0" {
					if fields[2] != "AccentCity" {
						pop, _ := strconv.Atoi(fields[4])
						lat, _ := strconv.ParseFloat(fields[5], 64)
						lng, _ := strconv.ParseFloat(fields[6], 64)
						// MaxMind's data set is a bit dirty. I've seen city names surrounded by parenthesis in a few places.
						cn := strings.Trim(string(fields[2]), " ")
						cn = strings.Trim(cn, "( )")

						// Don't take any city names with erroneous punctuation either.
						if strings.Contains(cn, "!") || strings.Contains(cn, "@") {
							continue
						}

						gh := geohash.Encode(lat, lng)
						// This is produced with empty lat/lng values - don't store it.
						if gh == "7zzzzzzzzzzz" {
							gh = ""
						}

						// If the geohash was seen before...
						_, ok := locationDedupeIdx[gh]
						if !ok {
							locationDedupeIdx[gh] = true

							var c GeobedCity
							c.City = cn
							c.Country = toUpper(string(fields[0]))
							c.Region = string(fields[3])
							c.Latitude = lat
							c.Longitude = lng
							c.Population = int32(pop)
							c.Geohash = gh

							// Don't include entries without a city name. If we want to geocode the centers of countries and states, then we can do that faster through other means.
							if len(c.City) > 0 && len(c.Country) > 0 {
								g.c = append(g.c, c)
							}
						}
					}
				}
			}
			// Clear out the temrporary index (set to nil, it does get re-created) so that Go can garbage collect it at some point whenever it feels the need.
			maxMindCityDedupeIdx = nil
			locationDedupeIdx = nil
		}

		// ...And this one is just plain text
		if f["id"] == "geonamesCountryInfo" {
			fi, err := os.Open(f["path"])

			if err != nil {
				log.Fatal(err)
			}
			defer fi.Close()

			scanner := bufio.NewScanner(fi)
			scanner.Split(bufio.ScanLines)

			i := 1
			for scanner.Scan() {
				t := scanner.Text()
				// There are a bunch of lines in this file that are comments, they start with #
				if string(t[0]) != "#" {
					i++
					fields := regexp.MustCompile("\t").Split(t, 19)

					if len(fields) == 19 {
						if fields[0] != "" && fields[0] != "0" {
							isoNumeric, _ := strconv.Atoi(fields[2])
							area, _ := strconv.Atoi(fields[6])
							pop, _ := strconv.Atoi(fields[7])
							gid, _ := strconv.Atoi(fields[16])

							var ci CountryInfo
							ci.ISO = string(fields[0])
							ci.ISO3 = string(fields[1])
							ci.ISONumeric = int16(isoNumeric)
							ci.Fips = string(fields[3])
							ci.Country = string(fields[4])
							ci.Capital = string(fields[5])
							ci.Area = int32(area)
							ci.Population = int32(pop)
							ci.Continent = string(fields[8])
							ci.Tld = string(fields[9])
							ci.CurrencyCode = string(fields[10])
							ci.CurrencyName = string(fields[11])
							ci.Phone = string(fields[12])
							ci.PostalCodeFormat = string(fields[13])
							ci.PostalCodeRegex = string(fields[14])
							ci.Languages = string(fields[15])
							ci.GeonameId = int32(gid)
							ci.Neighbours = string(fields[17])
							ci.EquivalentFipsCode = string(fields[18])

							g.co = append(g.co, ci)
						}
					}
				}
			}
		}
	}

	// Sort []GeobedCity by city names to help with binary search (the City field is the most searched upon field and the matching names can be easily filtered down from there).
	sort.Sort(g.c)

	//debug
	//log.Println("TOTAL RECORDS:")
	//log.Println(len(g.c))

	// Index the locations of city names in the g.c []GeoCity slice. This way when searching the range can be limited so it will be faster.
	cityNameIdx = make(map[string]int)
	for k, v := range g.c {
		// Get the index key for the first character of the city name.
		ik := toLower(string(v.City[0]))
		if val, ok := cityNameIdx[ik]; ok {
			// If this key number is greater than what was previously recorded, then set it as the new indexed key.
			if val < k {
				cityNameIdx[ik] = k
			}
		} else {
			// If the index key has not yet been set for this value, then set it.
			cityNameIdx[ik] = k
		}

		// Get the index key for the first two characters of the city name.
		// if len(v.CityLower) >= 2 {
		// 	ik2 := v.CityLower[0:2]
		// 	if val, ok := cityNameIdx[ik2]; ok {
		// 		// If this key number is greater than what was previously recorded, then set it as the new indexed key.
		// 		if val < k {
		// 			cityNameIdx[ik2] = k
		// 		}
		// 	} else {
		// 		// If the index key has not yet been set for this value, then set it.
		// 		cityNameIdx[ik2] = k
		// 	}
		// }
	}
}