func _ahocorasick() string { patterns := []string{ "mercury", "venus", "earth", "mars", "jupiter", "saturn", "uranus", "pluto", } m := ahocorasick.NewStringMatcher(patterns) found := m.Match([]byte(`earth`)) return fmt.Sprintln("found patterns", found) }
func getAffiliateLink(link models.Link) string { // Affiliate Window if !(len(ahocorasick.NewStringMatcher(affwinDomainParts).Match([]byte(strings.ToLower(link.Domain)))) == 0) { m := affWinLink{Link: link} if ok, u := m.getDestination(); ok { return u } } // Ebay Partner Network if !(len(ahocorasick.NewStringMatcher(ebayDomainParts).Match([]byte(strings.ToLower(link.Domain)))) == 0) { m := ebayLink{Link: link} if ok, u := m.getDestination(); ok { return u } } // Webgains if !(len(ahocorasick.NewStringMatcher(webgainsDomainParts).Match([]byte(strings.ToLower(link.Domain)))) == 0) { m := webgainsLink{Link: link} if ok, u := m.getDestination(); ok { return u } } // Amazon if !(len(ahocorasick.NewStringMatcher(amazonDomainParts).Match([]byte(strings.ToLower(link.Domain)))) == 0) { m := amazonLink{Link: link} if ok, u := m.getDestination(); ok { return u } } return link.URL }
func main() { // Load urls... urls := createListFromCSVFile("./data/newurls.csv") // Load keywords... dict := createKeywordDictionaryFromCSVFile("./data/keywords.csv") // Classify... keywords := getKeywords(dict) m := ahocorasick.NewStringMatcher(keywords) for _, url := range urls { hits := m.Match([]byte(strings.ToLower(url))) //fmt.Printf("# of hits for %s: %d\n", url, len(hits)) categories := getUniqueCategoriesFromHits(hits, keywords, dict) for _, v := range categories { fmt.Printf("%s,%s\n", url, v) } } }
func (m *Link) rewriteRuleMayExist() bool { // A super-quick pre-check for determining whether we are likely to have a // rewrite rule in the database. This is hard-coded for speed, when you add // a new unique domain rule, add the domain keyword here. This is string // matching and does not use regular expressions. domains := ahocorasick.NewStringMatcher([]string{ "bikely", "bikemap.net", "everytrail.com", "garmin", "google.com", "gpsies.com", "plotaroute.com", "ridewithgps.com", "strava", "vimeo", "youtube", "youtu.be", }) hits := domains.Match([]byte(strings.ToLower(m.Domain))) return !(len(hits) == 0) }
func (extr *Extractor) loadEntitiesFromDatabase(dbConnectionString string) error { db, err := sql.Open("postgres", dbConnectionString) if err != nil { return err } rows, err := db.Query("SELECT id, terms FROM entities") if err != nil { return err } defer rows.Close() var entities Entities for rows.Next() { var id, terms string if err := rows.Scan(&id, &terms); err != nil { return err } if err := entities.addEntity(id, terms); err != nil { return err } } if err := rows.Err(); err != nil { return err } extr.matcher = ahocorasick.NewStringMatcher(entities.terms) logInfo("Loaded", len(entities.terms), "terms") extr.entities = entities return nil }
func affiliateMayExist(domain string) bool { domains := ahocorasick.NewStringMatcher(affDomainParts) hits := domains.Match([]byte(strings.ToLower(domain))) return !(len(hits) == 0) }