Beispiel #1
0
func _ahocorasick() string {
	patterns := []string{
		"mercury", "venus", "earth", "mars",
		"jupiter", "saturn", "uranus", "pluto",
	}

	m := ahocorasick.NewStringMatcher(patterns)

	found := m.Match([]byte(`earth`))
	return fmt.Sprintln("found patterns", found)
}
func getAffiliateLink(link models.Link) string {

	// Affiliate Window
	if !(len(ahocorasick.NewStringMatcher(affwinDomainParts).Match([]byte(strings.ToLower(link.Domain)))) == 0) {
		m := affWinLink{Link: link}
		if ok, u := m.getDestination(); ok {
			return u
		}
	}

	// Ebay Partner Network
	if !(len(ahocorasick.NewStringMatcher(ebayDomainParts).Match([]byte(strings.ToLower(link.Domain)))) == 0) {
		m := ebayLink{Link: link}
		if ok, u := m.getDestination(); ok {
			return u
		}
	}

	// Webgains
	if !(len(ahocorasick.NewStringMatcher(webgainsDomainParts).Match([]byte(strings.ToLower(link.Domain)))) == 0) {
		m := webgainsLink{Link: link}
		if ok, u := m.getDestination(); ok {
			return u
		}
	}

	// Amazon
	if !(len(ahocorasick.NewStringMatcher(amazonDomainParts).Match([]byte(strings.ToLower(link.Domain)))) == 0) {
		m := amazonLink{Link: link}
		if ok, u := m.getDestination(); ok {
			return u
		}
	}

	return link.URL
}
Beispiel #3
0
func main() {
	// Load urls...
	urls := createListFromCSVFile("./data/newurls.csv")

	// Load keywords...
	dict := createKeywordDictionaryFromCSVFile("./data/keywords.csv")

	// Classify...
	keywords := getKeywords(dict)
	m := ahocorasick.NewStringMatcher(keywords)
	for _, url := range urls {
		hits := m.Match([]byte(strings.ToLower(url)))
		//fmt.Printf("# of hits for %s: %d\n", url, len(hits))
		categories := getUniqueCategoriesFromHits(hits, keywords, dict)
		for _, v := range categories {
			fmt.Printf("%s,%s\n", url, v)
		}
	}
}
func (m *Link) rewriteRuleMayExist() bool {
	// A super-quick pre-check for determining whether we are likely to have a
	// rewrite rule in the database. This is hard-coded for speed, when you add
	// a new unique domain rule, add the domain keyword here. This is string
	// matching and does not use regular expressions.
	domains := ahocorasick.NewStringMatcher([]string{
		"bikely",
		"bikemap.net",
		"everytrail.com",
		"garmin",
		"google.com",
		"gpsies.com",
		"plotaroute.com",
		"ridewithgps.com",
		"strava",
		"vimeo",
		"youtube",
		"youtu.be",
	})
	hits := domains.Match([]byte(strings.ToLower(m.Domain)))

	return !(len(hits) == 0)
}
Beispiel #5
0
func (extr *Extractor) loadEntitiesFromDatabase(dbConnectionString string) error {
	db, err := sql.Open("postgres", dbConnectionString)
	if err != nil {
		return err
	}

	rows, err := db.Query("SELECT id, terms FROM entities")
	if err != nil {
		return err
	}
	defer rows.Close()

	var entities Entities

	for rows.Next() {
		var id, terms string
		if err := rows.Scan(&id, &terms); err != nil {
			return err
		}

		if err := entities.addEntity(id, terms); err != nil {
			return err
		}
	}

	if err := rows.Err(); err != nil {
		return err
	}

	extr.matcher = ahocorasick.NewStringMatcher(entities.terms)
	logInfo("Loaded", len(entities.terms), "terms")

	extr.entities = entities

	return nil
}
func affiliateMayExist(domain string) bool {
	domains := ahocorasick.NewStringMatcher(affDomainParts)
	hits := domains.Match([]byte(strings.ToLower(domain)))

	return !(len(hits) == 0)
}