// Searches the given Nucleotide-Sequence file-path for the given 'searchString' using sophisticated compression
func search(fileName string, sequenceToSearchFor string) (bool, []int) {
	var isPresent bool = false
	var master []byte
	var offsets []int

	lines, err := readLines(fileName)
	if err != nil {
		log.Fatalf("readLines: %s", err)
	} else {
		for _, line := range lines {
			master = *(compress(&line))
		}
	}

	// dat, _ := ioutil.ReadFile("./encoded.txt")
	index := suffixarray.New(master)

	searchString := sequenceToSearchFor
	searchBytes := *(compress(&searchString))

	//https://code.google.com/p/go/source/browse/src/pkg/index/suffixarray/suffixarray.go?name=release#190
	//Gets exactly the first match alone

	// offsets = index.Lookup([]byte(searchBytes), -1)

	//https://code.google.com/p/go/source/browse/src/pkg/index/suffixarray/suffixarray.go?name=release#174
	//Gets all matches
	offsets = index.Lookup([]byte(searchBytes), 1)

	if len(offsets) == 1 {
		isPresent = true
	}
	return isPresent, offsets
}
Beispiel #2
0
func main() {
	//A set of words delimited by space
	words := "a apple sphere atom atmosphere"

	//A suffix array created in golang by converting the given string into bytes
	index := suffixarray.New([]byte(words))

	//Lookup Time complexity =  O(log(N)*len(s) + len(result))

	// N : the size of the indexed data
	// s : substring to be seached for
	// result : array containing integers which represent the index of the given substring 's' in the suffix array

	//NOTE
	// Let's take the following example
	// var s string = "apple"
	// fmt.Println([]byte(s)) would print  = [97 112 112 108 101].
	// Golang Suffix array uses the byte representation of the sub-string "s" in order to perform most optimal comutation

	offsets1 := index.Lookup([]byte("sphere"), -1) // the list of all indices where s occurs in data

	//Prints unsorted array of integers which are the indices of the given substring
	fmt.Println(offsets1)

}
Beispiel #3
0
func _indexSuffixArray() string {
	docs := []string{
		"mercury", "venus", "earth", "mars",
		"jupiter", "saturn", "uranus", "pluto",
	}

	var data []byte
	var offsets []int

	for _, d := range docs {
		data = append(data, []byte(d)...)
		offsets = append(offsets, len(data))
	}
	sfx := suffixarray.New(data)

	query := "earth"

	idxs := sfx.Lookup([]byte(query), -1)
	var results []int
	for _, idx := range idxs {
		i := sort.Search(len(offsets), func(i int) bool { return offsets[i] > idx })
		if idx+len(query) <= offsets[i] {
			results = append(results, i)
		}
	}

	return fmt.Sprintf("%q is in documents %v\n", query, results)
}
Beispiel #4
0
func NewTruncIndex() *TruncIndex {
	return &TruncIndex{
		index: suffixarray.New([]byte{' '}),
		ids:   make(map[string]bool),
		bytes: []byte{' '},
	}
}
Beispiel #5
0
func main() {
	const N = 6e5
	const M = 100
	const I = 20
	const J = 10
	const P = 2
	data := make([]byte, N)
	for i := 0; i < N; i++ {
		data[i] = byte(rand.Intn(255))
	}
	done := make(chan bool, P)
	for p := 0; p < P; p++ {
		go func() {
			for i := 0; i < I; i++ {
				suffix := suffixarray.New(data)
				for j := 0; j < J; j++ {
					str := make([]byte, M)
					for m := 0; m < M; m++ {
						str[m] = byte(rand.Intn(255))
					}
					_ = suffix.Lookup(str, 10)
				}
			}
			done <- true
		}()
	}
	for p := 0; p < P; p++ {
		<-done
	}
}
Beispiel #6
0
func (idx *TruncIndex) Add(id string) error {
	idx.Lock()
	defer idx.Unlock()
	if err := idx.addId(id); err != nil {
		return err
	}
	idx.index = suffixarray.New(idx.bytes)
	return nil
}
func BenchmarkBuildSuffixArray(b *testing.B) {
	words := getDictWords()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		data := []byte("\x00" + strings.Join(words, "\x00") + "\x00")
		_ = suffixarray.New(data)
	}
}
Beispiel #8
0
func (waf *WAF) GoSufArray_UserAgentIsClean(UA []byte) bool {
	index := suffixarray.New(UA)
	for _, robot := range waf.bad_robots {
		if len(index.Lookup(robot, 1)) > 0 {
			return false
		}
	}

	return true
}
Beispiel #9
0
func check_url(substr, str string) bool {
	var dst []byte
	substr_bytes := strconv.AppendQuoteToASCII(dst, substr)
	str_bytes := strconv.AppendQuoteToASCII(dst, str)
	index := suffixarray.New(str_bytes)
	offsets := index.Lookup(substr_bytes, -1)
	if offsets == nil {
		return false
	}
	return offsets[0] == 0
}
Beispiel #10
0
func benchFast(line string, substr string) time.Duration {
	index := fs.New([]byte(line))
	start := time.Now()
	for i := 0; i < LOOPS; i++ {
		index.Lookup([]byte(substr), 1)
	}
	end := time.Now()
	delta := end.Sub(start)
	fmt.Printf("%10s: %20s\t%16s %10s\n", "fast", substr, delta, delta/LOOPS)
	return delta
}
Beispiel #11
0
func NewTruncIndex(ids []string) (idx *TruncIndex) {
	idx = &TruncIndex{
		ids:   make(map[string]bool),
		bytes: []byte{' '},
	}
	for _, id := range ids {
		idx.ids[id] = true
		idx.bytes = append(idx.bytes, []byte(id+" ")...)
	}
	idx.index = suffixarray.New(idx.bytes)
	return
}
Beispiel #12
0
func (s *Searcher) indexFile(path string, info os.FileInfo, err error) error {
	// only index 1/4 of the files per server
	if int(path[len(path)-1])%4 != s.id {
		return nil
	}
	if info.Mode().IsRegular() && info.Size() < (1<<20) {
		name := strings.TrimPrefix(path, s.base_path)
		data, _ := ioutil.ReadFile(path)
		s.files[name] = suffixarray.New(data)
	}
	return nil
}
Beispiel #13
0
func (idx *TruncIndex) Add(id string) error {
	if strings.Contains(id, " ") {
		return fmt.Errorf("Illegal character: ' '")
	}
	if _, exists := idx.ids[id]; exists {
		return fmt.Errorf("Id already exists: %s", id)
	}
	idx.ids[id] = true
	idx.bytes = append(idx.bytes, []byte(id+" ")...)
	idx.index = suffixarray.New(idx.bytes)
	return nil
}
func BenchmarkLookupXX(b *testing.B) {
	words := getDictWords()
	data := []byte("\x00" + strings.Join(words, "\x00") + "\x00")
	sa := suffixarray.New(data)

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		indices := sa.Lookup([]byte(XXwordToTry), 1)
		if len(indices) > 0 {
			_ = getStringFromIndex(data, indices[0])
		}
	}
}
Beispiel #15
0
func (idx *TruncIndex) Delete(id string) error {
	if _, exists := idx.ids[id]; !exists {
		return fmt.Errorf("No such id: %s", id)
	}
	before, after, err := idx.lookup(id)
	if err != nil {
		return err
	}
	delete(idx.ids, id)
	idx.bytes = append(idx.bytes[:before], idx.bytes[after:]...)
	idx.index = suffixarray.New(idx.bytes)
	return nil
}
Beispiel #16
0
// Index builds a new suffixarray for the package names previously registered
// with this instance.
func (idx *ManualIndex) Index() error {
	var buf bytes.Buffer

	for pkg := range idx.packages {
		_, err := fmt.Fprintf(&buf, "\x00%s", pkg)
		if err != nil {
			return err
		}
	}

	idx.index = suffixarray.New(buf.Bytes())
	return nil
}
Beispiel #17
0
func (idx *SuffixIndex) sort() {
	if idx.index == nil {
		keys := make([][]byte, len(idx.tiles))
		i := 0
		for k := range idx.tiles {
			keys[i] = []byte(k)
			i++
		}
		d := []byte{zero}
		b := bytes.Join(keys, d)                    //join w/ zeros
		idx.indexed = bytes.Join([][]byte{d, d}, b) //pad w/ zeros
		idx.index = suffixarray.New(idx.indexed)
	}
}
func TestSuffixArrayFind(t *testing.T) {
	words := getDictWords()
	data := []byte("\x00" + strings.Join(words, "\x00") + "\x00")
	sa := suffixarray.New(data)

	buf := &bytes.Buffer{}
	sa.Write(buf)
	fmt.Println("size:", buf.Len())

	indices := sa.Lookup([]byte("yrate"), 1)
	if indices == nil || len(indices) < 1 {
		t.Fatal("not found")
	}
}
Beispiel #19
0
func test(data, what []byte) {
	s := suffixarray.New(data)
	idx0 := s.Lookup(what, -1)
	idx1 := simple(data, what)
	if len(idx0) != len(idx1) {
		panic(fmt.Sprintf("len mismatch: %+v, %+v", idx0, idx1))
	}
	sort.Ints(idx0)
	for i, x := range idx0 {
		if x != idx1[i] {
			panic(fmt.Sprintf("data mismatch: %+v, %+v", idx0, idx1))
		}
	}
}
Beispiel #20
0
func main() {
	sometext, err := ioutil.ReadFile(os.Args[1])

	if err != nil {
		log.Println(err)
	}

	index := suffixarray.New(sometext)

	offsets := index.Lookup([]byte("*"), -1)

	for _, value := range offsets[0:100] {
		log.Println(value, string(sometext[value]))
	}
}
Beispiel #21
0
// Takes the known dictionary listing and creates a suffix array
// model for these terms. If a model already existed, it is discarded
func (model *Model) updateSuffixArr() {
	if !model.UseAutocomplete {
		return
	}
	model.RLock()
	termArr := make([]string, 0, 1000)
	for term, count := range model.Data {
		if count.Corpus > model.Threshold || count.Query > 0 { // TODO: query threshold?
			termArr = append(termArr, term)
		}
	}
	model.SuffixArrConcat = "\x00" + strings.Join(termArr, "\x00") + "\x00"
	model.SuffixArr = suffixarray.New([]byte(model.SuffixArrConcat))
	model.SuffDivergence = 0
	model.RUnlock()
}
Beispiel #22
0
// golang.org网站的全文搜索是基于suffix array实现的【http://t.cn/hBJekg】,
// 可能觉得效果不错,就把suffix array添加到golang的标准库里面了。【http://t.cn/hBJekd】
// http://blog.csdn.net/fxsjy/article/details/6297523
func main() {
	fmt.Println("Hello, 世界")
	str := `The Go programming language is an open source project to make programmers more productive. 
	Go is expressive, concise, clean, and efficient. 
	Its concurrency mechanisms make it easy to write programs that get the most out of multicore 
	and networked machines, hyd, while its novel type system enables flexible 
	and modular program construction. 
	Go compiles quickly to machine code yet has the convenience of garbage collection 
	and the power of run-time reflection. It's a fast, statically typed, 
	compiled language that feels like a dynamically typed, interpreted language.`

	index := suffixarray.New([]byte(str))
	offsets1 := index.Lookup([]byte("hyd"), -1)
	for _, i := range offsets1 {
		fmt.Println(str[i:])
	}

}
Beispiel #23
0
func test_sa() {

	data := []byte("i am a test i am a test i am a test i am a test i am a test")
	s := data[2:4]

	// create index for some data
	index := suffixarray.New(data)

	// lookup byte slice s
	offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
	offsets2 := index.Lookup(s, 3)  // the list of at most 3 indices where s occurs in data
	fmt.Println("test_sa")
	fmt.Println(string(s))
	fmt.Println(offsets1)
	fmt.Println(offsets2)
	for _, i := range offsets1 {
		m := data[i : i+2]
		fmt.Println(string(m))
	}
}
Beispiel #24
0
func main() {
	str, _ := ioutil.ReadFile("large.in")
	next := func(str string) func() string {
		reg, _ := regexp.Compile("\\S+")
		is := suffixarray.New([]byte(str)).FindAllIndex(reg, -1)
		return func() (result string) {
			if len(is) < 1 {
				return ""
			}
			result = str[is[0][0]:is[0][1]]
			is = is[1:]
			return
		}
	}(string(str))
	t := time.Now().UnixNano()
	for i := 0; i < 100000000; i++ {
		next()
	}
	log.Println(time.Now().UnixNano() - t)
}
Beispiel #25
0
func main() {
	str := `1234 2 3333 4 5  aaaaa
s
aaa
`
	next := func(str string) func() string {
		reg, _ := regexp.Compile("\\S+")
		is := suffixarray.New([]byte(str)).FindAllIndex(reg, -1)
		return func() (result string) {
			if len(is) < 1 {
				return ""
			}
			result = str[is[0][0]:is[0][1]]
			is = is[1:]
			return
		}
	}(str)
	for x := next(); x != ""; x = next() {
		log.Println(x)
	}
}
Beispiel #26
0
// Convert ...
func Convert(path string) error {
	file, err := os.Open(path)
	if err != nil {
		return err
	}

	defer file.Close()

	d := createDictReader(createTokenReader(file))

	var buffer bytes.Buffer
	_, _ = buffer.WriteRune('@')
	for !d.isDone() {
		g, err := d.nextGroup()
		if err != nil {
			return err
		}
		for _, w := range g.words {
			_, _ = buffer.WriteString(w.name)
			_, _ = buffer.WriteRune('@')
		}
	}
	sa := suffixarray.New(buffer.Bytes()[:])

	flags := os.O_CREATE | os.O_WRONLY
	dictFile, err := os.OpenFile("morph.dict", flags, 0666)
	if err != nil {
		return err
	}

	defer dictFile.Close()

	err = sa.Write(dictFile)
	if err != nil {
		return err
	}

	return nil

}
func main() {
	words := []string{
		"banana",
		"apple",
		"pear",
		"tangerine",
		"orange",
		"lemon",
		"peach",
		"persimmon",
	}

	// Combine all words into a single byte slice, separated by \x00 bytes (which
	// do not appear in words), adding one on each end too.
	data := []byte("\x00" + strings.Join(words, "\x00") + "\x00")
	sa := suffixarray.New(data)

	indices := sa.Lookup([]byte("an"), -1)
	if len(indices) > 0 {
		fmt.Println("Lookup returns:", indices)
	} else {
		fmt.Println("Lookup: not found")
	}

	// Reconstruct matches from indices found by Lookup.
	for _, idx := range indices {
		fmt.Println(getStringFromIndex(data, idx))
	}

	// Here using a completely "literal" regexp, similar to the usage of Lookup,
	// to compare what the two methods return. FindAllIndex can take an arbitrary
	// regexp - but beware of the caveat discussed in the blog post.
	r := regexp.MustCompile("an")
	matches := sa.FindAllIndex(r, -1)
	fmt.Println("FindAllIndex returns:", matches)
}
Beispiel #28
0
// NewIndex creates a new index for the .go files
// in the directories given by dirnames.
//
func NewIndex(dirnames <-chan string, fulltextIndex bool, throttle float64) *Index {
	var x Indexer
	th := NewThrottle(throttle, 100*time.Millisecond) // run at least 0.1s at a time

	// initialize Indexer
	// (use some reasonably sized maps to start)
	x.fset = token.NewFileSet()
	x.packages = make(map[string]*Pak, 256)
	x.words = make(map[string]*IndexResult, 8192)

	// index all files in the directories given by dirnames
	for dirname := range dirnames {
		list, err := fs.ReadDir(dirname)
		if err != nil {
			continue // ignore this directory
		}
		for _, f := range list {
			if !f.IsDir() {
				x.visitFile(dirname, f, fulltextIndex)
			}
			th.Throttle()
		}
	}

	if !fulltextIndex {
		// the file set, the current file, and the sources are
		// not needed after indexing if no text index is built -
		// help GC and clear them
		x.fset = nil
		x.sources.Reset()
		x.current = nil // contains reference to fset!
	}

	// for each word, reduce the RunLists into a LookupResult;
	// also collect the word with its canonical spelling in a
	// word list for later computation of alternative spellings
	words := make(map[string]*LookupResult)
	var wlist RunList
	for w, h := range x.words {
		decls := reduce(h.Decls)
		others := reduce(h.Others)
		words[w] = &LookupResult{
			Decls:  decls,
			Others: others,
		}
		wlist = append(wlist, &wordPair{canonical(w), w})
		th.Throttle()
	}
	x.stats.Words = len(words)

	// reduce the word list {canonical(w), w} into
	// a list of AltWords runs {canonical(w), {w}}
	alist := wlist.reduce(lessWordPair, newAltWords)

	// convert alist into a map of alternative spellings
	alts := make(map[string]*AltWords)
	for i := 0; i < len(alist); i++ {
		a := alist[i].(*AltWords)
		alts[a.Canon] = a
	}

	// create text index
	var suffixes *suffixarray.Index
	if fulltextIndex {
		suffixes = suffixarray.New(x.sources.Bytes())
	}

	return &Index{x.fset, suffixes, words, alts, x.snippets, x.stats}
}
Beispiel #29
0
// NewIndex creates a new index for the .go files provided by the corpus.
func (c *Corpus) NewIndex() *Index {
	// initialize Indexer
	// (use some reasonably sized maps to start)
	x := &Indexer{
		c:           c,
		fset:        token.NewFileSet(),
		fsOpenGate:  make(chan bool, maxOpenFiles),
		strings:     make(map[string]string),
		packages:    make(map[Pak]*Pak, 256),
		words:       make(map[string]*IndexResult, 8192),
		throttle:    util.NewThrottle(c.throttle(), 100*time.Millisecond), // run at least 0.1s at a time
		importCount: make(map[string]int),
		packagePath: make(map[string]map[string]bool),
		exports:     make(map[string]map[string]SpotKind),
		idents:      make(map[SpotKind]map[string][]Ident, 4),
	}

	// index all files in the directories given by dirnames
	var wg sync.WaitGroup // outstanding ReadDir + visitFile
	dirGate := make(chan bool, maxOpenDirs)
	for dirname := range c.fsDirnames() {
		if c.IndexDirectory != nil && !c.IndexDirectory(dirname) {
			continue
		}
		dirGate <- true
		wg.Add(1)
		go func(dirname string) {
			defer func() { <-dirGate }()
			defer wg.Done()

			list, err := c.fs.ReadDir(dirname)
			if err != nil {
				log.Printf("ReadDir(%q): %v; skipping directory", dirname, err)
				return // ignore this directory
			}
			for _, fi := range list {
				wg.Add(1)
				go func(fi os.FileInfo) {
					defer wg.Done()
					x.visitFile(dirname, fi)
				}(fi)
			}
		}(dirname)
	}
	wg.Wait()

	if !c.IndexFullText {
		// the file set, the current file, and the sources are
		// not needed after indexing if no text index is built -
		// help GC and clear them
		x.fset = nil
		x.sources.Reset()
		x.current = nil // contains reference to fset!
	}

	// for each word, reduce the RunLists into a LookupResult;
	// also collect the word with its canonical spelling in a
	// word list for later computation of alternative spellings
	words := make(map[string]*LookupResult)
	var wlist RunList
	for w, h := range x.words {
		decls := reduce(h.Decls)
		others := reduce(h.Others)
		words[w] = &LookupResult{
			Decls:  decls,
			Others: others,
		}
		wlist = append(wlist, &wordPair{canonical(w), w})
		x.throttle.Throttle()
	}
	x.stats.Words = len(words)

	// reduce the word list {canonical(w), w} into
	// a list of AltWords runs {canonical(w), {w}}
	alist := wlist.reduce(lessWordPair, newAltWords)

	// convert alist into a map of alternative spellings
	alts := make(map[string]*AltWords)
	for i := 0; i < len(alist); i++ {
		a := alist[i].(*AltWords)
		alts[a.Canon] = a
	}

	// create text index
	var suffixes *suffixarray.Index
	if c.IndexFullText {
		suffixes = suffixarray.New(x.sources.Bytes())
	}

	for _, idMap := range x.idents {
		for _, ir := range idMap {
			sort.Sort(byPackage(ir))
		}
	}

	return &Index{
		fset:        x.fset,
		suffixes:    suffixes,
		words:       words,
		alts:        alts,
		snippets:    x.snippets,
		stats:       x.stats,
		importCount: x.importCount,
		packagePath: x.packagePath,
		exports:     x.exports,
		idents:      x.idents,
		opts: indexOptions{
			Docs:       x.c.IndexDocs,
			GoCode:     x.c.IndexGoCode,
			FullText:   x.c.IndexFullText,
			MaxResults: x.c.MaxResults,
		},
	}
}
Beispiel #30
0
// NewIndex creates a new index for the .go files
// in the directories given by dirnames.
//
func NewIndex(dirnames <-chan string, fulltextIndex bool) *Index {
	var x Indexer

	// initialize Indexer
	x.fset = token.NewFileSet()
	x.words = make(map[string]*IndexResult)

	// index all files in the directories given by dirnames
	for dirname := range dirnames {
		list, err := ioutil.ReadDir(dirname)
		if err != nil {
			continue // ignore this directory
		}
		for _, f := range list {
			if !f.IsDirectory() {
				x.visitFile(dirname, f, fulltextIndex)
			}
		}
	}

	if !fulltextIndex {
		// the file set, the current file, and the sources are
		// not needed after indexing if no text index is built -
		// help GC and clear them
		x.fset = nil
		x.sources.Reset()
		x.current = nil // contains reference to fset!
	}

	// for each word, reduce the RunLists into a LookupResult;
	// also collect the word with its canonical spelling in a
	// word list for later computation of alternative spellings
	words := make(map[string]*LookupResult)
	var wlist RunList
	for w, h := range x.words {
		decls := reduce(&h.Decls)
		others := reduce(&h.Others)
		words[w] = &LookupResult{
			Decls:  decls,
			Others: others,
		}
		wlist.Push(&wordPair{canonical(w), w})
	}
	x.stats.Words = len(words)

	// reduce the word list {canonical(w), w} into
	// a list of AltWords runs {canonical(w), {w}}
	alist := wlist.reduce(lessWordPair, newAltWords)

	// convert alist into a map of alternative spellings
	alts := make(map[string]*AltWords)
	for i := 0; i < alist.Len(); i++ {
		a := alist.At(i).(*AltWords)
		alts[a.Canon] = a
	}

	// convert snippet vector into a list
	snippets := make([]*Snippet, x.snippets.Len())
	for i := 0; i < x.snippets.Len(); i++ {
		snippets[i] = x.snippets.At(i).(*Snippet)
	}

	// create text index
	var suffixes *suffixarray.Index
	if fulltextIndex {
		suffixes = suffixarray.New(x.sources.Bytes())
	}

	return &Index{x.fset, suffixes, words, alts, snippets, x.stats}
}