コード例 #1
0
// Searches the given Nucleotide-Sequence file-path for the given 'searchString' using sophisticated compression
func search(fileName string, sequenceToSearchFor string) (bool, []int) {
	var isPresent bool = false
	var master []byte
	var offsets []int

	lines, err := readLines(fileName)
	if err != nil {
		log.Fatalf("readLines: %s", err)
	} else {
		for _, line := range lines {
			master = *(compress(&line))
		}
	}

	// dat, _ := ioutil.ReadFile("./encoded.txt")
	index := suffixarray.New(master)

	searchString := sequenceToSearchFor
	searchBytes := *(compress(&searchString))

	//https://code.google.com/p/go/source/browse/src/pkg/index/suffixarray/suffixarray.go?name=release#190
	//Gets exactly the first match alone

	// offsets = index.Lookup([]byte(searchBytes), -1)

	//https://code.google.com/p/go/source/browse/src/pkg/index/suffixarray/suffixarray.go?name=release#174
	//Gets all matches
	offsets = index.Lookup([]byte(searchBytes), 1)

	if len(offsets) == 1 {
		isPresent = true
	}
	return isPresent, offsets
}
コード例 #2
0
func main() {
	//A set of words delimited by space
	words := "a apple sphere atom atmosphere"

	//A suffix array created in golang by converting the given string into bytes
	index := suffixarray.New([]byte(words))

	//Lookup Time complexity =  O(log(N)*len(s) + len(result))

	// N : the size of the indexed data
	// s : substring to be seached for
	// result : array containing integers which represent the index of the given substring 's' in the suffix array

	//NOTE
	// Let's take the following example
	// var s string = "apple"
	// fmt.Println([]byte(s)) would print  = [97 112 112 108 101].
	// Golang Suffix array uses the byte representation of the sub-string "s" in order to perform most optimal comutation

	offsets1 := index.Lookup([]byte("sphere"), -1) // the list of all indices where s occurs in data

	//Prints unsorted array of integers which are the indices of the given substring
	fmt.Println(offsets1)

}
コード例 #3
0
ファイル: main.go プロジェクト: kyokomi-sandbox/sandbox
func _indexSuffixArray() string {
	docs := []string{
		"mercury", "venus", "earth", "mars",
		"jupiter", "saturn", "uranus", "pluto",
	}

	var data []byte
	var offsets []int

	for _, d := range docs {
		data = append(data, []byte(d)...)
		offsets = append(offsets, len(data))
	}
	sfx := suffixarray.New(data)

	query := "earth"

	idxs := sfx.Lookup([]byte(query), -1)
	var results []int
	for _, idx := range idxs {
		i := sort.Search(len(offsets), func(i int) bool { return offsets[i] > idx })
		if idx+len(query) <= offsets[i] {
			results = append(results, i)
		}
	}

	return fmt.Sprintf("%q is in documents %v\n", query, results)
}
コード例 #4
0
ファイル: utils.go プロジェクト: paulhammond/docker
func NewTruncIndex() *TruncIndex {
	return &TruncIndex{
		index: suffixarray.New([]byte{' '}),
		ids:   make(map[string]bool),
		bytes: []byte{' '},
	}
}
コード例 #5
0
ファイル: stw.go プロジェクト: isaiah/go_scheduler_talk
func main() {
	const N = 6e5
	const M = 100
	const I = 20
	const J = 10
	const P = 2
	data := make([]byte, N)
	for i := 0; i < N; i++ {
		data[i] = byte(rand.Intn(255))
	}
	done := make(chan bool, P)
	for p := 0; p < P; p++ {
		go func() {
			for i := 0; i < I; i++ {
				suffix := suffixarray.New(data)
				for j := 0; j < J; j++ {
					str := make([]byte, M)
					for m := 0; m < M; m++ {
						str[m] = byte(rand.Intn(255))
					}
					_ = suffix.Lookup(str, 10)
				}
			}
			done <- true
		}()
	}
	for p := 0; p < P; p++ {
		<-done
	}
}
コード例 #6
0
func (idx *TruncIndex) Add(id string) error {
	idx.Lock()
	defer idx.Unlock()
	if err := idx.addId(id); err != nil {
		return err
	}
	idx.index = suffixarray.New(idx.bytes)
	return nil
}
コード例 #7
0
func BenchmarkBuildSuffixArray(b *testing.B) {
	words := getDictWords()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		data := []byte("\x00" + strings.Join(words, "\x00") + "\x00")
		_ = suffixarray.New(data)
	}
}
コード例 #8
0
ファイル: waf.go プロジェクト: Zigazou/nataraja
func (waf *WAF) GoSufArray_UserAgentIsClean(UA []byte) bool {
	index := suffixarray.New(UA)
	for _, robot := range waf.bad_robots {
		if len(index.Lookup(robot, 1)) > 0 {
			return false
		}
	}

	return true
}
コード例 #9
0
ファイル: rest.go プロジェクト: hobbeswalsh/cardgame
func check_url(substr, str string) bool {
	var dst []byte
	substr_bytes := strconv.AppendQuoteToASCII(dst, substr)
	str_bytes := strconv.AppendQuoteToASCII(dst, str)
	index := suffixarray.New(str_bytes)
	offsets := index.Lookup(substr_bytes, -1)
	if offsets == nil {
		return false
	}
	return offsets[0] == 0
}
コード例 #10
0
ファイル: strings.go プロジェクト: funkygao/dlogmon
func benchFast(line string, substr string) time.Duration {
	index := fs.New([]byte(line))
	start := time.Now()
	for i := 0; i < LOOPS; i++ {
		index.Lookup([]byte(substr), 1)
	}
	end := time.Now()
	delta := end.Sub(start)
	fmt.Printf("%10s: %20s\t%16s %10s\n", "fast", substr, delta, delta/LOOPS)
	return delta
}
コード例 #11
0
ファイル: utils.go プロジェクト: ChaosCloud/docker
func NewTruncIndex(ids []string) (idx *TruncIndex) {
	idx = &TruncIndex{
		ids:   make(map[string]bool),
		bytes: []byte{' '},
	}
	for _, id := range ids {
		idx.ids[id] = true
		idx.bytes = append(idx.bytes, []byte(id+" ")...)
	}
	idx.index = suffixarray.New(idx.bytes)
	return
}
コード例 #12
0
ファイル: main.go プロジェクト: jstanley0/stripe-ctf-3
func (s *Searcher) indexFile(path string, info os.FileInfo, err error) error {
	// only index 1/4 of the files per server
	if int(path[len(path)-1])%4 != s.id {
		return nil
	}
	if info.Mode().IsRegular() && info.Size() < (1<<20) {
		name := strings.TrimPrefix(path, s.base_path)
		data, _ := ioutil.ReadFile(path)
		s.files[name] = suffixarray.New(data)
	}
	return nil
}
コード例 #13
0
ファイル: utils.go プロジェクト: paulhammond/docker
func (idx *TruncIndex) Add(id string) error {
	if strings.Contains(id, " ") {
		return fmt.Errorf("Illegal character: ' '")
	}
	if _, exists := idx.ids[id]; exists {
		return fmt.Errorf("Id already exists: %s", id)
	}
	idx.ids[id] = true
	idx.bytes = append(idx.bytes, []byte(id+" ")...)
	idx.index = suffixarray.New(idx.bytes)
	return nil
}
コード例 #14
0
func BenchmarkLookupXX(b *testing.B) {
	words := getDictWords()
	data := []byte("\x00" + strings.Join(words, "\x00") + "\x00")
	sa := suffixarray.New(data)

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		indices := sa.Lookup([]byte(XXwordToTry), 1)
		if len(indices) > 0 {
			_ = getStringFromIndex(data, indices[0])
		}
	}
}
コード例 #15
0
ファイル: utils.go プロジェクト: paulhammond/docker
func (idx *TruncIndex) Delete(id string) error {
	if _, exists := idx.ids[id]; !exists {
		return fmt.Errorf("No such id: %s", id)
	}
	before, after, err := idx.lookup(id)
	if err != nil {
		return err
	}
	delete(idx.ids, id)
	idx.bytes = append(idx.bytes[:before], idx.bytes[after:]...)
	idx.index = suffixarray.New(idx.bytes)
	return nil
}
コード例 #16
0
ファイル: manual_index.go プロジェクト: nullstyle/mcdev
// Index builds a new suffixarray for the package names previously registered
// with this instance.
func (idx *ManualIndex) Index() error {
	var buf bytes.Buffer

	for pkg := range idx.packages {
		_, err := fmt.Fprintf(&buf, "\x00%s", pkg)
		if err != nil {
			return err
		}
	}

	idx.index = suffixarray.New(buf.Bytes())
	return nil
}
コード例 #17
0
ファイル: index.go プロジェクト: buckhx/diglet
func (idx *SuffixIndex) sort() {
	if idx.index == nil {
		keys := make([][]byte, len(idx.tiles))
		i := 0
		for k := range idx.tiles {
			keys[i] = []byte(k)
			i++
		}
		d := []byte{zero}
		b := bytes.Join(keys, d)                    //join w/ zeros
		idx.indexed = bytes.Join([][]byte{d, d}, b) //pad w/ zeros
		idx.index = suffixarray.New(idx.indexed)
	}
}
コード例 #18
0
func TestSuffixArrayFind(t *testing.T) {
	words := getDictWords()
	data := []byte("\x00" + strings.Join(words, "\x00") + "\x00")
	sa := suffixarray.New(data)

	buf := &bytes.Buffer{}
	sa.Write(buf)
	fmt.Println("size:", buf.Len())

	indices := sa.Lookup([]byte("yrate"), 1)
	if indices == nil || len(indices) < 1 {
		t.Fatal("not found")
	}
}
コード例 #19
0
func test(data, what []byte) {
	s := suffixarray.New(data)
	idx0 := s.Lookup(what, -1)
	idx1 := simple(data, what)
	if len(idx0) != len(idx1) {
		panic(fmt.Sprintf("len mismatch: %+v, %+v", idx0, idx1))
	}
	sort.Ints(idx0)
	for i, x := range idx0 {
		if x != idx1[i] {
			panic(fmt.Sprintf("data mismatch: %+v, %+v", idx0, idx1))
		}
	}
}
コード例 #20
0
ファイル: dogberry.go プロジェクト: NovemberFoxtrot/dogberry
func main() {
	sometext, err := ioutil.ReadFile(os.Args[1])

	if err != nil {
		log.Println(err)
	}

	index := suffixarray.New(sometext)

	offsets := index.Lookup([]byte("*"), -1)

	for _, value := range offsets[0:100] {
		log.Println(value, string(sometext[value]))
	}
}
コード例 #21
0
ファイル: fuzzy.go プロジェクト: sparrc/fuzzy
// Takes the known dictionary listing and creates a suffix array
// model for these terms. If a model already existed, it is discarded
func (model *Model) updateSuffixArr() {
	if !model.UseAutocomplete {
		return
	}
	model.RLock()
	termArr := make([]string, 0, 1000)
	for term, count := range model.Data {
		if count.Corpus > model.Threshold || count.Query > 0 { // TODO: query threshold?
			termArr = append(termArr, term)
		}
	}
	model.SuffixArrConcat = "\x00" + strings.Join(termArr, "\x00") + "\x00"
	model.SuffixArr = suffixarray.New([]byte(model.SuffixArrConcat))
	model.SuffDivergence = 0
	model.RUnlock()
}
コード例 #22
0
ファイル: ind.go プロジェクト: hyndio/hyd.me
// golang.org网站的全文搜索是基于suffix array实现的【http://t.cn/hBJekg】,
// 可能觉得效果不错,就把suffix array添加到golang的标准库里面了。【http://t.cn/hBJekd】
// http://blog.csdn.net/fxsjy/article/details/6297523
func main() {
	fmt.Println("Hello, 世界")
	str := `The Go programming language is an open source project to make programmers more productive. 
	Go is expressive, concise, clean, and efficient. 
	Its concurrency mechanisms make it easy to write programs that get the most out of multicore 
	and networked machines, hyd, while its novel type system enables flexible 
	and modular program construction. 
	Go compiles quickly to machine code yet has the convenience of garbage collection 
	and the power of run-time reflection. It's a fast, statically typed, 
	compiled language that feels like a dynamically typed, interpreted language.`

	index := suffixarray.New([]byte(str))
	offsets1 := index.Lookup([]byte("hyd"), -1)
	for _, i := range offsets1 {
		fmt.Println(str[i:])
	}

}
コード例 #23
0
ファイル: hello.go プロジェクト: peterwilliams97/go-work
func test_sa() {

	data := []byte("i am a test i am a test i am a test i am a test i am a test")
	s := data[2:4]

	// create index for some data
	index := suffixarray.New(data)

	// lookup byte slice s
	offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
	offsets2 := index.Lookup(s, 3)  // the list of at most 3 indices where s occurs in data
	fmt.Println("test_sa")
	fmt.Println(string(s))
	fmt.Println(offsets1)
	fmt.Println(offsets2)
	for _, i := range offsets1 {
		m := data[i : i+2]
		fmt.Println(string(m))
	}
}
コード例 #24
0
ファイル: next.go プロジェクト: nise-nabe/misc-pages
func main() {
	str, _ := ioutil.ReadFile("large.in")
	next := func(str string) func() string {
		reg, _ := regexp.Compile("\\S+")
		is := suffixarray.New([]byte(str)).FindAllIndex(reg, -1)
		return func() (result string) {
			if len(is) < 1 {
				return ""
			}
			result = str[is[0][0]:is[0][1]]
			is = is[1:]
			return
		}
	}(string(str))
	t := time.Now().UnixNano()
	for i := 0; i < 100000000; i++ {
		next()
	}
	log.Println(time.Now().UnixNano() - t)
}
コード例 #25
0
ファイル: next.go プロジェクト: nise-nabe/misc-pages
func main() {
	str := `1234 2 3333 4 5  aaaaa
s
aaa
`
	next := func(str string) func() string {
		reg, _ := regexp.Compile("\\S+")
		is := suffixarray.New([]byte(str)).FindAllIndex(reg, -1)
		return func() (result string) {
			if len(is) < 1 {
				return ""
			}
			result = str[is[0][0]:is[0][1]]
			is = is[1:]
			return
		}
	}(str)
	for x := next(); x != ""; x = next() {
		log.Println(x)
	}
}
コード例 #26
0
ファイル: convert.go プロジェクト: ReanGD/go-web-search
// Convert ...
func Convert(path string) error {
	file, err := os.Open(path)
	if err != nil {
		return err
	}

	defer file.Close()

	d := createDictReader(createTokenReader(file))

	var buffer bytes.Buffer
	_, _ = buffer.WriteRune('@')
	for !d.isDone() {
		g, err := d.nextGroup()
		if err != nil {
			return err
		}
		for _, w := range g.words {
			_, _ = buffer.WriteString(w.name)
			_, _ = buffer.WriteRune('@')
		}
	}
	sa := suffixarray.New(buffer.Bytes()[:])

	flags := os.O_CREATE | os.O_WRONLY
	dictFile, err := os.OpenFile("morph.dict", flags, 0666)
	if err != nil {
		return err
	}

	defer dictFile.Close()

	err = sa.Write(dictFile)
	if err != nil {
		return err
	}

	return nil

}
コード例 #27
0
func main() {
	words := []string{
		"banana",
		"apple",
		"pear",
		"tangerine",
		"orange",
		"lemon",
		"peach",
		"persimmon",
	}

	// Combine all words into a single byte slice, separated by \x00 bytes (which
	// do not appear in words), adding one on each end too.
	data := []byte("\x00" + strings.Join(words, "\x00") + "\x00")
	sa := suffixarray.New(data)

	indices := sa.Lookup([]byte("an"), -1)
	if len(indices) > 0 {
		fmt.Println("Lookup returns:", indices)
	} else {
		fmt.Println("Lookup: not found")
	}

	// Reconstruct matches from indices found by Lookup.
	for _, idx := range indices {
		fmt.Println(getStringFromIndex(data, idx))
	}

	// Here using a completely "literal" regexp, similar to the usage of Lookup,
	// to compare what the two methods return. FindAllIndex can take an arbitrary
	// regexp - but beware of the caveat discussed in the blog post.
	r := regexp.MustCompile("an")
	matches := sa.FindAllIndex(r, -1)
	fmt.Println("FindAllIndex returns:", matches)
}
コード例 #28
0
ファイル: index.go プロジェクト: gnanderson/go
// NewIndex creates a new index for the .go files
// in the directories given by dirnames.
//
func NewIndex(dirnames <-chan string, fulltextIndex bool, throttle float64) *Index {
	var x Indexer
	th := NewThrottle(throttle, 100*time.Millisecond) // run at least 0.1s at a time

	// initialize Indexer
	// (use some reasonably sized maps to start)
	x.fset = token.NewFileSet()
	x.packages = make(map[string]*Pak, 256)
	x.words = make(map[string]*IndexResult, 8192)

	// index all files in the directories given by dirnames
	for dirname := range dirnames {
		list, err := fs.ReadDir(dirname)
		if err != nil {
			continue // ignore this directory
		}
		for _, f := range list {
			if !f.IsDir() {
				x.visitFile(dirname, f, fulltextIndex)
			}
			th.Throttle()
		}
	}

	if !fulltextIndex {
		// the file set, the current file, and the sources are
		// not needed after indexing if no text index is built -
		// help GC and clear them
		x.fset = nil
		x.sources.Reset()
		x.current = nil // contains reference to fset!
	}

	// for each word, reduce the RunLists into a LookupResult;
	// also collect the word with its canonical spelling in a
	// word list for later computation of alternative spellings
	words := make(map[string]*LookupResult)
	var wlist RunList
	for w, h := range x.words {
		decls := reduce(h.Decls)
		others := reduce(h.Others)
		words[w] = &LookupResult{
			Decls:  decls,
			Others: others,
		}
		wlist = append(wlist, &wordPair{canonical(w), w})
		th.Throttle()
	}
	x.stats.Words = len(words)

	// reduce the word list {canonical(w), w} into
	// a list of AltWords runs {canonical(w), {w}}
	alist := wlist.reduce(lessWordPair, newAltWords)

	// convert alist into a map of alternative spellings
	alts := make(map[string]*AltWords)
	for i := 0; i < len(alist); i++ {
		a := alist[i].(*AltWords)
		alts[a.Canon] = a
	}

	// create text index
	var suffixes *suffixarray.Index
	if fulltextIndex {
		suffixes = suffixarray.New(x.sources.Bytes())
	}

	return &Index{x.fset, suffixes, words, alts, x.snippets, x.stats}
}
コード例 #29
0
ファイル: index.go プロジェクト: Bosh-for-Cpi/bosh-2605
// NewIndex creates a new index for the .go files provided by the corpus.
func (c *Corpus) NewIndex() *Index {
	// initialize Indexer
	// (use some reasonably sized maps to start)
	x := &Indexer{
		c:           c,
		fset:        token.NewFileSet(),
		fsOpenGate:  make(chan bool, maxOpenFiles),
		strings:     make(map[string]string),
		packages:    make(map[Pak]*Pak, 256),
		words:       make(map[string]*IndexResult, 8192),
		throttle:    util.NewThrottle(c.throttle(), 100*time.Millisecond), // run at least 0.1s at a time
		importCount: make(map[string]int),
		packagePath: make(map[string]map[string]bool),
		exports:     make(map[string]map[string]SpotKind),
		idents:      make(map[SpotKind]map[string][]Ident, 4),
	}

	// index all files in the directories given by dirnames
	var wg sync.WaitGroup // outstanding ReadDir + visitFile
	dirGate := make(chan bool, maxOpenDirs)
	for dirname := range c.fsDirnames() {
		if c.IndexDirectory != nil && !c.IndexDirectory(dirname) {
			continue
		}
		dirGate <- true
		wg.Add(1)
		go func(dirname string) {
			defer func() { <-dirGate }()
			defer wg.Done()

			list, err := c.fs.ReadDir(dirname)
			if err != nil {
				log.Printf("ReadDir(%q): %v; skipping directory", dirname, err)
				return // ignore this directory
			}
			for _, fi := range list {
				wg.Add(1)
				go func(fi os.FileInfo) {
					defer wg.Done()
					x.visitFile(dirname, fi)
				}(fi)
			}
		}(dirname)
	}
	wg.Wait()

	if !c.IndexFullText {
		// the file set, the current file, and the sources are
		// not needed after indexing if no text index is built -
		// help GC and clear them
		x.fset = nil
		x.sources.Reset()
		x.current = nil // contains reference to fset!
	}

	// for each word, reduce the RunLists into a LookupResult;
	// also collect the word with its canonical spelling in a
	// word list for later computation of alternative spellings
	words := make(map[string]*LookupResult)
	var wlist RunList
	for w, h := range x.words {
		decls := reduce(h.Decls)
		others := reduce(h.Others)
		words[w] = &LookupResult{
			Decls:  decls,
			Others: others,
		}
		wlist = append(wlist, &wordPair{canonical(w), w})
		x.throttle.Throttle()
	}
	x.stats.Words = len(words)

	// reduce the word list {canonical(w), w} into
	// a list of AltWords runs {canonical(w), {w}}
	alist := wlist.reduce(lessWordPair, newAltWords)

	// convert alist into a map of alternative spellings
	alts := make(map[string]*AltWords)
	for i := 0; i < len(alist); i++ {
		a := alist[i].(*AltWords)
		alts[a.Canon] = a
	}

	// create text index
	var suffixes *suffixarray.Index
	if c.IndexFullText {
		suffixes = suffixarray.New(x.sources.Bytes())
	}

	for _, idMap := range x.idents {
		for _, ir := range idMap {
			sort.Sort(byPackage(ir))
		}
	}

	return &Index{
		fset:        x.fset,
		suffixes:    suffixes,
		words:       words,
		alts:        alts,
		snippets:    x.snippets,
		stats:       x.stats,
		importCount: x.importCount,
		packagePath: x.packagePath,
		exports:     x.exports,
		idents:      x.idents,
		opts: indexOptions{
			Docs:       x.c.IndexDocs,
			GoCode:     x.c.IndexGoCode,
			FullText:   x.c.IndexFullText,
			MaxResults: x.c.MaxResults,
		},
	}
}
コード例 #30
0
ファイル: index.go プロジェクト: go-nosql/golang
// NewIndex creates a new index for the .go files
// in the directories given by dirnames.
//
func NewIndex(dirnames <-chan string, fulltextIndex bool) *Index {
	var x Indexer

	// initialize Indexer
	x.fset = token.NewFileSet()
	x.words = make(map[string]*IndexResult)

	// index all files in the directories given by dirnames
	for dirname := range dirnames {
		list, err := ioutil.ReadDir(dirname)
		if err != nil {
			continue // ignore this directory
		}
		for _, f := range list {
			if !f.IsDirectory() {
				x.visitFile(dirname, f, fulltextIndex)
			}
		}
	}

	if !fulltextIndex {
		// the file set, the current file, and the sources are
		// not needed after indexing if no text index is built -
		// help GC and clear them
		x.fset = nil
		x.sources.Reset()
		x.current = nil // contains reference to fset!
	}

	// for each word, reduce the RunLists into a LookupResult;
	// also collect the word with its canonical spelling in a
	// word list for later computation of alternative spellings
	words := make(map[string]*LookupResult)
	var wlist RunList
	for w, h := range x.words {
		decls := reduce(&h.Decls)
		others := reduce(&h.Others)
		words[w] = &LookupResult{
			Decls:  decls,
			Others: others,
		}
		wlist.Push(&wordPair{canonical(w), w})
	}
	x.stats.Words = len(words)

	// reduce the word list {canonical(w), w} into
	// a list of AltWords runs {canonical(w), {w}}
	alist := wlist.reduce(lessWordPair, newAltWords)

	// convert alist into a map of alternative spellings
	alts := make(map[string]*AltWords)
	for i := 0; i < alist.Len(); i++ {
		a := alist.At(i).(*AltWords)
		alts[a.Canon] = a
	}

	// convert snippet vector into a list
	snippets := make([]*Snippet, x.snippets.Len())
	for i := 0; i < x.snippets.Len(); i++ {
		snippets[i] = x.snippets.At(i).(*Snippet)
	}

	// create text index
	var suffixes *suffixarray.Index
	if fulltextIndex {
		suffixes = suffixarray.New(x.sources.Bytes())
	}

	return &Index{x.fset, suffixes, words, alts, snippets, x.stats}
}