Пример #1
0
// Fingerprint generates the fingerprint of an HTML from the io.Reader r and a shingle factor.
// Shingle refers to the level of shuffling.
// E.g. with shingle factor =2, input "a", "b", "c" will be converted to "a b", "b c"
func Fingerprint(r io.Reader, shingle int) uint64 {
	if shingle < 1 {
		shingle = 1
	}
	// collect the features via this cf channel.
	cf := make(chan string, 1000)
	cs := make(chan uint64, 1000)
	v := simhash.Vector{}

	// Tokenize and then Generate Features. .
	go func() {
		defer close(cf)
		z := html.NewTokenizer(r)
		// TODO - export the max token count as an function argument.
		count := 0
		for tt := z.Next(); count < 5000 && tt != html.ErrorToken; tt = z.Next() {
			t := z.Token()
			count++
			genFeatures(&t, cf)
		}

	}()

	// Collect the features.
	go func() {
		defer close(cs)
		a := make([][]byte, shingle)
		for f := <-cf; f != ""; f = <-cf {
			// shingle: generate the k-gram token as a single feature.
			a = append(a[1:], []byte(f))
			// fmt.Printf("%#v\n", a)
			// fmt.Printf("%s\n", bytes.Join(a, []byte(" ")))
			cs <- simhash.NewFeature(bytes.Join(a, []byte(" "))).Sum()
			// cs <- simhash.NewFeature([]byte(f)).Sum()
		}
	}()

	// from the checksum (of feature), append to vector.
	for s := <-cs; s != 0; s = <-cs {
		for i := uint8(0); i < 64; i++ {
			bit := ((s >> i) & 1)
			if bit == 1 {
				v[i]++
			} else {
				v[i]--
			}
		}
	}

	return simhash.Fingerprint(v)

}
Пример #2
0
func Compute(r io.Reader, N, shingle int) (uint64, error) {
	if shingle < 1 {
		shingle = 1
	}
	chFeature := make(chan string, 128)
	z := html.NewTokenizer(r)
	go func() {
		count := 1
		for tt := z.Next(); count < N && tt != html.ErrorToken; tt = z.Next() {
			t := z.Token()
			count++
			genFeature(&t, chFeature)
		}
		close(chFeature)
	}()

	ch := make(chan uint64, 128)
	go func() {
		// Avoid allocation
		s := make([][]byte, shingle)
		joined := make([][]byte, 2*shingle-1)
		space := []byte(" ")

		var i, n int
		for f := range chFeature {
			// Collect enough features
			if n < shingle {
				s[n] = []byte(f)
				if n++; n == shingle {
					goto JOIN
				}
				continue
			}
			// Shift array to produce one space
			for i = 0; i < shingle-1; i++ {
				s[i] = s[i+1]
			}
			s[i] = []byte(f)

		JOIN:
			for i, f := range s {
				joined[2*i] = f
				if i+1 != len(s) {
					joined[2*i+1] = space
				}
			}
			ch <- hash(joined...)
		}
		close(ch)
	}()

	v := simhash.Vector{}
	var i uint
	var bit int
	for n := range ch {
		for i = 0; i < 64; i++ {
			bit = int((n >> i) & 1)
			// bit == 1 ? 1 : -1
			v[i] += (bit ^ (bit - 1))
		}
	}
	if err := z.Err(); err != io.EOF {
		return 0, err
	}
	return simhash.Fingerprint(v), nil
}