예제 #1
0
파일: simhash.go 프로젝트: nemowen/golang
func main() {

	var docs = [][]byte{
		[]byte("Z2X6629402"),
		[]byte("q2X6629302"),
		[]byte("ZyX6629302"),
	}

	fmt.Printf("%b\n", 8)

	uint64s, _ := strconv.ParseUint("1001100110111100011011010111101101100111100111000110111010010", 2, 64)
	fmt.Printf("64:%x\n", uint64s)

	hashes := make([]uint64, len(docs))
	for i, d := range docs {
		hashes[i] = simhash.Simhash(simhash.NewWordFeatureSet(d))
		fmt.Printf("Simhash of %s: %x b:%b\n", d, hashes[i], hashes[i])
	}

	q, e := strconv.ParseUint("13378daf6cf38dd2", 16, 64)
	if e != nil {
		fmt.Println(e.Error())
	}
	q2, e := strconv.ParseUint("f597e9511bbc518d", 16, 64)
	if e != nil {
		fmt.Println(e.Error())
	}
	fmt.Printf("simhash:%d\n", simhash.Compare(q, q2))

	fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[1], simhash.Compare(hashes[0], hashes[1]))
	fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[2], simhash.Compare(hashes[0], hashes[2]))
}
예제 #2
0
func TestFingerprint(t *testing.T) {
	const s1 = `
<html>
<head>
</head>
<body>
<p>Hello, World</p>
</body>
</html>
`
	const s2 = `
<html>
<head>
</head>
<body>
<p>你好,世界</p>
<p>维基 百科</p>
</body>
</html>
`
	const s3 = `
<html>
<head>
</head>
<body>
<p>Hello, World</p>
<p>你好,世界</p>
</body>
</html>
`
	f1, _ := Compute(strings.NewReader(s1), 4096, 2)
	f2, _ := Compute(strings.NewReader(s2), 4096, 2)
	f3, _ := Compute(strings.NewReader(s3), 4096, 2)
	if d := simhash.Compare(f1, f2); d > 3 {
		t.Errorf("distance should <= 3, actual: %d\n", d)
	}
	if d := simhash.Compare(f2, f3); d > 3 {
		t.Errorf("distance should <= 3, actual: %d\n", d)
	}
}
예제 #3
0
파일: bktree.go 프로젝트: 0xwindows/gryffin
// Distance return the similarity distance between two fingerprint.
func Distance(a, b uint64) uint8 {
	return simhash.Compare(a, b)
}
예제 #4
0
파일: bktree.go 프로젝트: fanyang01/crawler
func Distance(a, b uint64) int {
	return int(simhash.Compare(a, b))
}