- 论坛徽章:
- 1
|
我想比较两个字符串的相似度,用到了simhash和hamming distance。网上都是Python和java的例子,我拿来改了改。应该是正确的,但是又没有把握,感觉我的算法和网上的特别是python版本的差距很大。请各位看看。
我的perl版本:- #!/bin/perl
- sub hash{
- my ($input)=@_;
- my @chars=split "",$input;
- my $hash=5381;
- foreach(@chars){
- $hash += ($hash << 5) + ord($_);
- }
- return $hash;
- }
- sub simhash{
- my @tokens=@_;
- my @simhash=();
- foreach (@tokens){
- my $hash=hash($_);
- foreach (0 .. 31){
- my $current_bit=$hash & 0x1;
- if ($current_bit == 0){
- $simhash[$_]--;
- }else {
- $simhash[$_]++;
- }
-
- $hash = $hash >>1;
- }
- }
- my $simhash=0;
- @simhash= reverse @simhash;
- foreach(@simhash){
- if ($_ > 0){
- $simhash=($simhash << 1)+ 0x1;
- }else{
- $simhash=$simhash << 1;
- }
- }
- return $simhash;
- }
- sub hamming_distance{
- my ($ourhash,$otherhash)=@_;
- my $x=$ourhash ^ $otherhash;
- printf "ourhash=%x,otherhash=%x,x=%d\n",$ourhash,$otherhash,$x;
- my $tot=0;
- while ($x) {
- $tot += 1;
- $x &=$x -1;
- }
- return $tot;
- }
- while (chomp ($_ = <STDIN>)){
- my @test=qw (ok are you ready go);
- my @test2=split /\s+/, $_;
- my $sim1=simhash(@test);
- my $sim2=simhash(@test2);
- printf "test=%x,test2=%x\n",$sim1,$sim2;
- my $simi=$sim1 < $sim2 ? $sim1 / $sim2 : $sim2 / $sim1;
- print $simi . "\n";
- my $hamming=hamming_distance($sim1,$sim2);
- print $hamming . "\n";
- #exit(0);
- }
复制代码 下面是网上的python版本:- #!/usr/bin/python
- # coding=utf-8
- class simhash:
-
- #构造函数
- def __init__(self, tokens='', hashbits=128):
- self.hashbits = hashbits
- self.hash = self.simhash(tokens);
-
- #toString函数
- def __str__(self):
- return str(self.hash)
-
- #生成simhash值
- def simhash(self, tokens):
- v = [0] * self.hashbits
- for t in [self._string_hash(x) for x in tokens]: #t为token的普通hash值
- for i in range(self.hashbits):
- bitmask = 1 << i
- if t & bitmask :
- v[i] += 1 #查看当前bit位是否为1,是的话将该位+1
- else:
- v[i] -= 1 #否则的话,该位-1
- fingerprint = 0
- for i in range(self.hashbits):
- if v[i] >= 0:
- fingerprint += 1 << i
- return fingerprint #整个文档的fingerprint为最终各个位>=0的和
-
- #求海明距离
- def hamming_distance(self, other):
- x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
- tot = 0;
- while x :
- tot += 1
- x &= x - 1
- return tot
-
- #求相似度
- def similarity (self, other):
- a = float(self.hash)
- b = float(other.hash)
- if a > b : return b / a
- else: return a / b
-
- #针对source生成hash值 (一个可变长度版本的Python的内置散列)
- def _string_hash(self, source):
- if source == "":
- return 0
- else:
- x = ord(source[0]) << 7
- m = 1000003
- mask = 2 ** self.hashbits - 1
- for c in source:
- x = ((x * m) ^ ord(c)) & mask
- x ^= len(source)
- if x == -1:
- x = -2
- return x
-
- if __name__ == '__main__':
- s = 'This is a test string for testing'
- hash1 = simhash(s.split())
-
- s = 'This is a test string for testing also'
- hash2 = simhash(s.split())
-
- s = 'nai nai ge xiong cao'
- hash3 = simhash(s.split())
-
- print(hash1.hamming_distance(hash2) , " " , hash1.similarity(hash2))
- print(hash1.hamming_distance(hash3) , " " , hash1.similarity(hash3))
复制代码 |
|