#? Encoding =utf-8
Class? simhash:
# constructor
def? __init__(self, token = ",? Hash bit = 128):
self.hashbits? =? Hashbit
self.hash? =? Self.simhash (token);
#toString function
def? __str__(self):
Return? Str (self-hashing)
# generate simhash value
def? simhash(self,? Token):
v? =? [0]? *? self.hashbits
For what? t? Are you online? [self. _string_hash(x)? For what? x? Are you online? Token]:? #t is the ordinary hash value of token.
For what? Me? Are you online? Scope (self.hashbits):
Bit mask? =? 1? & lt& lt? I
What if? t? & amp? Bit mask? :
v[i]? +=? 1? # Check whether the current bit is 1, and if it is, set it to+1.
Otherwise:
v[i]? -=? 1? # Otherwise, the bit is-1
Fingerprints? =? 0
For what? Me? Are you online? Scope (self.hashbits):
What if? v[i]? & gt=? 0:
Fingerprints? +=? 1? & lt& lt? I
Return? Fingerprints? # The fingerprint of the whole document is the last bit >; Sum =0
# Find Hamming Distance
def? Hamming _ Distance (ego,? Others):
x? =? (self.hash? ? other.hash)? & amp? (( 1? & lt& lt? self.hashbits)? -? 1)
tot? =? 0;
What time? x? :
tot? +=? 1
x? & amp=? x? -? 1
Return? child
# Find similarities
def? Similarity? (self,? Others):
Answer? =? float(self.hash)
b? =? float(other.hash)
What if? Answer? & gt? b? :? Return? b? /? a
Otherwise:? Return? Answer? /? b
# Generate a hash value for source (built-in hash value of Python variable-length version)
def? _string_hash(self,? Source):
What if? Source? ==? "":
Return? 0
Otherwise:
x? =? order(source[0])? & lt& lt? seven
m? =? 1000003
Mask? =? 2? **? self.hashbits? -? 1
For what? c? Are you online? Source:
x? =? ((x? *? m)? ? Order (c)? & amp? mask
x? ^=? Lens (source)
What if? x? ==? - 1:
x? =? -2
Return? x
What if? __name__? ==? __main__ ':
s? =? This? Is it? Answer? Testing? String? For what? Test'
hash 1? =? simhash(s.split())
s? =? This? Is it? Answer? Testing? String? For what? Testing? also
hash2? =? simhash(s.split())
s? =? Nye? Nye? Ge? Bear? Cao
hash3? =? simhash(s.split())
print(hash 1 . hamming _ distance(hash 2)? ,? ""? ,? hash 1.similarity(hash2))
print(hash 1 . hamming _ distance(hash 3)? ,? ""? ,? hash 1.similarity(hash3))