hydrus/include/ClientVPTree.py

92 lines
3.6 KiB
Python

import random
import HydrusData
class VPTreeNode( object ):
def __init__( self, phashes ):
ghd = HydrusData.GetHammingDistance
if len( phashes ) == 1:
( self._phash, ) = phashes
self._radius = 0
inner_phashes = []
outer_phashes = []
else:
# we want to choose a good node.
# a good node is one that doesn't overlap with other circles much
# get a random sample with big lists, to keep cpu costs down
if len( phashes ) > 50: phashes_sample = random.sample( phashes, 50 )
else: phashes_sample = phashes
all_nodes_comparisons = { phash1 : [ ( ghd( phash1, phash2 ), phash2 ) for phash2 in phashes_sample if phash2 != phash1 ] for phash1 in phashes_sample }
for comparisons in all_nodes_comparisons.values(): comparisons.sort()
# the median of the sorted hamming distances makes a decent radius
all_nodes_radii = [ ( comparisons[ len( comparisons ) / 2 ], phash ) for ( phash, comparisons ) in all_nodes_comparisons.items() ]
all_nodes_radii.sort()
# let's make our node the phash with the smallest predicted radius
( ( predicted_radius, whatever ), self._phash ) = all_nodes_radii[ 0 ]
if len( phashes ) > 50:
my_hammings = [ ( ghd( self._phash, phash ), phash ) for phash in phashes if phash != self._phash ]
my_hammings.sort()
else: my_hammings = all_nodes_comparisons[ self._phash ]
median_index = len( my_hammings ) / 2
( self._radius, whatever ) = my_hammings[ median_index ]
# lets bump our index up until we actually get outside the radius
while median_index + 1 < len( my_hammings ) and my_hammings[ median_index + 1 ][0] == self._radius: median_index += 1
# now separate my phashes into inside and outside that radius
inner_phashes = [ phash for ( hamming, phash ) in my_hammings[ : median_index + 1 ] ]
outer_phashes = [ phash for ( hamming, phash ) in my_hammings[ median_index + 1 : ] ]
if len( inner_phashes ) == 0: self._inner_node = VPTreeNodeEmpty()
else: self._inner_node = VPTreeNode( inner_phashes )
if len( outer_phashes ) == 0: self._outer_node = VPTreeNodeEmpty()
else: self._outer_node = VPTreeNode( outer_phashes )
def __len__( self ): return len( self._inner_node ) + len( self._outer_node ) + 1
def GetMatches( self, phash, max_hamming ):
hamming_distance_to_me = HydrusData.GetHammingDistance( self._phash, phash )
matches = []
if hamming_distance_to_me <= max_hamming: matches.append( self._phash )
if hamming_distance_to_me <= ( self._radius + max_hamming ): matches.extend( self._inner_node.GetMatches( phash, max_hamming ) ) # i.e. result could be in inner
if hamming_distance_to_me >= ( self._radius - max_hamming ): matches.extend( self._outer_node.GetMatches( phash, max_hamming ) ) # i.e. result could be in outer
return matches
class VPTreeNodeEmpty( object ):
def __init__( self ): pass
def __len__( self ): return 0
def GetMatches( self, phash, max_hamming ): return []