hydrus/hydrus/client/db/ClientDBSimilarFiles.py

771 lines
33 KiB
Python

import collections
import random
import sqlite3
import typing
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData
from hydrus.core import HydrusDB
from hydrus.core import HydrusDBModule
from hydrus.core import HydrusGlobals as HG
from hydrus.client import ClientThreading
from hydrus.client.db import ClientDBFilesStorage
from hydrus.client.db import ClientDBServices
class ClientDBSimilarFiles( HydrusDBModule.HydrusDBModule ):
def __init__( self, cursor: sqlite3.Cursor, modules_services: ClientDBServices.ClientDBMasterServices, modules_files_storage: ClientDBFilesStorage.ClientDBFilesStorage ):
self.modules_services = modules_services
self.modules_files_storage = modules_files_storage
HydrusDBModule.HydrusDBModule.__init__( self, 'client similar files', cursor )
def _AddLeaf( self, phash_id, phash ):
result = self._c.execute( 'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;' ).fetchone()
if result is None:
parent_id = None
else:
( root_node_phash_id, ) = result
ancestors_we_are_inside = []
ancestors_we_are_outside = []
an_ancestor_is_unbalanced = False
next_ancestor_id = root_node_phash_id
while next_ancestor_id is not None:
ancestor_id = next_ancestor_id
( ancestor_phash, ancestor_radius, ancestor_inner_id, ancestor_inner_population, ancestor_outer_id, ancestor_outer_population ) = self._c.execute( 'SELECT phash, radius, inner_id, inner_population, outer_id, outer_population FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;', ( ancestor_id, ) ).fetchone()
distance_to_ancestor = HydrusData.Get64BitHammingDistance( phash, ancestor_phash )
if ancestor_radius is None or distance_to_ancestor <= ancestor_radius:
ancestors_we_are_inside.append( ancestor_id )
ancestor_inner_population += 1
next_ancestor_id = ancestor_inner_id
if ancestor_inner_id is None:
self._c.execute( 'UPDATE shape_vptree SET inner_id = ?, radius = ? WHERE phash_id = ?;', ( phash_id, distance_to_ancestor, ancestor_id ) )
parent_id = ancestor_id
else:
ancestors_we_are_outside.append( ancestor_id )
ancestor_outer_population += 1
next_ancestor_id = ancestor_outer_id
if ancestor_outer_id is None:
self._c.execute( 'UPDATE shape_vptree SET outer_id = ? WHERE phash_id = ?;', ( phash_id, ancestor_id ) )
parent_id = ancestor_id
if not an_ancestor_is_unbalanced and ancestor_inner_population + ancestor_outer_population > 16:
larger = max( ancestor_inner_population, ancestor_outer_population )
smaller = min( ancestor_inner_population, ancestor_outer_population )
if smaller / larger < 0.5:
self._c.execute( 'INSERT OR IGNORE INTO shape_maintenance_branch_regen ( phash_id ) VALUES ( ? );', ( ancestor_id, ) )
# we only do this for the eldest ancestor, as the eventual rebalancing will affect all children
an_ancestor_is_unbalanced = True
self._c.executemany( 'UPDATE shape_vptree SET inner_population = inner_population + 1 WHERE phash_id = ?;', ( ( ancestor_id, ) for ancestor_id in ancestors_we_are_inside ) )
self._c.executemany( 'UPDATE shape_vptree SET outer_population = outer_population + 1 WHERE phash_id = ?;', ( ( ancestor_id, ) for ancestor_id in ancestors_we_are_outside ) )
radius = None
inner_id = None
inner_population = 0
outer_id = None
outer_population = 0
self._c.execute( 'INSERT OR REPLACE INTO shape_vptree ( phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population ) VALUES ( ?, ?, ?, ?, ?, ?, ? );', ( phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population ) )
def _GenerateBranch( self, job_key, parent_id, phash_id, phash, children ):
process_queue = collections.deque()
process_queue.append( ( parent_id, phash_id, phash, children ) )
insert_rows = []
num_done = 0
num_to_do = len( children ) + 1
while len( process_queue ) > 0:
job_key.SetVariable( 'popup_text_2', 'generating new branch -- ' + HydrusData.ConvertValueRangeToPrettyString( num_done, num_to_do ) )
( parent_id, phash_id, phash, children ) = process_queue.popleft()
if len( children ) == 0:
inner_id = None
inner_population = 0
outer_id = None
outer_population = 0
radius = None
else:
children = sorted( ( ( HydrusData.Get64BitHammingDistance( phash, child_phash ), child_id, child_phash ) for ( child_id, child_phash ) in children ) )
median_index = len( children ) // 2
median_radius = children[ median_index ][0]
inner_children = [ ( child_id, child_phash ) for ( distance, child_id, child_phash ) in children if distance < median_radius ]
radius_children = [ ( child_id, child_phash ) for ( distance, child_id, child_phash ) in children if distance == median_radius ]
outer_children = [ ( child_id, child_phash ) for ( distance, child_id, child_phash ) in children if distance > median_radius ]
if len( inner_children ) <= len( outer_children ):
radius = median_radius
inner_children.extend( radius_children )
else:
radius = median_radius - 1
outer_children.extend( radius_children )
inner_population = len( inner_children )
outer_population = len( outer_children )
( inner_id, inner_phash ) = self._PopBestRootNode( inner_children ) #HydrusData.MedianPop( inner_children )
if len( outer_children ) == 0:
outer_id = None
else:
( outer_id, outer_phash ) = self._PopBestRootNode( outer_children ) #HydrusData.MedianPop( outer_children )
insert_rows.append( ( phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population ) )
if inner_id is not None:
process_queue.append( ( phash_id, inner_id, inner_phash, inner_children ) )
if outer_id is not None:
process_queue.append( ( phash_id, outer_id, outer_phash, outer_children ) )
num_done += 1
job_key.SetVariable( 'popup_text_2', 'branch constructed, now committing' )
self._c.executemany( 'INSERT OR REPLACE INTO shape_vptree ( phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population ) VALUES ( ?, ?, ?, ?, ?, ?, ? );', insert_rows )
def _GetInitialIndexGenerationTuples( self ):
index_generation_tuples = []
index_generation_tuples.append( ( 'external_caches.shape_perceptual_hash_map', [ 'hash_id' ], False ) )
index_generation_tuples.append( ( 'external_caches.shape_vptree', [ 'parent_id' ], False ) )
return index_generation_tuples
def _GetPHashId( self, phash ):
result = self._c.execute( 'SELECT phash_id FROM shape_perceptual_hashes WHERE phash = ?;', ( sqlite3.Binary( phash ), ) ).fetchone()
if result is None:
self._c.execute( 'INSERT INTO shape_perceptual_hashes ( phash ) VALUES ( ? );', ( sqlite3.Binary( phash ), ) )
phash_id = self._c.lastrowid
self._AddLeaf( phash_id, phash )
else:
( phash_id, ) = result
return phash_id
def _PopBestRootNode( self, node_rows ):
if len( node_rows ) == 1:
root_row = node_rows.pop()
return root_row
MAX_VIEWPOINTS = 256
MAX_SAMPLE = 64
if len( node_rows ) > MAX_VIEWPOINTS:
viewpoints = random.sample( node_rows, MAX_VIEWPOINTS )
else:
viewpoints = node_rows
if len( node_rows ) > MAX_SAMPLE:
sample = random.sample( node_rows, MAX_SAMPLE )
else:
sample = node_rows
final_scores = []
for ( v_id, v_phash ) in viewpoints:
views = sorted( ( HydrusData.Get64BitHammingDistance( v_phash, s_phash ) for ( s_id, s_phash ) in sample if v_id != s_id ) )
# let's figure out the ratio of left_children to right_children, preferring 1:1, and convert it to a discrete integer score
median_index = len( views ) // 2
radius = views[ median_index ]
num_left = len( [ 1 for view in views if view < radius ] )
num_radius = len( [ 1 for view in views if view == radius ] )
num_right = len( [ 1 for view in views if view > radius ] )
if num_left <= num_right:
num_left += num_radius
else:
num_right += num_radius
smaller = min( num_left, num_right )
larger = max( num_left, num_right )
ratio = smaller / larger
ratio_score = int( ratio * MAX_SAMPLE / 2 )
# now let's calc the standard deviation--larger sd tends to mean less sphere overlap when searching
mean_view = sum( views ) / len( views )
squared_diffs = [ ( view - mean_view ) ** 2 for view in views ]
sd = ( sum( squared_diffs ) / len( squared_diffs ) ) ** 0.5
final_scores.append( ( ratio_score, sd, v_id ) )
final_scores.sort()
# we now have a list like [ ( 11, 4.0, [id] ), ( 15, 3.7, [id] ), ( 15, 4.3, [id] ) ]
( ratio_gumpf, sd_gumpf, root_id ) = final_scores.pop()
for ( i, ( v_id, v_phash ) ) in enumerate( node_rows ):
if v_id == root_id:
root_row = node_rows.pop( i )
return root_row
def _RegenerateBranch( self, job_key, phash_id ):
job_key.SetVariable( 'popup_text_2', 'reviewing existing branch' )
# grab everything in the branch
( parent_id, ) = self._c.execute( 'SELECT parent_id FROM shape_vptree WHERE phash_id = ?;', ( phash_id, ) ).fetchone()
cte_table_name = 'branch ( branch_phash_id )'
initial_select = 'SELECT ?'
recursive_select = 'SELECT phash_id FROM shape_vptree, branch ON parent_id = branch_phash_id'
with_clause = 'WITH RECURSIVE ' + cte_table_name + ' AS ( ' + initial_select + ' UNION ALL ' + recursive_select + ')'
unbalanced_nodes = self._c.execute( with_clause + ' SELECT branch_phash_id, phash FROM branch, shape_perceptual_hashes ON phash_id = branch_phash_id;', ( phash_id, ) ).fetchall()
# removal of old branch, maintenance schedule, and orphan phashes
job_key.SetVariable( 'popup_text_2', HydrusData.ToHumanInt( len( unbalanced_nodes ) ) + ' leaves found--now clearing out old branch' )
unbalanced_phash_ids = { p_id for ( p_id, p_h ) in unbalanced_nodes }
self._c.executemany( 'DELETE FROM shape_vptree WHERE phash_id = ?;', ( ( p_id, ) for p_id in unbalanced_phash_ids ) )
self._c.executemany( 'DELETE FROM shape_maintenance_branch_regen WHERE phash_id = ?;', ( ( p_id, ) for p_id in unbalanced_phash_ids ) )
with HydrusDB.TemporaryIntegerTable( self._c, unbalanced_phash_ids, 'phash_id' ) as temp_phash_ids_table_name:
useful_phash_ids = self._STS( self._c.execute( 'SELECT phash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );'.format( temp_phash_ids_table_name ) ) )
orphan_phash_ids = unbalanced_phash_ids.difference( useful_phash_ids )
self._c.executemany( 'DELETE FROM shape_perceptual_hashes WHERE phash_id = ?;', ( ( p_id, ) for p_id in orphan_phash_ids ) )
useful_nodes = [ row for row in unbalanced_nodes if row[0] in useful_phash_ids ]
useful_population = len( useful_nodes )
# now create the new branch, starting by choosing a new root and updating the parent's left/right reference to that
if useful_population > 0:
( new_phash_id, new_phash ) = self._PopBestRootNode( useful_nodes ) #HydrusData.RandomPop( useful_nodes )
else:
new_phash_id = None
if parent_id is not None:
( parent_inner_id, ) = self._c.execute( 'SELECT inner_id FROM shape_vptree WHERE phash_id = ?;', ( parent_id, ) ).fetchone()
if parent_inner_id == phash_id:
query = 'UPDATE shape_vptree SET inner_id = ?, inner_population = ? WHERE phash_id = ?;'
else:
query = 'UPDATE shape_vptree SET outer_id = ?, outer_population = ? WHERE phash_id = ?;'
self._c.execute( query, ( new_phash_id, useful_population, parent_id ) )
if useful_population > 0:
self._GenerateBranch( job_key, parent_id, new_phash_id, new_phash, useful_nodes )
def AssociatePHashes( self, hash_id, phashes ):
phash_ids = set()
for phash in phashes:
phash_id = self._GetPHashId( phash )
phash_ids.add( phash_id )
self._c.executemany( 'INSERT OR IGNORE INTO shape_perceptual_hash_map ( phash_id, hash_id ) VALUES ( ?, ? );', ( ( phash_id, hash_id ) for phash_id in phash_ids ) )
if HydrusDB.GetRowCount( self._c ) > 0:
self._c.execute( 'REPLACE INTO shape_search_cache ( hash_id, searched_distance ) VALUES ( ?, ? );', ( hash_id, None ) )
return phash_ids
def CreateInitialTables( self ):
self._c.execute( 'CREATE TABLE IF NOT EXISTS external_caches.shape_perceptual_hashes ( phash_id INTEGER PRIMARY KEY, phash BLOB_BYTES UNIQUE );' )
self._c.execute( 'CREATE TABLE IF NOT EXISTS external_caches.shape_perceptual_hash_map ( phash_id INTEGER, hash_id INTEGER, PRIMARY KEY ( phash_id, hash_id ) );' )
self._c.execute( 'CREATE TABLE IF NOT EXISTS external_caches.shape_vptree ( phash_id INTEGER PRIMARY KEY, parent_id INTEGER, radius INTEGER, inner_id INTEGER, inner_population INTEGER, outer_id INTEGER, outer_population INTEGER );' )
self._c.execute( 'CREATE TABLE IF NOT EXISTS external_caches.shape_maintenance_branch_regen ( phash_id INTEGER PRIMARY KEY );' )
self._c.execute( 'CREATE TABLE IF NOT EXISTS external_caches.shape_search_cache ( hash_id INTEGER PRIMARY KEY, searched_distance INTEGER );' )
def DisassociatePHashes( self, hash_id, phash_ids ):
self._c.executemany( 'DELETE FROM shape_perceptual_hash_map WHERE phash_id = ? AND hash_id = ?;', ( ( phash_id, hash_id ) for phash_id in phash_ids ) )
useful_phash_ids = { phash for ( phash, ) in self._c.execute( 'SELECT phash_id FROM shape_perceptual_hash_map WHERE phash_id IN ' + HydrusData.SplayListForDB( phash_ids ) + ';' ) }
useless_phash_ids = phash_ids.difference( useful_phash_ids )
self._c.executemany( 'INSERT OR IGNORE INTO shape_maintenance_branch_regen ( phash_id ) VALUES ( ? );', ( ( phash_id, ) for phash_id in useless_phash_ids ) )
def GetExpectedTableNames( self ) -> typing.Collection[ str ]:
expected_table_names = []
return expected_table_names
def GetMaintenanceStatus( self ):
searched_distances_to_count = collections.Counter( dict( self._c.execute( 'SELECT searched_distance, COUNT( * ) FROM shape_search_cache GROUP BY searched_distance;' ) ) )
return searched_distances_to_count
def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:
if HC.CONTENT_TYPE_HASH:
return [ ( 'shape_perceptual_hash_map', 'hash_id' ) ]
return []
def MaintainTree( self, maintenance_mode = HC.MAINTENANCE_FORCED, job_key = None, stop_time = None ):
time_started = HydrusData.GetNow()
pub_job_key = False
job_key_pubbed = False
if job_key is None:
job_key = ClientThreading.JobKey( cancellable = True )
pub_job_key = True
try:
job_key.SetStatusTitle( 'similar files metadata maintenance' )
rebalance_phash_ids = self._STL( self._c.execute( 'SELECT phash_id FROM shape_maintenance_branch_regen;' ) )
num_to_do = len( rebalance_phash_ids )
while len( rebalance_phash_ids ) > 0:
if pub_job_key and not job_key_pubbed and HydrusData.TimeHasPassed( time_started + 5 ):
HG.client_controller.pub( 'modal_message', job_key )
job_key_pubbed = True
( i_paused, should_quit ) = job_key.WaitIfNeeded()
should_stop = HG.client_controller.ShouldStopThisWork( maintenance_mode, stop_time = stop_time )
if should_quit or should_stop:
return
num_done = num_to_do - len( rebalance_phash_ids )
text = 'rebalancing similar file metadata - ' + HydrusData.ConvertValueRangeToPrettyString( num_done, num_to_do )
HG.client_controller.frame_splash_status.SetSubtext( text )
job_key.SetVariable( 'popup_text_1', text )
job_key.SetVariable( 'popup_gauge_1', ( num_done, num_to_do ) )
with HydrusDB.TemporaryIntegerTable( self._c, rebalance_phash_ids, 'phash_id' ) as temp_table_name:
# temp phashes to tree
( biggest_phash_id, ) = self._c.execute( 'SELECT phash_id FROM {} CROSS JOIN shape_vptree USING ( phash_id ) ORDER BY inner_population + outer_population DESC;'.format( temp_table_name ) ).fetchone()
self._RegenerateBranch( job_key, biggest_phash_id )
rebalance_phash_ids = self._STL( self._c.execute( 'SELECT phash_id FROM shape_maintenance_branch_regen;' ) )
finally:
job_key.SetVariable( 'popup_text_1', 'done!' )
job_key.DeleteVariable( 'popup_gauge_1' )
job_key.DeleteVariable( 'popup_text_2' ) # used in the regenbranch call
job_key.Finish()
job_key.Delete( 5 )
def MaintenanceDue( self ):
new_options = HG.client_controller.new_options
if new_options.GetBoolean( 'maintain_similar_files_duplicate_pairs_during_idle' ):
search_distance = new_options.GetInteger( 'similar_files_duplicate_pairs_search_distance' )
( count, ) = self._c.execute( 'SELECT COUNT( * ) FROM ( SELECT 1 FROM shape_search_cache WHERE searched_distance IS NULL or searched_distance < ? LIMIT 100 );', ( search_distance, ) ).fetchone()
if count >= 100:
return True
return False
def RegenerateTree( self ):
job_key = ClientThreading.JobKey()
try:
job_key.SetStatusTitle( 'regenerating similar file search data' )
HG.client_controller.pub( 'modal_message', job_key )
job_key.SetVariable( 'popup_text_1', 'purging search info of orphans' )
( current_files_table_name, deleted_files_table_name, pending_files_table_name, petitioned_files_table_name ) = ClientDBFilesStorage.GenerateFilesTableNames( self.modules_services.combined_local_file_service_id )
self._c.execute( 'DELETE FROM shape_perceptual_hash_map WHERE hash_id NOT IN ( SELECT hash_id FROM {} );'.format( current_files_table_name ) )
job_key.SetVariable( 'popup_text_1', 'gathering all leaves' )
self._c.execute( 'DELETE FROM shape_vptree;' )
all_nodes = self._c.execute( 'SELECT phash_id, phash FROM shape_perceptual_hashes;' ).fetchall()
job_key.SetVariable( 'popup_text_1', HydrusData.ToHumanInt( len( all_nodes ) ) + ' leaves found, now regenerating' )
( root_id, root_phash ) = self._PopBestRootNode( all_nodes ) #HydrusData.RandomPop( all_nodes )
self._GenerateBranch( job_key, None, root_id, root_phash, all_nodes )
finally:
job_key.SetVariable( 'popup_text_1', 'done!' )
job_key.DeleteVariable( 'popup_text_2' )
job_key.Finish()
job_key.Delete( 5 )
def ResetSearch( self, hash_ids ):
self._c.executemany( 'UPDATE shape_search_cache SET searched_distance = NULL WHERE hash_id = ?;', ( ( hash_id, ) for hash_id in hash_ids ) )
def Search( self, hash_id, max_hamming_distance ):
if max_hamming_distance == 0:
similar_hash_ids = self._STL( self._c.execute( 'SELECT hash_id FROM shape_perceptual_hash_map WHERE phash_id IN ( SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ? );', ( hash_id, ) ) )
similar_hash_ids_and_distances = [ ( similar_hash_id, 0 ) for similar_hash_id in similar_hash_ids ]
else:
search_radius = max_hamming_distance
top_node_result = self._c.execute( 'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;' ).fetchone()
if top_node_result is None:
return []
( root_node_phash_id, ) = top_node_result
search = self._STL( self._c.execute( 'SELECT phash FROM shape_perceptual_hashes NATURAL JOIN shape_perceptual_hash_map WHERE hash_id = ?;', ( hash_id, ) ) )
if len( search ) == 0:
return []
similar_phash_ids_to_distances = {}
num_cycles = 0
total_nodes_searched = 0
for search_phash in search:
next_potentials = [ root_node_phash_id ]
while len( next_potentials ) > 0:
current_potentials = next_potentials
next_potentials = []
num_cycles += 1
total_nodes_searched += len( current_potentials )
for group_of_current_potentials in HydrusData.SplitListIntoChunks( current_potentials, 10000 ):
# this is split into fixed lists of results of subgroups because as an iterable it was causing crashes on linux!!
# after investigation, it seemed to be SQLite having a problem with part of Get64BitHammingDistance touching phashes it presumably was still hanging on to
# the crash was in sqlite code, again presumably on subsequent fetch
# adding a delay in seemed to fix it as well. guess it was some memory maintenance buffer/bytes thing
# anyway, we now just get the whole lot of results first and then work on the whole lot
'''
#old method
select_statement = 'SELECT phash_id, phash, radius, inner_id, outer_id FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;'
results = list( self._ExecuteManySelectSingleParam( select_statement, group_of_current_potentials ) )
'''
with HydrusDB.TemporaryIntegerTable( self._c, group_of_current_potentials, 'phash_id' ) as temp_table_name:
# temp phash_ids to actual phashes and tree info
results = self._c.execute( 'SELECT phash_id, phash, radius, inner_id, outer_id FROM {} CROSS JOIN shape_perceptual_hashes USING ( phash_id ) CROSS JOIN shape_vptree USING ( phash_id );'.format( temp_table_name ) ).fetchall()
for ( node_phash_id, node_phash, node_radius, inner_phash_id, outer_phash_id ) in results:
# first check the node itself--is it similar?
node_hamming_distance = HydrusData.Get64BitHammingDistance( search_phash, node_phash )
if node_hamming_distance <= search_radius:
if node_phash_id in similar_phash_ids_to_distances:
current_distance = similar_phash_ids_to_distances[ node_phash_id ]
similar_phash_ids_to_distances[ node_phash_id ] = min( node_hamming_distance, current_distance )
else:
similar_phash_ids_to_distances[ node_phash_id ] = node_hamming_distance
# now how about its children?
if node_radius is not None:
# we have two spheres--node and search--their centers separated by node_hamming_distance
# we want to search inside/outside the node_sphere if the search_sphere intersects with those spaces
# there are four possibles:
# (----N----)-(--S--) intersects with outer only - distance between N and S > their radii
# (----N---(-)-S--) intersects with both
# (----N-(--S-)-) intersects with both
# (---(-N-S--)-) intersects with inner only - distance between N and S + radius_S does not exceed radius_N
if inner_phash_id is not None:
spheres_disjoint = node_hamming_distance > ( node_radius + search_radius )
if not spheres_disjoint: # i.e. they intersect at some point
next_potentials.append( inner_phash_id )
if outer_phash_id is not None:
search_sphere_subset_of_node_sphere = ( node_hamming_distance + search_radius ) <= node_radius
if not search_sphere_subset_of_node_sphere: # i.e. search sphere intersects with non-node sphere space at some point
next_potentials.append( outer_phash_id )
if HG.db_report_mode:
HydrusData.ShowText( 'Similar file search touched {} nodes over {} cycles.'.format( HydrusData.ToHumanInt( total_nodes_searched ), HydrusData.ToHumanInt( num_cycles ) ) )
# so, now we have phash_ids and distances. let's map that to actual files.
# files can have multiple phashes, and phashes can refer to multiple files, so let's make sure we are setting the smallest distance we found
similar_phash_ids = list( similar_phash_ids_to_distances.keys() )
with HydrusDB.TemporaryIntegerTable( self._c, similar_phash_ids, 'phash_id' ) as temp_table_name:
# temp phashes to hash map
similar_phash_ids_to_hash_ids = HydrusData.BuildKeyToListDict( self._c.execute( 'SELECT phash_id, hash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );'.format( temp_table_name ) ) )
similar_hash_ids_to_distances = {}
for ( phash_id, hash_ids ) in similar_phash_ids_to_hash_ids.items():
distance = similar_phash_ids_to_distances[ phash_id ]
for hash_id in hash_ids:
if hash_id not in similar_hash_ids_to_distances:
similar_hash_ids_to_distances[ hash_id ] = distance
else:
current_distance = similar_hash_ids_to_distances[ hash_id ]
if distance < current_distance:
similar_hash_ids_to_distances[ hash_id ] = distance
similar_hash_ids_and_distances = list( similar_hash_ids_to_distances.items() )
return similar_hash_ids_and_distances
def SetPHashes( self, hash_id, phashes ):
current_phash_ids = self._STS( self._c.execute( 'SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ?;', ( hash_id, ) ) )
if len( current_phash_ids ) > 0:
self.DisassociatePHashes( hash_id, current_phash_ids )
if len( phashes ) > 0:
self.AssociatePHashes( hash_id, phashes )