import collections import itertools import random import sqlite3 import typing from hydrus.core import HydrusConstants as HC from hydrus.core import HydrusExceptions from hydrus.client import ClientConstants as CC from hydrus.client import ClientLocation from hydrus.client import ClientSearch from hydrus.client.db import ClientDBDefinitionsCache from hydrus.client.db import ClientDBFilesStorage from hydrus.client.db import ClientDBModule from hydrus.client.db import ClientDBSimilarFiles class ClientDBFilesDuplicates( ClientDBModule.ClientDBModule ): def __init__( self, cursor: sqlite3.Cursor, modules_files_storage: ClientDBFilesStorage.ClientDBFilesStorage, modules_hashes_local_cache: ClientDBDefinitionsCache.ClientDBCacheLocalHashes, modules_similar_files: ClientDBSimilarFiles.ClientDBSimilarFiles ): ClientDBModule.ClientDBModule.__init__( self, 'client file duplicates', cursor ) self.modules_files_storage = modules_files_storage self.modules_hashes_local_cache = modules_hashes_local_cache self.modules_similar_files = modules_similar_files self._service_ids_to_content_types_to_outstanding_local_processing = collections.defaultdict( dict ) def _GetInitialIndexGenerationDict( self ) -> dict: index_generation_dict = {} index_generation_dict[ 'main.duplicate_false_positives' ] = [ ( [ 'larger_alternates_group_id', 'smaller_alternates_group_id' ], True, 469 ) ] index_generation_dict[ 'main.potential_duplicate_pairs' ] = [ ( [ 'larger_media_id', 'smaller_media_id' ], True, 469 ) ] return index_generation_dict def _GetInitialTableGenerationDict( self ) -> dict: return { 'main.alternate_file_groups' : ( 'CREATE TABLE IF NOT EXISTS {} ( alternates_group_id INTEGER PRIMARY KEY );', 469 ), 'main.alternate_file_group_members' : ( 'CREATE TABLE IF NOT EXISTS {} ( alternates_group_id INTEGER, media_id INTEGER UNIQUE, PRIMARY KEY ( alternates_group_id, media_id ) );', 469 ), 'main.confirmed_alternate_pairs' : ( 'CREATE TABLE IF NOT EXISTS {} ( smaller_media_id INTEGER, larger_media_id INTEGER, PRIMARY KEY ( smaller_media_id, larger_media_id ) );', 469 ), 'main.duplicate_files' : ( 'CREATE TABLE IF NOT EXISTS {} ( media_id INTEGER PRIMARY KEY, king_hash_id INTEGER UNIQUE );', 469 ), 'main.duplicate_file_members' : ( 'CREATE TABLE IF NOT EXISTS {} ( media_id INTEGER, hash_id INTEGER UNIQUE, PRIMARY KEY ( media_id, hash_id ) );', 469 ), 'main.duplicate_false_positives' : ( 'CREATE TABLE IF NOT EXISTS {} ( smaller_alternates_group_id INTEGER, larger_alternates_group_id INTEGER, PRIMARY KEY ( smaller_alternates_group_id, larger_alternates_group_id ) );', 469 ), 'main.potential_duplicate_pairs' : ( 'CREATE TABLE IF NOT EXISTS {} ( smaller_media_id INTEGER, larger_media_id INTEGER, distance INTEGER, PRIMARY KEY ( smaller_media_id, larger_media_id ) );', 469 ) } def DuplicatesAddPotentialDuplicates( self, media_id, potential_duplicate_media_ids_and_distances ): inserts = [] for ( potential_duplicate_media_id, distance ) in potential_duplicate_media_ids_and_distances: if potential_duplicate_media_id == media_id: # already duplicates! continue if self.DuplicatesMediasAreFalsePositive( media_id, potential_duplicate_media_id ): continue if self.DuplicatesMediasAreConfirmedAlternates( media_id, potential_duplicate_media_id ): continue # if they are alternates with different alt label and index, do not add # however this _could_ be folded into areconfirmedalts on the setalt event--any other alt with diff label/index also gets added smaller_media_id = min( media_id, potential_duplicate_media_id ) larger_media_id = max( media_id, potential_duplicate_media_id ) inserts.append( ( smaller_media_id, larger_media_id, distance ) ) if len( inserts ) > 0: self._ExecuteMany( 'INSERT OR IGNORE INTO potential_duplicate_pairs ( smaller_media_id, larger_media_id, distance ) VALUES ( ?, ?, ? );', inserts ) def DuplicatesAlternatesGroupsAreFalsePositive( self, alternates_group_id_a, alternates_group_id_b ): if alternates_group_id_a == alternates_group_id_b: return False smaller_alternates_group_id = min( alternates_group_id_a, alternates_group_id_b ) larger_alternates_group_id = max( alternates_group_id_a, alternates_group_id_b ) result = self._Execute( 'SELECT 1 FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? AND larger_alternates_group_id = ?;', ( smaller_alternates_group_id, larger_alternates_group_id ) ).fetchone() false_positive_pair_found = result is not None return false_positive_pair_found def DuplicatesClearAllFalsePositiveRelations( self, alternates_group_id ): self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id, alternates_group_id ) ) media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id ) hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids ) self.modules_similar_files.ResetSearch( hash_ids ) def DuplicatesClearAllFalsePositiveRelationsFromHashes( self, hashes ): hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes ) for hash_id in hash_ids: media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True ) if alternates_group_id is not None: self.DuplicatesClearAllFalsePositiveRelations( alternates_group_id ) def DuplicatesClearFalsePositiveRelationsBetweenGroups( self, alternates_group_ids ): pairs = list( itertools.combinations( alternates_group_ids, 2 ) ) for ( alternates_group_id_a, alternates_group_id_b ) in pairs: smaller_alternates_group_id = min( alternates_group_id_a, alternates_group_id_b ) larger_alternates_group_id = max( alternates_group_id_a, alternates_group_id_b ) self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? AND larger_alternates_group_id = ?;', ( smaller_alternates_group_id, larger_alternates_group_id ) ) for alternates_group_id in alternates_group_ids: media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id ) hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids ) self.modules_similar_files.ResetSearch( hash_ids ) def DuplicatesClearFalsePositiveRelationsBetweenGroupsFromHashes( self, hashes ): alternates_group_ids = set() hash_id = self.modules_hashes_local_cache.GetHashId( hash ) media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True ) if alternates_group_id is not None: alternates_group_ids.add( alternates_group_id ) if len( alternates_group_ids ) > 1: self.DuplicatesClearFalsePositiveRelationsBetweenGroups( alternates_group_ids ) def DuplicatesClearPotentialsBetweenMedias( self, media_ids_a, media_ids_b ): # these two groups of medias now have a false positive or alternates relationship set between them, or they are about to be merged # therefore, potentials between them are no longer needed # note that we are not eliminating intra-potentials within A or B, only inter-potentials between A and B all_media_ids = set() all_media_ids.update( media_ids_a ) all_media_ids.update( media_ids_b ) with self._MakeTemporaryIntegerTable( all_media_ids, 'media_id' ) as temp_media_ids_table_name: # keep these separate--older sqlite can't do cross join to an OR ON # temp media ids to potential pairs potential_duplicate_pairs = set( self._Execute( 'SELECT smaller_media_id, larger_media_id FROM {} CROSS JOIN potential_duplicate_pairs ON ( smaller_media_id = media_id );'.format( temp_media_ids_table_name ) ).fetchall() ) potential_duplicate_pairs.update( self._Execute( 'SELECT smaller_media_id, larger_media_id FROM {} CROSS JOIN potential_duplicate_pairs ON ( larger_media_id = media_id );'.format( temp_media_ids_table_name ) ).fetchall() ) deletees = [] for ( smaller_media_id, larger_media_id ) in potential_duplicate_pairs: if ( smaller_media_id in media_ids_a and larger_media_id in media_ids_b ) or ( smaller_media_id in media_ids_b and larger_media_id in media_ids_a ): deletees.append( ( smaller_media_id, larger_media_id ) ) if len( deletees ) > 0: self._ExecuteMany( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', deletees ) def DuplicatesClearPotentialsBetweenAlternatesGroups( self, alternates_group_id_a, alternates_group_id_b ): # these groups are being set as false positive. therefore, any potential between them no longer applies media_ids_a = self.DuplicatesGetAlternateMediaIds( alternates_group_id_a ) media_ids_b = self.DuplicatesGetAlternateMediaIds( alternates_group_id_b ) self.DuplicatesClearPotentialsBetweenMedias( media_ids_a, media_ids_b ) def DuplicatesDeleteAllPotentialDuplicatePairs( self ): media_ids = set() for ( smaller_media_id, larger_media_id ) in self._Execute( 'SELECT smaller_media_id, larger_media_id FROM potential_duplicate_pairs;' ): media_ids.add( smaller_media_id ) media_ids.add( larger_media_id ) hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids ) self._Execute( 'DELETE FROM potential_duplicate_pairs;' ) self.modules_similar_files.ResetSearch( hash_ids ) def DuplicatesDissolveAlternatesGroupId( self, alternates_group_id ): media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id ) for media_id in media_ids: self.DuplicatesDissolveMediaId( media_id ) def DuplicatesDissolveAlternatesGroupIdFromHashes( self, hashes ): hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes ) for hash_id in hash_ids: media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True ) if alternates_group_id is not None: self.DuplicatesDissolveAlternatesGroupId( alternates_group_id ) def DuplicatesDissolveMediaId( self, media_id ): self.DuplicatesRemoveAlternateMember( media_id ) self._Execute( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( media_id, media_id ) ) hash_ids = self.DuplicatesGetDuplicateHashIds( media_id ) self._Execute( 'DELETE FROM duplicate_file_members WHERE media_id = ?;', ( media_id, ) ) self._Execute( 'DELETE FROM duplicate_files WHERE media_id = ?;', ( media_id, ) ) self.modules_similar_files.ResetSearch( hash_ids ) def DuplicatesDissolveMediaIdFromHashes( self, hashes ): hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes ) for hash_id in hash_ids: media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: self.DuplicatesDissolveMediaId( media_id ) def DuplicatesFilterKingHashIds( self, allowed_hash_ids ): # can't just pull explicit king_hash_ids, since files that do not have a media_id are still kings # kings = hashes - explicitly not kings if not isinstance( allowed_hash_ids, set ): allowed_hash_ids = set( allowed_hash_ids ) with self._MakeTemporaryIntegerTable( allowed_hash_ids, 'hash_id' ) as temp_hash_ids_table_name: explicit_king_hash_ids = self._STS( self._Execute( 'SELECT king_hash_id FROM {} CROSS JOIN duplicate_files ON ( {}.hash_id = duplicate_files.king_hash_id );'.format( temp_hash_ids_table_name, temp_hash_ids_table_name ) ) ) all_duplicate_member_hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} CROSS JOIN duplicate_file_members USING ( hash_id );'.format( temp_hash_ids_table_name ) ) ) all_non_king_hash_ids = all_duplicate_member_hash_ids.difference( explicit_king_hash_ids ) return allowed_hash_ids.difference( all_non_king_hash_ids ) def DuplicatesFilterMediaIdPairs( self, db_location_context: ClientDBFilesStorage.DBLocationContext, media_id_pairs ): if len( media_id_pairs ) == 0: return [] # this is pretty wonked out due to me not wanting to force db_location_context to make a single table all_media_ids = { i for i in itertools.chain.from_iterable( media_id_pairs ) } with self._MakeTemporaryIntegerTable( all_media_ids, 'media_id' ) as temp_media_ids_table_name: hash_ids_to_media_ids = dict( self._Execute( 'SELECT hash_id, media_id FROM {} CROSS JOIN {} USING ( media_id );'.format( temp_media_ids_table_name, 'duplicate_file_members' ) ) ) all_hash_ids = set( hash_ids_to_media_ids.keys() ) good_hash_ids = self.modules_files_storage.FilterHashIds( db_location_context.location_context, all_hash_ids ) good_media_ids = { hash_ids_to_media_ids[ hash_id ] for hash_id in good_hash_ids } good_media_id_pairs = [ ( smaller_media_id, larger_media_id ) for ( smaller_media_id, larger_media_id ) in media_id_pairs if smaller_media_id in good_media_ids and larger_media_id in good_media_ids ] return good_media_id_pairs def DuplicatesGetAlternatesGroupId( self, media_id, do_not_create = False ): result = self._Execute( 'SELECT alternates_group_id FROM alternate_file_group_members WHERE media_id = ?;', ( media_id, ) ).fetchone() if result is None: if do_not_create: return None self._Execute( 'INSERT INTO alternate_file_groups DEFAULT VALUES;' ) alternates_group_id = self._GetLastRowId() self._Execute( 'INSERT INTO alternate_file_group_members ( alternates_group_id, media_id ) VALUES ( ?, ? );', ( alternates_group_id, media_id ) ) else: ( alternates_group_id, ) = result return alternates_group_id def DuplicatesGetAlternateMediaIds( self, alternates_group_id ): media_ids = self._STS( self._Execute( 'SELECT media_id FROM alternate_file_group_members WHERE alternates_group_id = ?;', ( alternates_group_id, ) ) ) return media_ids def DuplicatesGetBestKingId( self, media_id, db_location_context: ClientDBFilesStorage.DBLocationContext, allowed_hash_ids = None, preferred_hash_ids = None ): media_hash_ids = self.DuplicatesGetDuplicateHashIds( media_id, db_location_context = db_location_context ) if allowed_hash_ids is not None: media_hash_ids.intersection_update( allowed_hash_ids ) if len( media_hash_ids ) > 0: king_hash_id = self.DuplicatesGetKingHashId( media_id ) if preferred_hash_ids is not None: preferred_hash_ids = media_hash_ids.intersection( preferred_hash_ids ) if len( preferred_hash_ids ) > 0: if king_hash_id not in preferred_hash_ids: king_hash_id = random.choice( list( preferred_hash_ids ) ) return king_hash_id if king_hash_id not in media_hash_ids: king_hash_id = random.choice( list( media_hash_ids ) ) return king_hash_id return None def DuplicatesGetDuplicateHashIds( self, media_id, db_location_context: ClientDBFilesStorage.DBLocationContext = None ): table_join = 'duplicate_file_members' if db_location_context is not None: if not db_location_context.SingleTableIsFast(): hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} WHERE media_id = ?;'.format( table_join ), ( media_id, ) ) ) hash_ids = self.modules_files_storage.FilterHashIds( db_location_context.location_context, hash_ids ) return hash_ids table_join = db_location_context.GetTableJoinLimitedByFileDomain( table_join ) hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} WHERE media_id = ?;'.format( table_join ), ( media_id, ) ) ) return hash_ids def DuplicatesGetDuplicatesHashIds( self, media_ids, db_location_context: ClientDBFilesStorage.DBLocationContext = None ): with self._MakeTemporaryIntegerTable( media_ids, 'media_id' ) as temp_media_ids_table_name: table_join = '{} CROSS JOIN {} USING ( media_id )'.format( temp_media_ids_table_name, 'duplicate_file_members' ) if db_location_context is not None: table_join = db_location_context.GetTableJoinLimitedByFileDomain( table_join ) hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {};'.format( table_join ) ) ) return hash_ids def DuplicatesGetFalsePositiveAlternatesGroupIds( self, alternates_group_id ): false_positive_alternates_group_ids = set() results = self._Execute( 'SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id, alternates_group_id ) ).fetchall() for ( smaller_alternates_group_id, larger_alternates_group_id ) in results: false_positive_alternates_group_ids.add( smaller_alternates_group_id ) false_positive_alternates_group_ids.add( larger_alternates_group_id ) return false_positive_alternates_group_ids def DuplicatesGetFileDuplicateInfo( self, location_context, hash ): result_dict = {} result_dict[ 'is_king' ] = True hash_id = self.modules_hashes_local_cache.GetHashId( hash ) counter = collections.Counter() media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: db_location_context = self.modules_files_storage.GetDBLocationContext( location_context ) all_potential_pairs = self._Execute( 'SELECT DISTINCT smaller_media_id, larger_media_id FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( media_id, media_id, ) ).fetchall() potential_pairs = self.DuplicatesFilterMediaIdPairs( db_location_context, all_potential_pairs ) if len( potential_pairs ) > 0: counter[ HC.DUPLICATE_POTENTIAL ] = len( potential_pairs ) king_hash_id = self.DuplicatesGetKingHashId( media_id ) result_dict[ 'is_king' ] = king_hash_id == hash_id media_hash_ids = self.DuplicatesGetDuplicateHashIds( media_id, db_location_context = db_location_context ) num_other_dupe_members = len( media_hash_ids ) - 1 if num_other_dupe_members > 0: counter[ HC.DUPLICATE_MEMBER ] = num_other_dupe_members alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True ) if alternates_group_id is not None: alt_media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id ) alt_media_ids.discard( media_id ) for alt_media_id in alt_media_ids: alt_hash_ids = self.DuplicatesGetDuplicateHashIds( alt_media_id, db_location_context = db_location_context ) if len( alt_hash_ids ) > 0: counter[ HC.DUPLICATE_ALTERNATE ] += 1 smaller_media_id = min( media_id, alt_media_id ) larger_media_id = max( media_id, alt_media_id ) result = self._Execute( 'SELECT 1 FROM confirmed_alternate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', ( smaller_media_id, larger_media_id ) ).fetchone() if result is not None: counter[ HC.DUPLICATE_CONFIRMED_ALTERNATE ] += 1 false_positive_alternates_group_ids = self.DuplicatesGetFalsePositiveAlternatesGroupIds( alternates_group_id ) false_positive_alternates_group_ids.discard( alternates_group_id ) for false_positive_alternates_group_id in false_positive_alternates_group_ids: fp_media_ids = self.DuplicatesGetAlternateMediaIds( false_positive_alternates_group_id ) for fp_media_id in fp_media_ids: fp_hash_ids = self.DuplicatesGetDuplicateHashIds( fp_media_id, db_location_context = db_location_context ) if len( fp_hash_ids ) > 0: counter[ HC.DUPLICATE_FALSE_POSITIVE ] += 1 result_dict[ 'counts' ] = counter return result_dict def DuplicatesGetFileHashesByDuplicateType( self, location_context: ClientLocation.LocationContext, hash: bytes, duplicate_type: int, allowed_hash_ids = None, preferred_hash_ids = None ) -> typing.List[ bytes ]: hash_id = self.modules_hashes_local_cache.GetHashId( hash ) db_location_context = self.modules_files_storage.GetDBLocationContext( location_context ) dupe_hash_ids = set() if duplicate_type == HC.DUPLICATE_FALSE_POSITIVE: media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True ) if alternates_group_id is not None: false_positive_alternates_group_ids = self.DuplicatesGetFalsePositiveAlternatesGroupIds( alternates_group_id ) false_positive_alternates_group_ids.discard( alternates_group_id ) false_positive_media_ids = set() for false_positive_alternates_group_id in false_positive_alternates_group_ids: false_positive_media_ids.update( self.DuplicatesGetAlternateMediaIds( false_positive_alternates_group_id ) ) for false_positive_media_id in false_positive_media_ids: best_king_hash_id = self.DuplicatesGetBestKingId( false_positive_media_id, db_location_context, allowed_hash_ids = allowed_hash_ids, preferred_hash_ids = preferred_hash_ids ) if best_king_hash_id is not None: dupe_hash_ids.add( best_king_hash_id ) elif duplicate_type == HC.DUPLICATE_ALTERNATE: media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True ) if alternates_group_id is not None: alternates_media_ids = self._STS( self._Execute( 'SELECT media_id FROM alternate_file_group_members WHERE alternates_group_id = ?;', ( alternates_group_id, ) ) ) alternates_media_ids.discard( media_id ) for alternates_media_id in alternates_media_ids: best_king_hash_id = self.DuplicatesGetBestKingId( alternates_media_id, db_location_context, allowed_hash_ids = allowed_hash_ids, preferred_hash_ids = preferred_hash_ids ) if best_king_hash_id is not None: dupe_hash_ids.add( best_king_hash_id ) elif duplicate_type == HC.DUPLICATE_MEMBER: media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: media_hash_ids = self.DuplicatesGetDuplicateHashIds( media_id, db_location_context = db_location_context ) if allowed_hash_ids is not None: media_hash_ids.intersection_update( allowed_hash_ids ) dupe_hash_ids.update( media_hash_ids ) elif duplicate_type == HC.DUPLICATE_KING: media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: best_king_hash_id = self.DuplicatesGetBestKingId( media_id, db_location_context, allowed_hash_ids = allowed_hash_ids, preferred_hash_ids = preferred_hash_ids ) if best_king_hash_id is not None: dupe_hash_ids.add( best_king_hash_id ) elif duplicate_type == HC.DUPLICATE_POTENTIAL: media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: table_join = self.DuplicatesGetPotentialDuplicatePairsTableJoinOnFileService( db_location_context ) for ( smaller_media_id, larger_media_id ) in self._Execute( 'SELECT smaller_media_id, larger_media_id FROM {} WHERE smaller_media_id = ? OR larger_media_id = ?;'.format( table_join ), ( media_id, media_id ) ).fetchall(): if smaller_media_id != media_id: potential_media_id = smaller_media_id else: potential_media_id = larger_media_id best_king_hash_id = self.DuplicatesGetBestKingId( potential_media_id, db_location_context, allowed_hash_ids = allowed_hash_ids, preferred_hash_ids = preferred_hash_ids ) if best_king_hash_id is not None: dupe_hash_ids.add( best_king_hash_id ) dupe_hash_ids.discard( hash_id ) dupe_hash_ids = list( dupe_hash_ids ) dupe_hash_ids.insert( 0, hash_id ) dupe_hashes = self.modules_hashes_local_cache.GetHashes( dupe_hash_ids ) return dupe_hashes def DuplicatesGetHashIdsFromDuplicateCountPredicate( self, db_location_context: ClientDBFilesStorage.DBLocationContext, operator, num_relationships, dupe_type ): # doesn't work for '= 0' or '< 1' if operator == CC.UNICODE_ALMOST_EQUAL_TO: lower_bound = 0.8 * num_relationships upper_bound = 1.2 * num_relationships def filter_func( count ): return lower_bound < count and count < upper_bound elif operator == '<': def filter_func( count ): return count < num_relationships elif operator == '>': def filter_func( count ): return count > num_relationships elif operator == '=': def filter_func( count ): return count == num_relationships hash_ids = set() if dupe_type == HC.DUPLICATE_FALSE_POSITIVE: alternates_group_ids_to_valid_for_file_domain = {} alternates_group_ids_to_false_positives = collections.defaultdict( list ) query = 'SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives;' for ( alternates_group_id_a, alternates_group_id_b ) in self._Execute( query ): alternates_group_ids_to_false_positives[ alternates_group_id_a ].append( alternates_group_id_b ) alternates_group_ids_to_false_positives[ alternates_group_id_b ].append( alternates_group_id_a ) for ( alternates_group_id, false_positive_alternates_group_ids ) in alternates_group_ids_to_false_positives.items(): count = 0 for false_positive_alternates_group_id in false_positive_alternates_group_ids: if false_positive_alternates_group_id not in alternates_group_ids_to_valid_for_file_domain: valid = False fp_media_ids = self.DuplicatesGetAlternateMediaIds( false_positive_alternates_group_id ) for fp_media_id in fp_media_ids: fp_hash_ids = self.DuplicatesGetDuplicateHashIds( fp_media_id, db_location_context = db_location_context ) if len( fp_hash_ids ) > 0: valid = True break alternates_group_ids_to_valid_for_file_domain[ false_positive_alternates_group_id ] = valid if alternates_group_ids_to_valid_for_file_domain[ false_positive_alternates_group_id ]: count += 1 if filter_func( count ): media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id ) hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids, db_location_context = db_location_context ) elif dupe_type == HC.DUPLICATE_ALTERNATE: query = 'SELECT alternates_group_id, COUNT( * ) FROM alternate_file_group_members GROUP BY alternates_group_id;' results = self._Execute( query ).fetchall() for ( alternates_group_id, count ) in results: count -= 1 # num relationships is number group members - 1 media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id ) alternates_group_id_hash_ids = [] for media_id in media_ids: media_id_hash_ids = self.DuplicatesGetDuplicateHashIds( media_id, db_location_context = db_location_context ) if len( media_id_hash_ids ) == 0: # this alternate relation does not count for our current file domain, so it should not contribute to the count count -= 1 else: alternates_group_id_hash_ids.extend( media_id_hash_ids ) if filter_func( count ): hash_ids.update( alternates_group_id_hash_ids ) elif dupe_type == HC.DUPLICATE_MEMBER: table_join = db_location_context.GetTableJoinLimitedByFileDomain( 'duplicate_file_members' ) query = 'SELECT media_id, COUNT( * ) FROM {} GROUP BY media_id;'.format( table_join ) media_ids = [] for ( media_id, count ) in self._Execute( query ): count -= 1 if filter_func( count ): media_ids.append( media_id ) hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids, db_location_context = db_location_context ) elif dupe_type == HC.DUPLICATE_POTENTIAL: table_join = self.DuplicatesGetPotentialDuplicatePairsTableJoinOnFileService( db_location_context ) smaller_query = 'SELECT smaller_media_id, COUNT( * ) FROM ( SELECT DISTINCT smaller_media_id, larger_media_id FROM {} ) GROUP BY smaller_media_id;'.format( table_join ) larger_query = 'SELECT larger_media_id, COUNT( * ) FROM ( SELECT DISTINCT smaller_media_id, larger_media_id FROM {} ) GROUP BY larger_media_id;'.format( table_join ) media_ids_to_counts = collections.Counter() for ( media_id, count ) in self._Execute( smaller_query ): media_ids_to_counts[ media_id ] += count for ( media_id, count ) in self._Execute( larger_query ): media_ids_to_counts[ media_id ] += count media_ids = [ media_id for ( media_id, count ) in media_ids_to_counts.items() if filter_func( count ) ] hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids, db_location_context = db_location_context ) return hash_ids def DuplicatesGetKingHashId( self, media_id ): ( king_hash_id, ) = self._Execute( 'SELECT king_hash_id FROM duplicate_files WHERE media_id = ?;', ( media_id, ) ).fetchone() return king_hash_id def DuplicatesGetMediaId( self, hash_id, do_not_create = False ): result = self._Execute( 'SELECT media_id FROM duplicate_file_members WHERE hash_id = ?;', ( hash_id, ) ).fetchone() if result is None: if do_not_create: return None self._Execute( 'INSERT INTO duplicate_files ( king_hash_id ) VALUES ( ? );', ( hash_id, ) ) media_id = self._GetLastRowId() self._Execute( 'INSERT INTO duplicate_file_members ( media_id, hash_id ) VALUES ( ?, ? );', ( media_id, hash_id ) ) else: ( media_id, ) = result return media_id def DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( self, pixel_dupes_preference: int, max_hamming_distance: int ): tables = [ 'potential_duplicate_pairs', 'duplicate_files AS duplicate_files_smaller', 'duplicate_files AS duplicate_files_larger' ] join_predicates = [ 'smaller_media_id = duplicate_files_smaller.media_id AND larger_media_id = duplicate_files_larger.media_id' ] if pixel_dupes_preference != CC.SIMILAR_FILES_PIXEL_DUPES_REQUIRED: join_predicates.append( 'distance <= {}'.format( max_hamming_distance ) ) if pixel_dupes_preference in ( CC.SIMILAR_FILES_PIXEL_DUPES_REQUIRED, CC.SIMILAR_FILES_PIXEL_DUPES_EXCLUDED ): join_predicate_pixel_dupes = 'duplicate_files_smaller.king_hash_id = pixel_hash_map_smaller.hash_id AND duplicate_files_larger.king_hash_id = pixel_hash_map_larger.hash_id AND pixel_hash_map_smaller.pixel_hash_id = pixel_hash_map_larger.pixel_hash_id' if pixel_dupes_preference == CC.SIMILAR_FILES_PIXEL_DUPES_REQUIRED: tables.extend( [ 'pixel_hash_map AS pixel_hash_map_smaller', 'pixel_hash_map AS pixel_hash_map_larger' ] ) join_predicates.append( join_predicate_pixel_dupes ) elif pixel_dupes_preference == CC.SIMILAR_FILES_PIXEL_DUPES_EXCLUDED: # can't do "AND NOT {}", or the join will just give you the million rows where it isn't true. we want 'AND NEVER {}', and quick select_statement = 'SELECT 1 FROM pixel_hash_map AS pixel_hash_map_smaller, pixel_hash_map as pixel_hash_map_larger ON ( {} )'.format( join_predicate_pixel_dupes ) join_predicates.append( 'NOT EXISTS ( {} )'.format( select_statement ) ) return ( tables, join_predicates ) def DuplicatesGetPotentialDuplicatePairsTableJoinOnEverythingSearchResults( self, db_location_context: ClientDBFilesStorage.DBLocationContext, pixel_dupes_preference: int, max_hamming_distance: int ): ( tables, join_predicates ) = self.DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( pixel_dupes_preference, max_hamming_distance ) if not db_location_context.location_context.IsAllKnownFiles(): files_table_name = db_location_context.GetSingleFilesTableName() tables.extend( [ '{} AS current_files_smaller'.format( files_table_name ), '{} AS current_files_larger'.format( files_table_name ) ] ) join_predicates.append( 'duplicate_files_smaller.king_hash_id = current_files_smaller.hash_id AND duplicate_files_larger.king_hash_id = current_files_larger.hash_id' ) table_join = '{} ON ( {} )'.format( ', '.join( tables ), ' AND '.join( join_predicates ) ) return table_join def DuplicatesGetPotentialDuplicatePairsTableJoinOnFileService( self, db_location_context: ClientDBFilesStorage.DBLocationContext ): if db_location_context.location_context.IsAllKnownFiles(): table_join = 'potential_duplicate_pairs' else: files_table_name = db_location_context.GetSingleFilesTableName() table_join = 'potential_duplicate_pairs, duplicate_files AS duplicate_files_smaller, {} AS current_files_smaller, duplicate_files AS duplicate_files_larger, {} AS current_files_larger ON ( smaller_media_id = duplicate_files_smaller.media_id AND duplicate_files_smaller.king_hash_id = current_files_smaller.hash_id AND larger_media_id = duplicate_files_larger.media_id AND duplicate_files_larger.king_hash_id = current_files_larger.hash_id )'.format( files_table_name, files_table_name ) return table_join def DuplicatesGetPotentialDuplicatePairsTableJoinOnSearchResultsBothFiles( self, results_table_name: str, pixel_dupes_preference: int, max_hamming_distance: int ): ( tables, join_predicates ) = self.DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( pixel_dupes_preference, max_hamming_distance ) tables.extend( [ '{} AS results_smaller'.format( results_table_name ), '{} AS results_larger'.format( results_table_name ) ] ) join_predicates.append( 'duplicate_files_smaller.king_hash_id = results_smaller.hash_id AND duplicate_files_larger.king_hash_id = results_larger.hash_id' ) table_join = '{} ON ( {} )'.format( ', '.join( tables ), ' AND '.join( join_predicates ) ) return table_join def DuplicatesGetPotentialDuplicatePairsTableJoinOnSearchResults( self, db_location_context: ClientDBFilesStorage.DBLocationContext, results_table_name: str, pixel_dupes_preference: int, max_hamming_distance: int ): # why yes this is a seven table join that involves a mix of duplicated tables, temporary tables, and duplicated temporary tables # # main thing is, give this guy a search from duplicate filter UI, it'll give you a fast table join that returns potential dupes that match that # # ████████████████████████████████████████████████████████████████████████ # ████████████████████████████████████████████████████████████████████████ # ██████████████████████████████████▓█████████████████████████████████████ # ██████████████████████████████████▒▓████████████████████████████████████ # █████████████████████████████▓▒▓▓▒░░▒░██████████████████████████████████ # ███████████████████████████▓▒▒░░░░ ▒▓███████▓▓▓██████████████████████ # █████████████████████▓▒▓▓▓█ ▒ ▓████▓▓▓▓▓██████████████████████ # █████████████████▓▓▓▓▓░ ░ ░░ ░▓█▓▓▓██▓▓██████████████████████ # █████████████████▓▓▓▒░▒▒▒ █▒ ░▓▓▓█████▓▓▓▓██████████████████████ # █████████████████▓▓▒░░ ░▒ ░▒█▓░▒▓▓▓█████▒▒▒▒▒▓█████████████████████ # ████████████████████▓▒░ ░ ░▒▒▓▓▓██▓▓█▓░ ░░▓▓▒▓▓▒▓▓██████████████████ # ██████████████████████▒░░░░ ▒▓▓▓▓▒▓▓▓▓██▓▓░▓█▓▓▓▓▓▓▓▓▓▓████████████████ # ████████████▓▒█▓███▓▓▒▓░▒░░▒▓▓▓▓▓▒▒░░ ░▒▓▓████▓ ▓▓░░▒▓▓ ░▒▒████████████ # ████████████▒▒████▓░ ░▒▒▒▓██▓▓▒▒▒▒░░ ▒▓▓▒ ░▒░░▓▒ ▒████████████ # ████████████▒▓▓▓█▓░▒▒░▒▓███▓▓▒░░░░ ░░ ░░░▒ ▒▓▒▒▒░▒▒ ▓███████████ # █████████████▒▓▓▓▒▒▓▓▒▓███▓▓▓▒▒░░░░░ ░░▒▓▓ ▒▒░░░ ▓███████████ # ██████████████▓▓▓▓███▓██▓▓▓▓▓▒▒░░░░ ░ ░▓░ ░░ ░▓█████████████ # ███████████████▓▓██▒▓█▓▓▓▓▓▓▒▒░░░░ ░░ ▒▓░ ▓██████████████ # █████████████████▓▒▓█▓▓▓▓▓▓▓▓▒▒▒▒░░▒▒▒ ░▒█▒ ▓████████████████ # ████████████████▓░▒██▓▓▓▓▓▓▓▓▓▒▒▒░░▒▒▒▓▒▒ ░▒▓▓▒▒░░▒░▓██████████████████ # ██████████████▓░▓████▓▓▓▓▓▓▓▓▒▒░░░▒░░░▒▒ ▒▓▓▓ ░▒▓▓▓ ▒█████████████████ # ██████████████▓▓▓██████▓▓▓▓▓▓▒ ░▒▓▒░▓▓ ░ ░▒ ▒░▒▒▒▒▓▒ ▓██████████████ # ██████████████▓▒░▒▒ ▓█▓▓▓▓▓▓▓▓▓▓▒░▒▒▒░▒▒░░░░ ▓▒░░ ░████▓███████████ # █████████████████░ ▓█▓██████████▓░░ ░▒▓█████▓ ▒░░ ░▓▓▒▓██░░▓█████████ # █████████████████▒ ▒█▓▓▓██████████▓▓█▓████████▓ ▒░▒▒░▒ ░███ ▓████████ # ██████████████████▒ ▒█▓▓▓██████████▒ ███████████ ░▓▒ ▒████▒ ████████ # █████████████████████▓▓▒▓██▓███████░ ▒▒████████▒░███▒ ░▓▓▓▓▒▒███████████ # ███████████████████████▒▒███████▓▓▓▓▒ ░▓██████ ▒████▒▓▓▓▓▒▓████████████ # █████████████████████▓▓▓▓▓▓▓▓▓▓▓▓█████ ▒▒▓▒▒ ▓██▓ ▒████████████ # ██████████████████████▓▓▓▓▓▓▓█▓▓▓██████ ▒██▓░░░ ▒ ░▓█▓▒▒█████████████ # ███████████████████████▓▓▓▓▓▓█▓▓▓██▓██▓ ░▓███▓▓▓░ ▓███████████████████ # ████████████████████████▓███▓▓▓▓▓▓█▓█▓ ░ ░▓█ ▒░░▒ ▓███████████████████ # █████████████████████████▓▓████▓▓▓▓▓ ▒█░ ▓█▓▓████████████████████ # ████████████████████████▓█▓██▓▓▓▓▓▒▓ ▓▒ ▒█████████████████████████ # ████████████████████████▓▓███▓▓▓▒▓▒▓░▒░ ▓░░ ██████████████████████████ # ████████████████████████▓▓▓▓▓█▓▓▓▒░░░░░ ▒ ▒██████████████████████████ # █████████████████████████▓▓▓▓▓▓▓█▓▓▓▓▒░ ░░ ▒███████████████████████████ # ███████████████████████████▓▓▓▓▓▓▓▓▓▓▒ ▓████████████████████████████ # ████████████████████████████▓▓▓▓▓▒▒ ▒░ ██████████████████████████████ # ██████████████████████████████▓▓▓▒ ▒███████████████████████████████ # ███████████████████████████████▓▓▒░ ▓████████████████████████████████ # ████████████████████████████████████████████████████████████████████████ # ████████████████████████████████████████████████████████████████████████ # ( tables, join_predicates ) = self.DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( pixel_dupes_preference, max_hamming_distance ) if db_location_context.location_context.IsAllKnownFiles(): tables.append( '{} AS results_table_for_this_query'.format( results_table_name ) ) join_predicates.append( '( duplicate_files_smaller.king_hash_id = results_table_for_this_query.hash_id OR duplicate_files_larger.king_hash_id = results_table_for_this_query.hash_id )' ) else: files_table_name = db_location_context.GetSingleFilesTableName() tables.extend( [ '{} AS results_table_for_this_query'.format( results_table_name ), '{} AS current_files_for_this_query'.format( files_table_name ) ] ) join_predicate_smaller_matches = '( duplicate_files_smaller.king_hash_id = results_table_for_this_query.hash_id AND duplicate_files_larger.king_hash_id = current_files_for_this_query.hash_id )' join_predicate_larger_matches = '( duplicate_files_smaller.king_hash_id = current_files_for_this_query.hash_id AND duplicate_files_larger.king_hash_id = results_table_for_this_query.hash_id )' join_predicates.append( '( {} OR {} )'.format( join_predicate_smaller_matches, join_predicate_larger_matches ) ) table_join = '{} ON ( {} )'.format( ', '.join( tables ), ' AND '.join( join_predicates ) ) return table_join def DuplicatesGetPotentialDuplicatePairsTableJoinOnSeparateSearchResults( self, results_table_name_1: str, results_table_name_2: str, pixel_dupes_preference: int, max_hamming_distance: int ): # # And taking the above to its logical conclusion with two results sets, one file in xor either # ( tables, join_predicates ) = self.DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( pixel_dupes_preference, max_hamming_distance ) # we don't have to do any db_location_context jibber-jabber here as long as we stipulate that the two results sets have the same location context, which we'll enforce in UI # just like above when 'both files match', we know we are db_location_context cross-referenced since we are intersecting with file searches performed on that search domain # so, this is actually a bit simpler than the non-both-files-match one search case!! tables.extend( [ '{} AS results_table_for_this_query_1'.format( results_table_name_1 ), '{} AS results_table_for_this_query_2'.format( results_table_name_2 ) ] ) one_two = '( duplicate_files_smaller.king_hash_id = results_table_for_this_query_1.hash_id AND duplicate_files_larger.king_hash_id = results_table_for_this_query_2.hash_id )' two_one = '( duplicate_files_smaller.king_hash_id = results_table_for_this_query_2.hash_id AND duplicate_files_larger.king_hash_id = results_table_for_this_query_1.hash_id )' join_predicates.append( '( {} OR {} )'.format( one_two, two_one ) ) table_join = '{} ON ( {} )'.format( ', '.join( tables ), ' AND '.join( join_predicates ) ) return table_join def DuplicatesMediasAreAlternates( self, media_id_a, media_id_b ): alternates_group_id_a = self.DuplicatesGetAlternatesGroupId( media_id_a, do_not_create = True ) if alternates_group_id_a is None: return False alternates_group_id_b = self.DuplicatesGetAlternatesGroupId( media_id_b, do_not_create = True ) if alternates_group_id_b is None: return False return alternates_group_id_a == alternates_group_id_b def DuplicatesMediasAreConfirmedAlternates( self, media_id_a, media_id_b ): smaller_media_id = min( media_id_a, media_id_b ) larger_media_id = max( media_id_a, media_id_b ) result = self._Execute( 'SELECT 1 FROM confirmed_alternate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', ( smaller_media_id, larger_media_id ) ).fetchone() return result is not None def DuplicatesMediasAreFalsePositive( self, media_id_a, media_id_b ): alternates_group_id_a = self.DuplicatesGetAlternatesGroupId( media_id_a, do_not_create = True ) if alternates_group_id_a is None: return False alternates_group_id_b = self.DuplicatesGetAlternatesGroupId( media_id_b, do_not_create = True ) if alternates_group_id_b is None: return False return self.DuplicatesAlternatesGroupsAreFalsePositive( alternates_group_id_a, alternates_group_id_b ) def DuplicatesMergeMedias( self, superior_media_id, mergee_media_id ): if superior_media_id == mergee_media_id: return self.DuplicatesClearPotentialsBetweenMedias( ( superior_media_id, ), ( mergee_media_id, ) ) alternates_group_id = self.DuplicatesGetAlternatesGroupId( superior_media_id ) mergee_alternates_group_id = self.DuplicatesGetAlternatesGroupId( mergee_media_id ) if alternates_group_id != mergee_alternates_group_id: if self.DuplicatesAlternatesGroupsAreFalsePositive( alternates_group_id, mergee_alternates_group_id ): smaller_alternates_group_id = min( alternates_group_id, mergee_alternates_group_id ) larger_alternates_group_id = max( alternates_group_id, mergee_alternates_group_id ) self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? AND larger_alternates_group_id = ?;', ( smaller_alternates_group_id, larger_alternates_group_id ) ) self.DuplicatesSetAlternates( superior_media_id, mergee_media_id ) self._Execute( 'UPDATE duplicate_file_members SET media_id = ? WHERE media_id = ?;', ( superior_media_id, mergee_media_id ) ) smaller_media_id = min( superior_media_id, mergee_media_id ) larger_media_id = max( superior_media_id, mergee_media_id ) # ensure the potential merge pair is gone self._Execute( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', ( smaller_media_id, larger_media_id ) ) # now merge potentials from the old to the new--however this has complicated tests to stop confirmed alts and so on, so can't just update ids existing_potential_info_of_mergee_media_id = self._Execute( 'SELECT smaller_media_id, larger_media_id, distance FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( mergee_media_id, mergee_media_id ) ).fetchall() self._Execute( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( mergee_media_id, mergee_media_id ) ) for ( smaller_media_id, larger_media_id, distance ) in existing_potential_info_of_mergee_media_id: if smaller_media_id == mergee_media_id: media_id_a = superior_media_id media_id_b = larger_media_id else: media_id_a = smaller_media_id media_id_b = superior_media_id potential_duplicate_media_ids_and_distances = [ ( media_id_b, distance ) ] self.DuplicatesAddPotentialDuplicates( media_id_a, potential_duplicate_media_ids_and_distances ) # ensure any previous confirmed alt pair is gone self._Execute( 'DELETE FROM confirmed_alternate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', ( smaller_media_id, larger_media_id ) ) # now merge confirmed alts from the old to the new self._Execute( 'UPDATE OR IGNORE confirmed_alternate_pairs SET smaller_media_id = ? WHERE smaller_media_id = ?;', ( superior_media_id, mergee_media_id ) ) self._Execute( 'UPDATE OR IGNORE confirmed_alternate_pairs SET larger_media_id = ? WHERE larger_media_id = ?;', ( superior_media_id, mergee_media_id ) ) # and clear out potentials that are now invalid confirmed_alternate_pairs = self._Execute( 'SELECT smaller_media_id, larger_media_id FROM confirmed_alternate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( superior_media_id, superior_media_id ) ).fetchall() self._ExecuteMany( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', confirmed_alternate_pairs ) # clear out empty records self._Execute( 'DELETE FROM alternate_file_group_members WHERE media_id = ?;', ( mergee_media_id, ) ) self._Execute( 'DELETE FROM duplicate_files WHERE media_id = ?;', ( mergee_media_id, ) ) def DuplicatesRemoveAlternateMember( self, media_id ): alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True ) if alternates_group_id is not None: alternates_media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id ) self._Execute( 'DELETE FROM alternate_file_group_members WHERE media_id = ?;', ( media_id, ) ) self._Execute( 'DELETE FROM confirmed_alternate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( media_id, media_id ) ) if len( alternates_media_ids ) == 1: # i.e. what we just removed was the last of the group self._Execute( 'DELETE FROM alternate_file_groups WHERE alternates_group_id = ?;', ( alternates_group_id, ) ) self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id, alternates_group_id ) ) hash_ids = self.DuplicatesGetDuplicateHashIds( media_id ) self.modules_similar_files.ResetSearch( hash_ids ) def DuplicatesRemoveAlternateMemberFromHashes( self, hashes ): hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes ) for hash_id in hash_ids: media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: self.DuplicatesRemoveAlternateMember( media_id ) def DuplicatesRemoveMediaIdMember( self, hash_id ): media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: king_hash_id = self.DuplicatesGetKingHashId( media_id ) if hash_id == king_hash_id: self.DuplicatesDissolveMediaId( media_id ) else: self._Execute( 'DELETE FROM duplicate_file_members WHERE hash_id = ?;', ( hash_id, ) ) self.modules_similar_files.ResetSearch( ( hash_id, ) ) def DuplicatesRemoveMediaIdMemberFromHashes( self, hashes ): hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes ) for hash_id in hash_ids: self.DuplicatesRemoveMediaIdMember( hash_id ) def DuplicatesRemovePotentialPairs( self, hash_id ): media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True ) if media_id is not None: self._Execute( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( media_id, media_id ) ) def DuplicatesRemovePotentialPairsFromHashes( self, hashes ): hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes ) for hash_id in hash_ids: self.DuplicatesRemovePotentialPairs( hash_id ) def DuplicatesSetAlternates( self, media_id_a, media_id_b ): if media_id_a == media_id_b: return # let's clear out any outstanding potentials. whether this is a valid or not connection, we don't want to see it again self.DuplicatesClearPotentialsBetweenMedias( ( media_id_a, ), ( media_id_b, ) ) # now check if we should be making a new relationship alternates_group_id_a = self.DuplicatesGetAlternatesGroupId( media_id_a ) alternates_group_id_b = self.DuplicatesGetAlternatesGroupId( media_id_b ) if self.DuplicatesAlternatesGroupsAreFalsePositive( alternates_group_id_a, alternates_group_id_b ): return # write a confirmed result so this can't come up again due to subsequent re-searching etc... # in future, I can tune this to consider alternate labels and indices. alternates with different labels and indices are not appropriate for potentials, so we can add more rows here smaller_media_id = min( media_id_a, media_id_b ) larger_media_id = max( media_id_a, media_id_b ) self._Execute( 'INSERT OR IGNORE INTO confirmed_alternate_pairs ( smaller_media_id, larger_media_id ) VALUES ( ?, ? );', ( smaller_media_id, larger_media_id ) ) if alternates_group_id_a == alternates_group_id_b: return # ok, they are currently not alternates, so we need to merge B into A # first, for all false positive relationships that A already has, clear out potentials between B and those fps before it moves over false_positive_pairs = self._Execute( 'SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id_a, alternates_group_id_a ) ) for ( smaller_false_positive_alternates_group_id, larger_false_positive_alternates_group_id ) in false_positive_pairs: if smaller_false_positive_alternates_group_id == alternates_group_id_a: self.DuplicatesClearPotentialsBetweenAlternatesGroups( alternates_group_id_b, larger_false_positive_alternates_group_id ) else: self.DuplicatesClearPotentialsBetweenAlternatesGroups( smaller_false_positive_alternates_group_id, alternates_group_id_b ) # first, update all B to A self._Execute( 'UPDATE alternate_file_group_members SET alternates_group_id = ? WHERE alternates_group_id = ?;', ( alternates_group_id_a, alternates_group_id_b ) ) # move false positive records for B to A false_positive_pairs = self._Execute( 'SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id_b, alternates_group_id_b ) ) self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id_b, alternates_group_id_b ) ) for ( smaller_false_positive_alternates_group_id, larger_false_positive_alternates_group_id ) in false_positive_pairs: if smaller_false_positive_alternates_group_id == alternates_group_id_b: self.DuplicatesSetFalsePositive( alternates_group_id_a, larger_false_positive_alternates_group_id ) else: self.DuplicatesSetFalsePositive( smaller_false_positive_alternates_group_id, alternates_group_id_a ) # remove master record self._Execute( 'DELETE FROM alternate_file_groups WHERE alternates_group_id = ?;', ( alternates_group_id_b, ) ) # pubsub to refresh alternates info for alternates_group_id_a and _b goes here def DuplicatesSetFalsePositive( self, alternates_group_id_a, alternates_group_id_b ): if alternates_group_id_a == alternates_group_id_b: return self.DuplicatesClearPotentialsBetweenAlternatesGroups( alternates_group_id_a, alternates_group_id_b ) smaller_alternates_group_id = min( alternates_group_id_a, alternates_group_id_b ) larger_alternates_group_id = max( alternates_group_id_a, alternates_group_id_b ) self._Execute( 'INSERT OR IGNORE INTO duplicate_false_positives ( smaller_alternates_group_id, larger_alternates_group_id ) VALUES ( ?, ? );', ( smaller_alternates_group_id, larger_alternates_group_id ) ) def DuplicatesSetKing( self, king_hash_id, media_id ): self._Execute( 'UPDATE duplicate_files SET king_hash_id = ? WHERE media_id = ?;', ( king_hash_id, media_id ) ) def DuplicatesSetKingFromHash( self, hash ): hash_id = self.modules_hashes_local_cache.GetHashId( hash ) media_id = self.DuplicatesGetMediaId( hash_id ) self.DuplicatesSetKing( hash_id, media_id ) def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]: tables_and_columns = [] if content_type == HC.CONTENT_TYPE_HASH: tables_and_columns.append( ( 'file_maintenance_jobs', 'hash_id' ) ) return tables_and_columns