hydrus/hydrus/client/db/ClientDBFilesDuplicates.py

1469 lines
72 KiB
Python

import collections
import itertools
import random
import sqlite3
import typing
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusExceptions
from hydrus.client import ClientConstants as CC
from hydrus.client import ClientLocation
from hydrus.client import ClientSearch
from hydrus.client.db import ClientDBDefinitionsCache
from hydrus.client.db import ClientDBFilesStorage
from hydrus.client.db import ClientDBModule
from hydrus.client.db import ClientDBSimilarFiles
class ClientDBFilesDuplicates( ClientDBModule.ClientDBModule ):
def __init__(
self,
cursor: sqlite3.Cursor,
modules_files_storage: ClientDBFilesStorage.ClientDBFilesStorage,
modules_hashes_local_cache: ClientDBDefinitionsCache.ClientDBCacheLocalHashes,
modules_similar_files: ClientDBSimilarFiles.ClientDBSimilarFiles
):
ClientDBModule.ClientDBModule.__init__( self, 'client file duplicates', cursor )
self.modules_files_storage = modules_files_storage
self.modules_hashes_local_cache = modules_hashes_local_cache
self.modules_similar_files = modules_similar_files
self._service_ids_to_content_types_to_outstanding_local_processing = collections.defaultdict( dict )
def _GetInitialIndexGenerationDict( self ) -> dict:
index_generation_dict = {}
index_generation_dict[ 'main.duplicate_false_positives' ] = [
( [ 'larger_alternates_group_id', 'smaller_alternates_group_id' ], True, 469 )
]
index_generation_dict[ 'main.potential_duplicate_pairs' ] = [
( [ 'larger_media_id', 'smaller_media_id' ], True, 469 )
]
return index_generation_dict
def _GetInitialTableGenerationDict( self ) -> dict:
return {
'main.alternate_file_groups' : ( 'CREATE TABLE IF NOT EXISTS {} ( alternates_group_id INTEGER PRIMARY KEY );', 469 ),
'main.alternate_file_group_members' : ( 'CREATE TABLE IF NOT EXISTS {} ( alternates_group_id INTEGER, media_id INTEGER UNIQUE, PRIMARY KEY ( alternates_group_id, media_id ) );', 469 ),
'main.confirmed_alternate_pairs' : ( 'CREATE TABLE IF NOT EXISTS {} ( smaller_media_id INTEGER, larger_media_id INTEGER, PRIMARY KEY ( smaller_media_id, larger_media_id ) );', 469 ),
'main.duplicate_files' : ( 'CREATE TABLE IF NOT EXISTS {} ( media_id INTEGER PRIMARY KEY, king_hash_id INTEGER UNIQUE );', 469 ),
'main.duplicate_file_members' : ( 'CREATE TABLE IF NOT EXISTS {} ( media_id INTEGER, hash_id INTEGER UNIQUE, PRIMARY KEY ( media_id, hash_id ) );', 469 ),
'main.duplicate_false_positives' : ( 'CREATE TABLE IF NOT EXISTS {} ( smaller_alternates_group_id INTEGER, larger_alternates_group_id INTEGER, PRIMARY KEY ( smaller_alternates_group_id, larger_alternates_group_id ) );', 469 ),
'main.potential_duplicate_pairs' : ( 'CREATE TABLE IF NOT EXISTS {} ( smaller_media_id INTEGER, larger_media_id INTEGER, distance INTEGER, PRIMARY KEY ( smaller_media_id, larger_media_id ) );', 469 )
}
def DuplicatesAddPotentialDuplicates( self, media_id, potential_duplicate_media_ids_and_distances ):
inserts = []
for ( potential_duplicate_media_id, distance ) in potential_duplicate_media_ids_and_distances:
if potential_duplicate_media_id == media_id: # already duplicates!
continue
if self.DuplicatesMediasAreFalsePositive( media_id, potential_duplicate_media_id ):
continue
if self.DuplicatesMediasAreConfirmedAlternates( media_id, potential_duplicate_media_id ):
continue
# if they are alternates with different alt label and index, do not add
# however this _could_ be folded into areconfirmedalts on the setalt event--any other alt with diff label/index also gets added
smaller_media_id = min( media_id, potential_duplicate_media_id )
larger_media_id = max( media_id, potential_duplicate_media_id )
inserts.append( ( smaller_media_id, larger_media_id, distance ) )
if len( inserts ) > 0:
self._ExecuteMany( 'INSERT OR IGNORE INTO potential_duplicate_pairs ( smaller_media_id, larger_media_id, distance ) VALUES ( ?, ?, ? );', inserts )
def DuplicatesAlternatesGroupsAreFalsePositive( self, alternates_group_id_a, alternates_group_id_b ):
if alternates_group_id_a == alternates_group_id_b:
return False
smaller_alternates_group_id = min( alternates_group_id_a, alternates_group_id_b )
larger_alternates_group_id = max( alternates_group_id_a, alternates_group_id_b )
result = self._Execute( 'SELECT 1 FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? AND larger_alternates_group_id = ?;', ( smaller_alternates_group_id, larger_alternates_group_id ) ).fetchone()
false_positive_pair_found = result is not None
return false_positive_pair_found
def DuplicatesClearAllFalsePositiveRelations( self, alternates_group_id ):
self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id, alternates_group_id ) )
media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id )
hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids )
self.modules_similar_files.ResetSearch( hash_ids )
def DuplicatesClearAllFalsePositiveRelationsFromHashes( self, hashes ):
hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes )
for hash_id in hash_ids:
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True )
if alternates_group_id is not None:
self.DuplicatesClearAllFalsePositiveRelations( alternates_group_id )
def DuplicatesClearFalsePositiveRelationsBetweenGroups( self, alternates_group_ids ):
pairs = list( itertools.combinations( alternates_group_ids, 2 ) )
for ( alternates_group_id_a, alternates_group_id_b ) in pairs:
smaller_alternates_group_id = min( alternates_group_id_a, alternates_group_id_b )
larger_alternates_group_id = max( alternates_group_id_a, alternates_group_id_b )
self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? AND larger_alternates_group_id = ?;', ( smaller_alternates_group_id, larger_alternates_group_id ) )
for alternates_group_id in alternates_group_ids:
media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id )
hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids )
self.modules_similar_files.ResetSearch( hash_ids )
def DuplicatesClearFalsePositiveRelationsBetweenGroupsFromHashes( self, hashes ):
alternates_group_ids = set()
hash_id = self.modules_hashes_local_cache.GetHashId( hash )
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True )
if alternates_group_id is not None:
alternates_group_ids.add( alternates_group_id )
if len( alternates_group_ids ) > 1:
self.DuplicatesClearFalsePositiveRelationsBetweenGroups( alternates_group_ids )
def DuplicatesClearPotentialsBetweenMedias( self, media_ids_a, media_ids_b ):
# these two groups of medias now have a false positive or alternates relationship set between them, or they are about to be merged
# therefore, potentials between them are no longer needed
# note that we are not eliminating intra-potentials within A or B, only inter-potentials between A and B
all_media_ids = set()
all_media_ids.update( media_ids_a )
all_media_ids.update( media_ids_b )
with self._MakeTemporaryIntegerTable( all_media_ids, 'media_id' ) as temp_media_ids_table_name:
# keep these separate--older sqlite can't do cross join to an OR ON
# temp media ids to potential pairs
potential_duplicate_pairs = set( self._Execute( 'SELECT smaller_media_id, larger_media_id FROM {} CROSS JOIN potential_duplicate_pairs ON ( smaller_media_id = media_id );'.format( temp_media_ids_table_name ) ).fetchall() )
potential_duplicate_pairs.update( self._Execute( 'SELECT smaller_media_id, larger_media_id FROM {} CROSS JOIN potential_duplicate_pairs ON ( larger_media_id = media_id );'.format( temp_media_ids_table_name ) ).fetchall() )
deletees = []
for ( smaller_media_id, larger_media_id ) in potential_duplicate_pairs:
if ( smaller_media_id in media_ids_a and larger_media_id in media_ids_b ) or ( smaller_media_id in media_ids_b and larger_media_id in media_ids_a ):
deletees.append( ( smaller_media_id, larger_media_id ) )
if len( deletees ) > 0:
self._ExecuteMany( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', deletees )
def DuplicatesClearPotentialsBetweenAlternatesGroups( self, alternates_group_id_a, alternates_group_id_b ):
# these groups are being set as false positive. therefore, any potential between them no longer applies
media_ids_a = self.DuplicatesGetAlternateMediaIds( alternates_group_id_a )
media_ids_b = self.DuplicatesGetAlternateMediaIds( alternates_group_id_b )
self.DuplicatesClearPotentialsBetweenMedias( media_ids_a, media_ids_b )
def DuplicatesDeleteAllPotentialDuplicatePairs( self ):
media_ids = set()
for ( smaller_media_id, larger_media_id ) in self._Execute( 'SELECT smaller_media_id, larger_media_id FROM potential_duplicate_pairs;' ):
media_ids.add( smaller_media_id )
media_ids.add( larger_media_id )
hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids )
self._Execute( 'DELETE FROM potential_duplicate_pairs;' )
self.modules_similar_files.ResetSearch( hash_ids )
def DuplicatesDissolveAlternatesGroupId( self, alternates_group_id ):
media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id )
for media_id in media_ids:
self.DuplicatesDissolveMediaId( media_id )
def DuplicatesDissolveAlternatesGroupIdFromHashes( self, hashes ):
hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes )
for hash_id in hash_ids:
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True )
if alternates_group_id is not None:
self.DuplicatesDissolveAlternatesGroupId( alternates_group_id )
def DuplicatesDissolveMediaId( self, media_id ):
self.DuplicatesRemoveAlternateMember( media_id )
self._Execute( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( media_id, media_id ) )
hash_ids = self.DuplicatesGetDuplicateHashIds( media_id )
self._Execute( 'DELETE FROM duplicate_file_members WHERE media_id = ?;', ( media_id, ) )
self._Execute( 'DELETE FROM duplicate_files WHERE media_id = ?;', ( media_id, ) )
self.modules_similar_files.ResetSearch( hash_ids )
def DuplicatesDissolveMediaIdFromHashes( self, hashes ):
hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes )
for hash_id in hash_ids:
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
self.DuplicatesDissolveMediaId( media_id )
def DuplicatesFilterKingHashIds( self, allowed_hash_ids ):
# can't just pull explicit king_hash_ids, since files that do not have a media_id are still kings
# kings = hashes - explicitly not kings
if not isinstance( allowed_hash_ids, set ):
allowed_hash_ids = set( allowed_hash_ids )
with self._MakeTemporaryIntegerTable( allowed_hash_ids, 'hash_id' ) as temp_hash_ids_table_name:
explicit_king_hash_ids = self._STS( self._Execute( 'SELECT king_hash_id FROM {} CROSS JOIN duplicate_files ON ( {}.hash_id = duplicate_files.king_hash_id );'.format( temp_hash_ids_table_name, temp_hash_ids_table_name ) ) )
all_duplicate_member_hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} CROSS JOIN duplicate_file_members USING ( hash_id );'.format( temp_hash_ids_table_name ) ) )
all_non_king_hash_ids = all_duplicate_member_hash_ids.difference( explicit_king_hash_ids )
return allowed_hash_ids.difference( all_non_king_hash_ids )
def DuplicatesFilterMediaIdPairs( self, db_location_context: ClientDBFilesStorage.DBLocationContext, media_id_pairs ):
if len( media_id_pairs ) == 0:
return []
# this is pretty wonked out due to me not wanting to force db_location_context to make a single table
all_media_ids = { i for i in itertools.chain.from_iterable( media_id_pairs ) }
with self._MakeTemporaryIntegerTable( all_media_ids, 'media_id' ) as temp_media_ids_table_name:
hash_ids_to_media_ids = dict( self._Execute( 'SELECT hash_id, media_id FROM {} CROSS JOIN {} USING ( media_id );'.format( temp_media_ids_table_name, 'duplicate_file_members' ) ) )
all_hash_ids = set( hash_ids_to_media_ids.keys() )
good_hash_ids = self.modules_files_storage.FilterHashIds( db_location_context.location_context, all_hash_ids )
good_media_ids = { hash_ids_to_media_ids[ hash_id ] for hash_id in good_hash_ids }
good_media_id_pairs = [ ( smaller_media_id, larger_media_id ) for ( smaller_media_id, larger_media_id ) in media_id_pairs if smaller_media_id in good_media_ids and larger_media_id in good_media_ids ]
return good_media_id_pairs
def DuplicatesGetAlternatesGroupId( self, media_id, do_not_create = False ):
result = self._Execute( 'SELECT alternates_group_id FROM alternate_file_group_members WHERE media_id = ?;', ( media_id, ) ).fetchone()
if result is None:
if do_not_create:
return None
self._Execute( 'INSERT INTO alternate_file_groups DEFAULT VALUES;' )
alternates_group_id = self._GetLastRowId()
self._Execute( 'INSERT INTO alternate_file_group_members ( alternates_group_id, media_id ) VALUES ( ?, ? );', ( alternates_group_id, media_id ) )
else:
( alternates_group_id, ) = result
return alternates_group_id
def DuplicatesGetAlternateMediaIds( self, alternates_group_id ):
media_ids = self._STS( self._Execute( 'SELECT media_id FROM alternate_file_group_members WHERE alternates_group_id = ?;', ( alternates_group_id, ) ) )
return media_ids
def DuplicatesGetBestKingId( self, media_id, db_location_context: ClientDBFilesStorage.DBLocationContext, allowed_hash_ids = None, preferred_hash_ids = None ):
media_hash_ids = self.DuplicatesGetDuplicateHashIds( media_id, db_location_context = db_location_context )
if allowed_hash_ids is not None:
media_hash_ids.intersection_update( allowed_hash_ids )
if len( media_hash_ids ) > 0:
king_hash_id = self.DuplicatesGetKingHashId( media_id )
if preferred_hash_ids is not None:
preferred_hash_ids = media_hash_ids.intersection( preferred_hash_ids )
if len( preferred_hash_ids ) > 0:
if king_hash_id not in preferred_hash_ids:
king_hash_id = random.choice( list( preferred_hash_ids ) )
return king_hash_id
if king_hash_id not in media_hash_ids:
king_hash_id = random.choice( list( media_hash_ids ) )
return king_hash_id
return None
def DuplicatesGetDuplicateHashIds( self, media_id, db_location_context: ClientDBFilesStorage.DBLocationContext = None ):
table_join = 'duplicate_file_members'
if db_location_context is not None:
if not db_location_context.SingleTableIsFast():
hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} WHERE media_id = ?;'.format( table_join ), ( media_id, ) ) )
hash_ids = self.modules_files_storage.FilterHashIds( db_location_context.location_context, hash_ids )
return hash_ids
table_join = db_location_context.GetTableJoinLimitedByFileDomain( table_join )
hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} WHERE media_id = ?;'.format( table_join ), ( media_id, ) ) )
return hash_ids
def DuplicatesGetDuplicatesHashIds( self, media_ids, db_location_context: ClientDBFilesStorage.DBLocationContext = None ):
with self._MakeTemporaryIntegerTable( media_ids, 'media_id' ) as temp_media_ids_table_name:
table_join = '{} CROSS JOIN {} USING ( media_id )'.format( temp_media_ids_table_name, 'duplicate_file_members' )
if db_location_context is not None:
table_join = db_location_context.GetTableJoinLimitedByFileDomain( table_join )
hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {};'.format( table_join ) ) )
return hash_ids
def DuplicatesGetFalsePositiveAlternatesGroupIds( self, alternates_group_id ):
false_positive_alternates_group_ids = set()
results = self._Execute( 'SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id, alternates_group_id ) ).fetchall()
for ( smaller_alternates_group_id, larger_alternates_group_id ) in results:
false_positive_alternates_group_ids.add( smaller_alternates_group_id )
false_positive_alternates_group_ids.add( larger_alternates_group_id )
return false_positive_alternates_group_ids
def DuplicatesGetFileDuplicateInfo( self, location_context, hash ):
result_dict = {}
result_dict[ 'is_king' ] = True
hash_id = self.modules_hashes_local_cache.GetHashId( hash )
counter = collections.Counter()
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
db_location_context = self.modules_files_storage.GetDBLocationContext( location_context )
all_potential_pairs = self._Execute( 'SELECT DISTINCT smaller_media_id, larger_media_id FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( media_id, media_id, ) ).fetchall()
potential_pairs = self.DuplicatesFilterMediaIdPairs( db_location_context, all_potential_pairs )
if len( potential_pairs ) > 0:
counter[ HC.DUPLICATE_POTENTIAL ] = len( potential_pairs )
king_hash_id = self.DuplicatesGetKingHashId( media_id )
result_dict[ 'is_king' ] = king_hash_id == hash_id
media_hash_ids = self.DuplicatesGetDuplicateHashIds( media_id, db_location_context = db_location_context )
num_other_dupe_members = len( media_hash_ids ) - 1
if num_other_dupe_members > 0:
counter[ HC.DUPLICATE_MEMBER ] = num_other_dupe_members
alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True )
if alternates_group_id is not None:
alt_media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id )
alt_media_ids.discard( media_id )
for alt_media_id in alt_media_ids:
alt_hash_ids = self.DuplicatesGetDuplicateHashIds( alt_media_id, db_location_context = db_location_context )
if len( alt_hash_ids ) > 0:
counter[ HC.DUPLICATE_ALTERNATE ] += 1
smaller_media_id = min( media_id, alt_media_id )
larger_media_id = max( media_id, alt_media_id )
result = self._Execute( 'SELECT 1 FROM confirmed_alternate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', ( smaller_media_id, larger_media_id ) ).fetchone()
if result is not None:
counter[ HC.DUPLICATE_CONFIRMED_ALTERNATE ] += 1
false_positive_alternates_group_ids = self.DuplicatesGetFalsePositiveAlternatesGroupIds( alternates_group_id )
false_positive_alternates_group_ids.discard( alternates_group_id )
for false_positive_alternates_group_id in false_positive_alternates_group_ids:
fp_media_ids = self.DuplicatesGetAlternateMediaIds( false_positive_alternates_group_id )
for fp_media_id in fp_media_ids:
fp_hash_ids = self.DuplicatesGetDuplicateHashIds( fp_media_id, db_location_context = db_location_context )
if len( fp_hash_ids ) > 0:
counter[ HC.DUPLICATE_FALSE_POSITIVE ] += 1
result_dict[ 'counts' ] = counter
return result_dict
def DuplicatesGetFileHashesByDuplicateType( self, location_context: ClientLocation.LocationContext, hash: bytes, duplicate_type: int, allowed_hash_ids = None, preferred_hash_ids = None ) -> typing.List[ bytes ]:
hash_id = self.modules_hashes_local_cache.GetHashId( hash )
db_location_context = self.modules_files_storage.GetDBLocationContext( location_context )
dupe_hash_ids = set()
if duplicate_type == HC.DUPLICATE_FALSE_POSITIVE:
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True )
if alternates_group_id is not None:
false_positive_alternates_group_ids = self.DuplicatesGetFalsePositiveAlternatesGroupIds( alternates_group_id )
false_positive_alternates_group_ids.discard( alternates_group_id )
false_positive_media_ids = set()
for false_positive_alternates_group_id in false_positive_alternates_group_ids:
false_positive_media_ids.update( self.DuplicatesGetAlternateMediaIds( false_positive_alternates_group_id ) )
for false_positive_media_id in false_positive_media_ids:
best_king_hash_id = self.DuplicatesGetBestKingId( false_positive_media_id, db_location_context, allowed_hash_ids = allowed_hash_ids, preferred_hash_ids = preferred_hash_ids )
if best_king_hash_id is not None:
dupe_hash_ids.add( best_king_hash_id )
elif duplicate_type == HC.DUPLICATE_ALTERNATE:
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True )
if alternates_group_id is not None:
alternates_media_ids = self._STS( self._Execute( 'SELECT media_id FROM alternate_file_group_members WHERE alternates_group_id = ?;', ( alternates_group_id, ) ) )
alternates_media_ids.discard( media_id )
for alternates_media_id in alternates_media_ids:
best_king_hash_id = self.DuplicatesGetBestKingId( alternates_media_id, db_location_context, allowed_hash_ids = allowed_hash_ids, preferred_hash_ids = preferred_hash_ids )
if best_king_hash_id is not None:
dupe_hash_ids.add( best_king_hash_id )
elif duplicate_type == HC.DUPLICATE_MEMBER:
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
media_hash_ids = self.DuplicatesGetDuplicateHashIds( media_id, db_location_context = db_location_context )
if allowed_hash_ids is not None:
media_hash_ids.intersection_update( allowed_hash_ids )
dupe_hash_ids.update( media_hash_ids )
elif duplicate_type == HC.DUPLICATE_KING:
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
best_king_hash_id = self.DuplicatesGetBestKingId( media_id, db_location_context, allowed_hash_ids = allowed_hash_ids, preferred_hash_ids = preferred_hash_ids )
if best_king_hash_id is not None:
dupe_hash_ids.add( best_king_hash_id )
elif duplicate_type == HC.DUPLICATE_POTENTIAL:
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
table_join = self.DuplicatesGetPotentialDuplicatePairsTableJoinOnFileService( db_location_context )
for ( smaller_media_id, larger_media_id ) in self._Execute( 'SELECT smaller_media_id, larger_media_id FROM {} WHERE smaller_media_id = ? OR larger_media_id = ?;'.format( table_join ), ( media_id, media_id ) ).fetchall():
if smaller_media_id != media_id:
potential_media_id = smaller_media_id
else:
potential_media_id = larger_media_id
best_king_hash_id = self.DuplicatesGetBestKingId( potential_media_id, db_location_context, allowed_hash_ids = allowed_hash_ids, preferred_hash_ids = preferred_hash_ids )
if best_king_hash_id is not None:
dupe_hash_ids.add( best_king_hash_id )
dupe_hash_ids.discard( hash_id )
dupe_hash_ids = list( dupe_hash_ids )
dupe_hash_ids.insert( 0, hash_id )
dupe_hashes = self.modules_hashes_local_cache.GetHashes( dupe_hash_ids )
return dupe_hashes
def DuplicatesGetHashIdsFromDuplicateCountPredicate( self, db_location_context: ClientDBFilesStorage.DBLocationContext, operator, num_relationships, dupe_type ):
# doesn't work for '= 0' or '< 1'
if operator == CC.UNICODE_ALMOST_EQUAL_TO:
lower_bound = 0.8 * num_relationships
upper_bound = 1.2 * num_relationships
def filter_func( count ):
return lower_bound < count and count < upper_bound
elif operator == '<':
def filter_func( count ):
return count < num_relationships
elif operator == '>':
def filter_func( count ):
return count > num_relationships
elif operator == '=':
def filter_func( count ):
return count == num_relationships
hash_ids = set()
if dupe_type == HC.DUPLICATE_FALSE_POSITIVE:
alternates_group_ids_to_valid_for_file_domain = {}
alternates_group_ids_to_false_positives = collections.defaultdict( list )
query = 'SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives;'
for ( alternates_group_id_a, alternates_group_id_b ) in self._Execute( query ):
alternates_group_ids_to_false_positives[ alternates_group_id_a ].append( alternates_group_id_b )
alternates_group_ids_to_false_positives[ alternates_group_id_b ].append( alternates_group_id_a )
for ( alternates_group_id, false_positive_alternates_group_ids ) in alternates_group_ids_to_false_positives.items():
count = 0
for false_positive_alternates_group_id in false_positive_alternates_group_ids:
if false_positive_alternates_group_id not in alternates_group_ids_to_valid_for_file_domain:
valid = False
fp_media_ids = self.DuplicatesGetAlternateMediaIds( false_positive_alternates_group_id )
for fp_media_id in fp_media_ids:
fp_hash_ids = self.DuplicatesGetDuplicateHashIds( fp_media_id, db_location_context = db_location_context )
if len( fp_hash_ids ) > 0:
valid = True
break
alternates_group_ids_to_valid_for_file_domain[ false_positive_alternates_group_id ] = valid
if alternates_group_ids_to_valid_for_file_domain[ false_positive_alternates_group_id ]:
count += 1
if filter_func( count ):
media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id )
hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids, db_location_context = db_location_context )
elif dupe_type == HC.DUPLICATE_ALTERNATE:
query = 'SELECT alternates_group_id, COUNT( * ) FROM alternate_file_group_members GROUP BY alternates_group_id;'
results = self._Execute( query ).fetchall()
for ( alternates_group_id, count ) in results:
count -= 1 # num relationships is number group members - 1
media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id )
alternates_group_id_hash_ids = []
for media_id in media_ids:
media_id_hash_ids = self.DuplicatesGetDuplicateHashIds( media_id, db_location_context = db_location_context )
if len( media_id_hash_ids ) == 0:
# this alternate relation does not count for our current file domain, so it should not contribute to the count
count -= 1
else:
alternates_group_id_hash_ids.extend( media_id_hash_ids )
if filter_func( count ):
hash_ids.update( alternates_group_id_hash_ids )
elif dupe_type == HC.DUPLICATE_MEMBER:
table_join = db_location_context.GetTableJoinLimitedByFileDomain( 'duplicate_file_members' )
query = 'SELECT media_id, COUNT( * ) FROM {} GROUP BY media_id;'.format( table_join )
media_ids = []
for ( media_id, count ) in self._Execute( query ):
count -= 1
if filter_func( count ):
media_ids.append( media_id )
hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids, db_location_context = db_location_context )
elif dupe_type == HC.DUPLICATE_POTENTIAL:
table_join = self.DuplicatesGetPotentialDuplicatePairsTableJoinOnFileService( db_location_context )
smaller_query = 'SELECT smaller_media_id, COUNT( * ) FROM ( SELECT DISTINCT smaller_media_id, larger_media_id FROM {} ) GROUP BY smaller_media_id;'.format( table_join )
larger_query = 'SELECT larger_media_id, COUNT( * ) FROM ( SELECT DISTINCT smaller_media_id, larger_media_id FROM {} ) GROUP BY larger_media_id;'.format( table_join )
media_ids_to_counts = collections.Counter()
for ( media_id, count ) in self._Execute( smaller_query ):
media_ids_to_counts[ media_id ] += count
for ( media_id, count ) in self._Execute( larger_query ):
media_ids_to_counts[ media_id ] += count
media_ids = [ media_id for ( media_id, count ) in media_ids_to_counts.items() if filter_func( count ) ]
hash_ids = self.DuplicatesGetDuplicatesHashIds( media_ids, db_location_context = db_location_context )
return hash_ids
def DuplicatesGetKingHashId( self, media_id ):
( king_hash_id, ) = self._Execute( 'SELECT king_hash_id FROM duplicate_files WHERE media_id = ?;', ( media_id, ) ).fetchone()
return king_hash_id
def DuplicatesGetMediaId( self, hash_id, do_not_create = False ):
result = self._Execute( 'SELECT media_id FROM duplicate_file_members WHERE hash_id = ?;', ( hash_id, ) ).fetchone()
if result is None:
if do_not_create:
return None
self._Execute( 'INSERT INTO duplicate_files ( king_hash_id ) VALUES ( ? );', ( hash_id, ) )
media_id = self._GetLastRowId()
self._Execute( 'INSERT INTO duplicate_file_members ( media_id, hash_id ) VALUES ( ?, ? );', ( media_id, hash_id ) )
else:
( media_id, ) = result
return media_id
def DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( self, pixel_dupes_preference: int, max_hamming_distance: int ):
tables = [
'potential_duplicate_pairs',
'duplicate_files AS duplicate_files_smaller',
'duplicate_files AS duplicate_files_larger'
]
join_predicates = [ 'smaller_media_id = duplicate_files_smaller.media_id AND larger_media_id = duplicate_files_larger.media_id' ]
if pixel_dupes_preference != CC.SIMILAR_FILES_PIXEL_DUPES_REQUIRED:
join_predicates.append( 'distance <= {}'.format( max_hamming_distance ) )
if pixel_dupes_preference in ( CC.SIMILAR_FILES_PIXEL_DUPES_REQUIRED, CC.SIMILAR_FILES_PIXEL_DUPES_EXCLUDED ):
join_predicate_pixel_dupes = 'duplicate_files_smaller.king_hash_id = pixel_hash_map_smaller.hash_id AND duplicate_files_larger.king_hash_id = pixel_hash_map_larger.hash_id AND pixel_hash_map_smaller.pixel_hash_id = pixel_hash_map_larger.pixel_hash_id'
if pixel_dupes_preference == CC.SIMILAR_FILES_PIXEL_DUPES_REQUIRED:
tables.extend( [
'pixel_hash_map AS pixel_hash_map_smaller',
'pixel_hash_map AS pixel_hash_map_larger'
] )
join_predicates.append( join_predicate_pixel_dupes )
elif pixel_dupes_preference == CC.SIMILAR_FILES_PIXEL_DUPES_EXCLUDED:
# can't do "AND NOT {}", or the join will just give you the million rows where it isn't true. we want 'AND NEVER {}', and quick
select_statement = 'SELECT 1 FROM pixel_hash_map AS pixel_hash_map_smaller, pixel_hash_map as pixel_hash_map_larger ON ( {} )'.format( join_predicate_pixel_dupes )
join_predicates.append( 'NOT EXISTS ( {} )'.format( select_statement ) )
return ( tables, join_predicates )
def DuplicatesGetPotentialDuplicatePairsTableJoinOnEverythingSearchResults( self, db_location_context: ClientDBFilesStorage.DBLocationContext, pixel_dupes_preference: int, max_hamming_distance: int ):
( tables, join_predicates ) = self.DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( pixel_dupes_preference, max_hamming_distance )
if not db_location_context.location_context.IsAllKnownFiles():
files_table_name = db_location_context.GetSingleFilesTableName()
tables.extend( [
'{} AS current_files_smaller'.format( files_table_name ),
'{} AS current_files_larger'.format( files_table_name )
] )
join_predicates.append( 'duplicate_files_smaller.king_hash_id = current_files_smaller.hash_id AND duplicate_files_larger.king_hash_id = current_files_larger.hash_id' )
table_join = '{} ON ( {} )'.format( ', '.join( tables ), ' AND '.join( join_predicates ) )
return table_join
def DuplicatesGetPotentialDuplicatePairsTableJoinOnFileService( self, db_location_context: ClientDBFilesStorage.DBLocationContext ):
if db_location_context.location_context.IsAllKnownFiles():
table_join = 'potential_duplicate_pairs'
else:
files_table_name = db_location_context.GetSingleFilesTableName()
table_join = 'potential_duplicate_pairs, duplicate_files AS duplicate_files_smaller, {} AS current_files_smaller, duplicate_files AS duplicate_files_larger, {} AS current_files_larger ON ( smaller_media_id = duplicate_files_smaller.media_id AND duplicate_files_smaller.king_hash_id = current_files_smaller.hash_id AND larger_media_id = duplicate_files_larger.media_id AND duplicate_files_larger.king_hash_id = current_files_larger.hash_id )'.format( files_table_name, files_table_name )
return table_join
def DuplicatesGetPotentialDuplicatePairsTableJoinOnSearchResultsBothFiles( self, results_table_name: str, pixel_dupes_preference: int, max_hamming_distance: int ):
( tables, join_predicates ) = self.DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( pixel_dupes_preference, max_hamming_distance )
tables.extend( [
'{} AS results_smaller'.format( results_table_name ),
'{} AS results_larger'.format( results_table_name )
] )
join_predicates.append( 'duplicate_files_smaller.king_hash_id = results_smaller.hash_id AND duplicate_files_larger.king_hash_id = results_larger.hash_id' )
table_join = '{} ON ( {} )'.format( ', '.join( tables ), ' AND '.join( join_predicates ) )
return table_join
def DuplicatesGetPotentialDuplicatePairsTableJoinOnSearchResults( self, db_location_context: ClientDBFilesStorage.DBLocationContext, results_table_name: str, pixel_dupes_preference: int, max_hamming_distance: int ):
# why yes this is a seven table join that involves a mix of duplicated tables, temporary tables, and duplicated temporary tables
#
# main thing is, give this guy a search from duplicate filter UI, it'll give you a fast table join that returns potential dupes that match that
#
# ████████████████████████████████████████████████████████████████████████
# ████████████████████████████████████████████████████████████████████████
# ██████████████████████████████████▓█████████████████████████████████████
# ██████████████████████████████████▒▓████████████████████████████████████
# █████████████████████████████▓▒▓▓▒░░▒░██████████████████████████████████
# ███████████████████████████▓▒▒░░░░ ▒▓███████▓▓▓██████████████████████
# █████████████████████▓▒▓▓▓█ ▒ ▓████▓▓▓▓▓██████████████████████
# █████████████████▓▓▓▓▓░ ░ ░░ ░▓█▓▓▓██▓▓██████████████████████
# █████████████████▓▓▓▒░▒▒▒ █▒ ░▓▓▓█████▓▓▓▓██████████████████████
# █████████████████▓▓▒░░ ░▒ ░▒█▓░▒▓▓▓█████▒▒▒▒▒▓█████████████████████
# ████████████████████▓▒░ ░ ░▒▒▓▓▓██▓▓█▓░ ░░▓▓▒▓▓▒▓▓██████████████████
# ██████████████████████▒░░░░ ▒▓▓▓▓▒▓▓▓▓██▓▓░▓█▓▓▓▓▓▓▓▓▓▓████████████████
# ████████████▓▒█▓███▓▓▒▓░▒░░▒▓▓▓▓▓▒▒░░ ░▒▓▓████▓ ▓▓░░▒▓▓ ░▒▒████████████
# ████████████▒▒████▓░ ░▒▒▒▓██▓▓▒▒▒▒░░ ▒▓▓▒ ░▒░░▓▒ ▒████████████
# ████████████▒▓▓▓█▓░▒▒░▒▓███▓▓▒░░░░ ░░ ░░░▒ ▒▓▒▒▒░▒▒ ▓███████████
# █████████████▒▓▓▓▒▒▓▓▒▓███▓▓▓▒▒░░░░░ ░░▒▓▓ ▒▒░░░ ▓███████████
# ██████████████▓▓▓▓███▓██▓▓▓▓▓▒▒░░░░ ░ ░▓░ ░░ ░▓█████████████
# ███████████████▓▓██▒▓█▓▓▓▓▓▓▒▒░░░░ ░░ ▒▓░ ▓██████████████
# █████████████████▓▒▓█▓▓▓▓▓▓▓▓▒▒▒▒░░▒▒▒ ░▒█▒ ▓████████████████
# ████████████████▓░▒██▓▓▓▓▓▓▓▓▓▒▒▒░░▒▒▒▓▒▒ ░▒▓▓▒▒░░▒░▓██████████████████
# ██████████████▓░▓████▓▓▓▓▓▓▓▓▒▒░░░▒░░░▒▒ ▒▓▓▓ ░▒▓▓▓ ▒█████████████████
# ██████████████▓▓▓██████▓▓▓▓▓▓▒ ░▒▓▒░▓▓ ░ ░▒ ▒░▒▒▒▒▓▒ ▓██████████████
# ██████████████▓▒░▒▒ ▓█▓▓▓▓▓▓▓▓▓▓▒░▒▒▒░▒▒░░░░ ▓▒░░ ░████▓███████████
# █████████████████░ ▓█▓██████████▓░░ ░▒▓█████▓ ▒░░ ░▓▓▒▓██░░▓█████████
# █████████████████▒ ▒█▓▓▓██████████▓▓█▓████████▓ ▒░▒▒░▒ ░███ ▓████████
# ██████████████████▒ ▒█▓▓▓██████████▒ ███████████ ░▓▒ ▒████▒ ████████
# █████████████████████▓▓▒▓██▓███████░ ▒▒████████▒░███▒ ░▓▓▓▓▒▒███████████
# ███████████████████████▒▒███████▓▓▓▓▒ ░▓██████ ▒████▒▓▓▓▓▒▓████████████
# █████████████████████▓▓▓▓▓▓▓▓▓▓▓▓█████ ▒▒▓▒▒ ▓██▓ ▒████████████
# ██████████████████████▓▓▓▓▓▓▓█▓▓▓██████ ▒██▓░░░ ▒ ░▓█▓▒▒█████████████
# ███████████████████████▓▓▓▓▓▓█▓▓▓██▓██▓ ░▓███▓▓▓░ ▓███████████████████
# ████████████████████████▓███▓▓▓▓▓▓█▓█▓ ░ ░▓█ ▒░░▒ ▓███████████████████
# █████████████████████████▓▓████▓▓▓▓▓ ▒█░ ▓█▓▓████████████████████
# ████████████████████████▓█▓██▓▓▓▓▓▒▓ ▓▒ ▒█████████████████████████
# ████████████████████████▓▓███▓▓▓▒▓▒▓░▒░ ▓░░ ██████████████████████████
# ████████████████████████▓▓▓▓▓█▓▓▓▒░░░░░ ▒ ▒██████████████████████████
# █████████████████████████▓▓▓▓▓▓▓█▓▓▓▓▒░ ░░ ▒███████████████████████████
# ███████████████████████████▓▓▓▓▓▓▓▓▓▓▒ ▓████████████████████████████
# ████████████████████████████▓▓▓▓▓▒▒ ▒░ ██████████████████████████████
# ██████████████████████████████▓▓▓▒ ▒███████████████████████████████
# ███████████████████████████████▓▓▒░ ▓████████████████████████████████
# ████████████████████████████████████████████████████████████████████████
# ████████████████████████████████████████████████████████████████████████
#
( tables, join_predicates ) = self.DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( pixel_dupes_preference, max_hamming_distance )
if db_location_context.location_context.IsAllKnownFiles():
tables.append( '{} AS results_table_for_this_query'.format( results_table_name ) )
join_predicates.append( '( duplicate_files_smaller.king_hash_id = results_table_for_this_query.hash_id OR duplicate_files_larger.king_hash_id = results_table_for_this_query.hash_id )' )
else:
files_table_name = db_location_context.GetSingleFilesTableName()
tables.extend( [
'{} AS results_table_for_this_query'.format( results_table_name ),
'{} AS current_files_for_this_query'.format( files_table_name )
] )
join_predicate_smaller_matches = '( duplicate_files_smaller.king_hash_id = results_table_for_this_query.hash_id AND duplicate_files_larger.king_hash_id = current_files_for_this_query.hash_id )'
join_predicate_larger_matches = '( duplicate_files_smaller.king_hash_id = current_files_for_this_query.hash_id AND duplicate_files_larger.king_hash_id = results_table_for_this_query.hash_id )'
join_predicates.append( '( {} OR {} )'.format( join_predicate_smaller_matches, join_predicate_larger_matches ) )
table_join = '{} ON ( {} )'.format( ', '.join( tables ), ' AND '.join( join_predicates ) )
return table_join
def DuplicatesGetPotentialDuplicatePairsTableJoinOnSeparateSearchResults( self, results_table_name_1: str, results_table_name_2: str, pixel_dupes_preference: int, max_hamming_distance: int ):
#
# And taking the above to its logical conclusion with two results sets, one file in xor either
#
( tables, join_predicates ) = self.DuplicatesGetPotentialDuplicatePairsTableJoinGetInitialTablesAndPreds( pixel_dupes_preference, max_hamming_distance )
# we don't have to do any db_location_context jibber-jabber here as long as we stipulate that the two results sets have the same location context, which we'll enforce in UI
# just like above when 'both files match', we know we are db_location_context cross-referenced since we are intersecting with file searches performed on that search domain
# so, this is actually a bit simpler than the non-both-files-match one search case!!
tables.extend( [
'{} AS results_table_for_this_query_1'.format( results_table_name_1 ),
'{} AS results_table_for_this_query_2'.format( results_table_name_2 )
] )
one_two = '( duplicate_files_smaller.king_hash_id = results_table_for_this_query_1.hash_id AND duplicate_files_larger.king_hash_id = results_table_for_this_query_2.hash_id )'
two_one = '( duplicate_files_smaller.king_hash_id = results_table_for_this_query_2.hash_id AND duplicate_files_larger.king_hash_id = results_table_for_this_query_1.hash_id )'
join_predicates.append( '( {} OR {} )'.format( one_two, two_one ) )
table_join = '{} ON ( {} )'.format( ', '.join( tables ), ' AND '.join( join_predicates ) )
return table_join
def DuplicatesMediasAreAlternates( self, media_id_a, media_id_b ):
alternates_group_id_a = self.DuplicatesGetAlternatesGroupId( media_id_a, do_not_create = True )
if alternates_group_id_a is None:
return False
alternates_group_id_b = self.DuplicatesGetAlternatesGroupId( media_id_b, do_not_create = True )
if alternates_group_id_b is None:
return False
return alternates_group_id_a == alternates_group_id_b
def DuplicatesMediasAreConfirmedAlternates( self, media_id_a, media_id_b ):
smaller_media_id = min( media_id_a, media_id_b )
larger_media_id = max( media_id_a, media_id_b )
result = self._Execute( 'SELECT 1 FROM confirmed_alternate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', ( smaller_media_id, larger_media_id ) ).fetchone()
return result is not None
def DuplicatesMediasAreFalsePositive( self, media_id_a, media_id_b ):
alternates_group_id_a = self.DuplicatesGetAlternatesGroupId( media_id_a, do_not_create = True )
if alternates_group_id_a is None:
return False
alternates_group_id_b = self.DuplicatesGetAlternatesGroupId( media_id_b, do_not_create = True )
if alternates_group_id_b is None:
return False
return self.DuplicatesAlternatesGroupsAreFalsePositive( alternates_group_id_a, alternates_group_id_b )
def DuplicatesMergeMedias( self, superior_media_id, mergee_media_id ):
if superior_media_id == mergee_media_id:
return
self.DuplicatesClearPotentialsBetweenMedias( ( superior_media_id, ), ( mergee_media_id, ) )
alternates_group_id = self.DuplicatesGetAlternatesGroupId( superior_media_id )
mergee_alternates_group_id = self.DuplicatesGetAlternatesGroupId( mergee_media_id )
if alternates_group_id != mergee_alternates_group_id:
if self.DuplicatesAlternatesGroupsAreFalsePositive( alternates_group_id, mergee_alternates_group_id ):
smaller_alternates_group_id = min( alternates_group_id, mergee_alternates_group_id )
larger_alternates_group_id = max( alternates_group_id, mergee_alternates_group_id )
self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? AND larger_alternates_group_id = ?;', ( smaller_alternates_group_id, larger_alternates_group_id ) )
self.DuplicatesSetAlternates( superior_media_id, mergee_media_id )
self._Execute( 'UPDATE duplicate_file_members SET media_id = ? WHERE media_id = ?;', ( superior_media_id, mergee_media_id ) )
smaller_media_id = min( superior_media_id, mergee_media_id )
larger_media_id = max( superior_media_id, mergee_media_id )
# ensure the potential merge pair is gone
self._Execute( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', ( smaller_media_id, larger_media_id ) )
# now merge potentials from the old to the new--however this has complicated tests to stop confirmed alts and so on, so can't just update ids
existing_potential_info_of_mergee_media_id = self._Execute( 'SELECT smaller_media_id, larger_media_id, distance FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( mergee_media_id, mergee_media_id ) ).fetchall()
self._Execute( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( mergee_media_id, mergee_media_id ) )
for ( smaller_media_id, larger_media_id, distance ) in existing_potential_info_of_mergee_media_id:
if smaller_media_id == mergee_media_id:
media_id_a = superior_media_id
media_id_b = larger_media_id
else:
media_id_a = smaller_media_id
media_id_b = superior_media_id
potential_duplicate_media_ids_and_distances = [ ( media_id_b, distance ) ]
self.DuplicatesAddPotentialDuplicates( media_id_a, potential_duplicate_media_ids_and_distances )
# ensure any previous confirmed alt pair is gone
self._Execute( 'DELETE FROM confirmed_alternate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', ( smaller_media_id, larger_media_id ) )
# now merge confirmed alts from the old to the new
self._Execute( 'UPDATE OR IGNORE confirmed_alternate_pairs SET smaller_media_id = ? WHERE smaller_media_id = ?;', ( superior_media_id, mergee_media_id ) )
self._Execute( 'UPDATE OR IGNORE confirmed_alternate_pairs SET larger_media_id = ? WHERE larger_media_id = ?;', ( superior_media_id, mergee_media_id ) )
# and clear out potentials that are now invalid
confirmed_alternate_pairs = self._Execute( 'SELECT smaller_media_id, larger_media_id FROM confirmed_alternate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( superior_media_id, superior_media_id ) ).fetchall()
self._ExecuteMany( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?;', confirmed_alternate_pairs )
# clear out empty records
self._Execute( 'DELETE FROM alternate_file_group_members WHERE media_id = ?;', ( mergee_media_id, ) )
self._Execute( 'DELETE FROM duplicate_files WHERE media_id = ?;', ( mergee_media_id, ) )
def DuplicatesRemoveAlternateMember( self, media_id ):
alternates_group_id = self.DuplicatesGetAlternatesGroupId( media_id, do_not_create = True )
if alternates_group_id is not None:
alternates_media_ids = self.DuplicatesGetAlternateMediaIds( alternates_group_id )
self._Execute( 'DELETE FROM alternate_file_group_members WHERE media_id = ?;', ( media_id, ) )
self._Execute( 'DELETE FROM confirmed_alternate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( media_id, media_id ) )
if len( alternates_media_ids ) == 1: # i.e. what we just removed was the last of the group
self._Execute( 'DELETE FROM alternate_file_groups WHERE alternates_group_id = ?;', ( alternates_group_id, ) )
self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id, alternates_group_id ) )
hash_ids = self.DuplicatesGetDuplicateHashIds( media_id )
self.modules_similar_files.ResetSearch( hash_ids )
def DuplicatesRemoveAlternateMemberFromHashes( self, hashes ):
hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes )
for hash_id in hash_ids:
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
self.DuplicatesRemoveAlternateMember( media_id )
def DuplicatesRemoveMediaIdMember( self, hash_id ):
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
king_hash_id = self.DuplicatesGetKingHashId( media_id )
if hash_id == king_hash_id:
self.DuplicatesDissolveMediaId( media_id )
else:
self._Execute( 'DELETE FROM duplicate_file_members WHERE hash_id = ?;', ( hash_id, ) )
self.modules_similar_files.ResetSearch( ( hash_id, ) )
def DuplicatesRemoveMediaIdMemberFromHashes( self, hashes ):
hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes )
for hash_id in hash_ids:
self.DuplicatesRemoveMediaIdMember( hash_id )
def DuplicatesRemovePotentialPairs( self, hash_id ):
media_id = self.DuplicatesGetMediaId( hash_id, do_not_create = True )
if media_id is not None:
self._Execute( 'DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?;', ( media_id, media_id ) )
def DuplicatesRemovePotentialPairsFromHashes( self, hashes ):
hash_ids = self.modules_hashes_local_cache.GetHashIds( hashes )
for hash_id in hash_ids:
self.DuplicatesRemovePotentialPairs( hash_id )
def DuplicatesSetAlternates( self, media_id_a, media_id_b ):
if media_id_a == media_id_b:
return
# let's clear out any outstanding potentials. whether this is a valid or not connection, we don't want to see it again
self.DuplicatesClearPotentialsBetweenMedias( ( media_id_a, ), ( media_id_b, ) )
# now check if we should be making a new relationship
alternates_group_id_a = self.DuplicatesGetAlternatesGroupId( media_id_a )
alternates_group_id_b = self.DuplicatesGetAlternatesGroupId( media_id_b )
if self.DuplicatesAlternatesGroupsAreFalsePositive( alternates_group_id_a, alternates_group_id_b ):
return
# write a confirmed result so this can't come up again due to subsequent re-searching etc...
# in future, I can tune this to consider alternate labels and indices. alternates with different labels and indices are not appropriate for potentials, so we can add more rows here
smaller_media_id = min( media_id_a, media_id_b )
larger_media_id = max( media_id_a, media_id_b )
self._Execute( 'INSERT OR IGNORE INTO confirmed_alternate_pairs ( smaller_media_id, larger_media_id ) VALUES ( ?, ? );', ( smaller_media_id, larger_media_id ) )
if alternates_group_id_a == alternates_group_id_b:
return
# ok, they are currently not alternates, so we need to merge B into A
# first, for all false positive relationships that A already has, clear out potentials between B and those fps before it moves over
false_positive_pairs = self._Execute( 'SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id_a, alternates_group_id_a ) )
for ( smaller_false_positive_alternates_group_id, larger_false_positive_alternates_group_id ) in false_positive_pairs:
if smaller_false_positive_alternates_group_id == alternates_group_id_a:
self.DuplicatesClearPotentialsBetweenAlternatesGroups( alternates_group_id_b, larger_false_positive_alternates_group_id )
else:
self.DuplicatesClearPotentialsBetweenAlternatesGroups( smaller_false_positive_alternates_group_id, alternates_group_id_b )
# first, update all B to A
self._Execute( 'UPDATE alternate_file_group_members SET alternates_group_id = ? WHERE alternates_group_id = ?;', ( alternates_group_id_a, alternates_group_id_b ) )
# move false positive records for B to A
false_positive_pairs = self._Execute( 'SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id_b, alternates_group_id_b ) )
self._Execute( 'DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?;', ( alternates_group_id_b, alternates_group_id_b ) )
for ( smaller_false_positive_alternates_group_id, larger_false_positive_alternates_group_id ) in false_positive_pairs:
if smaller_false_positive_alternates_group_id == alternates_group_id_b:
self.DuplicatesSetFalsePositive( alternates_group_id_a, larger_false_positive_alternates_group_id )
else:
self.DuplicatesSetFalsePositive( smaller_false_positive_alternates_group_id, alternates_group_id_a )
# remove master record
self._Execute( 'DELETE FROM alternate_file_groups WHERE alternates_group_id = ?;', ( alternates_group_id_b, ) )
# pubsub to refresh alternates info for alternates_group_id_a and _b goes here
def DuplicatesSetFalsePositive( self, alternates_group_id_a, alternates_group_id_b ):
if alternates_group_id_a == alternates_group_id_b:
return
self.DuplicatesClearPotentialsBetweenAlternatesGroups( alternates_group_id_a, alternates_group_id_b )
smaller_alternates_group_id = min( alternates_group_id_a, alternates_group_id_b )
larger_alternates_group_id = max( alternates_group_id_a, alternates_group_id_b )
self._Execute( 'INSERT OR IGNORE INTO duplicate_false_positives ( smaller_alternates_group_id, larger_alternates_group_id ) VALUES ( ?, ? );', ( smaller_alternates_group_id, larger_alternates_group_id ) )
def DuplicatesSetKing( self, king_hash_id, media_id ):
self._Execute( 'UPDATE duplicate_files SET king_hash_id = ? WHERE media_id = ?;', ( king_hash_id, media_id ) )
def DuplicatesSetKingFromHash( self, hash ):
hash_id = self.modules_hashes_local_cache.GetHashId( hash )
media_id = self.DuplicatesGetMediaId( hash_id )
self.DuplicatesSetKing( hash_id, media_id )
def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:
tables_and_columns = []
if content_type == HC.CONTENT_TYPE_HASH:
tables_and_columns.append( ( 'file_maintenance_jobs', 'hash_id' ) )
return tables_and_columns