2022-01-12 22:14:50 +00:00
import collections
import itertools
import random
import sqlite3
import typing
from hydrus . core import HydrusConstants as HC
from hydrus . core import HydrusExceptions
from hydrus . client import ClientConstants as CC
2022-01-19 21:28:59 +00:00
from hydrus . client import ClientLocation
2022-01-12 22:14:50 +00:00
from hydrus . client import ClientSearch
from hydrus . client . db import ClientDBDefinitionsCache
from hydrus . client . db import ClientDBFilesStorage
from hydrus . client . db import ClientDBModule
from hydrus . client . db import ClientDBSimilarFiles
class ClientDBFilesDuplicates ( ClientDBModule . ClientDBModule ) :
def __init__ (
self ,
cursor : sqlite3 . Cursor ,
modules_files_storage : ClientDBFilesStorage . ClientDBFilesStorage ,
modules_hashes_local_cache : ClientDBDefinitionsCache . ClientDBCacheLocalHashes ,
modules_similar_files : ClientDBSimilarFiles . ClientDBSimilarFiles
) :
ClientDBModule . ClientDBModule . __init__ ( self , ' client file duplicates ' , cursor )
self . modules_files_storage = modules_files_storage
self . modules_hashes_local_cache = modules_hashes_local_cache
self . modules_similar_files = modules_similar_files
self . _service_ids_to_content_types_to_outstanding_local_processing = collections . defaultdict ( dict )
def _GetInitialIndexGenerationDict ( self ) - > dict :
index_generation_dict = { }
index_generation_dict [ ' main.duplicate_false_positives ' ] = [
( [ ' larger_alternates_group_id ' , ' smaller_alternates_group_id ' ] , True , 469 )
]
index_generation_dict [ ' main.potential_duplicate_pairs ' ] = [
( [ ' larger_media_id ' , ' smaller_media_id ' ] , True , 469 )
]
return index_generation_dict
def _GetInitialTableGenerationDict ( self ) - > dict :
return {
' main.alternate_file_groups ' : ( ' CREATE TABLE IF NOT EXISTS {} ( alternates_group_id INTEGER PRIMARY KEY ); ' , 469 ) ,
' main.alternate_file_group_members ' : ( ' CREATE TABLE IF NOT EXISTS {} ( alternates_group_id INTEGER, media_id INTEGER UNIQUE, PRIMARY KEY ( alternates_group_id, media_id ) ); ' , 469 ) ,
' main.confirmed_alternate_pairs ' : ( ' CREATE TABLE IF NOT EXISTS {} ( smaller_media_id INTEGER, larger_media_id INTEGER, PRIMARY KEY ( smaller_media_id, larger_media_id ) ); ' , 469 ) ,
' main.duplicate_files ' : ( ' CREATE TABLE IF NOT EXISTS {} ( media_id INTEGER PRIMARY KEY, king_hash_id INTEGER UNIQUE ); ' , 469 ) ,
' main.duplicate_file_members ' : ( ' CREATE TABLE IF NOT EXISTS {} ( media_id INTEGER, hash_id INTEGER UNIQUE, PRIMARY KEY ( media_id, hash_id ) ); ' , 469 ) ,
' main.duplicate_false_positives ' : ( ' CREATE TABLE IF NOT EXISTS {} ( smaller_alternates_group_id INTEGER, larger_alternates_group_id INTEGER, PRIMARY KEY ( smaller_alternates_group_id, larger_alternates_group_id ) ); ' , 469 ) ,
' main.potential_duplicate_pairs ' : ( ' CREATE TABLE IF NOT EXISTS {} ( smaller_media_id INTEGER, larger_media_id INTEGER, distance INTEGER, PRIMARY KEY ( smaller_media_id, larger_media_id ) ); ' , 469 )
}
def DuplicatesAddPotentialDuplicates ( self , media_id , potential_duplicate_media_ids_and_distances ) :
inserts = [ ]
for ( potential_duplicate_media_id , distance ) in potential_duplicate_media_ids_and_distances :
if potential_duplicate_media_id == media_id : # already duplicates!
continue
if self . DuplicatesMediasAreFalsePositive ( media_id , potential_duplicate_media_id ) :
continue
if self . DuplicatesMediasAreConfirmedAlternates ( media_id , potential_duplicate_media_id ) :
continue
# if they are alternates with different alt label and index, do not add
# however this _could_ be folded into areconfirmedalts on the setalt event--any other alt with diff label/index also gets added
smaller_media_id = min ( media_id , potential_duplicate_media_id )
larger_media_id = max ( media_id , potential_duplicate_media_id )
inserts . append ( ( smaller_media_id , larger_media_id , distance ) )
if len ( inserts ) > 0 :
self . _ExecuteMany ( ' INSERT OR IGNORE INTO potential_duplicate_pairs ( smaller_media_id, larger_media_id, distance ) VALUES ( ?, ?, ? ); ' , inserts )
def DuplicatesAlternatesGroupsAreFalsePositive ( self , alternates_group_id_a , alternates_group_id_b ) :
if alternates_group_id_a == alternates_group_id_b :
return False
smaller_alternates_group_id = min ( alternates_group_id_a , alternates_group_id_b )
larger_alternates_group_id = max ( alternates_group_id_a , alternates_group_id_b )
result = self . _Execute ( ' SELECT 1 FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? AND larger_alternates_group_id = ?; ' , ( smaller_alternates_group_id , larger_alternates_group_id ) ) . fetchone ( )
false_positive_pair_found = result is not None
return false_positive_pair_found
def DuplicatesClearAllFalsePositiveRelations ( self , alternates_group_id ) :
self . _Execute ( ' DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?; ' , ( alternates_group_id , alternates_group_id ) )
media_ids = self . DuplicatesGetAlternateMediaIds ( alternates_group_id )
hash_ids = self . DuplicatesGetDuplicatesHashIds ( media_ids )
self . modules_similar_files . ResetSearch ( hash_ids )
def DuplicatesClearAllFalsePositiveRelationsFromHashes ( self , hashes ) :
hash_ids = self . modules_hashes_local_cache . GetHashIds ( hashes )
for hash_id in hash_ids :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
alternates_group_id = self . DuplicatesGetAlternatesGroupId ( media_id , do_not_create = True )
if alternates_group_id is not None :
self . DuplicatesClearAllFalsePositiveRelations ( alternates_group_id )
def DuplicatesClearFalsePositiveRelationsBetweenGroups ( self , alternates_group_ids ) :
pairs = list ( itertools . combinations ( alternates_group_ids , 2 ) )
for ( alternates_group_id_a , alternates_group_id_b ) in pairs :
smaller_alternates_group_id = min ( alternates_group_id_a , alternates_group_id_b )
larger_alternates_group_id = max ( alternates_group_id_a , alternates_group_id_b )
self . _Execute ( ' DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? AND larger_alternates_group_id = ?; ' , ( smaller_alternates_group_id , larger_alternates_group_id ) )
for alternates_group_id in alternates_group_ids :
media_ids = self . DuplicatesGetAlternateMediaIds ( alternates_group_id )
hash_ids = self . DuplicatesGetDuplicatesHashIds ( media_ids )
self . modules_similar_files . ResetSearch ( hash_ids )
def DuplicatesClearFalsePositiveRelationsBetweenGroupsFromHashes ( self , hashes ) :
alternates_group_ids = set ( )
hash_id = self . modules_hashes_local_cache . GetHashId ( hash )
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
alternates_group_id = self . DuplicatesGetAlternatesGroupId ( media_id , do_not_create = True )
if alternates_group_id is not None :
alternates_group_ids . add ( alternates_group_id )
if len ( alternates_group_ids ) > 1 :
self . DuplicatesClearFalsePositiveRelationsBetweenGroups ( alternates_group_ids )
def DuplicatesClearPotentialsBetweenMedias ( self , media_ids_a , media_ids_b ) :
# these two groups of medias now have a false positive or alternates relationship set between them, or they are about to be merged
# therefore, potentials between them are no longer needed
# note that we are not eliminating intra-potentials within A or B, only inter-potentials between A and B
all_media_ids = set ( )
all_media_ids . update ( media_ids_a )
all_media_ids . update ( media_ids_b )
with self . _MakeTemporaryIntegerTable ( all_media_ids , ' media_id ' ) as temp_media_ids_table_name :
# keep these separate--older sqlite can't do cross join to an OR ON
# temp media ids to potential pairs
potential_duplicate_pairs = set ( self . _Execute ( ' SELECT smaller_media_id, larger_media_id FROM {} CROSS JOIN potential_duplicate_pairs ON ( smaller_media_id = media_id ); ' . format ( temp_media_ids_table_name ) ) . fetchall ( ) )
potential_duplicate_pairs . update ( self . _Execute ( ' SELECT smaller_media_id, larger_media_id FROM {} CROSS JOIN potential_duplicate_pairs ON ( larger_media_id = media_id ); ' . format ( temp_media_ids_table_name ) ) . fetchall ( ) )
deletees = [ ]
for ( smaller_media_id , larger_media_id ) in potential_duplicate_pairs :
if ( smaller_media_id in media_ids_a and larger_media_id in media_ids_b ) or ( smaller_media_id in media_ids_b and larger_media_id in media_ids_a ) :
deletees . append ( ( smaller_media_id , larger_media_id ) )
if len ( deletees ) > 0 :
self . _ExecuteMany ( ' DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?; ' , deletees )
def DuplicatesClearPotentialsBetweenAlternatesGroups ( self , alternates_group_id_a , alternates_group_id_b ) :
# these groups are being set as false positive. therefore, any potential between them no longer applies
media_ids_a = self . DuplicatesGetAlternateMediaIds ( alternates_group_id_a )
media_ids_b = self . DuplicatesGetAlternateMediaIds ( alternates_group_id_b )
self . DuplicatesClearPotentialsBetweenMedias ( media_ids_a , media_ids_b )
def DuplicatesDeleteAllPotentialDuplicatePairs ( self ) :
media_ids = set ( )
for ( smaller_media_id , larger_media_id ) in self . _Execute ( ' SELECT smaller_media_id, larger_media_id FROM potential_duplicate_pairs; ' ) :
media_ids . add ( smaller_media_id )
media_ids . add ( larger_media_id )
hash_ids = self . DuplicatesGetDuplicatesHashIds ( media_ids )
self . _Execute ( ' DELETE FROM potential_duplicate_pairs; ' )
self . modules_similar_files . ResetSearch ( hash_ids )
def DuplicatesDissolveAlternatesGroupId ( self , alternates_group_id ) :
media_ids = self . DuplicatesGetAlternateMediaIds ( alternates_group_id )
for media_id in media_ids :
self . DuplicatesDissolveMediaId ( media_id )
def DuplicatesDissolveAlternatesGroupIdFromHashes ( self , hashes ) :
hash_ids = self . modules_hashes_local_cache . GetHashIds ( hashes )
for hash_id in hash_ids :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
alternates_group_id = self . DuplicatesGetAlternatesGroupId ( media_id , do_not_create = True )
if alternates_group_id is not None :
self . DuplicatesDissolveAlternatesGroupId ( alternates_group_id )
def DuplicatesDissolveMediaId ( self , media_id ) :
self . DuplicatesRemoveAlternateMember ( media_id )
self . _Execute ( ' DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?; ' , ( media_id , media_id ) )
hash_ids = self . DuplicatesGetDuplicateHashIds ( media_id )
self . _Execute ( ' DELETE FROM duplicate_file_members WHERE media_id = ?; ' , ( media_id , ) )
self . _Execute ( ' DELETE FROM duplicate_files WHERE media_id = ?; ' , ( media_id , ) )
self . modules_similar_files . ResetSearch ( hash_ids )
def DuplicatesDissolveMediaIdFromHashes ( self , hashes ) :
hash_ids = self . modules_hashes_local_cache . GetHashIds ( hashes )
for hash_id in hash_ids :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
self . DuplicatesDissolveMediaId ( media_id )
def DuplicatesFilterKingHashIds ( self , allowed_hash_ids ) :
2022-01-26 21:57:04 +00:00
# can't just pull explicit king_hash_ids, since files that do not have a media_id are still kings
# kings = hashes - explicitly not kings
2022-01-12 22:14:50 +00:00
if not isinstance ( allowed_hash_ids , set ) :
allowed_hash_ids = set ( allowed_hash_ids )
2022-01-26 21:57:04 +00:00
with self . _MakeTemporaryIntegerTable ( allowed_hash_ids , ' hash_id ' ) as temp_hash_ids_table_name :
explicit_king_hash_ids = self . _STS ( self . _Execute ( ' SELECT king_hash_id FROM {} CROSS JOIN duplicate_files ON ( {} .hash_id = duplicate_files.king_hash_id ); ' . format ( temp_hash_ids_table_name , temp_hash_ids_table_name ) ) )
all_duplicate_member_hash_ids = self . _STS ( self . _Execute ( ' SELECT hash_id FROM {} CROSS JOIN duplicate_file_members USING ( hash_id ); ' . format ( temp_hash_ids_table_name ) ) )
2022-01-12 22:14:50 +00:00
all_non_king_hash_ids = all_duplicate_member_hash_ids . difference ( explicit_king_hash_ids )
return allowed_hash_ids . difference ( all_non_king_hash_ids )
2022-05-25 21:30:53 +00:00
def DuplicatesFilterMediaIdPairs ( self , db_location_context : ClientDBFilesStorage . DBLocationContext , media_id_pairs ) :
if len ( media_id_pairs ) == 0 :
return [ ]
# this is pretty wonked out due to me not wanting to force db_location_context to make a single table
all_media_ids = { i for i in itertools . chain . from_iterable ( media_id_pairs ) }
with self . _MakeTemporaryIntegerTable ( all_media_ids , ' media_id ' ) as temp_media_ids_table_name :
hash_ids_to_media_ids = dict ( self . _Execute ( ' SELECT hash_id, media_id FROM {} CROSS JOIN {} USING ( media_id ); ' . format ( temp_media_ids_table_name , ' duplicate_file_members ' ) ) )
all_hash_ids = set ( hash_ids_to_media_ids . keys ( ) )
good_hash_ids = self . modules_files_storage . FilterHashIds ( db_location_context . location_context , all_hash_ids )
good_media_ids = { hash_ids_to_media_ids [ hash_id ] for hash_id in good_hash_ids }
good_media_id_pairs = [ ( smaller_media_id , larger_media_id ) for ( smaller_media_id , larger_media_id ) in media_id_pairs if smaller_media_id in good_media_ids and larger_media_id in good_media_ids ]
return good_media_id_pairs
2022-01-12 22:14:50 +00:00
def DuplicatesGetAlternatesGroupId ( self , media_id , do_not_create = False ) :
result = self . _Execute ( ' SELECT alternates_group_id FROM alternate_file_group_members WHERE media_id = ?; ' , ( media_id , ) ) . fetchone ( )
if result is None :
if do_not_create :
return None
self . _Execute ( ' INSERT INTO alternate_file_groups DEFAULT VALUES; ' )
alternates_group_id = self . _GetLastRowId ( )
self . _Execute ( ' INSERT INTO alternate_file_group_members ( alternates_group_id, media_id ) VALUES ( ?, ? ); ' , ( alternates_group_id , media_id ) )
else :
( alternates_group_id , ) = result
return alternates_group_id
def DuplicatesGetAlternateMediaIds ( self , alternates_group_id ) :
media_ids = self . _STS ( self . _Execute ( ' SELECT media_id FROM alternate_file_group_members WHERE alternates_group_id = ?; ' , ( alternates_group_id , ) ) )
return media_ids
2022-01-19 21:28:59 +00:00
def DuplicatesGetBestKingId ( self , media_id , db_location_context : ClientDBFilesStorage . DBLocationContext , allowed_hash_ids = None , preferred_hash_ids = None ) :
2022-01-12 22:14:50 +00:00
2022-01-19 21:28:59 +00:00
media_hash_ids = self . DuplicatesGetDuplicateHashIds ( media_id , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
if allowed_hash_ids is not None :
media_hash_ids . intersection_update ( allowed_hash_ids )
if len ( media_hash_ids ) > 0 :
king_hash_id = self . DuplicatesGetKingHashId ( media_id )
if preferred_hash_ids is not None :
preferred_hash_ids = media_hash_ids . intersection ( preferred_hash_ids )
if len ( preferred_hash_ids ) > 0 :
if king_hash_id not in preferred_hash_ids :
2022-09-07 21:16:25 +00:00
king_hash_id = random . choice ( list ( preferred_hash_ids ) )
2022-01-12 22:14:50 +00:00
return king_hash_id
if king_hash_id not in media_hash_ids :
2022-09-07 21:16:25 +00:00
king_hash_id = random . choice ( list ( media_hash_ids ) )
2022-01-12 22:14:50 +00:00
return king_hash_id
return None
2022-01-19 21:28:59 +00:00
def DuplicatesGetDuplicateHashIds ( self , media_id , db_location_context : ClientDBFilesStorage . DBLocationContext = None ) :
2022-01-12 22:14:50 +00:00
table_join = ' duplicate_file_members '
2022-01-19 21:28:59 +00:00
if db_location_context is not None :
2022-01-12 22:14:50 +00:00
2022-05-25 21:30:53 +00:00
if not db_location_context . SingleTableIsFast ( ) :
hash_ids = self . _STS ( self . _Execute ( ' SELECT hash_id FROM {} WHERE media_id = ?; ' . format ( table_join ) , ( media_id , ) ) )
hash_ids = self . modules_files_storage . FilterHashIds ( db_location_context . location_context , hash_ids )
return hash_ids
2022-01-19 21:28:59 +00:00
table_join = db_location_context . GetTableJoinLimitedByFileDomain ( table_join )
2022-01-12 22:14:50 +00:00
hash_ids = self . _STS ( self . _Execute ( ' SELECT hash_id FROM {} WHERE media_id = ?; ' . format ( table_join ) , ( media_id , ) ) )
return hash_ids
2022-01-19 21:28:59 +00:00
def DuplicatesGetDuplicatesHashIds ( self , media_ids , db_location_context : ClientDBFilesStorage . DBLocationContext = None ) :
2022-01-12 22:14:50 +00:00
with self . _MakeTemporaryIntegerTable ( media_ids , ' media_id ' ) as temp_media_ids_table_name :
table_join = ' {} CROSS JOIN {} USING ( media_id ) ' . format ( temp_media_ids_table_name , ' duplicate_file_members ' )
2022-01-19 21:28:59 +00:00
if db_location_context is not None :
2022-01-12 22:14:50 +00:00
2022-01-19 21:28:59 +00:00
table_join = db_location_context . GetTableJoinLimitedByFileDomain ( table_join )
2022-01-12 22:14:50 +00:00
hash_ids = self . _STS ( self . _Execute ( ' SELECT hash_id FROM {} ; ' . format ( table_join ) ) )
return hash_ids
def DuplicatesGetFalsePositiveAlternatesGroupIds ( self , alternates_group_id ) :
false_positive_alternates_group_ids = set ( )
results = self . _Execute ( ' SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?; ' , ( alternates_group_id , alternates_group_id ) ) . fetchall ( )
for ( smaller_alternates_group_id , larger_alternates_group_id ) in results :
false_positive_alternates_group_ids . add ( smaller_alternates_group_id )
false_positive_alternates_group_ids . add ( larger_alternates_group_id )
return false_positive_alternates_group_ids
2022-01-19 21:28:59 +00:00
def DuplicatesGetFileDuplicateInfo ( self , location_context , hash ) :
2022-01-12 22:14:50 +00:00
result_dict = { }
result_dict [ ' is_king ' ] = True
hash_id = self . modules_hashes_local_cache . GetHashId ( hash )
counter = collections . Counter ( )
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
2022-01-19 21:28:59 +00:00
db_location_context = self . modules_files_storage . GetDBLocationContext ( location_context )
2022-01-12 22:14:50 +00:00
2022-05-25 21:30:53 +00:00
all_potential_pairs = self . _Execute ( ' SELECT DISTINCT smaller_media_id, larger_media_id FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?; ' , ( media_id , media_id , ) ) . fetchall ( )
2022-01-12 22:14:50 +00:00
2022-05-25 21:30:53 +00:00
potential_pairs = self . DuplicatesFilterMediaIdPairs ( db_location_context , all_potential_pairs )
2022-01-12 22:14:50 +00:00
2022-05-25 21:30:53 +00:00
if len ( potential_pairs ) > 0 :
2022-01-12 22:14:50 +00:00
2022-05-25 21:30:53 +00:00
counter [ HC . DUPLICATE_POTENTIAL ] = len ( potential_pairs )
2022-01-12 22:14:50 +00:00
king_hash_id = self . DuplicatesGetKingHashId ( media_id )
result_dict [ ' is_king ' ] = king_hash_id == hash_id
2022-01-19 21:28:59 +00:00
media_hash_ids = self . DuplicatesGetDuplicateHashIds ( media_id , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
num_other_dupe_members = len ( media_hash_ids ) - 1
if num_other_dupe_members > 0 :
counter [ HC . DUPLICATE_MEMBER ] = num_other_dupe_members
alternates_group_id = self . DuplicatesGetAlternatesGroupId ( media_id , do_not_create = True )
if alternates_group_id is not None :
alt_media_ids = self . DuplicatesGetAlternateMediaIds ( alternates_group_id )
alt_media_ids . discard ( media_id )
for alt_media_id in alt_media_ids :
2022-01-19 21:28:59 +00:00
alt_hash_ids = self . DuplicatesGetDuplicateHashIds ( alt_media_id , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
if len ( alt_hash_ids ) > 0 :
counter [ HC . DUPLICATE_ALTERNATE ] + = 1
smaller_media_id = min ( media_id , alt_media_id )
larger_media_id = max ( media_id , alt_media_id )
result = self . _Execute ( ' SELECT 1 FROM confirmed_alternate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?; ' , ( smaller_media_id , larger_media_id ) ) . fetchone ( )
if result is not None :
counter [ HC . DUPLICATE_CONFIRMED_ALTERNATE ] + = 1
false_positive_alternates_group_ids = self . DuplicatesGetFalsePositiveAlternatesGroupIds ( alternates_group_id )
false_positive_alternates_group_ids . discard ( alternates_group_id )
for false_positive_alternates_group_id in false_positive_alternates_group_ids :
fp_media_ids = self . DuplicatesGetAlternateMediaIds ( false_positive_alternates_group_id )
for fp_media_id in fp_media_ids :
2022-01-19 21:28:59 +00:00
fp_hash_ids = self . DuplicatesGetDuplicateHashIds ( fp_media_id , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
if len ( fp_hash_ids ) > 0 :
counter [ HC . DUPLICATE_FALSE_POSITIVE ] + = 1
result_dict [ ' counts ' ] = counter
return result_dict
2022-01-19 21:28:59 +00:00
def DuplicatesGetFileHashesByDuplicateType ( self , location_context : ClientLocation . LocationContext , hash , duplicate_type , allowed_hash_ids = None , preferred_hash_ids = None ) :
2022-01-12 22:14:50 +00:00
hash_id = self . modules_hashes_local_cache . GetHashId ( hash )
2022-01-19 21:28:59 +00:00
db_location_context = self . modules_files_storage . GetDBLocationContext ( location_context )
2022-01-12 22:14:50 +00:00
dupe_hash_ids = set ( )
if duplicate_type == HC . DUPLICATE_FALSE_POSITIVE :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
alternates_group_id = self . DuplicatesGetAlternatesGroupId ( media_id , do_not_create = True )
if alternates_group_id is not None :
false_positive_alternates_group_ids = self . DuplicatesGetFalsePositiveAlternatesGroupIds ( alternates_group_id )
false_positive_alternates_group_ids . discard ( alternates_group_id )
false_positive_media_ids = set ( )
for false_positive_alternates_group_id in false_positive_alternates_group_ids :
false_positive_media_ids . update ( self . DuplicatesGetAlternateMediaIds ( false_positive_alternates_group_id ) )
for false_positive_media_id in false_positive_media_ids :
2022-01-19 21:28:59 +00:00
best_king_hash_id = self . DuplicatesGetBestKingId ( false_positive_media_id , db_location_context , allowed_hash_ids = allowed_hash_ids , preferred_hash_ids = preferred_hash_ids )
2022-01-12 22:14:50 +00:00
if best_king_hash_id is not None :
dupe_hash_ids . add ( best_king_hash_id )
elif duplicate_type == HC . DUPLICATE_ALTERNATE :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
alternates_group_id = self . DuplicatesGetAlternatesGroupId ( media_id , do_not_create = True )
if alternates_group_id is not None :
alternates_media_ids = self . _STS ( self . _Execute ( ' SELECT media_id FROM alternate_file_group_members WHERE alternates_group_id = ?; ' , ( alternates_group_id , ) ) )
alternates_media_ids . discard ( media_id )
for alternates_media_id in alternates_media_ids :
2022-01-19 21:28:59 +00:00
best_king_hash_id = self . DuplicatesGetBestKingId ( alternates_media_id , db_location_context , allowed_hash_ids = allowed_hash_ids , preferred_hash_ids = preferred_hash_ids )
2022-01-12 22:14:50 +00:00
if best_king_hash_id is not None :
dupe_hash_ids . add ( best_king_hash_id )
elif duplicate_type == HC . DUPLICATE_MEMBER :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
2022-01-19 21:28:59 +00:00
media_hash_ids = self . DuplicatesGetDuplicateHashIds ( media_id , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
if allowed_hash_ids is not None :
media_hash_ids . intersection_update ( allowed_hash_ids )
dupe_hash_ids . update ( media_hash_ids )
elif duplicate_type == HC . DUPLICATE_KING :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
2022-01-19 21:28:59 +00:00
best_king_hash_id = self . DuplicatesGetBestKingId ( media_id , db_location_context , allowed_hash_ids = allowed_hash_ids , preferred_hash_ids = preferred_hash_ids )
2022-01-12 22:14:50 +00:00
if best_king_hash_id is not None :
dupe_hash_ids . add ( best_king_hash_id )
elif duplicate_type == HC . DUPLICATE_POTENTIAL :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
2022-01-19 21:28:59 +00:00
table_join = self . DuplicatesGetPotentialDuplicatePairsTableJoinOnFileService ( db_location_context )
2022-01-12 22:14:50 +00:00
for ( smaller_media_id , larger_media_id ) in self . _Execute ( ' SELECT smaller_media_id, larger_media_id FROM {} WHERE smaller_media_id = ? OR larger_media_id = ?; ' . format ( table_join ) , ( media_id , media_id ) ) . fetchall ( ) :
if smaller_media_id != media_id :
potential_media_id = smaller_media_id
else :
potential_media_id = larger_media_id
2022-01-19 21:28:59 +00:00
best_king_hash_id = self . DuplicatesGetBestKingId ( potential_media_id , db_location_context , allowed_hash_ids = allowed_hash_ids , preferred_hash_ids = preferred_hash_ids )
2022-01-12 22:14:50 +00:00
if best_king_hash_id is not None :
dupe_hash_ids . add ( best_king_hash_id )
dupe_hash_ids . discard ( hash_id )
dupe_hash_ids = list ( dupe_hash_ids )
dupe_hash_ids . insert ( 0 , hash_id )
dupe_hashes = self . modules_hashes_local_cache . GetHashes ( dupe_hash_ids )
return dupe_hashes
2022-01-19 21:28:59 +00:00
def DuplicatesGetHashIdsFromDuplicateCountPredicate ( self , db_location_context : ClientDBFilesStorage . DBLocationContext , operator , num_relationships , dupe_type ) :
2022-01-12 22:14:50 +00:00
# doesn't work for '= 0' or '< 1'
if operator == CC . UNICODE_ALMOST_EQUAL_TO :
lower_bound = 0.8 * num_relationships
upper_bound = 1.2 * num_relationships
def filter_func ( count ) :
return lower_bound < count and count < upper_bound
elif operator == ' < ' :
def filter_func ( count ) :
return count < num_relationships
elif operator == ' > ' :
def filter_func ( count ) :
return count > num_relationships
elif operator == ' = ' :
def filter_func ( count ) :
return count == num_relationships
hash_ids = set ( )
if dupe_type == HC . DUPLICATE_FALSE_POSITIVE :
alternates_group_ids_to_valid_for_file_domain = { }
alternates_group_ids_to_false_positives = collections . defaultdict ( list )
query = ' SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives; '
for ( alternates_group_id_a , alternates_group_id_b ) in self . _Execute ( query ) :
alternates_group_ids_to_false_positives [ alternates_group_id_a ] . append ( alternates_group_id_b )
alternates_group_ids_to_false_positives [ alternates_group_id_b ] . append ( alternates_group_id_a )
for ( alternates_group_id , false_positive_alternates_group_ids ) in alternates_group_ids_to_false_positives . items ( ) :
count = 0
for false_positive_alternates_group_id in false_positive_alternates_group_ids :
if false_positive_alternates_group_id not in alternates_group_ids_to_valid_for_file_domain :
valid = False
fp_media_ids = self . DuplicatesGetAlternateMediaIds ( false_positive_alternates_group_id )
for fp_media_id in fp_media_ids :
2022-01-19 21:28:59 +00:00
fp_hash_ids = self . DuplicatesGetDuplicateHashIds ( fp_media_id , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
if len ( fp_hash_ids ) > 0 :
valid = True
break
alternates_group_ids_to_valid_for_file_domain [ false_positive_alternates_group_id ] = valid
if alternates_group_ids_to_valid_for_file_domain [ false_positive_alternates_group_id ] :
count + = 1
if filter_func ( count ) :
media_ids = self . DuplicatesGetAlternateMediaIds ( alternates_group_id )
2022-01-19 21:28:59 +00:00
hash_ids = self . DuplicatesGetDuplicatesHashIds ( media_ids , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
elif dupe_type == HC . DUPLICATE_ALTERNATE :
query = ' SELECT alternates_group_id, COUNT( * ) FROM alternate_file_group_members GROUP BY alternates_group_id; '
results = self . _Execute ( query ) . fetchall ( )
for ( alternates_group_id , count ) in results :
count - = 1 # num relationships is number group members - 1
media_ids = self . DuplicatesGetAlternateMediaIds ( alternates_group_id )
alternates_group_id_hash_ids = [ ]
for media_id in media_ids :
2022-01-19 21:28:59 +00:00
media_id_hash_ids = self . DuplicatesGetDuplicateHashIds ( media_id , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
if len ( media_id_hash_ids ) == 0 :
# this alternate relation does not count for our current file domain, so it should not contribute to the count
count - = 1
else :
alternates_group_id_hash_ids . extend ( media_id_hash_ids )
if filter_func ( count ) :
hash_ids . update ( alternates_group_id_hash_ids )
elif dupe_type == HC . DUPLICATE_MEMBER :
2022-01-19 21:28:59 +00:00
table_join = db_location_context . GetTableJoinLimitedByFileDomain ( ' duplicate_file_members ' )
2022-01-12 22:14:50 +00:00
query = ' SELECT media_id, COUNT( * ) FROM {} GROUP BY media_id; ' . format ( table_join )
media_ids = [ ]
for ( media_id , count ) in self . _Execute ( query ) :
count - = 1
if filter_func ( count ) :
media_ids . append ( media_id )
2022-01-19 21:28:59 +00:00
hash_ids = self . DuplicatesGetDuplicatesHashIds ( media_ids , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
elif dupe_type == HC . DUPLICATE_POTENTIAL :
2022-01-19 21:28:59 +00:00
table_join = self . DuplicatesGetPotentialDuplicatePairsTableJoinOnFileService ( db_location_context )
2022-01-12 22:14:50 +00:00
smaller_query = ' SELECT smaller_media_id, COUNT( * ) FROM ( SELECT DISTINCT smaller_media_id, larger_media_id FROM {} ) GROUP BY smaller_media_id; ' . format ( table_join )
larger_query = ' SELECT larger_media_id, COUNT( * ) FROM ( SELECT DISTINCT smaller_media_id, larger_media_id FROM {} ) GROUP BY larger_media_id; ' . format ( table_join )
media_ids_to_counts = collections . Counter ( )
for ( media_id , count ) in self . _Execute ( smaller_query ) :
media_ids_to_counts [ media_id ] + = count
for ( media_id , count ) in self . _Execute ( larger_query ) :
media_ids_to_counts [ media_id ] + = count
media_ids = [ media_id for ( media_id , count ) in media_ids_to_counts . items ( ) if filter_func ( count ) ]
2022-01-19 21:28:59 +00:00
hash_ids = self . DuplicatesGetDuplicatesHashIds ( media_ids , db_location_context = db_location_context )
2022-01-12 22:14:50 +00:00
return hash_ids
def DuplicatesGetKingHashId ( self , media_id ) :
( king_hash_id , ) = self . _Execute ( ' SELECT king_hash_id FROM duplicate_files WHERE media_id = ?; ' , ( media_id , ) ) . fetchone ( )
return king_hash_id
def DuplicatesGetMediaId ( self , hash_id , do_not_create = False ) :
result = self . _Execute ( ' SELECT media_id FROM duplicate_file_members WHERE hash_id = ?; ' , ( hash_id , ) ) . fetchone ( )
if result is None :
if do_not_create :
return None
self . _Execute ( ' INSERT INTO duplicate_files ( king_hash_id ) VALUES ( ? ); ' , ( hash_id , ) )
media_id = self . _GetLastRowId ( )
self . _Execute ( ' INSERT INTO duplicate_file_members ( media_id, hash_id ) VALUES ( ?, ? ); ' , ( media_id , hash_id ) )
else :
( media_id , ) = result
return media_id
2022-01-19 21:28:59 +00:00
def DuplicatesGetPotentialDuplicatePairsTableJoinOnEverythingSearchResults ( self , db_location_context : ClientDBFilesStorage . DBLocationContext , pixel_dupes_preference : int , max_hamming_distance : int ) :
2022-01-12 22:14:50 +00:00
2022-09-07 21:16:25 +00:00
tables = ' potential_duplicate_pairs, duplicate_files AS duplicate_files_smaller, duplicate_files AS duplicate_files_larger '
join_predicate = ' smaller_media_id = duplicate_files_smaller.media_id AND larger_media_id = duplicate_files_larger.media_id AND distance <= {} ' . format ( max_hamming_distance )
2022-01-12 22:14:50 +00:00
2022-01-19 21:28:59 +00:00
if not db_location_context . location_context . IsAllKnownFiles ( ) :
2022-01-12 22:14:50 +00:00
2022-05-18 20:18:25 +00:00
files_table_name = db_location_context . GetSingleFilesTableName ( )
2022-01-12 22:14:50 +00:00
tables = ' {} , {} AS current_files_smaller, {} AS current_files_larger ' . format ( tables , files_table_name , files_table_name )
2022-09-07 21:16:25 +00:00
join_predicate = ' {} AND duplicate_files_smaller.king_hash_id = current_files_smaller.hash_id AND duplicate_files_larger.king_hash_id = current_files_larger.hash_id ' . format ( join_predicate )
2022-01-12 22:14:50 +00:00
if pixel_dupes_preference in ( CC . SIMILAR_FILES_PIXEL_DUPES_REQUIRED , CC . SIMILAR_FILES_PIXEL_DUPES_EXCLUDED ) :
2022-09-07 21:16:25 +00:00
join_predicate_pixel_dupes = ' duplicate_files_smaller.king_hash_id = pixel_hash_map_smaller.hash_id AND duplicate_files_larger.king_hash_id = pixel_hash_map_larger.hash_id AND pixel_hash_map_smaller.pixel_hash_id = pixel_hash_map_larger.pixel_hash_id '
2022-01-12 22:14:50 +00:00
if pixel_dupes_preference == CC . SIMILAR_FILES_PIXEL_DUPES_REQUIRED :
tables = ' {} , pixel_hash_map AS pixel_hash_map_smaller, pixel_hash_map AS pixel_hash_map_larger ' . format ( tables )
join_predicate = ' {} AND {} ' . format ( join_predicate , join_predicate_pixel_dupes )
elif pixel_dupes_preference == CC . SIMILAR_FILES_PIXEL_DUPES_EXCLUDED :
# can't do "AND NOT {}", or the join will just give you the million rows where it isn't true. we want 'AND NEVER {}', and quick
select_statement = ' SELECT 1 FROM pixel_hash_map AS pixel_hash_map_smaller, pixel_hash_map as pixel_hash_map_larger ON ( {} ) ' . format ( join_predicate_pixel_dupes )
join_predicate = ' {} AND NOT EXISTS ( {} ) ' . format ( join_predicate , select_statement )
table_join = ' {} ON ( {} ) ' . format ( tables , join_predicate )
return table_join
2022-01-19 21:28:59 +00:00
def DuplicatesGetPotentialDuplicatePairsTableJoinOnFileService ( self , db_location_context : ClientDBFilesStorage . DBLocationContext ) :
2022-01-12 22:14:50 +00:00
2022-01-19 21:28:59 +00:00
if db_location_context . location_context . IsAllKnownFiles ( ) :
2022-01-12 22:14:50 +00:00
table_join = ' potential_duplicate_pairs '
else :
2022-05-18 20:18:25 +00:00
files_table_name = db_location_context . GetSingleFilesTableName ( )
2022-01-12 22:14:50 +00:00
2022-09-07 21:16:25 +00:00
table_join = ' potential_duplicate_pairs, duplicate_files AS duplicate_files_smaller, {} AS current_files_smaller, duplicate_files AS duplicate_files_larger, {} AS current_files_larger ON ( smaller_media_id = duplicate_files_smaller.media_id AND duplicate_files_smaller.king_hash_id = current_files_smaller.hash_id AND larger_media_id = duplicate_files_larger.media_id AND duplicate_files_larger.king_hash_id = current_files_larger.hash_id ) ' . format ( files_table_name , files_table_name )
2022-01-12 22:14:50 +00:00
return table_join
2022-01-19 21:28:59 +00:00
def DuplicatesGetPotentialDuplicatePairsTableJoinOnSearchResults ( self , db_location_context : ClientDBFilesStorage . DBLocationContext , results_table_name : str , both_files_match : bool , pixel_dupes_preference : int , max_hamming_distance : int ) :
2022-01-12 22:14:50 +00:00
# why yes this is a seven table join that involves a mix of duplicated tables, temporary tables, and duplicated temporary tables
#
# main thing is, give this guy a search in duplicate filter UI, it'll give you a fast table join that returns potential dupes that match that
#
# ████████████████████████████████████████████████████████████████████████
# ████████████████████████████████████████████████████████████████████████
# ██████████████████████████████████▓█████████████████████████████████████
# ██████████████████████████████████▒▓████████████████████████████████████
# █████████████████████████████▓▒▓▓▒░░▒░██████████████████████████████████
# ███████████████████████████▓▒▒░░░░ ▒▓███████▓▓▓██████████████████████
# █████████████████████▓▒▓▓▓█ ▒ ▓████▓▓▓▓▓██████████████████████
# █████████████████▓▓▓▓▓░ ░ ░░ ░▓█▓▓▓██▓▓██████████████████████
# █████████████████▓▓▓▒░▒▒▒ █▒ ░▓▓▓█████▓▓▓▓██████████████████████
# █████████████████▓▓▒░░ ░▒ ░▒█▓░▒▓▓▓█████▒▒▒▒▒▓█████████████████████
# ████████████████████▓▒░ ░ ░▒▒▓▓▓██▓▓█▓░ ░░▓▓▒▓▓▒▓▓██████████████████
# ██████████████████████▒░░░░ ▒▓▓▓▓▒▓▓▓▓██▓▓░▓█▓▓▓▓▓▓▓▓▓▓████████████████
# ████████████▓▒█▓███▓▓▒▓░▒░░▒▓▓▓▓▓▒▒░░ ░▒▓▓████▓ ▓▓░░▒▓▓ ░▒▒████████████
# ████████████▒▒████▓░ ░▒▒▒▓██▓▓▒▒▒▒░░ ▒▓▓▒ ░▒░░▓▒ ▒████████████
# ████████████▒▓▓▓█▓░▒▒░▒▓███▓▓▒░░░░ ░░ ░░░▒ ▒▓▒▒▒░▒▒ ▓███████████
# █████████████▒▓▓▓▒▒▓▓▒▓███▓▓▓▒▒░░░░░ ░░▒▓▓ ▒▒░░░ ▓███████████
# ██████████████▓▓▓▓███▓██▓▓▓▓▓▒▒░░░░ ░ ░▓░ ░░ ░▓█████████████
# ███████████████▓▓██▒▓█▓▓▓▓▓▓▒▒░░░░ ░░ ▒▓░ ▓██████████████
# █████████████████▓▒▓█▓▓▓▓▓▓▓▓▒▒▒▒░░▒▒▒ ░▒█▒ ▓████████████████
# ████████████████▓░▒██▓▓▓▓▓▓▓▓▓▒▒▒░░▒▒▒▓▒▒ ░▒▓▓▒▒░░▒░▓██████████████████
# ██████████████▓░▓████▓▓▓▓▓▓▓▓▒▒░░░▒░░░▒▒ ▒▓▓▓ ░▒▓▓▓ ▒█████████████████
# ██████████████▓▓▓██████▓▓▓▓▓▓▒ ░▒▓▒░▓▓ ░ ░▒ ▒░▒▒▒▒▓▒ ▓██████████████
# ██████████████▓▒░▒▒ ▓█▓▓▓▓▓▓▓▓▓▓▒░▒▒▒░▒▒░░░░ ▓▒░░ ░████▓███████████
# █████████████████░ ▓█▓██████████▓░░ ░▒▓█████▓ ▒░░ ░▓▓▒▓██░░▓█████████
# █████████████████▒ ▒█▓▓▓██████████▓▓█▓████████▓ ▒░▒▒░▒ ░███ ▓████████
# ██████████████████▒ ▒█▓▓▓██████████▒ ███████████ ░▓▒ ▒████▒ ████████
# █████████████████████▓▓▒▓██▓███████░ ▒▒████████▒░███▒ ░▓▓▓▓▒▒███████████
# ███████████████████████▒▒███████▓▓▓▓▒ ░▓██████ ▒████▒▓▓▓▓▒▓████████████
# █████████████████████▓▓▓▓▓▓▓▓▓▓▓▓█████ ▒▒▓▒▒ ▓██▓ ▒████████████
# ██████████████████████▓▓▓▓▓▓▓█▓▓▓██████ ▒██▓░░░ ▒ ░▓█▓▒▒█████████████
# ███████████████████████▓▓▓▓▓▓█▓▓▓██▓██▓ ░▓███▓▓▓░ ▓███████████████████
# ████████████████████████▓███▓▓▓▓▓▓█▓█▓ ░ ░▓█ ▒░░▒ ▓███████████████████
# █████████████████████████▓▓████▓▓▓▓▓ ▒█░ ▓█▓▓████████████████████
# ████████████████████████▓█▓██▓▓▓▓▓▒▓ ▓▒ ▒█████████████████████████
# ████████████████████████▓▓███▓▓▓▒▓▒▓░▒░ ▓░░ ██████████████████████████
# ████████████████████████▓▓▓▓▓█▓▓▓▒░░░░░ ▒ ▒██████████████████████████
# █████████████████████████▓▓▓▓▓▓▓█▓▓▓▓▒░ ░░ ▒███████████████████████████
# ███████████████████████████▓▓▓▓▓▓▓▓▓▓▒ ▓████████████████████████████
# ████████████████████████████▓▓▓▓▓▒▒ ▒░ ██████████████████████████████
# ██████████████████████████████▓▓▓▒ ▒███████████████████████████████
# ███████████████████████████████▓▓▒░ ▓████████████████████████████████
# ████████████████████████████████████████████████████████████████████████
# ████████████████████████████████████████████████████████████████████████
#
2022-09-07 21:16:25 +00:00
base_tables = ' potential_duplicate_pairs, duplicate_files AS duplicate_files_smaller, duplicate_files AS duplicate_files_larger '
2022-01-12 22:14:50 +00:00
2022-09-07 21:16:25 +00:00
join_predicate_media_to_hashes = ' smaller_media_id = duplicate_files_smaller.media_id AND larger_media_id = duplicate_files_larger.media_id AND distance <= {} ' . format ( max_hamming_distance )
2022-01-12 22:14:50 +00:00
if both_files_match :
tables = ' {} , {} AS results_smaller, {} AS results_larger ' . format ( base_tables , results_table_name , results_table_name )
2022-09-07 21:16:25 +00:00
join_predicate_hashes_to_allowed_results = ' duplicate_files_smaller.king_hash_id = results_smaller.hash_id AND duplicate_files_larger.king_hash_id = results_larger.hash_id '
2022-01-12 22:14:50 +00:00
else :
2022-01-19 21:28:59 +00:00
if db_location_context . location_context . IsAllKnownFiles ( ) :
2022-01-12 22:14:50 +00:00
tables = ' {} , {} AS results_table_for_this_query ' . format ( base_tables , results_table_name )
2022-09-07 21:16:25 +00:00
join_predicate_hashes_to_allowed_results = ' ( duplicate_files_smaller.king_hash_id = results_table_for_this_query.hash_id OR duplicate_files_larger.king_hash_id = results_table_for_this_query.hash_id ) '
2022-01-12 22:14:50 +00:00
else :
2022-05-18 20:18:25 +00:00
files_table_name = db_location_context . GetSingleFilesTableName ( )
2022-01-12 22:14:50 +00:00
tables = ' {} , {} AS results_table_for_this_query, {} AS current_files_for_this_query ' . format ( base_tables , results_table_name , files_table_name )
2022-09-07 21:16:25 +00:00
join_predicate_smaller_matches = ' ( duplicate_files_smaller.king_hash_id = results_table_for_this_query.hash_id AND duplicate_files_larger.king_hash_id = current_files_for_this_query.hash_id ) '
2022-01-12 22:14:50 +00:00
2022-09-07 21:16:25 +00:00
join_predicate_larger_matches = ' ( duplicate_files_smaller.king_hash_id = current_files_for_this_query.hash_id AND duplicate_files_larger.king_hash_id = results_table_for_this_query.hash_id ) '
2022-01-12 22:14:50 +00:00
join_predicate_hashes_to_allowed_results = ' ( {} OR {} ) ' . format ( join_predicate_smaller_matches , join_predicate_larger_matches )
if pixel_dupes_preference in ( CC . SIMILAR_FILES_PIXEL_DUPES_REQUIRED , CC . SIMILAR_FILES_PIXEL_DUPES_EXCLUDED ) :
2022-09-07 21:16:25 +00:00
join_predicate_pixel_dupes = ' duplicate_files_smaller.king_hash_id = pixel_hash_map_smaller.hash_id AND duplicate_files_larger.king_hash_id = pixel_hash_map_larger.hash_id AND pixel_hash_map_smaller.pixel_hash_id = pixel_hash_map_larger.pixel_hash_id '
2022-01-12 22:14:50 +00:00
if pixel_dupes_preference == CC . SIMILAR_FILES_PIXEL_DUPES_REQUIRED :
tables = ' {} , pixel_hash_map AS pixel_hash_map_smaller, pixel_hash_map AS pixel_hash_map_larger ' . format ( tables )
join_predicate_hashes_to_allowed_results = ' {} AND {} ' . format ( join_predicate_hashes_to_allowed_results , join_predicate_pixel_dupes )
elif pixel_dupes_preference == CC . SIMILAR_FILES_PIXEL_DUPES_EXCLUDED :
# can't do "AND NOT {}", or the join will just give you the million rows where it isn't true. we want 'AND NEVER {}', and quick
select_statement = ' SELECT 1 FROM pixel_hash_map AS pixel_hash_map_smaller, pixel_hash_map as pixel_hash_map_larger ON ( {} ) ' . format ( join_predicate_pixel_dupes )
join_predicate_hashes_to_allowed_results = ' {} AND NOT EXISTS ( {} ) ' . format ( join_predicate_hashes_to_allowed_results , select_statement )
join_predicate = ' {} AND {} ' . format ( join_predicate_media_to_hashes , join_predicate_hashes_to_allowed_results )
table_join = ' {} ON ( {} ) ' . format ( tables , join_predicate )
return table_join
def DuplicatesMediasAreAlternates ( self , media_id_a , media_id_b ) :
alternates_group_id_a = self . DuplicatesGetAlternatesGroupId ( media_id_a , do_not_create = True )
if alternates_group_id_a is None :
return False
alternates_group_id_b = self . DuplicatesGetAlternatesGroupId ( media_id_b , do_not_create = True )
if alternates_group_id_b is None :
return False
return alternates_group_id_a == alternates_group_id_b
def DuplicatesMediasAreConfirmedAlternates ( self , media_id_a , media_id_b ) :
smaller_media_id = min ( media_id_a , media_id_b )
larger_media_id = max ( media_id_a , media_id_b )
result = self . _Execute ( ' SELECT 1 FROM confirmed_alternate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?; ' , ( smaller_media_id , larger_media_id ) ) . fetchone ( )
return result is not None
def DuplicatesMediasAreFalsePositive ( self , media_id_a , media_id_b ) :
alternates_group_id_a = self . DuplicatesGetAlternatesGroupId ( media_id_a , do_not_create = True )
if alternates_group_id_a is None :
return False
alternates_group_id_b = self . DuplicatesGetAlternatesGroupId ( media_id_b , do_not_create = True )
if alternates_group_id_b is None :
return False
return self . DuplicatesAlternatesGroupsAreFalsePositive ( alternates_group_id_a , alternates_group_id_b )
def DuplicatesMergeMedias ( self , superior_media_id , mergee_media_id ) :
if superior_media_id == mergee_media_id :
return
self . DuplicatesClearPotentialsBetweenMedias ( ( superior_media_id , ) , ( mergee_media_id , ) )
alternates_group_id = self . DuplicatesGetAlternatesGroupId ( superior_media_id )
mergee_alternates_group_id = self . DuplicatesGetAlternatesGroupId ( mergee_media_id )
if alternates_group_id != mergee_alternates_group_id :
if self . DuplicatesAlternatesGroupsAreFalsePositive ( alternates_group_id , mergee_alternates_group_id ) :
smaller_alternates_group_id = min ( alternates_group_id , mergee_alternates_group_id )
larger_alternates_group_id = max ( alternates_group_id , mergee_alternates_group_id )
self . _Execute ( ' DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? AND larger_alternates_group_id = ?; ' , ( smaller_alternates_group_id , larger_alternates_group_id ) )
self . DuplicatesSetAlternates ( superior_media_id , mergee_media_id )
self . _Execute ( ' UPDATE duplicate_file_members SET media_id = ? WHERE media_id = ?; ' , ( superior_media_id , mergee_media_id ) )
smaller_media_id = min ( superior_media_id , mergee_media_id )
larger_media_id = max ( superior_media_id , mergee_media_id )
# ensure the potential merge pair is gone
self . _Execute ( ' DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?; ' , ( smaller_media_id , larger_media_id ) )
# now merge potentials from the old to the new--however this has complicated tests to stop confirmed alts and so on, so can't just update ids
existing_potential_info_of_mergee_media_id = self . _Execute ( ' SELECT smaller_media_id, larger_media_id, distance FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?; ' , ( mergee_media_id , mergee_media_id ) ) . fetchall ( )
self . _Execute ( ' DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?; ' , ( mergee_media_id , mergee_media_id ) )
for ( smaller_media_id , larger_media_id , distance ) in existing_potential_info_of_mergee_media_id :
if smaller_media_id == mergee_media_id :
media_id_a = superior_media_id
media_id_b = larger_media_id
else :
media_id_a = smaller_media_id
media_id_b = superior_media_id
potential_duplicate_media_ids_and_distances = [ ( media_id_b , distance ) ]
self . DuplicatesAddPotentialDuplicates ( media_id_a , potential_duplicate_media_ids_and_distances )
# ensure any previous confirmed alt pair is gone
self . _Execute ( ' DELETE FROM confirmed_alternate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?; ' , ( smaller_media_id , larger_media_id ) )
# now merge confirmed alts from the old to the new
self . _Execute ( ' UPDATE OR IGNORE confirmed_alternate_pairs SET smaller_media_id = ? WHERE smaller_media_id = ?; ' , ( superior_media_id , mergee_media_id ) )
self . _Execute ( ' UPDATE OR IGNORE confirmed_alternate_pairs SET larger_media_id = ? WHERE larger_media_id = ?; ' , ( superior_media_id , mergee_media_id ) )
# and clear out potentials that are now invalid
confirmed_alternate_pairs = self . _Execute ( ' SELECT smaller_media_id, larger_media_id FROM confirmed_alternate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?; ' , ( superior_media_id , superior_media_id ) ) . fetchall ( )
self . _ExecuteMany ( ' DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? AND larger_media_id = ?; ' , confirmed_alternate_pairs )
# clear out empty records
self . _Execute ( ' DELETE FROM alternate_file_group_members WHERE media_id = ?; ' , ( mergee_media_id , ) )
self . _Execute ( ' DELETE FROM duplicate_files WHERE media_id = ?; ' , ( mergee_media_id , ) )
def DuplicatesRemoveAlternateMember ( self , media_id ) :
alternates_group_id = self . DuplicatesGetAlternatesGroupId ( media_id , do_not_create = True )
if alternates_group_id is not None :
alternates_media_ids = self . DuplicatesGetAlternateMediaIds ( alternates_group_id )
self . _Execute ( ' DELETE FROM alternate_file_group_members WHERE media_id = ?; ' , ( media_id , ) )
self . _Execute ( ' DELETE FROM confirmed_alternate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?; ' , ( media_id , media_id ) )
if len ( alternates_media_ids ) == 1 : # i.e. what we just removed was the last of the group
self . _Execute ( ' DELETE FROM alternate_file_groups WHERE alternates_group_id = ?; ' , ( alternates_group_id , ) )
self . _Execute ( ' DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?; ' , ( alternates_group_id , alternates_group_id ) )
hash_ids = self . DuplicatesGetDuplicateHashIds ( media_id )
self . modules_similar_files . ResetSearch ( hash_ids )
def DuplicatesRemoveAlternateMemberFromHashes ( self , hashes ) :
hash_ids = self . modules_hashes_local_cache . GetHashIds ( hashes )
for hash_id in hash_ids :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
self . DuplicatesRemoveAlternateMember ( media_id )
def DuplicatesRemoveMediaIdMember ( self , hash_id ) :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
king_hash_id = self . DuplicatesGetKingHashId ( media_id )
if hash_id == king_hash_id :
self . DuplicatesDissolveMediaId ( media_id )
else :
self . _Execute ( ' DELETE FROM duplicate_file_members WHERE hash_id = ?; ' , ( hash_id , ) )
self . modules_similar_files . ResetSearch ( ( hash_id , ) )
def DuplicatesRemoveMediaIdMemberFromHashes ( self , hashes ) :
hash_ids = self . modules_hashes_local_cache . GetHashIds ( hashes )
for hash_id in hash_ids :
self . DuplicatesRemoveMediaIdMember ( hash_id )
def DuplicatesRemovePotentialPairs ( self , hash_id ) :
media_id = self . DuplicatesGetMediaId ( hash_id , do_not_create = True )
if media_id is not None :
self . _Execute ( ' DELETE FROM potential_duplicate_pairs WHERE smaller_media_id = ? OR larger_media_id = ?; ' , ( media_id , media_id ) )
def DuplicatesRemovePotentialPairsFromHashes ( self , hashes ) :
hash_ids = self . modules_hashes_local_cache . GetHashIds ( hashes )
for hash_id in hash_ids :
self . DuplicatesRemovePotentialPairs ( hash_id )
def DuplicatesSetAlternates ( self , media_id_a , media_id_b ) :
2022-03-09 22:18:23 +00:00
if media_id_a == media_id_b :
return
2022-01-12 22:14:50 +00:00
# let's clear out any outstanding potentials. whether this is a valid or not connection, we don't want to see it again
self . DuplicatesClearPotentialsBetweenMedias ( ( media_id_a , ) , ( media_id_b , ) )
# now check if we should be making a new relationship
alternates_group_id_a = self . DuplicatesGetAlternatesGroupId ( media_id_a )
alternates_group_id_b = self . DuplicatesGetAlternatesGroupId ( media_id_b )
if self . DuplicatesAlternatesGroupsAreFalsePositive ( alternates_group_id_a , alternates_group_id_b ) :
return
# write a confirmed result so this can't come up again due to subsequent re-searching etc...
# in future, I can tune this to consider alternate labels and indices. alternates with different labels and indices are not appropriate for potentials, so we can add more rows here
smaller_media_id = min ( media_id_a , media_id_b )
larger_media_id = max ( media_id_a , media_id_b )
self . _Execute ( ' INSERT OR IGNORE INTO confirmed_alternate_pairs ( smaller_media_id, larger_media_id ) VALUES ( ?, ? ); ' , ( smaller_media_id , larger_media_id ) )
if alternates_group_id_a == alternates_group_id_b :
return
# ok, they are currently not alternates, so we need to merge B into A
# first, for all false positive relationships that A already has, clear out potentials between B and those fps before it moves over
false_positive_pairs = self . _Execute ( ' SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?; ' , ( alternates_group_id_a , alternates_group_id_a ) )
for ( smaller_false_positive_alternates_group_id , larger_false_positive_alternates_group_id ) in false_positive_pairs :
if smaller_false_positive_alternates_group_id == alternates_group_id_a :
self . DuplicatesClearPotentialsBetweenAlternatesGroups ( alternates_group_id_b , larger_false_positive_alternates_group_id )
else :
self . DuplicatesClearPotentialsBetweenAlternatesGroups ( smaller_false_positive_alternates_group_id , alternates_group_id_b )
# first, update all B to A
self . _Execute ( ' UPDATE alternate_file_group_members SET alternates_group_id = ? WHERE alternates_group_id = ?; ' , ( alternates_group_id_a , alternates_group_id_b ) )
# move false positive records for B to A
false_positive_pairs = self . _Execute ( ' SELECT smaller_alternates_group_id, larger_alternates_group_id FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?; ' , ( alternates_group_id_b , alternates_group_id_b ) )
self . _Execute ( ' DELETE FROM duplicate_false_positives WHERE smaller_alternates_group_id = ? OR larger_alternates_group_id = ?; ' , ( alternates_group_id_b , alternates_group_id_b ) )
for ( smaller_false_positive_alternates_group_id , larger_false_positive_alternates_group_id ) in false_positive_pairs :
if smaller_false_positive_alternates_group_id == alternates_group_id_b :
self . DuplicatesSetFalsePositive ( alternates_group_id_a , larger_false_positive_alternates_group_id )
else :
self . DuplicatesSetFalsePositive ( smaller_false_positive_alternates_group_id , alternates_group_id_a )
# remove master record
self . _Execute ( ' DELETE FROM alternate_file_groups WHERE alternates_group_id = ?; ' , ( alternates_group_id_b , ) )
# pubsub to refresh alternates info for alternates_group_id_a and _b goes here
def DuplicatesSetFalsePositive ( self , alternates_group_id_a , alternates_group_id_b ) :
if alternates_group_id_a == alternates_group_id_b :
return
self . DuplicatesClearPotentialsBetweenAlternatesGroups ( alternates_group_id_a , alternates_group_id_b )
smaller_alternates_group_id = min ( alternates_group_id_a , alternates_group_id_b )
larger_alternates_group_id = max ( alternates_group_id_a , alternates_group_id_b )
self . _Execute ( ' INSERT OR IGNORE INTO duplicate_false_positives ( smaller_alternates_group_id, larger_alternates_group_id ) VALUES ( ?, ? ); ' , ( smaller_alternates_group_id , larger_alternates_group_id ) )
def DuplicatesSetKing ( self , king_hash_id , media_id ) :
self . _Execute ( ' UPDATE duplicate_files SET king_hash_id = ? WHERE media_id = ?; ' , ( king_hash_id , media_id ) )
def DuplicatesSetKingFromHash ( self , hash ) :
hash_id = self . modules_hashes_local_cache . GetHashId ( hash )
media_id = self . DuplicatesGetMediaId ( hash_id )
self . DuplicatesSetKing ( hash_id , media_id )
def GetTablesAndColumnsThatUseDefinitions ( self , content_type : int ) - > typing . List [ typing . Tuple [ str , str ] ] :
tables_and_columns = [ ]
2022-02-02 22:14:01 +00:00
if content_type == HC . CONTENT_TYPE_HASH :
2022-01-12 22:14:50 +00:00
tables_and_columns . append ( ( ' file_maintenance_jobs ' , ' hash_id ' ) )
return tables_and_columns