hydrus/hydrus/client/ClientDuplicates.py

588 lines
24 KiB
Python
Raw Normal View History

2020-05-20 21:36:02 +00:00
import collections
2020-12-09 22:18:48 +00:00
import threading
import time
2020-05-20 21:36:02 +00:00
2020-04-22 21:00:35 +00:00
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData
from hydrus.core import HydrusExceptions
from hydrus.core import HydrusGlobals as HG
from hydrus.core import HydrusSerialisable
2021-04-07 21:26:45 +00:00
from hydrus.core import HydrusTags
2018-05-16 20:09:50 +00:00
2020-07-29 20:52:44 +00:00
from hydrus.client import ClientConstants as CC
2020-12-09 22:18:48 +00:00
from hydrus.client import ClientThreading
from hydrus.client.metadata import ClientTags
2020-07-29 20:52:44 +00:00
2020-12-09 22:18:48 +00:00
class DuplicatesManager( object ):
my_instance = None
def __init__( self ):
DuplicatesManager.my_instance = self
self._similar_files_maintenance_status = None
self._currently_refreshing_maintenance_numbers = False
self._refresh_maintenance_numbers = True
self._currently_doing_potentials_search = False
self._lock = threading.Lock()
@staticmethod
def instance() -> 'DuplicatesManager':
if DuplicatesManager.my_instance is None:
DuplicatesManager()
return DuplicatesManager.my_instance
def GetMaintenanceNumbers( self ):
with self._lock:
if self._refresh_maintenance_numbers and not self._currently_refreshing_maintenance_numbers:
self._refresh_maintenance_numbers = False
self._currently_refreshing_maintenance_numbers = True
HG.client_controller.pub( 'new_similar_files_maintenance_numbers' )
HG.client_controller.CallToThread( self.THREADRefreshMaintenanceNumbers )
return ( self._similar_files_maintenance_status, self._currently_refreshing_maintenance_numbers, self._currently_doing_potentials_search )
def RefreshMaintenanceNumbers( self ):
with self._lock:
self._refresh_maintenance_numbers = True
HG.client_controller.pub( 'new_similar_files_maintenance_numbers' )
2020-12-16 22:29:51 +00:00
def NotifyNewPotentialsSearchNumbers( self ):
HG.client_controller.pub( 'new_similar_files_potentials_search_numbers' )
2020-12-09 22:18:48 +00:00
def StartPotentialsSearch( self ):
with self._lock:
if self._currently_doing_potentials_search or self._similar_files_maintenance_status is None:
return
self._currently_doing_potentials_search = True
HG.client_controller.CallToThreadLongRunning( self.THREADSearchPotentials )
def THREADRefreshMaintenanceNumbers( self ):
try:
similar_files_maintenance_status = HG.client_controller.Read( 'similar_files_maintenance_status' )
with self._lock:
self._similar_files_maintenance_status = similar_files_maintenance_status
if self._refresh_maintenance_numbers:
self._refresh_maintenance_numbers = False
HG.client_controller.CallToThread( self.THREADRefreshMaintenanceNumbers )
else:
self._currently_refreshing_maintenance_numbers = False
self._refresh_maintenance_numbers = False
HG.client_controller.pub( 'new_similar_files_maintenance_numbers' )
except:
self._currently_refreshing_maintenance_numbers = False
HG.client_controller.pub( 'new_similar_files_maintenance_numbers' )
raise
def THREADSearchPotentials( self ):
try:
search_distance = HG.client_controller.new_options.GetInteger( 'similar_files_duplicate_pairs_search_distance' )
with self._lock:
if self._similar_files_maintenance_status is None:
return
searched_distances_to_count = self._similar_files_maintenance_status
total_num_files = sum( searched_distances_to_count.values() )
num_searched = sum( ( count for ( value, count ) in searched_distances_to_count.items() if value is not None and value >= search_distance ) )
all_files_searched = num_searched >= total_num_files
if all_files_searched:
return # no work to do
num_searched_estimate = num_searched
HG.client_controller.pub( 'new_similar_files_maintenance_numbers' )
job_key = ClientThreading.JobKey( cancellable = True )
2021-06-09 20:28:09 +00:00
job_key.SetStatusTitle( 'searching for potential duplicates' )
2020-12-09 22:18:48 +00:00
HG.client_controller.pub( 'message', job_key )
still_work_to_do = True
while still_work_to_do:
search_distance = HG.client_controller.new_options.GetInteger( 'similar_files_duplicate_pairs_search_distance' )
2021-06-09 20:28:09 +00:00
start_time = HydrusData.GetNowPrecise()
2020-12-09 22:18:48 +00:00
( still_work_to_do, num_done ) = HG.client_controller.WriteSynchronous( 'maintain_similar_files_search_for_potential_duplicates', search_distance, maintenance_mode = HC.MAINTENANCE_FORCED, job_key = job_key, work_time_float = 0.5 )
2021-06-09 20:28:09 +00:00
time_it_took = HydrusData.GetNowPrecise() - start_time
2020-12-09 22:18:48 +00:00
num_searched_estimate += num_done
2021-07-14 20:42:19 +00:00
if num_searched_estimate > total_num_files:
similar_files_maintenance_status = HG.client_controller.Read( 'similar_files_maintenance_status' )
if similar_files_maintenance_status is None:
break
with self._lock:
self._similar_files_maintenance_status = similar_files_maintenance_status
searched_distances_to_count = self._similar_files_maintenance_status
total_num_files = max( num_searched_estimate, sum( searched_distances_to_count.values() ) )
2020-12-09 22:18:48 +00:00
text = 'searching: {}'.format( HydrusData.ConvertValueRangeToPrettyString( num_searched_estimate, total_num_files ) )
job_key.SetVariable( 'popup_text_1', text )
job_key.SetVariable( 'popup_gauge_1', ( num_searched_estimate, total_num_files ) )
if job_key.IsCancelled() or HG.model_shutdown:
break
2021-07-14 20:42:19 +00:00
time.sleep( min( 5, time_it_took ) ) # ideally 0.5s, but potentially longer
2020-12-09 22:18:48 +00:00
job_key.Delete()
finally:
with self._lock:
self._currently_doing_potentials_search = False
self.RefreshMaintenanceNumbers()
2020-12-16 22:29:51 +00:00
self.NotifyNewPotentialsSearchNumbers()
2020-12-09 22:18:48 +00:00
2018-05-16 20:09:50 +00:00
class DuplicateActionOptions( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_DUPLICATE_ACTION_OPTIONS
SERIALISABLE_NAME = 'Duplicate Action Options'
2019-05-22 22:35:06 +00:00
SERIALISABLE_VERSION = 4
2018-05-16 20:09:50 +00:00
2019-05-22 22:35:06 +00:00
def __init__( self, tag_service_actions = None, rating_service_actions = None, sync_archive = False, sync_urls_action = None ):
2018-05-16 20:09:50 +00:00
if tag_service_actions is None:
tag_service_actions = []
if rating_service_actions is None:
rating_service_actions = []
HydrusSerialisable.SerialisableBase.__init__( self )
self._tag_service_actions = tag_service_actions
self._rating_service_actions = rating_service_actions
self._sync_archive = sync_archive
self._sync_urls_action = sync_urls_action
def _GetSerialisableInfo( self ):
if HG.client_controller.IsBooted():
services_manager = HG.client_controller.services_manager
2020-03-11 21:52:11 +00:00
self._tag_service_actions = [ ( service_key, action, tag_filter ) for ( service_key, action, tag_filter ) in self._tag_service_actions if services_manager.ServiceExists( service_key ) and services_manager.GetServiceType( service_key ) in HC.REAL_TAG_SERVICES ]
2018-05-16 20:09:50 +00:00
self._rating_service_actions = [ ( service_key, action ) for ( service_key, action ) in self._rating_service_actions if services_manager.ServiceExists( service_key ) and services_manager.GetServiceType( service_key ) in ( HC.LOCAL_RATING_LIKE, HC.LOCAL_RATING_NUMERICAL ) ]
2019-01-09 22:59:03 +00:00
serialisable_tag_service_actions = [ ( service_key.hex(), action, tag_filter.GetSerialisableTuple() ) for ( service_key, action, tag_filter ) in self._tag_service_actions ]
serialisable_rating_service_actions = [ ( service_key.hex(), action ) for ( service_key, action ) in self._rating_service_actions ]
2018-05-16 20:09:50 +00:00
2019-05-22 22:35:06 +00:00
return ( serialisable_tag_service_actions, serialisable_rating_service_actions, self._sync_archive, self._sync_urls_action )
2018-05-16 20:09:50 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2019-05-22 22:35:06 +00:00
( serialisable_tag_service_actions, serialisable_rating_service_actions, self._sync_archive, self._sync_urls_action ) = serialisable_info
2018-05-16 20:09:50 +00:00
2019-01-09 22:59:03 +00:00
self._tag_service_actions = [ ( bytes.fromhex( serialisable_service_key ), action, HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_filter ) ) for ( serialisable_service_key, action, serialisable_tag_filter ) in serialisable_tag_service_actions ]
self._rating_service_actions = [ ( bytes.fromhex( serialisable_service_key ), action ) for ( serialisable_service_key, action ) in serialisable_rating_service_actions ]
2018-05-16 20:09:50 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( serialisable_service_actions, delete_second_file ) = old_serialisable_info
tag_service_actions = []
rating_service_actions = []
# As the client isn't booted when this is loaded in options, there isn't a good way to figure out tag from rating
# So, let's just dupe and purge later on, in serialisation
for ( service_key_encoded, action ) in serialisable_service_actions:
2019-01-09 22:59:03 +00:00
service_key = bytes.fromhex( service_key_encoded )
2018-05-16 20:09:50 +00:00
2021-04-07 21:26:45 +00:00
tag_filter = HydrusTags.TagFilter()
2018-05-16 20:09:50 +00:00
tag_service_actions.append( ( service_key, action, tag_filter ) )
rating_service_actions.append( ( service_key, action ) )
2019-01-09 22:59:03 +00:00
serialisable_tag_service_actions = [ ( service_key.hex(), action, tag_filter.GetSerialisableTuple() ) for ( service_key, action, tag_filter ) in tag_service_actions ]
serialisable_rating_service_actions = [ ( service_key.hex(), action ) for ( service_key, action ) in rating_service_actions ]
2018-05-16 20:09:50 +00:00
sync_archive = delete_second_file
delete_both_files = False
new_serialisable_info = ( serialisable_tag_service_actions, serialisable_rating_service_actions, delete_second_file, sync_archive, delete_both_files )
return ( 2, new_serialisable_info )
if version == 2:
( serialisable_tag_service_actions, serialisable_rating_service_actions, delete_second_file, sync_archive, delete_both_files ) = old_serialisable_info
sync_urls_action = None
new_serialisable_info = ( serialisable_tag_service_actions, serialisable_rating_service_actions, delete_second_file, sync_archive, delete_both_files, sync_urls_action )
return ( 3, new_serialisable_info )
2019-05-22 22:35:06 +00:00
if version == 3:
2018-05-16 20:09:50 +00:00
2019-05-22 22:35:06 +00:00
( serialisable_tag_service_actions, serialisable_rating_service_actions, delete_second_file, sync_archive, delete_both_files, sync_urls_action ) = old_serialisable_info
2018-05-16 20:09:50 +00:00
2019-05-22 22:35:06 +00:00
new_serialisable_info = ( serialisable_tag_service_actions, serialisable_rating_service_actions, sync_archive, sync_urls_action )
2018-05-16 20:09:50 +00:00
2019-05-22 22:35:06 +00:00
return ( 4, new_serialisable_info )
2018-05-16 20:09:50 +00:00
2019-05-22 22:35:06 +00:00
def SetTuple( self, tag_service_actions, rating_service_actions, sync_archive, sync_urls_action ):
2018-05-16 20:09:50 +00:00
self._tag_service_actions = tag_service_actions
self._rating_service_actions = rating_service_actions
self._sync_archive = sync_archive
self._sync_urls_action = sync_urls_action
def ToTuple( self ):
2019-05-22 22:35:06 +00:00
return ( self._tag_service_actions, self._rating_service_actions, self._sync_archive, self._sync_urls_action )
2018-05-16 20:09:50 +00:00
2019-05-22 22:35:06 +00:00
def ProcessPairIntoContentUpdates( self, first_media, second_media, delete_first = False, delete_second = False, delete_both = False, file_deletion_reason = None ):
2019-04-10 22:50:53 +00:00
if file_deletion_reason is None:
file_deletion_reason = 'unknown reason'
2018-05-16 20:09:50 +00:00
2018-11-07 23:09:40 +00:00
service_keys_to_content_updates = collections.defaultdict( list )
2018-05-16 20:09:50 +00:00
first_hashes = first_media.GetHashes()
second_hashes = second_media.GetHashes()
#
services_manager = HG.client_controller.services_manager
for ( service_key, action, tag_filter ) in self._tag_service_actions:
content_updates = []
try:
service = services_manager.GetService( service_key )
except HydrusExceptions.DataMissing:
continue
service_type = service.GetServiceType()
if service_type == HC.LOCAL_TAG:
add_content_action = HC.CONTENT_UPDATE_ADD
elif service_type == HC.TAG_REPOSITORY:
add_content_action = HC.CONTENT_UPDATE_PEND
2019-10-02 23:38:59 +00:00
first_tags = first_media.GetTagsManager().GetCurrentAndPending( service_key, ClientTags.TAG_DISPLAY_STORAGE )
second_tags = second_media.GetTagsManager().GetCurrentAndPending( service_key, ClientTags.TAG_DISPLAY_STORAGE )
2018-05-16 20:09:50 +00:00
2019-01-23 22:19:16 +00:00
first_tags = tag_filter.Filter( first_tags )
second_tags = tag_filter.Filter( second_tags )
2018-05-16 20:09:50 +00:00
if action == HC.CONTENT_MERGE_ACTION_TWO_WAY_MERGE:
2019-01-23 22:19:16 +00:00
first_needs = second_tags.difference( first_tags )
second_needs = first_tags.difference( second_tags )
2018-05-16 20:09:50 +00:00
content_updates.extend( ( HydrusData.ContentUpdate( HC.CONTENT_TYPE_MAPPINGS, add_content_action, ( tag, first_hashes ) ) for tag in first_needs ) )
content_updates.extend( ( HydrusData.ContentUpdate( HC.CONTENT_TYPE_MAPPINGS, add_content_action, ( tag, second_hashes ) ) for tag in second_needs ) )
elif action == HC.CONTENT_MERGE_ACTION_COPY:
2019-01-23 22:19:16 +00:00
first_needs = second_tags.difference( first_tags )
2018-05-16 20:09:50 +00:00
content_updates.extend( ( HydrusData.ContentUpdate( HC.CONTENT_TYPE_MAPPINGS, add_content_action, ( tag, first_hashes ) ) for tag in first_needs ) )
elif service_type == HC.LOCAL_TAG and action == HC.CONTENT_MERGE_ACTION_MOVE:
2019-01-23 22:19:16 +00:00
first_needs = second_tags.difference( first_tags )
2018-05-16 20:09:50 +00:00
content_updates.extend( ( HydrusData.ContentUpdate( HC.CONTENT_TYPE_MAPPINGS, add_content_action, ( tag, first_hashes ) ) for tag in first_needs ) )
2019-01-23 22:19:16 +00:00
content_updates.extend( ( HydrusData.ContentUpdate( HC.CONTENT_TYPE_MAPPINGS, HC.CONTENT_UPDATE_DELETE, ( tag, second_hashes ) ) for tag in second_tags ) )
2018-05-16 20:09:50 +00:00
if len( content_updates ) > 0:
2018-11-07 23:09:40 +00:00
service_keys_to_content_updates[ service_key ].extend( content_updates )
2018-05-16 20:09:50 +00:00
2019-02-27 23:03:30 +00:00
def worth_updating_rating( source_rating, dest_rating ):
if source_rating is not None:
if dest_rating is None or source_rating > dest_rating:
return True
return False
2018-05-16 20:09:50 +00:00
for ( service_key, action ) in self._rating_service_actions:
content_updates = []
try:
service = services_manager.GetService( service_key )
except HydrusExceptions.DataMissing:
continue
first_current_value = first_media.GetRatingsManager().GetRating( service_key )
second_current_value = second_media.GetRatingsManager().GetRating( service_key )
if action == HC.CONTENT_MERGE_ACTION_TWO_WAY_MERGE:
2019-02-27 23:03:30 +00:00
if worth_updating_rating( first_current_value, second_current_value ):
2018-05-16 20:09:50 +00:00
2019-02-27 23:03:30 +00:00
content_updates.append( HydrusData.ContentUpdate( HC.CONTENT_TYPE_RATINGS, HC.CONTENT_UPDATE_ADD, ( first_current_value, second_hashes ) ) )
2018-05-16 20:09:50 +00:00
2019-02-27 23:03:30 +00:00
elif worth_updating_rating( second_current_value, first_current_value ):
2018-05-16 20:09:50 +00:00
content_updates.append( HydrusData.ContentUpdate( HC.CONTENT_TYPE_RATINGS, HC.CONTENT_UPDATE_ADD, ( second_current_value, first_hashes ) ) )
elif action == HC.CONTENT_MERGE_ACTION_COPY:
2019-02-27 23:03:30 +00:00
if worth_updating_rating( second_current_value, first_current_value ):
2018-05-16 20:09:50 +00:00
content_updates.append( HydrusData.ContentUpdate( HC.CONTENT_TYPE_RATINGS, HC.CONTENT_UPDATE_ADD, ( second_current_value, first_hashes ) ) )
elif action == HC.CONTENT_MERGE_ACTION_MOVE:
if second_current_value is not None:
2019-02-27 23:03:30 +00:00
if worth_updating_rating( second_current_value, first_current_value ):
2018-05-16 20:09:50 +00:00
content_updates.append( HydrusData.ContentUpdate( HC.CONTENT_TYPE_RATINGS, HC.CONTENT_UPDATE_ADD, ( second_current_value, first_hashes ) ) )
content_updates.append( HydrusData.ContentUpdate( HC.CONTENT_TYPE_RATINGS, HC.CONTENT_UPDATE_ADD, ( None, second_hashes ) ) )
if len( content_updates ) > 0:
2018-11-07 23:09:40 +00:00
service_keys_to_content_updates[ service_key ].extend( content_updates )
2018-05-16 20:09:50 +00:00
#
if self._sync_archive:
if first_media.HasInbox() and second_media.HasArchive():
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_FILES, HC.CONTENT_UPDATE_ARCHIVE, first_hashes )
2018-11-07 23:09:40 +00:00
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
2018-05-16 20:09:50 +00:00
elif first_media.HasArchive() and second_media.HasInbox():
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_FILES, HC.CONTENT_UPDATE_ARCHIVE, second_hashes )
2018-11-07 23:09:40 +00:00
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
2018-05-16 20:09:50 +00:00
#
if self._sync_urls_action is not None:
first_urls = set( first_media.GetLocationsManager().GetURLs() )
second_urls = set( second_media.GetLocationsManager().GetURLs() )
content_updates = []
if self._sync_urls_action == HC.CONTENT_MERGE_ACTION_TWO_WAY_MERGE:
first_needs = second_urls.difference( first_urls )
second_needs = first_urls.difference( second_urls )
2018-10-17 21:00:09 +00:00
if len( first_needs ) > 0:
content_updates.append( HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( first_needs, first_hashes ) ) )
if len( second_needs ) > 0:
content_updates.append( HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( second_needs, second_hashes ) ) )
2018-05-16 20:09:50 +00:00
elif self._sync_urls_action == HC.CONTENT_MERGE_ACTION_COPY:
first_needs = second_urls.difference( first_urls )
2018-10-17 21:00:09 +00:00
if len( first_needs ) > 0:
content_updates.append( HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( first_needs, first_hashes ) ) )
2018-05-16 20:09:50 +00:00
if len( content_updates ) > 0:
2018-11-07 23:09:40 +00:00
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].extend( content_updates )
2018-05-16 20:09:50 +00:00
#
deletee_media = []
2019-05-22 22:35:06 +00:00
if delete_first or delete_second or delete_both:
2018-05-16 20:09:50 +00:00
2019-05-22 22:35:06 +00:00
if delete_first or delete_both:
2018-05-16 20:09:50 +00:00
deletee_media.append( first_media )
2019-05-22 22:35:06 +00:00
if delete_second or delete_both:
2019-04-10 22:50:53 +00:00
2019-05-22 22:35:06 +00:00
deletee_media.append( second_media )
2019-04-10 22:50:53 +00:00
2018-05-16 20:09:50 +00:00
2021-05-19 21:30:28 +00:00
delete_lock_for_archived_files = HG.client_controller.new_options.GetBoolean( 'delete_lock_for_archived_files' )
2018-05-16 20:09:50 +00:00
for media in deletee_media:
2021-05-19 21:30:28 +00:00
if delete_lock_for_archived_files and not media.HasInbox():
continue
2021-04-28 21:43:16 +00:00
if media.GetLocationsManager().IsTrashed():
2018-05-16 20:09:50 +00:00
2021-04-28 21:43:16 +00:00
deletee_service_keys = ( CC.COMBINED_LOCAL_FILE_SERVICE_KEY, )
2018-05-16 20:09:50 +00:00
else:
2021-04-28 21:43:16 +00:00
local_file_service_keys = HG.client_controller.services_manager.GetServiceKeys( ( HC.LOCAL_FILE_DOMAIN, ) )
deletee_service_keys = media.GetLocationsManager().GetCurrent().intersection( local_file_service_keys )
2018-05-16 20:09:50 +00:00
2021-04-28 21:43:16 +00:00
for deletee_service_key in deletee_service_keys:
2018-05-16 20:09:50 +00:00
2019-04-10 22:50:53 +00:00
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_FILES, HC.CONTENT_UPDATE_DELETE, media.GetHashes(), reason = file_deletion_reason )
2018-05-16 20:09:50 +00:00
service_keys_to_content_updates[ deletee_service_key ].append( content_update )
#
2018-11-07 23:09:40 +00:00
return service_keys_to_content_updates
2018-05-16 20:09:50 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_DUPLICATE_ACTION_OPTIONS ] = DuplicateActionOptions