import os import time import unittest from hydrus.core import HydrusConstants as HC from hydrus.core import HydrusData from hydrus.core import HydrusGlobals as HG from hydrus.client import ClientConstants as CC from hydrus.client import ClientLocation from hydrus.client import ClientSearch from hydrus.client.db import ClientDB from hydrus.client.importing import ClientImportFiles from hydrus.client.importing.options import FileImportOptions from hydrus.test import TestController class TestClientDBDuplicates( unittest.TestCase ): @classmethod def _clear_db( cls ): cls._delete_db() # class variable cls._db = ClientDB.DB( HG.test_controller, TestController.DB_DIR, 'client' ) @classmethod def _delete_db( cls ): cls._db.Shutdown() while not cls._db.LoopIsFinished(): time.sleep( 0.1 ) db_filenames = list(cls._db._db_filenames.values()) for filename in db_filenames: path = os.path.join( TestController.DB_DIR, filename ) os.remove( path ) del cls._db @classmethod def setUpClass( cls ): cls._db = ClientDB.DB( HG.test_controller, TestController.DB_DIR, 'client' ) HG.test_controller.SetRead( 'hash_status', ClientImportFiles.FileImportStatus.STATICGetUnknownStatus() ) @classmethod def tearDownClass( cls ): cls._delete_db() def _read( self, action, *args, **kwargs ): return TestClientDBDuplicates._db.Read( action, *args, **kwargs ) def _write( self, action, *args, **kwargs ): return TestClientDBDuplicates._db.Write( action, True, *args, **kwargs ) def _get_group_potential_count( self, file_duplicate_types_to_counts ): num_potentials = len( self._all_hashes ) - 1 num_potentials -= len( self._our_main_dupe_group_hashes ) - 1 num_potentials -= len( self._our_second_dupe_group_hashes ) - 1 num_potentials -= len( self._our_alt_dupe_group_hashes ) - 1 num_potentials -= len( self._our_fp_dupe_group_hashes ) - 1 if HC.DUPLICATE_FALSE_POSITIVE in file_duplicate_types_to_counts: # this would not work if the fp group had mutiple alt members num_potentials -= file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ] if HC.DUPLICATE_ALTERNATE in file_duplicate_types_to_counts: num_potentials -= file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ] return num_potentials def _import_and_find_dupes( self ): perceptual_hash = os.urandom( 8 ) # fake-import the files with the perceptual_hash ( size, mime, width, height, duration, num_frames, has_audio, num_words ) = ( 65535, HC.IMAGE_JPEG, 640, 480, None, None, False, None ) file_import_options = FileImportOptions.FileImportOptions() file_import_options.SetIsDefault( True ) for hash in self._all_hashes: fake_file_import_job = ClientImportFiles.FileImportJob( 'fake path', file_import_options ) fake_file_import_job._pre_import_file_status = ClientImportFiles.FileImportStatus( CC.STATUS_UNKNOWN, hash ) fake_file_import_job._file_info = ( size, mime, width, height, duration, num_frames, has_audio, num_words ) fake_file_import_job._extra_hashes = ( b'abcd', b'abcd', b'abcd' ) fake_file_import_job._perceptual_hashes = [ perceptual_hash ] fake_file_import_job._file_import_options = FileImportOptions.FileImportOptions() self._write( 'import_file', fake_file_import_job ) # run search maintenance self._write( 'maintain_similar_files_tree' ) self._write( 'maintain_similar_files_search_for_potential_duplicates', 0 ) def _test_initial_state( self ): pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertEqual( num_potentials, self._expected_num_potentials ) result = self._read( 'random_potential_duplicate_hashes', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertEqual( len( result ), len( self._all_hashes ) ) self.assertEqual( set( result ), self._all_hashes ) filtering_pairs = self._read( 'duplicate_pairs_for_filtering', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) for ( a, b ) in filtering_pairs: self.assertIn( a.GetHash(), self._all_hashes ) self.assertIn( b.GetHash(), self._all_hashes ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[0] ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[0], HC.DUPLICATE_POTENTIAL ) self.assertEqual( result[0], self._dupe_hashes[0] ) self.assertEqual( set( result ), self._all_hashes ) def _test_initial_better_worse( self ): row = ( HC.DUPLICATE_BETTER, self._king_hash, self._dupe_hashes[1], {} ) self._write( 'duplicate_pair_status', [ row ] ) self._our_main_dupe_group_hashes.add( self._dupe_hashes[1] ) row = ( HC.DUPLICATE_BETTER, self._dupe_hashes[1], self._dupe_hashes[2], {} ) self._write( 'duplicate_pair_status', [ row ] ) self._our_main_dupe_group_hashes.add( self._dupe_hashes[2] ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self._num_free_agents -= 1 self._expected_num_potentials -= self._num_free_agents self._num_free_agents -= 1 self._expected_num_potentials -= self._num_free_agents self.assertEqual( num_potentials, self._expected_num_potentials ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[1] ) self.assertEqual( result[ 'is_king' ], False ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[2] ) self.assertEqual( result[ 'is_king' ], False ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._king_hash ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[1], HC.DUPLICATE_KING ) self.assertEqual( result, [ self._dupe_hashes[1], self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[1], HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._dupe_hashes[1] ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[2], HC.DUPLICATE_KING ) self.assertEqual( result, [ self._dupe_hashes[2], self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[2], HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._dupe_hashes[2] ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) def _test_initial_king_usurp( self ): self._old_king_hash = self._king_hash self._king_hash = self._dupe_hashes[3] row = ( HC.DUPLICATE_BETTER, self._king_hash, self._old_king_hash, {} ) self._write( 'duplicate_pair_status', [ row ] ) self._our_main_dupe_group_hashes.add( self._king_hash ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self._num_free_agents -= 1 self._expected_num_potentials -= self._num_free_agents self.assertEqual( num_potentials, self._expected_num_potentials ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._old_king_hash ) self.assertEqual( result[ 'is_king' ], False ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._king_hash ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._old_king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._old_king_hash, self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._old_king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._old_king_hash ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) def _test_initial_same_quality( self ): row = ( HC.DUPLICATE_SAME_QUALITY, self._king_hash, self._dupe_hashes[4], {} ) self._write( 'duplicate_pair_status', [ row ] ) self._our_main_dupe_group_hashes.add( self._dupe_hashes[4] ) row = ( HC.DUPLICATE_SAME_QUALITY, self._old_king_hash, self._dupe_hashes[5], {} ) self._write( 'duplicate_pair_status', [ row ] ) self._our_main_dupe_group_hashes.add( self._dupe_hashes[5] ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self._num_free_agents -= 1 self._expected_num_potentials -= self._num_free_agents self._num_free_agents -= 1 self._expected_num_potentials -= self._num_free_agents self.assertEqual( num_potentials, self._expected_num_potentials ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[4] ) self.assertEqual( result[ 'is_king' ], False ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[5] ) self.assertEqual( result[ 'is_king' ], False ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._king_hash ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[4], HC.DUPLICATE_KING ) self.assertEqual( result, [ self._dupe_hashes[4], self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[4], HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._dupe_hashes[4] ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[5], HC.DUPLICATE_KING ) self.assertEqual( result, [ self._dupe_hashes[5], self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._dupe_hashes[5], HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._dupe_hashes[5] ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) def _test_explicit_set_new_king( self ): self._write( 'duplicate_set_king', self._dupe_hashes[5] ) self._king_hash = self._dupe_hashes[5] result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._king_hash ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) def _test_establish_second_group( self ): rows = [] rows.append( ( HC.DUPLICATE_BETTER, self._second_group_king_hash, self._second_group_dupe_hashes[1], {} ) ) rows.append( ( HC.DUPLICATE_SAME_QUALITY, self._second_group_king_hash, self._second_group_dupe_hashes[2], {} ) ) rows.append( ( HC.DUPLICATE_BETTER, self._second_group_king_hash, self._second_group_dupe_hashes[3], {} ) ) self._write( 'duplicate_pair_status', rows ) self._our_second_dupe_group_hashes.add( self._second_group_dupe_hashes[1] ) self._our_second_dupe_group_hashes.add( self._second_group_dupe_hashes[2] ) self._our_second_dupe_group_hashes.add( self._second_group_dupe_hashes[3] ) def _test_poach_better( self ): # better than not the king row = ( HC.DUPLICATE_BETTER, self._king_hash, self._second_group_dupe_hashes[1], {} ) self._write( 'duplicate_pair_status', [ row ] ) self._our_second_dupe_group_hashes.discard( self._second_group_dupe_hashes[1] ) self._our_main_dupe_group_hashes.add( self._second_group_dupe_hashes[1] ) self._write( 'maintain_similar_files_search_for_potential_duplicates', 0 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) # TODO: sometimes this is 20 instead of 21 # my guess is this is some complicated relationships due to random population of this test # the answer is to rewrite this monstrocity so the tests are simpler to understand and pull apart expected = self._get_group_potential_count( file_duplicate_types_to_counts ) self.assertIn( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], ( expected, expected - 1 ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._second_group_king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) expected = self._get_group_potential_count( file_duplicate_types_to_counts ) self.assertIn( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], ( expected, expected - 1 ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_second_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._king_hash ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._second_group_king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._second_group_king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._second_group_king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._second_group_king_hash ) self.assertEqual( set( result ), self._our_second_dupe_group_hashes ) def _test_poach_same( self ): # not the king is the same as not the king row = ( HC.DUPLICATE_SAME_QUALITY, self._old_king_hash, self._second_group_dupe_hashes[2], {} ) self._write( 'duplicate_pair_status', [ row ] ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertLess( num_potentials, self._expected_num_potentials ) self._expected_num_potentials = num_potentials self._our_second_dupe_group_hashes.discard( self._second_group_dupe_hashes[2] ) self._our_main_dupe_group_hashes.add( self._second_group_dupe_hashes[2] ) self._write( 'maintain_similar_files_search_for_potential_duplicates', 0 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) # TODO: sometimes this is 20 instead of 21 # my guess is this is some complicated relationships due to random population of this test # the answer is to rewrite this monstrocity so the tests are simpler to understand and pull apart expected = self._get_group_potential_count( file_duplicate_types_to_counts ) self.assertIn( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], ( expected, expected - 1 ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._second_group_king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) expected = self._get_group_potential_count( file_duplicate_types_to_counts ) self.assertIn( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], ( expected, expected - 1 ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_second_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._king_hash ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._second_group_king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._second_group_king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._second_group_king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._second_group_king_hash ) self.assertEqual( set( result ), self._our_second_dupe_group_hashes ) def _test_group_merge( self ): rows = [] rows.append( ( HC.DUPLICATE_BETTER, self._dupe_hashes[6], self._dupe_hashes[7], {} ) ) rows.append( ( HC.DUPLICATE_BETTER, self._dupe_hashes[8], self._dupe_hashes[9], {} ) ) rows.append( ( HC.DUPLICATE_BETTER, self._dupe_hashes[10], self._dupe_hashes[11], {} ) ) rows.append( ( HC.DUPLICATE_BETTER, self._dupe_hashes[12], self._dupe_hashes[13], {} ) ) self._write( 'duplicate_pair_status', rows ) rows = [] rows.append( ( HC.DUPLICATE_SAME_QUALITY, self._old_king_hash, self._dupe_hashes[6], {} ) ) rows.append( ( HC.DUPLICATE_SAME_QUALITY, self._king_hash, self._dupe_hashes[8], {} ) ) rows.append( ( HC.DUPLICATE_BETTER, self._old_king_hash, self._dupe_hashes[10], {} ) ) rows.append( ( HC.DUPLICATE_BETTER, self._king_hash, self._dupe_hashes[12], {} ) ) self._write( 'duplicate_pair_status', rows ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertLess( num_potentials, self._expected_num_potentials ) self._expected_num_potentials = num_potentials self._our_main_dupe_group_hashes.update( ( self._dupe_hashes[ i ] for i in range( 6, 14 ) ) ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 2 ) result = self._get_group_potential_count( file_duplicate_types_to_counts ) self.assertIn( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], ( result, result -1 ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_KING ) self.assertEqual( result, [ self._king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_MEMBER ) self.assertEqual( result[0], self._king_hash ) self.assertEqual( set( result ), self._our_main_dupe_group_hashes ) def _test_establish_false_positive_group( self ): rows = [] rows.append( ( HC.DUPLICATE_BETTER, self._false_positive_king_hash, self._similar_looking_false_positive_hashes[1], {} ) ) rows.append( ( HC.DUPLICATE_SAME_QUALITY, self._false_positive_king_hash, self._similar_looking_false_positive_hashes[2], {} ) ) self._write( 'duplicate_pair_status', rows ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertLess( num_potentials, self._expected_num_potentials ) self._expected_num_potentials = num_potentials self._our_fp_dupe_group_hashes.add( self._similar_looking_false_positive_hashes[1] ) self._our_fp_dupe_group_hashes.add( self._similar_looking_false_positive_hashes[2] ) def _test_false_positive( self ): row = ( HC.DUPLICATE_FALSE_POSITIVE, self._king_hash, self._false_positive_king_hash, {} ) self._write( 'duplicate_pair_status', [ row ] ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertLess( num_potentials, self._expected_num_potentials ) self._expected_num_potentials = num_potentials result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 3 ) expected = self._get_group_potential_count( file_duplicate_types_to_counts ) self.assertIn( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], ( expected, expected - 1 ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._false_positive_king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 3 ) expected = self._get_group_potential_count( file_duplicate_types_to_counts ) self.assertIn( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], ( expected, expected - 1 ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_fp_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_FALSE_POSITIVE ) self.assertEqual( result, [ self._king_hash, self._false_positive_king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._false_positive_king_hash, HC.DUPLICATE_FALSE_POSITIVE ) self.assertEqual( result, [ self._false_positive_king_hash, self._king_hash ] ) def _test_establish_alt_group( self ): rows = [] rows.append( ( HC.DUPLICATE_BETTER, self._alternate_king_hash, self._similar_looking_alternate_hashes[1], {} ) ) rows.append( ( HC.DUPLICATE_SAME_QUALITY, self._alternate_king_hash, self._similar_looking_alternate_hashes[2], {} ) ) self._write( 'duplicate_pair_status', rows ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertLess( num_potentials, self._expected_num_potentials ) self._expected_num_potentials = num_potentials self._our_alt_dupe_group_hashes.add( self._similar_looking_alternate_hashes[1] ) self._our_alt_dupe_group_hashes.add( self._similar_looking_alternate_hashes[2] ) def _test_alt( self ): row = ( HC.DUPLICATE_ALTERNATE, self._king_hash, self._alternate_king_hash, {} ) self._write( 'duplicate_pair_status', [ row ] ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertLess( num_potentials, self._expected_num_potentials ) self._expected_num_potentials = num_potentials result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 5 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._alternate_king_hash, HC.DUPLICATE_POTENTIAL ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._alternate_king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 5 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_alt_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_ALTERNATE ) self.assertEqual( result, [ self._king_hash, self._alternate_king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._alternate_king_hash, HC.DUPLICATE_ALTERNATE ) self.assertEqual( result, [ self._alternate_king_hash, self._king_hash ] ) def _test_expand_false_positive( self ): rows = [] rows.append( ( HC.DUPLICATE_BETTER, self._false_positive_king_hash, self._similar_looking_false_positive_hashes[3], {} ) ) rows.append( ( HC.DUPLICATE_SAME_QUALITY, self._false_positive_king_hash, self._similar_looking_false_positive_hashes[4], {} ) ) self._write( 'duplicate_pair_status', rows ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertLess( num_potentials, self._expected_num_potentials ) self._expected_num_potentials = num_potentials self._our_fp_dupe_group_hashes.add( self._similar_looking_false_positive_hashes[3] ) self._our_fp_dupe_group_hashes.add( self._similar_looking_false_positive_hashes[4] ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 5 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._false_positive_king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 3 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_fp_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 2 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_FALSE_POSITIVE ) self.assertEqual( result[0], self._king_hash ) self.assertEqual( set( result ), { self._king_hash, self._false_positive_king_hash } ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._false_positive_king_hash, HC.DUPLICATE_FALSE_POSITIVE ) self.assertEqual( result[0], self._false_positive_king_hash ) self.assertEqual( set( result ), { self._false_positive_king_hash, self._king_hash, self._alternate_king_hash } ) def _test_expand_alt( self ): rows = [] rows.append( ( HC.DUPLICATE_BETTER, self._alternate_king_hash, self._similar_looking_alternate_hashes[3], {} ) ) rows.append( ( HC.DUPLICATE_SAME_QUALITY, self._alternate_king_hash, self._similar_looking_alternate_hashes[4], {} ) ) self._write( 'duplicate_pair_status', rows ) pixel_dupes_preference = CC.SIMILAR_FILES_PIXEL_DUPES_ALLOWED max_hamming_distance = 4 dupe_search_type = CC.DUPE_SEARCH_BOTH_FILES_MATCH_ONE_SEARCH num_potentials = self._read( 'potential_duplicates_count', self._file_search_context_1, self._file_search_context_2, dupe_search_type, pixel_dupes_preference, max_hamming_distance ) self.assertLess( num_potentials, self._expected_num_potentials ) self._expected_num_potentials = num_potentials self._our_alt_dupe_group_hashes.add( self._similar_looking_alternate_hashes[3] ) self._our_alt_dupe_group_hashes.add( self._similar_looking_alternate_hashes[4] ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 5 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._alternate_king_hash ) self.assertEqual( result[ 'is_king' ], True ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 5 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_alt_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash, HC.DUPLICATE_ALTERNATE ) self.assertEqual( result, [ self._king_hash, self._alternate_king_hash ] ) result = self._read( 'file_duplicate_hashes', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._alternate_king_hash, HC.DUPLICATE_ALTERNATE ) self.assertEqual( result, [ self._alternate_king_hash, self._king_hash ] ) def _test_dissolve( self ): result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 5 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_POTENTIAL ], self._get_group_potential_count( file_duplicate_types_to_counts ) ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) # remove potentials self._write( 'remove_potential_pairs', ( self._king_hash, ) ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 4 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) # remove member self._write( 'remove_duplicates_member', ( self._dupe_hashes[7], ) ) self._our_main_dupe_group_hashes.discard( self._dupe_hashes[7] ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 4 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_FALSE_POSITIVE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) # clear fps self._write( 'clear_false_positive_relations', ( self._king_hash, ) ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 3 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) # remove alt rows = [] rows.append( ( HC.DUPLICATE_ALTERNATE, self._king_hash, self._false_positive_king_hash, {} ) ) self._write( 'duplicate_pair_status', rows ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 3 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 2 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 2 ) self._write( 'remove_alternates_member', ( self._false_positive_king_hash, ) ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 3 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_MEMBER ], len( self._our_main_dupe_group_hashes ) - 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_ALTERNATE ], 1 ) self.assertEqual( file_duplicate_types_to_counts[ HC.DUPLICATE_CONFIRMED_ALTERNATE ], 1 ) # dissolve alt rows = [] rows.append( ( HC.DUPLICATE_ALTERNATE, self._king_hash, self._false_positive_king_hash, {} ) ) self._write( 'duplicate_pair_status', rows ) self._write( 'dissolve_alternates_group', ( self._king_hash, ) ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 0 ) # dissolve group rows = [] rows.append( ( HC.DUPLICATE_BETTER, self._king_hash, self._dupe_hashes[1], {} ) ) self._write( 'duplicate_pair_status', rows ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 1 ) self._write( 'dissolve_duplicates_group', ( self._king_hash, ) ) result = self._read( 'file_duplicate_info', ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ), self._king_hash ) file_duplicate_types_to_counts = result[ 'counts' ] self.assertEqual( len( file_duplicate_types_to_counts ), 0 ) def test_duplicates( self ): self._dupe_hashes = [ HydrusData.GenerateKey() for i in range( 16 ) ] self._second_group_dupe_hashes = [ HydrusData.GenerateKey() for i in range( 4 ) ] self._similar_looking_alternate_hashes = [ HydrusData.GenerateKey() for i in range( 5 ) ] self._similar_looking_false_positive_hashes = [ HydrusData.GenerateKey() for i in range( 5 ) ] self._all_hashes = set() self._all_hashes.update( self._dupe_hashes ) self._all_hashes.update( self._second_group_dupe_hashes ) self._all_hashes.update( self._similar_looking_alternate_hashes ) self._all_hashes.update( self._similar_looking_false_positive_hashes ) self._king_hash = self._dupe_hashes[0] self._second_group_king_hash = self._second_group_dupe_hashes[0] self._false_positive_king_hash = self._similar_looking_false_positive_hashes[0] self._alternate_king_hash = self._similar_looking_alternate_hashes[0] self._our_main_dupe_group_hashes = set( [ self._king_hash ] ) self._our_second_dupe_group_hashes = set( [ self._second_group_king_hash ] ) self._our_alt_dupe_group_hashes = set( [ self._alternate_king_hash ] ) self._our_fp_dupe_group_hashes = set( [ self._false_positive_king_hash ] ) n = len( self._all_hashes ) self._num_free_agents = n # initial number pair combinations is (n(n-1))/2 self._expected_num_potentials = int( n * ( n - 1 ) / 2 ) size_pred = ClientSearch.Predicate( ClientSearch.PREDICATE_TYPE_SYSTEM_SIZE, ( '=', 65535, HydrusData.ConvertUnitToInt( 'B' ) ) ) png_pred = ClientSearch.Predicate( ClientSearch.PREDICATE_TYPE_SYSTEM_MIME, ( HC.IMAGE_PNG, ) ) location_context = ClientLocation.LocationContext.STATICCreateSimple( CC.LOCAL_FILE_SERVICE_KEY ) self._file_search_context_1 = ClientSearch.FileSearchContext( location_context = location_context, predicates = [ size_pred ] ) self._file_search_context_2 = ClientSearch.FileSearchContext( location_context = location_context, predicates = [ png_pred ] ) self._import_and_find_dupes() self._test_initial_state() self._test_initial_better_worse() self._test_initial_king_usurp() self._test_initial_same_quality() self._test_explicit_set_new_king() self._test_establish_second_group() self._test_poach_better() self._test_poach_same() self._test_group_merge() self._test_establish_false_positive_group() self._test_false_positive() self._test_establish_alt_group() self._test_alt() self._test_expand_false_positive() self._test_expand_alt() self._test_dissolve()