hydrus/hydrus/client/db/ClientDBMappingsStorage.py

435 lines
20 KiB
Python

import sqlite3
import typing
from hydrus.core import HydrusConstants as HC
from hydrus.client.db import ClientDBModule
from hydrus.client.db import ClientDBServices
def DoingAFileJoinTagSearchIsFaster( estimated_file_row_count, estimated_tag_row_count ):
# ok, so there are times we want to do a tag search when we already know a superset of the file results (e.g. 'get all of these files that are tagged with samus')
# sometimes it is fastest to just do the search using tag outer-join-loop/indices and intersect/difference in python
# sometimes it is fastest to do the search with a temp file table and CROSS JOIN or EXISTS or similar to effect file outer-join-loop/indices
# with experimental profiling, it is generally 2.5 times as slow to look up mappings using file indices. it also takes about 0.1 the time to set up temp table and other misc overhead
# so, when we have file result A, and we want to fetch B, if the estimated size of A is < 2.6 the estimated size of B, we can save a bunch of time
# normally, we could let sqlite do NATURAL JOIN analyze profiling, but that sometimes fails for me when the queries get complex, I believe due to my wewlad 'temp table' queries and weird tag/file index distribution
file_lookup_speed_ratio = 2.5
temp_table_overhead = 0.1
return estimated_file_row_count * ( file_lookup_speed_ratio + temp_table_overhead ) < estimated_tag_row_count
def GenerateMappingsTableNames( service_id: int ) -> typing.Tuple[ str, str, str, str ]:
suffix = str( service_id )
current_mappings_table_name = 'external_mappings.current_mappings_{}'.format( suffix )
deleted_mappings_table_name = 'external_mappings.deleted_mappings_{}'.format( suffix )
pending_mappings_table_name = 'external_mappings.pending_mappings_{}'.format( suffix )
petitioned_mappings_table_name = 'external_mappings.petitioned_mappings_{}'.format( suffix )
return ( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name )
def GenerateSpecificDisplayMappingsCacheTableNames( file_service_id, tag_service_id ):
suffix = '{}_{}'.format( file_service_id, tag_service_id )
cache_display_current_mappings_table_name = 'external_caches.specific_display_current_mappings_cache_{}'.format( suffix )
cache_display_pending_mappings_table_name = 'external_caches.specific_display_pending_mappings_cache_{}'.format( suffix )
return ( cache_display_current_mappings_table_name, cache_display_pending_mappings_table_name )
def GenerateSpecificMappingsCacheTableNames( file_service_id, tag_service_id ):
suffix = '{}_{}'.format( file_service_id, tag_service_id )
cache_current_mappings_table_name = 'external_caches.specific_current_mappings_cache_{}'.format( suffix )
cache_deleted_mappings_table_name = 'external_caches.specific_deleted_mappings_cache_{}'.format( suffix )
cache_pending_mappings_table_name = 'external_caches.specific_pending_mappings_cache_{}'.format( suffix )
return ( cache_current_mappings_table_name, cache_deleted_mappings_table_name, cache_pending_mappings_table_name )
class ClientDBMappingsStorage( ClientDBModule.ClientDBModule ):
def __init__( self, cursor: sqlite3.Cursor, modules_services: ClientDBServices.ClientDBMasterServices ):
self.modules_services = modules_services
ClientDBModule.ClientDBModule.__init__( self, 'client mappings storage', cursor )
def _GetServiceIndexGenerationDict( self, service_id ) -> dict:
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
index_generation_dict = {}
index_generation_dict[ current_mappings_table_name ] = [
( [ 'hash_id', 'tag_id' ], True, 400 )
]
index_generation_dict[ deleted_mappings_table_name ] = [
( [ 'hash_id', 'tag_id' ], True, 400 )
]
index_generation_dict[ pending_mappings_table_name ] = [
( [ 'hash_id', 'tag_id' ], True, 400 )
]
index_generation_dict[ petitioned_mappings_table_name ] = [
( [ 'hash_id', 'tag_id' ], True, 400 )
]
return index_generation_dict
def _GetServiceTableGenerationDict( self, service_id ) -> dict:
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
return {
current_mappings_table_name : ( 'CREATE TABLE IF NOT EXISTS {} ( tag_id INTEGER, hash_id INTEGER, PRIMARY KEY ( tag_id, hash_id ) ) WITHOUT ROWID;', 400 ),
deleted_mappings_table_name : ( 'CREATE TABLE IF NOT EXISTS {} ( tag_id INTEGER, hash_id INTEGER, PRIMARY KEY ( tag_id, hash_id ) ) WITHOUT ROWID;', 400 ),
pending_mappings_table_name : ( 'CREATE TABLE IF NOT EXISTS {} ( tag_id INTEGER, hash_id INTEGER, PRIMARY KEY ( tag_id, hash_id ) ) WITHOUT ROWID;', 400 ),
petitioned_mappings_table_name : ( 'CREATE TABLE IF NOT EXISTS {} ( tag_id INTEGER, hash_id INTEGER, reason_id INTEGER, PRIMARY KEY ( tag_id, hash_id ) ) WITHOUT ROWID;', 400 )
}
def _GetServiceIdsWeGenerateDynamicTablesFor( self ):
return self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES )
def ClearMappingsTables( self, service_id: int ):
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
self._Execute( 'DELETE FROM {};'.format( current_mappings_table_name ) )
self._Execute( 'DELETE FROM {};'.format( deleted_mappings_table_name ) )
self._Execute( 'DELETE FROM {};'.format( pending_mappings_table_name ) )
self._Execute( 'DELETE FROM {};'.format( petitioned_mappings_table_name ) )
def DropMappingsTables( self, service_id: int ):
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
self._Execute( 'DROP TABLE IF EXISTS {};'.format( current_mappings_table_name ) )
self._Execute( 'DROP TABLE IF EXISTS {};'.format( deleted_mappings_table_name ) )
self._Execute( 'DROP TABLE IF EXISTS {};'.format( pending_mappings_table_name ) )
self._Execute( 'DROP TABLE IF EXISTS {};'.format( petitioned_mappings_table_name ) )
def FilterExistingUpdateMappings( self, tag_service_id, mappings_ids, action ):
if len( mappings_ids ) == 0:
return mappings_ids
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( tag_service_id )
culled_mappings_ids = []
for row in mappings_ids:
# mappings_ids here can have 'reason_id' for petitions, so we'll index our values here
tag_id = row[0]
hash_ids = row[1]
if len( hash_ids ) == 0:
continue
elif len( hash_ids ) == 1:
( hash_id, ) = hash_ids
if action == HC.CONTENT_UPDATE_ADD:
result = self._Execute( 'SELECT 1 FROM {} WHERE tag_id = ? AND hash_id = ?;'.format( current_mappings_table_name ), ( tag_id, hash_id ) ).fetchone()
if result is None:
valid_hash_ids = hash_ids
else:
continue
elif action == HC.CONTENT_UPDATE_DELETE:
result = self._Execute( 'SELECT 1 FROM {} WHERE tag_id = ? AND hash_id = ?;'.format( deleted_mappings_table_name ), ( tag_id, hash_id ) ).fetchone()
if result is None:
valid_hash_ids = hash_ids
else:
continue
elif action == HC.CONTENT_UPDATE_PEND:
result = self._Execute( 'SELECT 1 FROM {} WHERE tag_id = ? AND hash_id = ?;'.format( current_mappings_table_name ), ( tag_id, hash_id ) ).fetchone()
if result is None:
result = self._Execute( 'SELECT 1 FROM {} WHERE tag_id = ? AND hash_id = ?;'.format( pending_mappings_table_name ), ( tag_id, hash_id ) ).fetchone()
if result is None:
valid_hash_ids = hash_ids
else:
continue
else:
continue
elif action == HC.CONTENT_UPDATE_RESCIND_PEND:
result = self._Execute( 'SELECT 1 FROM {} WHERE tag_id = ? AND hash_id = ?;'.format( pending_mappings_table_name ), ( tag_id, hash_id ) ).fetchone()
if result is None:
continue
else:
valid_hash_ids = hash_ids
elif action == HC.CONTENT_UPDATE_PETITION:
result = self._Execute( 'SELECT 1 FROM {} WHERE tag_id = ? AND hash_id = ?;'.format( petitioned_mappings_table_name ), ( tag_id, hash_id ) ).fetchone()
if result is None:
valid_hash_ids = hash_ids
else:
continue
elif action == HC.CONTENT_UPDATE_RESCIND_PETITION:
result = self._Execute( 'SELECT 1 FROM {} WHERE tag_id = ? AND hash_id = ?;'.format( petitioned_mappings_table_name ), ( tag_id, hash_id ) ).fetchone()
if result is None:
continue
else:
valid_hash_ids = hash_ids
else:
valid_hash_ids = set()
else:
with self._MakeTemporaryIntegerTable( hash_ids, 'hash_id' ) as temp_hash_ids_table_name:
if action == HC.CONTENT_UPDATE_ADD:
existing_hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) WHERE tag_id = ?;'.format( temp_hash_ids_table_name, current_mappings_table_name ), ( tag_id, ) ) )
valid_hash_ids = set( hash_ids ).difference( existing_hash_ids )
elif action == HC.CONTENT_UPDATE_DELETE:
existing_hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) WHERE tag_id = ?;'.format( temp_hash_ids_table_name, deleted_mappings_table_name ), ( tag_id, ) ) )
valid_hash_ids = set( hash_ids ).difference( existing_hash_ids )
elif action == HC.CONTENT_UPDATE_PEND:
# prohibited hash_ids
existing_hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) WHERE tag_id = ?;'.format( temp_hash_ids_table_name, current_mappings_table_name ), ( tag_id, ) ) )
# existing_hash_ids
existing_hash_ids.update( self._STI( self._Execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) WHERE tag_id = ?;'.format( temp_hash_ids_table_name, pending_mappings_table_name ), ( tag_id, ) ) ) )
valid_hash_ids = set( hash_ids ).difference( existing_hash_ids )
elif action == HC.CONTENT_UPDATE_RESCIND_PEND:
valid_hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) WHERE tag_id = ?;'.format( temp_hash_ids_table_name, pending_mappings_table_name ), ( tag_id, ) ) )
elif action == HC.CONTENT_UPDATE_PETITION:
# we are technically ok with deleting tags that don't exist yet!
existing_hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) WHERE tag_id = ?;'.format( temp_hash_ids_table_name, petitioned_mappings_table_name ), ( tag_id, ) ) )
valid_hash_ids = set( hash_ids ).difference( existing_hash_ids )
elif action == HC.CONTENT_UPDATE_RESCIND_PETITION:
valid_hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) WHERE tag_id = ?;'.format( temp_hash_ids_table_name, petitioned_mappings_table_name ), ( tag_id, ) ) )
else:
valid_hash_ids = set()
if len( valid_hash_ids ) > 0:
if action == HC.CONTENT_UPDATE_PETITION:
reason_id = row[2]
culled_mappings_ids.append( ( tag_id, valid_hash_ids, reason_id ) )
else:
culled_mappings_ids.append( ( tag_id, valid_hash_ids ) )
return culled_mappings_ids
def GenerateMappingsTables( self, service_id: int ):
table_generation_dict = self._GetServiceTableGenerationDict( service_id )
for ( table_name, ( create_query_without_name, version_added ) ) in table_generation_dict.items():
self._CreateTable( create_query_without_name, table_name )
index_generation_dict = self._GetServiceIndexGenerationDict( service_id )
for ( table_name, columns, unique, version_added ) in self._FlattenIndexGenerationDict( index_generation_dict ):
self._CreateIndex( table_name, columns, unique = unique )
def GetCurrentFilesCount( self, service_id: int ) -> int:
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
result = self._Execute( 'SELECT COUNT( DISTINCT hash_id ) FROM {};'.format( current_mappings_table_name ) ).fetchone()
( count, ) = result
return count
def GetDeletedMappingsCount( self, service_id: int ) -> int:
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
result = self._Execute( 'SELECT COUNT( * ) FROM {};'.format( deleted_mappings_table_name ) ).fetchone()
( count, ) = result
return count
def GetFastestStorageMappingTableNames( self, file_service_id: int, tag_service_id: int ):
statuses_to_table_names = {}
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( tag_service_id )
statuses_to_table_names[ HC.CONTENT_STATUS_CURRENT ] = current_mappings_table_name
statuses_to_table_names[ HC.CONTENT_STATUS_DELETED ] = deleted_mappings_table_name
statuses_to_table_names[ HC.CONTENT_STATUS_PENDING ] = pending_mappings_table_name
statuses_to_table_names[ HC.CONTENT_STATUS_PETITIONED ] = petitioned_mappings_table_name
if file_service_id != self.modules_services.combined_file_service_id:
( cache_current_mappings_table_name, cache_deleted_mappings_table_name, cache_pending_mappings_table_name ) = GenerateSpecificMappingsCacheTableNames( file_service_id, tag_service_id )
statuses_to_table_names[ HC.CONTENT_STATUS_CURRENT ] = cache_current_mappings_table_name
statuses_to_table_names[ HC.CONTENT_STATUS_DELETED ] = cache_deleted_mappings_table_name
statuses_to_table_names[ HC.CONTENT_STATUS_PENDING ] = cache_pending_mappings_table_name
return statuses_to_table_names
def GetPendingMappingsCount( self, service_id: int ) -> int:
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
result = self._Execute( 'SELECT COUNT( * ) FROM {};'.format( pending_mappings_table_name ) ).fetchone()
( count, ) = result
return count
def GetPetitionedMappingsCount( self, service_id: int ) -> int:
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
result = self._Execute( 'SELECT COUNT( * ) FROM {};'.format( petitioned_mappings_table_name ) ).fetchone()
( count, ) = result
return count
def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:
tables_and_columns = []
if content_type == HC.CONTENT_TYPE_HASH:
for service_id in self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES ):
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
tables_and_columns.extend( [
( current_mappings_table_name, 'hash_id' ),
( deleted_mappings_table_name, 'hash_id' ),
( pending_mappings_table_name, 'hash_id' ),
( petitioned_mappings_table_name, 'hash_id' )
] )
elif content_type == HC.CONTENT_TYPE_TAG:
for service_id in self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES ):
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = GenerateMappingsTableNames( service_id )
tables_and_columns.extend( [
( current_mappings_table_name, 'tag_id' ),
( deleted_mappings_table_name, 'tag_id' ),
( pending_mappings_table_name, 'tag_id' ),
( petitioned_mappings_table_name, 'tag_id' )
] )
return tables_and_columns