hydrus/hydrus/client/db/ClientDBTagSearch.py

1393 lines
59 KiB
Python

import collections
import sqlite3
import typing
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData
from hydrus.core import HydrusDB
from hydrus.core import HydrusDBBase
from hydrus.core import HydrusGlobals as HG
from hydrus.core import HydrusTags
from hydrus.client import ClientConstants as CC
from hydrus.client import ClientSearch
from hydrus.client.db import ClientDBMappingsCounts
from hydrus.client.db import ClientDBMappingsStorage
from hydrus.client.db import ClientDBMaster
from hydrus.client.db import ClientDBModule
from hydrus.client.db import ClientDBServices
from hydrus.client.db import ClientDBTagDisplay
from hydrus.client.db import ClientDBTagSiblings
from hydrus.client.metadata import ClientTags
# Sqlite can handle -( 2 ** 63 ) -> ( 2 ** 63 ) - 1
MIN_CACHED_INTEGER = - ( 2 ** 63 )
MAX_CACHED_INTEGER = ( 2 ** 63 ) - 1
def CanCacheInteger( num ):
return MIN_CACHED_INTEGER <= num <= MAX_CACHED_INTEGER
def ConvertWildcardToSQLiteLikeParameter( wildcard ):
like_param = wildcard.replace( '*', '%' )
return like_param
def GenerateCombinedFilesIntegerSubtagsTableName( tag_service_id ):
name = 'combined_files_integer_subtags_cache'
integer_subtags_table_name = 'external_caches.{}_{}'.format( name, tag_service_id )
return integer_subtags_table_name
def GenerateCombinedFilesSubtagsFTS4TableName( tag_service_id ):
name = 'combined_files_subtags_fts4_cache'
subtags_fts4_table_name = 'external_caches.{}_{}'.format( name, tag_service_id )
return subtags_fts4_table_name
def GenerateCombinedFilesSubtagsSearchableMapTableName( tag_service_id ):
name = 'combined_files_subtags_searchable_map_cache'
subtags_searchable_map_table_name = 'external_caches.{}_{}'.format( name, tag_service_id )
return subtags_searchable_map_table_name
def GenerateCombinedFilesTagsTableName( tag_service_id ):
name = 'combined_files_tags_cache'
tags_table_name = 'external_caches.{}_{}'.format( name, tag_service_id )
return tags_table_name
def GenerateCombinedTagsTagsTableName( file_service_id ):
name = 'combined_tags_tags_cache'
tags_table_name = 'external_caches.{}_{}'.format( name, file_service_id )
return tags_table_name
def GenerateSpecificIntegerSubtagsTableName( file_service_id, tag_service_id ):
name = 'specific_integer_subtags_cache'
suffix = '{}_{}'.format( file_service_id, tag_service_id )
integer_subtags_table_name = 'external_caches.{}_{}'.format( name, suffix )
return integer_subtags_table_name
def GenerateSpecificSubtagsFTS4TableName( file_service_id, tag_service_id ):
name = 'specific_subtags_fts4_cache'
suffix = '{}_{}'.format( file_service_id, tag_service_id )
subtags_fts4_table_name = 'external_caches.{}_{}'.format( name, suffix )
return subtags_fts4_table_name
def GenerateSpecificSubtagsSearchableMapTableName( file_service_id, tag_service_id ):
name = 'specific_subtags_searchable_map_cache'
suffix = '{}_{}'.format( file_service_id, tag_service_id )
subtags_searchable_map_table_name = 'external_caches.{}_{}'.format( name, suffix )
return subtags_searchable_map_table_name
def GenerateSpecificTagsTableName( file_service_id, tag_service_id ):
name = 'specific_tags_cache'
suffix = '{}_{}'.format( file_service_id, tag_service_id )
tags_table_name = 'external_caches.{}_{}'.format( name, suffix )
return tags_table_name
def WildcardHasFTS4SearchableCharacters( wildcard: str ):
# fts4 says it can do alphanumeric or unicode with a value >= 128
for c in wildcard:
if c.isalnum() or ord( c ) >= 128 or c == '*':
return True
return False
class ClientDBTagSearch( ClientDBModule.ClientDBModule ):
CAN_REPOPULATE_ALL_MISSING_DATA = True
def __init__( self, cursor: sqlite3.Cursor, modules_services: ClientDBServices.ClientDBMasterServices, modules_tags: ClientDBMaster.ClientDBMasterTags, modules_tag_display: ClientDBTagDisplay.ClientDBTagDisplay, modules_tag_siblings: ClientDBTagSiblings.ClientDBTagSiblings, modules_mappings_counts: ClientDBMappingsCounts.ClientDBMappingsCounts ):
self.modules_services = modules_services
self.modules_tags = modules_tags
self.modules_tag_display = modules_tag_display
self.modules_tag_siblings = modules_tag_siblings
self.modules_mappings_counts = modules_mappings_counts
ClientDBModule.ClientDBModule.__init__( self, 'client tag search', cursor )
self._missing_tag_search_service_pairs = set()
def _GetServiceIndexGenerationDictSingle( self, file_service_id, tag_service_id ) -> dict:
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, tag_service_id )
subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, tag_service_id )
integer_subtags_table_name = self.GetIntegerSubtagsTableName( file_service_id, tag_service_id )
index_generation_dict = {}
index_generation_dict[ tags_table_name ] = [
( [ 'namespace_id', 'subtag_id' ], True, 465 ),
( [ 'subtag_id' ], False, 465 )
]
index_generation_dict[ subtags_searchable_map_table_name ] = [
( [ 'searchable_subtag_id' ], False, 465 )
]
index_generation_dict[ integer_subtags_table_name ] = [
( [ 'integer_subtag' ], False, 465 )
]
return index_generation_dict
def _GetServiceIndexGenerationDict( self, service_id ) -> dict:
tag_service_id = service_id
index_generation_dict = {}
file_service_ids = list( self.modules_services.GetServiceIds( HC.FILE_SERVICES_WITH_SPECIFIC_MAPPING_CACHES ) )
file_service_ids.append( self.modules_services.combined_file_service_id )
for file_service_id in file_service_ids:
single_index_dict = self._GetServiceIndexGenerationDictSingle( file_service_id, tag_service_id )
index_generation_dict.update( single_index_dict )
return index_generation_dict
def _GetServiceTableGenerationDictSingle( self, file_service_id, tag_service_id ):
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, tag_service_id )
subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, tag_service_id )
integer_subtags_table_name = self.GetIntegerSubtagsTableName( file_service_id, tag_service_id )
table_dict = {
tags_table_name : ( 'CREATE TABLE IF NOT EXISTS {} ( tag_id INTEGER PRIMARY KEY, namespace_id INTEGER, subtag_id INTEGER );', 465 ),
subtags_fts4_table_name : ( 'CREATE VIRTUAL TABLE IF NOT EXISTS {} USING fts4( subtag );', 465 ),
subtags_searchable_map_table_name : ( 'CREATE TABLE IF NOT EXISTS {} ( subtag_id INTEGER PRIMARY KEY, searchable_subtag_id INTEGER );', 465 ),
integer_subtags_table_name : ( 'CREATE TABLE IF NOT EXISTS {} ( subtag_id INTEGER PRIMARY KEY, integer_subtag INTEGER );', 465 )
}
return table_dict
def _GetServiceTableGenerationDict( self, service_id ) -> dict:
tag_service_id = service_id
table_dict = {}
file_service_ids = list( self.modules_services.GetServiceIds( HC.FILE_SERVICES_WITH_SPECIFIC_MAPPING_CACHES ) )
file_service_ids.append( self.modules_services.combined_file_service_id )
for file_service_id in file_service_ids:
single_table_dict = self._GetServiceTableGenerationDictSingle( file_service_id, tag_service_id )
table_dict.update( single_table_dict )
return table_dict
def _GetServiceIdsWeGenerateDynamicTablesFor( self ):
return self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES )
def _RepairRepopulateTables( self, table_names, cursor_transaction_wrapper: HydrusDBBase.DBCursorTransactionWrapper ):
file_service_ids = list( self.modules_services.GetServiceIds( HC.FILE_SERVICES_WITH_SPECIFIC_TAG_LOOKUP_CACHES ) )
file_service_ids.append( self.modules_services.combined_file_service_id )
tag_service_ids = list( self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES ) )
for tag_service_id in tag_service_ids:
for file_service_id in file_service_ids:
table_dict_for_this = self._GetServiceTableGenerationDictSingle( file_service_id, tag_service_id )
table_names_for_this = set( table_dict_for_this.keys() )
if not table_names_for_this.isdisjoint( table_names ):
self._missing_tag_search_service_pairs.add( ( file_service_id, tag_service_id ) )
def AddTags( self, file_service_id, tag_service_id, tag_ids ):
if len( tag_ids ) == 0:
return
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
actually_new_tag_ids = set()
for tag_id in tag_ids:
self._Execute( 'INSERT OR IGNORE INTO {} ( tag_id, namespace_id, subtag_id ) SELECT tag_id, namespace_id, subtag_id FROM tags WHERE tag_id = ?;'.format( tags_table_name ), ( tag_id, ) )
if self._GetRowCount() > 0:
actually_new_tag_ids.add( tag_id )
if len( actually_new_tag_ids ) > 0:
if file_service_id == self.modules_services.combined_file_service_id:
self._Execute( 'UPDATE service_info SET info = info + ? WHERE service_id = ? AND info_type = ?;', ( len( actually_new_tag_ids ), tag_service_id, HC.SERVICE_INFO_NUM_TAGS ) )
with self._MakeTemporaryIntegerTable( actually_new_tag_ids, 'tag_id' ) as temp_tag_ids_table_name:
# temp tags to fast tag definitions to subtags
subtag_ids_and_subtags = self._Execute( 'SELECT subtag_id, subtag FROM {} CROSS JOIN {} USING ( tag_id ) CROSS JOIN subtags USING ( subtag_id );'.format( temp_tag_ids_table_name, tags_table_name ) ).fetchall()
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, tag_service_id )
subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, tag_service_id )
integer_subtags_table_name = self.GetIntegerSubtagsTableName( file_service_id, tag_service_id )
for ( subtag_id, subtag ) in subtag_ids_and_subtags:
searchable_subtag = ClientSearch.ConvertSubtagToSearchable( subtag )
if searchable_subtag != subtag:
searchable_subtag_id = self.modules_tags.GetSubtagId( searchable_subtag )
self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id, searchable_subtag_id ) VALUES ( ?, ? );'.format( subtags_searchable_map_table_name ), ( subtag_id, searchable_subtag_id ) )
#
self._Execute( 'INSERT OR IGNORE INTO {} ( docid, subtag ) VALUES ( ?, ? );'.format( subtags_fts4_table_name ), ( subtag_id, searchable_subtag ) )
if subtag.isdecimal():
try:
integer_subtag = int( subtag )
if CanCacheInteger( integer_subtag ):
self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id, integer_subtag ) VALUES ( ?, ? );'.format( integer_subtags_table_name ), ( subtag_id, integer_subtag ) )
except ValueError:
pass
def DeleteTags( self, file_service_id, tag_service_id, tag_ids ):
if len( tag_ids ) == 0:
return
if not isinstance( tag_ids, set ):
tag_ids = set( tag_ids )
#
# we always include all chained guys regardless of count
chained_tag_ids = self.modules_tag_display.GetChainsMembers( ClientTags.TAG_DISPLAY_ACTUAL, tag_service_id, tag_ids )
tag_ids = tag_ids.difference( chained_tag_ids )
#
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, tag_service_id )
subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, tag_service_id )
integer_subtags_table_name = self.GetIntegerSubtagsTableName( file_service_id, tag_service_id )
with self._MakeTemporaryIntegerTable( tag_ids, 'tag_id' ) as temp_tag_ids_table_name:
# temp tag ids to tag definitions
subtag_ids = self._STS( self._Execute( 'SELECT subtag_id FROM {} CROSS JOIN {} USING ( tag_id );'.format( temp_tag_ids_table_name, tags_table_name ) ) )
#
self._ExecuteMany( 'DELETE FROM {} WHERE tag_id = ?;'.format( tags_table_name ), ( ( tag_id, ) for tag_id in tag_ids ) )
num_deleted = self._GetRowCount()
if num_deleted > 0:
if file_service_id == self.modules_services.combined_file_service_id:
self._Execute( 'UPDATE service_info SET info = info - ? WHERE service_id = ? AND info_type = ?;', ( num_deleted, tag_service_id, HC.SERVICE_INFO_NUM_TAGS ) )
#
# subtags may exist under other namespaces, so exclude those that do
with self._MakeTemporaryIntegerTable( subtag_ids, 'subtag_id' ) as temp_subtag_ids_table_name:
still_existing_subtag_ids = self._STS( self._Execute( 'SELECT subtag_id FROM {} CROSS JOIN {} USING ( subtag_id );'.format( temp_subtag_ids_table_name, tags_table_name ) ) )
deletee_subtag_ids = subtag_ids.difference( still_existing_subtag_ids )
self._ExecuteMany( 'DELETE FROM {} WHERE docid = ?;'.format( subtags_fts4_table_name ), ( ( subtag_id, ) for subtag_id in deletee_subtag_ids ) )
self._ExecuteMany( 'DELETE FROM {} WHERE subtag_id = ?;'.format( subtags_searchable_map_table_name ), ( ( subtag_id, ) for subtag_id in deletee_subtag_ids ) )
self._ExecuteMany( 'DELETE FROM {} WHERE subtag_id = ?;'.format( integer_subtags_table_name ), ( ( subtag_id, ) for subtag_id in deletee_subtag_ids ) )
def Drop( self, file_service_id, tag_service_id ):
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
self._Execute( 'DROP TABLE IF EXISTS {};'.format( tags_table_name ) )
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, tag_service_id )
self._Execute( 'DROP TABLE IF EXISTS {};'.format( subtags_fts4_table_name ) )
subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, tag_service_id )
self._Execute( 'DROP TABLE IF EXISTS {};'.format( subtags_searchable_map_table_name ) )
integer_subtags_table_name = self.GetIntegerSubtagsTableName( file_service_id, tag_service_id )
self._Execute( 'DROP TABLE IF EXISTS {};'.format( integer_subtags_table_name ) )
def FilterExistingTagIds( self, file_service_id, tag_service_id, tag_ids_table_name ):
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
return self._STS( self._Execute( 'SELECT tag_id FROM {} CROSS JOIN {} USING ( tag_id );'.format( tag_ids_table_name, tags_table_name ) ) )
def Generate( self, file_service_id, tag_service_id ):
table_generation_dict = self._GetServiceTableGenerationDictSingle( file_service_id, tag_service_id )
for ( table_name, ( create_query_without_name, version_added ) ) in table_generation_dict.items():
self._CreateTable( create_query_without_name, table_name )
index_generation_dict = self._GetServiceIndexGenerationDictSingle( file_service_id, tag_service_id )
for ( table_name, columns, unique, version_added ) in self._FlattenIndexGenerationDict( index_generation_dict ):
self._CreateIndex( table_name, columns, unique = unique )
def GetAllTagIds( self, leaf: ClientDBServices.FileSearchContextLeaf, job_key = None ):
tag_ids = set()
query = '{};'.format( self.GetQueryPhraseForTagIds( leaf.file_service_id, leaf.tag_service_id ) )
cursor = self._Execute( query )
cancelled_hook = None
if job_key is not None:
cancelled_hook = job_key.IsCancelled
loop_of_tag_ids = self._STS( HydrusDB.ReadFromCancellableCursor( cursor, 1024, cancelled_hook = cancelled_hook ) )
if job_key is not None and job_key.IsCancelled():
return set()
tag_ids.update( loop_of_tag_ids )
return tag_ids
def GetAutocompletePredicates(
self,
tag_display_type: int,
file_search_context: ClientSearch.FileSearchContext,
search_text: str = '',
exact_match = False,
inclusive = True,
search_namespaces_into_full_tags = False,
zero_count_ok = False,
job_key = None
):
# TODO: So I think I should interleave this, perhaps with the SearchLeaf object, or just as GetHashIdsFromTag now does, for each tag service. don't throw 'all known tags' down to lower methods
# _Then_, you do the GeneratePredicatesFromTagIdsAndCounts for each tag service in turn (don't worry, it is quick since servces won't share tags much), and then you can do some clever sibling counting
# For instance, if we search for A on a domain where one tag service has A->B, we return the B results. Well, let's increment the A (x) count according to that, based on each service!
# and then obviously a nice big merge at the end
location_context = file_search_context.GetLocationContext()
tag_context = file_search_context.GetTagContext()
display_tag_service_id = self.modules_services.GetServiceId( tag_context.display_service_key )
if tag_context.IsAllKnownTags() and location_context.IsAllKnownFiles():
return []
include_current = tag_context.include_current_tags
include_pending = tag_context.include_pending_tags
all_predicates = []
file_search_context_branch = self.modules_services.GetFileSearchContextBranch( file_search_context )
for leaf in file_search_context_branch.IterateLeaves():
tag_ids = self.GetAutocompleteTagIds( tag_display_type, leaf, search_text, exact_match, job_key = job_key )
if ':' not in search_text and search_namespaces_into_full_tags and not exact_match:
# 'char' -> 'character:samus aran'
special_search_text = '{}*:*'.format( search_text )
tag_ids.update( self.GetAutocompleteTagIds( tag_display_type, leaf, special_search_text, exact_match, job_key = job_key ) )
if job_key is not None and job_key.IsCancelled():
return []
domain_is_cross_referenced = leaf.file_service_id != self.modules_services.combined_deleted_file_service_id
for group_of_tag_ids in HydrusData.SplitIteratorIntoChunks( tag_ids, 1000 ):
if job_key is not None and job_key.IsCancelled():
return []
ids_to_count = self.modules_mappings_counts.GetCounts( tag_display_type, leaf.tag_service_id, leaf.file_service_id, group_of_tag_ids, include_current, include_pending, domain_is_cross_referenced = domain_is_cross_referenced, zero_count_ok = zero_count_ok, job_key = job_key )
if len( ids_to_count ) == 0:
continue
#
predicates = self.modules_tag_display.GeneratePredicatesFromTagIdsAndCounts( tag_display_type, display_tag_service_id, ids_to_count, inclusive, job_key = job_key )
all_predicates.extend( predicates )
if job_key is not None and job_key.IsCancelled():
return []
predicates = ClientSearch.MergePredicates( all_predicates )
return predicates
def GetAutocompleteTagIds( self, tag_display_type: int, leaf: ClientDBServices.FileSearchContextLeaf, search_text, exact_match, job_key = None ):
if search_text == '':
return set()
( namespace, half_complete_searchable_subtag ) = HydrusTags.SplitTag( search_text )
if half_complete_searchable_subtag == '':
return set()
if exact_match:
if '*' in namespace or '*' in half_complete_searchable_subtag:
return []
if '*' in namespace:
namespace_ids = self.GetNamespaceIdsFromWildcard( namespace )
else:
if not self.modules_tags.NamespaceExists( namespace ):
return set()
namespace_ids = ( self.modules_tags.GetNamespaceId( namespace ), )
if half_complete_searchable_subtag == '*':
if namespace == '':
# hellmode 'get all tags' search
tag_ids = self.GetAllTagIds( leaf, job_key = job_key )
else:
tag_ids = self.GetTagIdsFromNamespaceIds( leaf, namespace_ids, job_key = job_key )
else:
tag_ids = set()
with self._MakeTemporaryIntegerTable( [], 'subtag_id' ) as temp_subtag_ids_table_name:
self.GetSubtagIdsFromWildcardIntoTable( leaf.file_service_id, leaf.tag_service_id, half_complete_searchable_subtag, temp_subtag_ids_table_name, job_key = job_key )
if namespace == '':
loop_of_tag_ids = self.GetTagIdsFromSubtagIdsTable( leaf.file_service_id, leaf.tag_service_id, temp_subtag_ids_table_name, job_key = job_key )
else:
with self._MakeTemporaryIntegerTable( namespace_ids, 'namespace_id' ) as temp_namespace_ids_table_name:
loop_of_tag_ids = self.GetTagIdsFromNamespaceIdsSubtagIdsTables( leaf.file_service_id, leaf.tag_service_id, temp_namespace_ids_table_name, temp_subtag_ids_table_name, job_key = job_key )
tag_ids.update( loop_of_tag_ids )
# now fetch siblings, add to set
if not isinstance( tag_ids, set ):
tag_ids = set( tag_ids )
tag_ids_without_siblings = list( tag_ids )
seen_ideal_tag_ids = collections.defaultdict( set )
for batch_of_tag_ids in HydrusData.SplitListIntoChunks( tag_ids_without_siblings, 10240 ):
with self._MakeTemporaryIntegerTable( batch_of_tag_ids, 'tag_id' ) as temp_tag_ids_table_name:
if job_key is not None and job_key.IsCancelled():
return set()
with self._MakeTemporaryIntegerTable( [], 'ideal_tag_id' ) as temp_ideal_tag_ids_table_name:
self.modules_tag_siblings.FilterChainedIdealsIntoTable( ClientTags.TAG_DISPLAY_ACTUAL, leaf.tag_service_id, temp_tag_ids_table_name, temp_ideal_tag_ids_table_name )
with self._MakeTemporaryIntegerTable( [], 'tag_id' ) as temp_chained_tag_ids_table_name:
self.modules_tag_siblings.GetChainsMembersFromIdealsTables( ClientTags.TAG_DISPLAY_ACTUAL, leaf.tag_service_id, temp_ideal_tag_ids_table_name, temp_chained_tag_ids_table_name )
tag_ids.update( self._STI( self._Execute( 'SELECT tag_id FROM {};'.format( temp_chained_tag_ids_table_name ) ) ) )
return tag_ids
def GetIntegerSubtagsTableName( self, file_service_id, tag_service_id ):
if file_service_id == self.modules_services.combined_file_service_id:
integer_subtags_table_name = GenerateCombinedFilesIntegerSubtagsTableName( tag_service_id )
else:
if self.modules_services.FileServiceIsCoveredByAllLocalFiles( file_service_id ):
file_service_id = self.modules_services.combined_local_file_service_id
integer_subtags_table_name = GenerateSpecificIntegerSubtagsTableName( file_service_id, tag_service_id )
return integer_subtags_table_name
def GetMappingTables( self, tag_display_type, file_service_key: bytes, tag_context: ClientSearch.TagContext ):
mapping_and_tag_table_names = self.GetMappingAndTagTables( tag_display_type, file_service_key, tag_context )
mapping_table_names = [ mapping_table_name for ( mapping_table_name, tag_table_name ) in mapping_and_tag_table_names ]
return mapping_table_names
def GetMappingAndTagTables( self, tag_display_type, file_service_key: bytes, tag_context: ClientSearch.TagContext ):
file_service_id = self.modules_services.GetServiceId( file_service_key )
tag_service_key = tag_context.service_key
if tag_service_key == CC.COMBINED_TAG_SERVICE_KEY:
tag_service_ids = self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES )
else:
tag_service_ids = [ self.modules_services.GetServiceId( tag_service_key ) ]
current_tables = []
pending_tables = []
for tag_service_id in tag_service_ids:
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
if file_service_id == self.modules_services.combined_file_service_id:
# yo this does not support ClientTags.TAG_DISPLAY_ACTUAL--big tricky problem
( current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = ClientDBMappingsStorage.GenerateMappingsTableNames( tag_service_id )
current_tables.append( ( current_mappings_table_name, tags_table_name ) )
pending_tables.append( ( pending_mappings_table_name, tags_table_name ) )
else:
if tag_display_type == ClientTags.TAG_DISPLAY_STORAGE:
( cache_current_mappings_table_name, cache_deleted_mappings_table_name, cache_pending_mappings_table_name ) = ClientDBMappingsStorage.GenerateSpecificMappingsCacheTableNames( file_service_id, tag_service_id )
current_tables.append( ( cache_current_mappings_table_name, tags_table_name ) )
pending_tables.append( ( cache_pending_mappings_table_name, tags_table_name ) )
elif tag_display_type == ClientTags.TAG_DISPLAY_ACTUAL:
( cache_current_display_mappings_table_name, cache_pending_display_mappings_table_name ) = ClientDBMappingsStorage.GenerateSpecificDisplayMappingsCacheTableNames( file_service_id, tag_service_id )
current_tables.append( ( cache_current_display_mappings_table_name, tags_table_name ) )
pending_tables.append( ( cache_pending_display_mappings_table_name, tags_table_name ) )
table_names = []
if tag_context.include_current_tags:
table_names.extend( current_tables )
if tag_context.include_pending_tags:
table_names.extend( pending_tables )
return table_names
def GetMissingTagSearchServicePairs( self ):
return self._missing_tag_search_service_pairs
def GetNamespaceIdsFromWildcard( self, namespace_wildcard ):
if namespace_wildcard == '*':
return self._STL( self._Execute( 'SELECT namespace_id FROM namespaces;' ) )
elif '*' in namespace_wildcard:
like_param = ConvertWildcardToSQLiteLikeParameter( namespace_wildcard )
return self._STL( self._Execute( 'SELECT namespace_id FROM namespaces WHERE namespace LIKE ?;', ( like_param, ) ) )
else:
if self.modules_tags.NamespaceExists( namespace_wildcard ):
namespace_id = self.modules_tags.GetNamespaceId( namespace_wildcard )
return [ namespace_id ]
else:
return []
def GetQueryPhraseForTagIds( self, file_service_id, tag_service_id ):
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
return 'SELECT tag_id FROM {}'.format( tags_table_name )
def GetSubtagIdsFromWildcard( self, file_service_id: int, tag_service_id: int, subtag_wildcard, job_key = None ):
if tag_service_id == self.modules_services.combined_tag_service_id:
search_tag_service_ids = self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES )
else:
search_tag_service_ids = ( tag_service_id, )
result_subtag_ids = set()
for search_tag_service_id in search_tag_service_ids:
if '*' in subtag_wildcard:
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, search_tag_service_id )
wildcard_has_fts4_searchable_characters = WildcardHasFTS4SearchableCharacters( subtag_wildcard )
if subtag_wildcard == '*':
# hellmode, but shouldn't be called normally
cursor = self._Execute( 'SELECT docid FROM {};'.format( subtags_fts4_table_name ) )
elif ClientSearch.IsComplexWildcard( subtag_wildcard ) or not wildcard_has_fts4_searchable_characters:
# FTS4 does not support complex wildcards, so instead we'll search our raw subtags
# however, since we want to search 'searchable' text, we use the 'searchable subtags map' to cross between real and searchable
like_param = ConvertWildcardToSQLiteLikeParameter( subtag_wildcard )
if subtag_wildcard.startswith( '*' ) or not wildcard_has_fts4_searchable_characters:
# this is a SCAN, but there we go
# a potential optimisation here, in future, is to store fts4 of subtags reversed, then for '*amus', we can just search that reverse cache for 'suma*'
# and this would only double the size of the fts4 cache, the largest cache in the whole db! a steal!
# it also would not fix '*amu*', but with some cleverness could speed up '*amus ar*'
query = 'SELECT docid FROM {} WHERE subtag LIKE ?;'.format( subtags_fts4_table_name )
cursor = self._Execute( query, ( like_param, ) )
else:
# we have an optimisation here--rather than searching all subtags for bl*ah, let's search all the bl* subtags for bl*ah!
prefix_fts4_wildcard = subtag_wildcard.split( '*' )[0]
prefix_fts4_wildcard_param = '"{}*"'.format( prefix_fts4_wildcard )
query = 'SELECT docid FROM {} WHERE subtag MATCH ? AND subtag LIKE ?;'.format( subtags_fts4_table_name )
cursor = self._Execute( query, ( prefix_fts4_wildcard_param, like_param ) )
else:
# we want the " " wrapping our search text to keep whitespace words connected and in order
# "samus ar*" should not match "around samus"
# simple 'sam*' style subtag, so we can search fts4 no prob
subtags_fts4_param = '"{}"'.format( subtag_wildcard )
cursor = self._Execute( 'SELECT docid FROM {} WHERE subtag MATCH ?;'.format( subtags_fts4_table_name ), ( subtags_fts4_param, ) )
cancelled_hook = None
if job_key is not None:
cancelled_hook = job_key.IsCancelled
loop_of_subtag_ids = self._STL( HydrusDB.ReadFromCancellableCursor( cursor, 1024, cancelled_hook = cancelled_hook ) )
else:
# old notes from before we had searchable subtag map. I deleted that map once, albeit in an older and less efficient form. *don't delete it again, it has use*
#
# NOTE: doing a subtag = 'blah' lookup on subtags_fts4 tables is ultra slow, lmao!
# attempts to match '/a/' to 'a' with clever FTS4 MATCHing (i.e. a MATCH on a*\b, then an '= a') proved not super successful
# in testing, it was still a bit slow. my guess is it is still iterating through all the nodes for ^a*, the \b just makes it a bit more efficient sometimes
# in tests '^a\b' was about twice as fast as 'a*', so the \b might not even be helping at all
# so, I decided to move back to a lean and upgraded searchable subtag map, and here we are
subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, search_tag_service_id )
searchable_subtag = subtag_wildcard
if self.modules_tags.SubtagExists( searchable_subtag ):
searchable_subtag_id = self.modules_tags.GetSubtagId( searchable_subtag )
loop_of_subtag_ids = self._STS( self._Execute( 'SELECT subtag_id FROM {} WHERE searchable_subtag_id = ?;'.format( subtags_searchable_map_table_name ), ( searchable_subtag_id, ) ) )
loop_of_subtag_ids.add( searchable_subtag_id )
else:
loop_of_subtag_ids = set()
if job_key is not None and job_key.IsCancelled():
return set()
result_subtag_ids.update( loop_of_subtag_ids )
return result_subtag_ids
def GetSubtagIdsFromWildcardIntoTable( self, file_service_id: int, tag_service_id: int, subtag_wildcard, subtag_id_table_name, job_key = None ):
if tag_service_id == self.modules_services.combined_tag_service_id:
search_tag_service_ids = self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES )
else:
search_tag_service_ids = ( tag_service_id, )
for search_tag_service_id in search_tag_service_ids:
if '*' in subtag_wildcard:
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, search_tag_service_id )
wildcard_has_fts4_searchable_characters = WildcardHasFTS4SearchableCharacters( subtag_wildcard )
if subtag_wildcard == '*':
# hellmode, but shouldn't be called normally
cursor = self._Execute( 'SELECT docid FROM {};'.format( subtags_fts4_table_name ) )
elif ClientSearch.IsComplexWildcard( subtag_wildcard ) or not wildcard_has_fts4_searchable_characters:
# FTS4 does not support complex wildcards, so instead we'll search our raw subtags
# however, since we want to search 'searchable' text, we use the 'searchable subtags map' to cross between real and searchable
like_param = ConvertWildcardToSQLiteLikeParameter( subtag_wildcard )
if subtag_wildcard.startswith( '*' ) or not wildcard_has_fts4_searchable_characters:
# this is a SCAN, but there we go
# a potential optimisation here, in future, is to store fts4 of subtags reversed, then for '*amus', we can just search that reverse cache for 'suma*'
# and this would only double the size of the fts4 cache, the largest cache in the whole db! a steal!
# it also would not fix '*amu*', but with some cleverness could speed up '*amus ar*'
query = 'SELECT docid FROM {} WHERE subtag LIKE ?;'.format( subtags_fts4_table_name )
cursor = self._Execute( query, ( like_param, ) )
else:
# we have an optimisation here--rather than searching all subtags for bl*ah, let's search all the bl* subtags for bl*ah!
prefix_fts4_wildcard = subtag_wildcard.split( '*' )[0]
prefix_fts4_wildcard_param = '"{}*"'.format( prefix_fts4_wildcard )
query = 'SELECT docid FROM {} WHERE subtag MATCH ? AND subtag LIKE ?;'.format( subtags_fts4_table_name )
cursor = self._Execute( query, ( prefix_fts4_wildcard_param, like_param ) )
else:
# we want the " " wrapping our search text to keep whitespace words connected and in order
# "samus ar*" should not match "around samus"
# simple 'sam*' style subtag, so we can search fts4 no prob
subtags_fts4_param = '"{}"'.format( subtag_wildcard )
cursor = self._Execute( 'SELECT docid FROM {} WHERE subtag MATCH ?;'.format( subtags_fts4_table_name ), ( subtags_fts4_param, ) )
cancelled_hook = None
if job_key is not None:
cancelled_hook = job_key.IsCancelled
loop_of_subtag_id_tuples = HydrusDB.ReadFromCancellableCursor( cursor, 1024, cancelled_hook = cancelled_hook )
self._ExecuteMany( 'INSERT OR IGNORE INTO {} ( subtag_id ) VALUES ( ? );'.format( subtag_id_table_name ), loop_of_subtag_id_tuples )
else:
# old notes from before we had searchable subtag map. I deleted that map once, albeit in an older and less efficient form. *don't delete it again, it has use*
#
# NOTE: doing a subtag = 'blah' lookup on subtags_fts4 tables is ultra slow, lmao!
# attempts to match '/a/' to 'a' with clever FTS4 MATCHing (i.e. a MATCH on a*\b, then an '= a') proved not super successful
# in testing, it was still a bit slow. my guess is it is still iterating through all the nodes for ^a*, the \b just makes it a bit more efficient sometimes
# in tests '^a\b' was about twice as fast as 'a*', so the \b might not even be helping at all
# so, I decided to move back to a lean and upgraded searchable subtag map, and here we are
searchable_subtag = subtag_wildcard
if self.modules_tags.SubtagExists( searchable_subtag ):
searchable_subtag_id = self.modules_tags.GetSubtagId( searchable_subtag )
self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id ) VALUES ( ? );'.format( subtag_id_table_name ), ( searchable_subtag_id, ) )
subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, search_tag_service_id )
self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id ) SELECT subtag_id FROM {} WHERE searchable_subtag_id = ?;'.format( subtag_id_table_name, subtags_searchable_map_table_name ), ( searchable_subtag_id, ) )
if job_key is not None and job_key.IsCancelled():
self._Execute( 'DELETE FROM {};'.format( subtag_id_table_name ) )
return
def GetSubtagsFTS4TableName( self, file_service_id, tag_service_id ):
if file_service_id == self.modules_services.combined_file_service_id:
subtags_fts4_table_name = GenerateCombinedFilesSubtagsFTS4TableName( tag_service_id )
else:
if self.modules_services.FileServiceIsCoveredByAllLocalFiles( file_service_id ):
file_service_id = self.modules_services.combined_local_file_service_id
subtags_fts4_table_name = GenerateSpecificSubtagsFTS4TableName( file_service_id, tag_service_id )
return subtags_fts4_table_name
def GetSubtagsSearchableMapTableName( self, file_service_id, tag_service_id ):
if file_service_id == self.modules_services.combined_file_service_id:
subtags_searchable_map_table_name = GenerateCombinedFilesSubtagsSearchableMapTableName( tag_service_id )
else:
if self.modules_services.FileServiceIsCoveredByAllLocalFiles( file_service_id ):
file_service_id = self.modules_services.combined_local_file_service_id
subtags_searchable_map_table_name = GenerateSpecificSubtagsSearchableMapTableName( file_service_id, tag_service_id )
return subtags_searchable_map_table_name
def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:
tables_and_columns = []
if content_type == HC.CONTENT_TYPE_TAG:
tag_service_ids = self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES )
for tag_service_id in tag_service_ids:
table_dict = {}
file_service_ids = list( self.modules_services.GetServiceIds( HC.FILE_SERVICES_WITH_SPECIFIC_MAPPING_CACHES ) )
file_service_ids.append( self.modules_services.combined_file_service_id )
for file_service_id in file_service_ids:
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, tag_service_id )
tables_and_columns.append( ( tags_table_name, 'tag_id' ) )
tables_and_columns.append( ( subtags_fts4_table_name, 'docid' ) )
return tables_and_columns
def GetTagAsNumSubtagIds( self, file_service_id, tag_service_id, operator, num ):
integer_subtags_table_name = self.GetIntegerSubtagsTableName( file_service_id, tag_service_id )
return self._STS( self._Execute( 'SELECT subtag_id FROM {} WHERE integer_subtag {} {};'.format( integer_subtags_table_name, operator, num ) ) )
def GetTagCount( self, file_service_id, tag_service_id ):
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
( count, ) = self._Execute( 'SELECT COUNT( * ) FROM {};'.format( tags_table_name ) ).fetchone()
return count
def GetTagIdsFromNamespaceIds( self, leaf: ClientDBServices.FileSearchContextLeaf, namespace_ids: typing.Collection[ int ], job_key = None ):
if len( namespace_ids ) == 0:
return set()
final_result_tag_ids = set()
with self._MakeTemporaryIntegerTable( namespace_ids, 'namespace_id' ) as temp_namespace_ids_table_name:
tags_table_name = self.GetTagsTableName( leaf.file_service_id, leaf.tag_service_id )
if len( namespace_ids ) == 1:
( namespace_id, ) = namespace_ids
cursor = self._Execute( 'SELECT tag_id FROM {} WHERE namespace_id = ?;'.format( tags_table_name ), ( namespace_id, ) )
else:
# temp namespaces to tags
cursor = self._Execute( 'SELECT tag_id FROM {} CROSS JOIN {} USING ( namespace_id );'.format( temp_namespace_ids_table_name, tags_table_name ) )
cancelled_hook = None
if job_key is not None:
cancelled_hook = job_key.IsCancelled
result_tag_ids = self._STS( HydrusDB.ReadFromCancellableCursor( cursor, 128, cancelled_hook = cancelled_hook ) )
if job_key is not None:
if job_key.IsCancelled():
return set()
final_result_tag_ids.update( result_tag_ids )
return final_result_tag_ids
def GetTagIdsFromNamespaceIdsSubtagIds( self, file_service_id: int, tag_service_id: int, namespace_ids: typing.Collection[ int ], subtag_ids: typing.Collection[ int ], job_key = None ):
if len( namespace_ids ) == 0 or len( subtag_ids ) == 0:
return set()
with self._MakeTemporaryIntegerTable( subtag_ids, 'subtag_id' ) as temp_subtag_ids_table_name:
with self._MakeTemporaryIntegerTable( namespace_ids, 'namespace_id' ) as temp_namespace_ids_table_name:
return self.GetTagIdsFromNamespaceIdsSubtagIdsTables( file_service_id, tag_service_id, temp_namespace_ids_table_name, temp_subtag_ids_table_name, job_key = job_key )
def GetTagIdsFromNamespaceIdsSubtagIdsTables( self, file_service_id: int, tag_service_id: int, namespace_ids_table_name: str, subtag_ids_table_name: str, job_key = None ):
final_result_tag_ids = set()
if tag_service_id == self.modules_services.combined_tag_service_id:
search_tag_service_ids = self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES )
else:
search_tag_service_ids = ( tag_service_id, )
for search_tag_service_id in search_tag_service_ids:
tags_table_name = self.GetTagsTableName( file_service_id, search_tag_service_id )
# temp subtags to tags to temp namespaces
cursor = self._Execute( 'SELECT tag_id FROM {} CROSS JOIN {} USING ( subtag_id ) CROSS JOIN {} USING ( namespace_id );'.format( subtag_ids_table_name, tags_table_name, namespace_ids_table_name ) )
cancelled_hook = None
if job_key is not None:
cancelled_hook = job_key.IsCancelled
result_tag_ids = self._STS( HydrusDB.ReadFromCancellableCursor( cursor, 128, cancelled_hook = cancelled_hook ) )
if job_key is not None:
if job_key.IsCancelled():
return set()
final_result_tag_ids.update( result_tag_ids )
return final_result_tag_ids
def GetTagIdsFromSubtagIds( self, file_service_id: int, tag_service_id: int, subtag_ids: typing.Collection[ int ], job_key = None ):
if len( subtag_ids ) == 0:
return set()
with self._MakeTemporaryIntegerTable( subtag_ids, 'subtag_id' ) as temp_subtag_ids_table_name:
return self.GetTagIdsFromSubtagIdsTable( file_service_id, tag_service_id, temp_subtag_ids_table_name, job_key = job_key )
def GetTagIdsFromSubtagIdsTable( self, file_service_id: int, tag_service_id: int, subtag_ids_table_name: str, job_key = None ):
final_result_tag_ids = set()
if tag_service_id == self.modules_services.combined_tag_service_id:
search_tag_service_ids = self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES )
else:
search_tag_service_ids = ( tag_service_id, )
for search_tag_service_id in search_tag_service_ids:
tags_table_name = self.GetTagsTableName( file_service_id, search_tag_service_id )
# temp subtags to tags
cursor = self._Execute( 'SELECT tag_id FROM {} CROSS JOIN {} USING ( subtag_id );'.format( subtag_ids_table_name, tags_table_name ) )
cancelled_hook = None
if job_key is not None:
cancelled_hook = job_key.IsCancelled
result_tag_ids = self._STS( HydrusDB.ReadFromCancellableCursor( cursor, 128, cancelled_hook = cancelled_hook ) )
if job_key is not None:
if job_key.IsCancelled():
return set()
final_result_tag_ids.update( result_tag_ids )
return final_result_tag_ids
def GetTagsTableName( self, file_service_id, tag_service_id ):
if file_service_id == self.modules_services.combined_file_service_id:
tags_table_name = GenerateCombinedFilesTagsTableName( tag_service_id )
else:
if self.modules_services.FileServiceIsCoveredByAllLocalFiles( file_service_id ):
file_service_id = self.modules_services.combined_local_file_service_id
tags_table_name = GenerateSpecificTagsTableName( file_service_id, tag_service_id )
return tags_table_name
def HasTag( self, file_service_id, tag_service_id, tag_id ):
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
result = self._Execute( 'SELECT 1 FROM {} WHERE tag_id = ?;'.format( tags_table_name ), ( tag_id, ) ).fetchone()
return result is not None
def RegenerateSearchableSubtagMap( self, file_service_id, tag_service_id, status_hook = None ):
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, tag_service_id )
subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, tag_service_id )
self._Execute( 'DELETE FROM {};'.format( subtags_searchable_map_table_name ) )
query = 'SELECT docid FROM {};'.format( subtags_fts4_table_name )
BLOCK_SIZE = 10000
for ( group_of_subtag_ids, num_done, num_to_do ) in HydrusDB.ReadLargeIdQueryInSeparateChunks( self._c, query, BLOCK_SIZE ):
for subtag_id in group_of_subtag_ids:
result = self._Execute( 'SELECT subtag FROM subtags WHERE subtag_id = ?;', ( subtag_id, ) ).fetchone()
if result is None:
continue
( subtag, ) = result
searchable_subtag = ClientSearch.ConvertSubtagToSearchable( subtag )
if searchable_subtag != subtag:
searchable_subtag_id = self.modules_tags.GetSubtagId( searchable_subtag )
self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id, searchable_subtag_id ) VALUES ( ?, ? );'.format( subtags_searchable_map_table_name ), ( subtag_id, searchable_subtag_id ) )
message = HydrusData.ConvertValueRangeToPrettyString( num_done, num_to_do )
HG.client_controller.frame_splash_status.SetSubtext( message )
if status_hook is not None:
status_hook( message )
def RepopulateMissingSubtags( self, file_service_id, tag_service_id ):
tags_table_name = self.GetTagsTableName( file_service_id, tag_service_id )
subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, tag_service_id )
subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, tag_service_id )
integer_subtags_table_name = self.GetIntegerSubtagsTableName( file_service_id, tag_service_id )
missing_subtag_ids = self._STS( self._Execute( 'SELECT subtag_id FROM {} EXCEPT SELECT docid FROM {};'.format( tags_table_name, subtags_fts4_table_name ) ) )
for subtag_id in missing_subtag_ids:
result = self._Execute( 'SELECT subtag FROM subtags WHERE subtag_id = ?;', ( subtag_id, ) ).fetchone()
if result is None:
continue
( subtag, ) = result
searchable_subtag = ClientSearch.ConvertSubtagToSearchable( subtag )
if searchable_subtag != subtag:
searchable_subtag_id = self.modules_tags.GetSubtagId( searchable_subtag )
self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id, searchable_subtag_id ) VALUES ( ?, ? );'.format( subtags_searchable_map_table_name ), ( subtag_id, searchable_subtag_id ) )
#
self._Execute( 'INSERT OR IGNORE INTO {} ( docid, subtag ) VALUES ( ?, ? );'.format( subtags_fts4_table_name ), ( subtag_id, searchable_subtag ) )
if subtag.isdecimal():
try:
integer_subtag = int( subtag )
if CanCacheInteger( integer_subtag ):
self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id, integer_subtag ) VALUES ( ?, ? );'.format( integer_subtags_table_name ), ( subtag_id, integer_subtag ) )
except ValueError:
pass
if len( missing_subtag_ids ) > 0:
HydrusData.ShowText( 'Repopulated {} missing subtags for {}_{}.'.format( HydrusData.ToHumanInt( len( missing_subtag_ids ) ), file_service_id, tag_service_id ) )