hydrus/hydrus/client/db/ClientDBURLMap.py

204 lines
7.9 KiB
Python
Raw Normal View History

2022-05-18 20:18:25 +00:00
import re
import sqlite3
import typing
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData
from hydrus.client.db import ClientDBMaster
from hydrus.client.db import ClientDBModule
class ClientDBURLMap( ClientDBModule.ClientDBModule ):
def __init__( self, cursor: sqlite3.Cursor, modules_urls: ClientDBMaster.ClientDBMasterURLs ):
self.modules_urls = modules_urls
ClientDBModule.ClientDBModule.__init__( self, 'client urls mapping', cursor )
def _GetInitialIndexGenerationDict( self ) -> dict:
index_generation_dict = {}
index_generation_dict[ 'main.url_map' ] = [
( [ 'url_id' ], False, 485 )
]
return index_generation_dict
def _GetInitialTableGenerationDict( self ) -> dict:
return {
'main.url_map' : ( 'CREATE TABLE IF NOT EXISTS {} ( hash_id INTEGER, url_id INTEGER, PRIMARY KEY ( hash_id, url_id ) );', 485 )
}
def AddMapping( self, hash_id: int, url: str ):
url_id = self.modules_urls.GetURLId( url )
self._Execute( 'INSERT OR IGNORE INTO url_map ( hash_id, url_id ) VALUES ( ?, ? );', ( hash_id, url_id ) )
def DeleteMapping( self, hash_id: int, url: str ):
url_id = self.modules_urls.GetURLId( url )
self._Execute( 'DELETE FROM url_map WHERE hash_id = ? AND url_id = ?;', ( hash_id, url_id ) )
def GetHashIds( self, search_url: str ):
hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM url_map NATURAL JOIN urls WHERE url = ?;', ( search_url, ) ) )
return hash_ids
def GetHashIdsFromURLRule( self, rule_type, rule, hash_ids = None, hash_ids_table_name = None ):
if rule_type == 'exact_match':
url = rule
table_name = 'url_map NATURAL JOIN urls'
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
table_name += ' NATURAL JOIN {}'.format( hash_ids_table_name )
select = 'SELECT hash_id FROM {} WHERE url = ?;'.format( table_name )
result_hash_ids = self._STS( self._Execute( select, ( url, ) ) )
return result_hash_ids
elif rule_type in ( 'url_class', 'url_match' ):
url_class = rule
domain = url_class.GetDomain()
if url_class.MatchesSubdomains():
domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )
else:
domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain, only_www_subdomains = True )
result_hash_ids = set()
with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
# if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id );'.format( hash_ids_table_name, temp_domain_table_name )
else:
# domains to urls to url map
select = 'SELECT hash_id, url FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )
for ( hash_id, url ) in self._Execute( select ):
# this is actually insufficient, as more detailed url classes may match
if hash_id not in result_hash_ids and url_class.Matches( url ):
result_hash_ids.add( hash_id )
return result_hash_ids
elif rule_type in 'domain':
domain = rule
# if we search for site.com, we also want artist.site.com or www.site.com or cdn2.site.com
domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )
result_hash_ids = set()
with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
# if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
select = 'SELECT hash_id FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id )'.format( hash_ids_table_name, temp_domain_table_name )
else:
# domains to urls to url map
select = 'SELECT hash_id FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )
result_hash_ids = self._STS( self._Execute( select ) )
return result_hash_ids
else:
regex = rule
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
# if we aren't gonk mode with the number of files, temp hashes to url map to urls
# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on _TOTAL_ url count
select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name )
else:
select = 'SELECT hash_id, url FROM url_map NATURAL JOIN urls;'
result_hash_ids = set()
for ( hash_id, url ) in self._Execute( select ):
if hash_id not in result_hash_ids and re.search( regex, url ) is not None:
result_hash_ids.add( hash_id )
return result_hash_ids
def GetHashIdsToURLs( self, hash_ids_table_name = None ):
hash_ids_to_urls = {}
if hash_ids_table_name is not None:
hash_ids_to_urls = HydrusData.BuildKeyToSetDict( self._Execute( 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name ) ) )
return hash_ids_to_urls
def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:
# if content type is a domain, then give urls? bleh
tables_and_columns = []
2022-09-28 17:15:23 +00:00
if content_type == HC.CONTENT_TYPE_FILES:
2022-05-18 20:18:25 +00:00
tables_and_columns.append( ( 'main.url_map', 'hash_id' ) )
return tables_and_columns