264 lines
10 KiB
Python
264 lines
10 KiB
Python
import re
|
|
import sqlite3
|
|
import typing
|
|
|
|
from hydrus.core import HydrusConstants as HC
|
|
from hydrus.core import HydrusData
|
|
|
|
from hydrus.client.db import ClientDBMaster
|
|
from hydrus.client.db import ClientDBModule
|
|
from hydrus.client.search import ClientSearch
|
|
|
|
class ClientDBURLMap( ClientDBModule.ClientDBModule ):
|
|
|
|
def __init__( self, cursor: sqlite3.Cursor, modules_urls: ClientDBMaster.ClientDBMasterURLs ):
|
|
|
|
self.modules_urls = modules_urls
|
|
|
|
ClientDBModule.ClientDBModule.__init__( self, 'client urls mapping', cursor )
|
|
|
|
|
|
def _GetInitialIndexGenerationDict( self ) -> dict:
|
|
|
|
index_generation_dict = {}
|
|
|
|
index_generation_dict[ 'main.url_map' ] = [
|
|
( [ 'url_id' ], False, 485 )
|
|
]
|
|
|
|
return index_generation_dict
|
|
|
|
|
|
def _GetInitialTableGenerationDict( self ) -> dict:
|
|
|
|
return {
|
|
'main.url_map' : ( 'CREATE TABLE IF NOT EXISTS {} ( hash_id INTEGER, url_id INTEGER, PRIMARY KEY ( hash_id, url_id ) );', 485 )
|
|
}
|
|
|
|
|
|
def AddMapping( self, hash_id: int, url: str ):
|
|
|
|
url_id = self.modules_urls.GetURLId( url )
|
|
|
|
self._Execute( 'INSERT OR IGNORE INTO url_map ( hash_id, url_id ) VALUES ( ?, ? );', ( hash_id, url_id ) )
|
|
|
|
|
|
def DeleteMapping( self, hash_id: int, url: str ):
|
|
|
|
url_id = self.modules_urls.GetURLId( url )
|
|
|
|
self._Execute( 'DELETE FROM url_map WHERE hash_id = ? AND url_id = ?;', ( hash_id, url_id ) )
|
|
|
|
|
|
def GetHashIds( self, search_url: str ):
|
|
|
|
hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM url_map NATURAL JOIN urls WHERE url = ?;', ( search_url, ) ) )
|
|
|
|
return hash_ids
|
|
|
|
|
|
def GetHashIdsFromCountTests( self, num_urls_tests: typing.List[ ClientSearch.NumberTest ], hash_ids: typing.Collection[ int ], hash_ids_table_name: str ):
|
|
|
|
# we'll have to natural join 'urls' or 'urls-class-map-cache' or whatever when we add a proper filter to this guy
|
|
|
|
table_join = 'url_map'
|
|
|
|
if len( hash_ids ) < 50000:
|
|
|
|
table_join += ' NATURAL JOIN {}'.format( hash_ids_table_name )
|
|
|
|
|
|
#
|
|
|
|
result_hash_ids = set( hash_ids )
|
|
|
|
specific_num_urls_tests = [ number_test for number_test in num_urls_tests if not ( number_test.IsZero() or number_test.IsAnythingButZero() ) ]
|
|
|
|
megalambda = ClientSearch.NumberTest.STATICCreateMegaLambda( specific_num_urls_tests )
|
|
|
|
is_zero = True in ( number_test.IsZero() for number_test in num_urls_tests )
|
|
is_anything_but_zero = True in ( number_test.IsAnythingButZero() for number_test in num_urls_tests )
|
|
wants_zero = True in ( number_test.WantsZero() for number_test in num_urls_tests )
|
|
|
|
if is_zero or is_anything_but_zero or wants_zero:
|
|
|
|
select = f'SELECT DISTINCT hash_id FROM {table_join};'
|
|
|
|
nonzero_url_query_hash_ids = self._STS( self._Execute( select ) )
|
|
|
|
if is_zero:
|
|
|
|
result_hash_ids.difference_update( nonzero_url_query_hash_ids )
|
|
|
|
|
|
if is_anything_but_zero:
|
|
|
|
result_hash_ids.intersection_update( nonzero_url_query_hash_ids )
|
|
|
|
|
|
|
|
if len( specific_num_urls_tests ) > 0:
|
|
|
|
select = f'SELECT hash_id, COUNT( url_id ) FROM {table_join} GROUP BY hash_id;'
|
|
|
|
good_url_count_hash_ids = { hash_id for ( hash_id, count ) in self._Execute( select ) if megalambda( count ) }
|
|
|
|
if wants_zero:
|
|
|
|
zero_hash_ids = result_hash_ids.difference( nonzero_url_query_hash_ids )
|
|
|
|
good_url_count_hash_ids.update( zero_hash_ids )
|
|
|
|
|
|
result_hash_ids.intersection_update( good_url_count_hash_ids )
|
|
|
|
|
|
return result_hash_ids
|
|
|
|
|
|
def GetHashIdsFromURLRule( self, rule_type, rule, hash_ids = None, hash_ids_table_name = None ):
|
|
|
|
if rule_type == 'exact_match':
|
|
|
|
url = rule
|
|
|
|
table_name = 'url_map NATURAL JOIN urls'
|
|
|
|
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
|
|
|
|
table_name += ' NATURAL JOIN {}'.format( hash_ids_table_name )
|
|
|
|
|
|
select = 'SELECT hash_id FROM {} WHERE url = ?;'.format( table_name )
|
|
|
|
result_hash_ids = self._STS( self._Execute( select, ( url, ) ) )
|
|
|
|
return result_hash_ids
|
|
|
|
elif rule_type in ( 'url_class', 'url_match' ):
|
|
|
|
url_class = rule
|
|
|
|
domain = url_class.GetDomain()
|
|
|
|
if url_class.MatchesSubdomains():
|
|
|
|
domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )
|
|
|
|
else:
|
|
|
|
domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain, only_www_subdomains = True )
|
|
|
|
|
|
result_hash_ids = set()
|
|
|
|
with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:
|
|
|
|
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
|
|
|
|
# if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
|
|
# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
|
|
select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id );'.format( hash_ids_table_name, temp_domain_table_name )
|
|
|
|
else:
|
|
|
|
# domains to urls to url map
|
|
select = 'SELECT hash_id, url FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )
|
|
|
|
|
|
for ( hash_id, url ) in self._Execute( select ):
|
|
|
|
# this is actually insufficient, as more detailed url classes may match
|
|
if hash_id not in result_hash_ids and url_class.Matches( url ):
|
|
|
|
result_hash_ids.add( hash_id )
|
|
|
|
|
|
|
|
|
|
return result_hash_ids
|
|
|
|
elif rule_type in 'domain':
|
|
|
|
domain = rule
|
|
|
|
# if we search for site.com, we also want artist.site.com or www.site.com or cdn2.site.com
|
|
domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )
|
|
|
|
result_hash_ids = set()
|
|
|
|
with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:
|
|
|
|
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
|
|
|
|
# if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
|
|
# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
|
|
select = 'SELECT hash_id FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id )'.format( hash_ids_table_name, temp_domain_table_name )
|
|
|
|
else:
|
|
|
|
# domains to urls to url map
|
|
select = 'SELECT hash_id FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )
|
|
|
|
|
|
result_hash_ids = self._STS( self._Execute( select ) )
|
|
|
|
|
|
return result_hash_ids
|
|
|
|
else:
|
|
|
|
regex = rule
|
|
|
|
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
|
|
|
|
# if we aren't gonk mode with the number of files, temp hashes to url map to urls
|
|
# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on _TOTAL_ url count
|
|
select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name )
|
|
|
|
else:
|
|
|
|
select = 'SELECT hash_id, url FROM url_map NATURAL JOIN urls;'
|
|
|
|
|
|
result_hash_ids = set()
|
|
|
|
for ( hash_id, url ) in self._Execute( select ):
|
|
|
|
if hash_id not in result_hash_ids and re.search( regex, url ) is not None:
|
|
|
|
result_hash_ids.add( hash_id )
|
|
|
|
|
|
|
|
return result_hash_ids
|
|
|
|
|
|
|
|
def GetHashIdsToURLs( self, hash_ids_table_name = None ):
|
|
|
|
hash_ids_to_urls = {}
|
|
|
|
if hash_ids_table_name is not None:
|
|
|
|
hash_ids_to_urls = HydrusData.BuildKeyToSetDict( self._Execute( 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name ) ) )
|
|
|
|
|
|
return hash_ids_to_urls
|
|
|
|
|
|
def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:
|
|
|
|
# if content type is a domain, then give urls? bleh
|
|
|
|
tables_and_columns = []
|
|
|
|
if content_type == HC.CONTENT_TYPE_FILES:
|
|
|
|
tables_and_columns.append( ( 'main.url_map', 'hash_id' ) )
|
|
|
|
|
|
return tables_and_columns
|
|
|
|
|