hydrus/hydrus/core/HydrusTags.py

866 lines
23 KiB
Python
Raw Normal View History

2013-07-17 20:56:13 +00:00
import collections
import os
2015-03-25 22:04:19 +00:00
import re
2021-04-07 21:26:45 +00:00
import threading
2020-07-29 20:52:44 +00:00
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusExceptions
2021-04-07 21:26:45 +00:00
from hydrus.core import HydrusSerialisable
2020-04-22 21:00:35 +00:00
from hydrus.core import HydrusText
2017-04-05 21:16:40 +00:00
2016-04-06 19:52:45 +00:00
def CensorshipMatch( tag, censorships ):
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
for censorship in censorships:
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
if censorship == '': # '' - all non namespaced tags
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
if namespace == '':
2016-04-06 19:52:45 +00:00
return True
elif censorship == ':': # ':' - all namespaced tags
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
if namespace != '':
2016-04-06 19:52:45 +00:00
return True
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
elif ':' in censorship:
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
if censorship.endswith( ':' ): # 'series:' - namespaced tags
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
2017-03-15 20:13:04 +00:00
if namespace == censorship[:-1]:
2016-04-06 19:52:45 +00:00
return True
else: # 'series:evangelion' - exact match with namespace
if tag == censorship:
return True
else:
# 'table' - normal tag, or namespaced version of same
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
if subtag == censorship:
2016-04-06 19:52:45 +00:00
2017-02-08 22:27:00 +00:00
return True
2016-04-06 19:52:45 +00:00
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
return False
2018-02-28 22:30:36 +00:00
def CollapseMultipleSortedNumericTagsToMinMax( tags ):
if len( tags ) <= 2:
return tags
else:
includes_non_numeric_tag = True in ( not isinstance( ConvertTagToSortable( tag ), tuple ) for tag in tags )
if includes_non_numeric_tag:
return tags
else:
# this list of tags is entirely numeric and may well be something like 1, 2, 3, 4, 5
# the caller wants to present 1-5 instead, so lets cut out the first and last
if not isinstance( tags, list ):
tags = list( tags )
return [ tags[0], tags[-1] ]
2018-09-12 21:36:26 +00:00
def ConvertTagToSortable( tag ):
2016-12-14 21:19:07 +00:00
2018-09-26 19:05:12 +00:00
# this copies the human sort in hydrusdata
2018-09-12 21:36:26 +00:00
2019-01-16 22:40:53 +00:00
convert = lambda text: ( '', int( text ) ) if text.isdecimal() else ( text, 0 )
2018-09-12 21:36:26 +00:00
2019-01-09 22:59:03 +00:00
return tuple( [ convert( c ) for c in re.split( '([0-9]+)', tag.lower() ) ] )
2018-09-12 21:36:26 +00:00
# old method
'''if len( t ) > 0 and t[0].isdecimal():
2015-02-03 20:40:21 +00:00
# We want to maintain that:
# 0 < 0a < 0b < 1 ( lexicographic comparison )
# -and-
# 2 < 22 ( value comparison )
# So, if the first bit can be turned into an int, split it into ( int, extra )
int_component = ''
i = 0
for character in t:
2017-11-29 21:48:23 +00:00
if character.isdecimal():
int_component += character
else:
break
2015-02-03 20:40:21 +00:00
i += 1
str_component = t[i:]
2016-12-14 21:19:07 +00:00
number = int( int_component )
return ( number, str_component )
else:
return t
2018-09-12 21:36:26 +00:00
'''
2015-02-03 20:40:21 +00:00
2014-11-12 23:33:13 +00:00
def FilterNamespaces( tags, namespaces ):
processed_tags = collections.defaultdict( set )
for tag in tags:
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
processed_tags[ namespace ].add( tag )
2014-11-12 23:33:13 +00:00
result = set()
for namespace in namespaces:
2017-02-08 22:27:00 +00:00
if namespace == None:
result.update( processed_tags[ '' ] )
else:
result.update( processed_tags[ namespace ] )
2014-11-12 23:33:13 +00:00
return result
2016-04-27 19:20:37 +00:00
def SortNumericTags( tags ):
2015-01-21 22:49:58 +00:00
tags = list( tags )
2015-02-03 20:40:21 +00:00
tags.sort( key = ConvertTagToSortable )
2015-01-21 22:49:58 +00:00
return tags
2015-03-25 22:04:19 +00:00
def CheckTagNotEmpty( tag ):
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
2015-03-25 22:04:19 +00:00
2017-02-08 22:27:00 +00:00
if subtag == '':
2015-03-25 22:04:19 +00:00
2020-05-27 21:27:52 +00:00
raise HydrusExceptions.TagSizeException( 'Received a zero-length tag!' )
2015-03-25 22:04:19 +00:00
2017-04-05 21:16:40 +00:00
2015-03-25 22:04:19 +00:00
def CleanTag( tag ):
2015-07-15 20:28:26 +00:00
try:
2018-05-09 20:23:00 +00:00
if tag is None:
raise Exception()
2015-07-15 20:28:26 +00:00
tag = tag[:1024]
tag = tag.lower()
2018-05-16 20:09:50 +00:00
tag = HydrusText.re_leading_single_colon.sub( '::', tag ) # Convert anything starting with one colon to start with two i.e. :D -> ::D
if ':' in tag:
2017-03-02 02:14:56 +00:00
2017-04-05 21:16:40 +00:00
tag = StripTextOfGumpf( tag ) # need to repeat here to catch 'system:' stuff
2017-03-02 02:14:56 +00:00
( namespace, subtag ) = SplitTag( tag )
2017-04-05 21:16:40 +00:00
namespace = StripTextOfGumpf( namespace )
subtag = StripTextOfGumpf( subtag )
2017-03-02 02:14:56 +00:00
tag = CombineTag( namespace, subtag )
else:
2017-04-05 21:16:40 +00:00
tag = StripTextOfGumpf( tag )
2015-07-15 20:28:26 +00:00
2015-12-09 23:16:41 +00:00
2015-07-15 20:28:26 +00:00
except Exception as e:
2019-01-09 22:59:03 +00:00
text = 'Was unable to parse the tag: ' + str( tag )
2015-07-15 20:28:26 +00:00
text += os.linesep * 2
2019-01-09 22:59:03 +00:00
text += str( e )
2015-03-25 22:04:19 +00:00
2015-07-15 20:28:26 +00:00
raise Exception( text )
2015-03-25 22:04:19 +00:00
return tag
def CleanTags( tags ):
clean_tags = set()
for tag in tags:
2018-05-09 20:23:00 +00:00
if tag is None:
continue
2015-03-25 22:04:19 +00:00
tag = CleanTag( tag )
2017-05-31 21:50:53 +00:00
try:
CheckTagNotEmpty( tag )
2020-05-27 21:27:52 +00:00
except HydrusExceptions.TagSizeException:
2017-05-31 21:50:53 +00:00
continue
2015-03-25 22:04:19 +00:00
clean_tags.add( tag )
return clean_tags
2017-02-08 22:27:00 +00:00
def CombineTag( namespace, subtag ):
2015-12-09 23:16:41 +00:00
if namespace == '':
2018-05-16 20:09:50 +00:00
if HydrusText.re_leading_single_colon.search( subtag ) is not None:
2015-12-09 23:16:41 +00:00
2017-02-08 22:27:00 +00:00
return ':' + subtag
2015-12-09 23:16:41 +00:00
else:
2017-02-08 22:27:00 +00:00
return subtag
2015-12-09 23:16:41 +00:00
else:
2017-02-08 22:27:00 +00:00
return namespace + ':' + subtag
2015-12-09 23:16:41 +00:00
2021-04-07 21:26:45 +00:00
def ConvertTagSliceToString( tag_slice ):
if tag_slice == '':
return 'unnamespaced tags'
elif tag_slice == ':':
return 'namespaced tags'
elif tag_slice.count( ':' ) == 1 and tag_slice.endswith( ':' ):
namespace = tag_slice[ : -1 ]
return '\'' + namespace + '\' tags'
else:
return tag_slice
2021-11-24 21:59:58 +00:00
def IsUnnamespaced( tag ):
return SplitTag( tag )[0] == ''
2017-02-08 22:27:00 +00:00
def SplitTag( tag ):
if ':' in tag:
2019-08-15 00:40:48 +00:00
return tuple( tag.split( ':', 1 ) )
2017-02-08 22:27:00 +00:00
else:
2019-01-09 22:59:03 +00:00
return ( '', tag )
2017-02-08 22:27:00 +00:00
2020-12-02 22:04:38 +00:00
NULL_CHARACTER = '\x00'
2017-04-05 21:16:40 +00:00
def StripTextOfGumpf( t ):
2017-12-13 22:33:07 +00:00
t = HydrusText.re_newlines.sub( '', t )
2017-04-05 21:16:40 +00:00
2017-12-13 22:33:07 +00:00
t = HydrusText.re_multiple_spaces.sub( ' ', t )
2017-04-05 21:16:40 +00:00
2019-02-27 23:03:30 +00:00
t = t.strip()
2017-04-05 21:16:40 +00:00
2017-12-13 22:33:07 +00:00
t = HydrusText.re_leading_space_or_garbage.sub( '', t )
2017-04-05 21:16:40 +00:00
2020-12-02 22:04:38 +00:00
if NULL_CHARACTER in t:
t = t.replace( NULL_CHARACTER, '' )
2017-04-05 21:16:40 +00:00
return t
2020-03-04 22:12:53 +00:00
def TagOK( t ):
try:
CheckTagNotEmpty( CleanTag( t ) )
return True
except:
return False
2021-04-07 21:26:45 +00:00
class TagFilter( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_TAG_FILTER
SERIALISABLE_NAME = 'Tag Filter Rules'
SERIALISABLE_VERSION = 1
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._lock = threading.Lock()
self._tag_slices_to_rules = {}
self._all_unnamespaced_whitelisted = False
self._all_namespaced_whitelisted = False
self._namespaces_whitelist = set()
self._tags_whitelist = set()
self._all_unnamespaced_blacklisted = False
self._all_namespaced_blacklisted = False
self._namespaces_blacklist = set()
self._tags_blacklist = set()
self._namespaced_interesting = False
self._tags_interesting = False
def __eq__( self, other ):
if isinstance( other, TagFilter ):
return self._tag_slices_to_rules == other._tag_slices_to_rules
return NotImplemented
def _IterateTagSlices( self, tag, apply_unnamespaced_rules_to_namespaced_tags ):
# this guy gets called a lot, so we are making it an iterator
yield tag
( namespace, subtag ) = SplitTag( tag )
if tag != subtag and apply_unnamespaced_rules_to_namespaced_tags:
yield subtag
if namespace != '':
yield '{}:'.format( namespace )
yield ':'
else:
yield ''
def _GetSerialisableInfo( self ):
return list( self._tag_slices_to_rules.items() )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
self._tag_slices_to_rules = dict( serialisable_info )
self._UpdateRuleCache()
def _TagOK( self, tag, apply_unnamespaced_rules_to_namespaced_tags = False ):
# old method, has a bunch of overhead due to iteration
'''
blacklist_encountered = False
for tag_slice in self._IterateTagSlices( tag, apply_unnamespaced_rules_to_namespaced_tags = apply_unnamespaced_rules_to_namespaced_tags ):
if tag_slice in self._tag_slices_to_rules:
rule = self._tag_slices_to_rules[ tag_slice ]
if rule == HC.FILTER_WHITELIST:
return True # there is an exception for this class of tag
elif rule == HC.FILTER_BLACKLIST: # there is a rule against this class of tag
blacklist_encountered = True
if blacklist_encountered: # rule against and no exceptions
return False
else:
return True # no rules against or explicitly for, so permitted
'''
#
# since this is called a whole bunch and overhead piles up, we are now splaying the logic out to hardcoded tests
blacklist_encountered = False
if self._tags_interesting:
if tag in self._tags_whitelist:
return True
if tag in self._tags_blacklist:
blacklist_encountered = True
if self._namespaced_interesting or apply_unnamespaced_rules_to_namespaced_tags:
( namespace, subtag ) = SplitTag( tag )
if apply_unnamespaced_rules_to_namespaced_tags and self._tags_interesting and subtag != tag:
if subtag in self._tags_whitelist:
return True
if subtag in self._tags_blacklist:
blacklist_encountered = True
if self._namespaced_interesting:
if namespace == '':
if self._all_unnamespaced_whitelisted:
return True
if self._all_unnamespaced_blacklisted:
blacklist_encountered = True
else:
if self._all_namespaced_whitelisted or namespace in self._namespaces_whitelist:
return True
if self._all_namespaced_blacklisted or namespace in self._namespaces_blacklist:
blacklist_encountered = True
if blacklist_encountered: # rule against and no exceptions
return False
else:
return True # no rules against or explicitly for, so permitted
def _UpdateRuleCache( self ):
self._all_unnamespaced_whitelisted = False
self._all_namespaced_whitelisted = False
self._namespaces_whitelist = set()
self._tags_whitelist = set()
self._all_unnamespaced_blacklisted = False
self._all_namespaced_blacklisted = False
self._namespaces_blacklist = set()
self._tags_blacklist = set()
self._namespaced_interesting = False
self._tags_interesting = False
for ( tag_slice, rule ) in self._tag_slices_to_rules.items():
if tag_slice == '':
if rule == HC.FILTER_WHITELIST:
self._all_unnamespaced_whitelisted = True
else:
self._all_unnamespaced_blacklisted = True
self._namespaced_interesting = True
elif tag_slice == ':':
if rule == HC.FILTER_WHITELIST:
self._all_namespaced_whitelisted = True
else:
self._all_namespaced_blacklisted = True
self._namespaced_interesting = True
elif tag_slice.count( ':' ) == 1 and tag_slice.endswith( ':' ):
if rule == HC.FILTER_WHITELIST:
self._namespaces_whitelist.add( tag_slice[:-1] )
else:
self._namespaces_blacklist.add( tag_slice[:-1] )
self._namespaced_interesting = True
else:
if rule == HC.FILTER_WHITELIST:
self._tags_whitelist.add( tag_slice )
else:
self._tags_blacklist.add( tag_slice )
self._tags_interesting = True
def AllowsEverything( self ):
with self._lock:
for ( tag_slice, rule ) in self._tag_slices_to_rules.items():
if rule == HC.FILTER_BLACKLIST:
return False
return True
2021-07-28 21:12:00 +00:00
def CleanRules( self ):
new_tag_slices_to_rules = {}
for ( tag_slice, rule ) in self._tag_slices_to_rules.items():
if tag_slice == '':
pass
elif tag_slice == ':':
pass
elif tag_slice.count( ':' ) == 1 and tag_slice.endswith( ':' ):
example_tag = tag_slice + 'example'
try:
clean_example_tag = CleanTag( example_tag )
except:
continue
tag_slice = clean_example_tag[:-7]
else:
tag = tag_slice
try:
clean_tag = CleanTag( tag )
except:
continue
tag_slice = clean_tag
new_tag_slices_to_rules[ tag_slice ] = rule
self._tag_slices_to_rules = new_tag_slices_to_rules
self._UpdateRuleCache()
2021-04-07 21:26:45 +00:00
def Filter( self, tags, apply_unnamespaced_rules_to_namespaced_tags = False ):
with self._lock:
return { tag for tag in tags if self._TagOK( tag, apply_unnamespaced_rules_to_namespaced_tags = apply_unnamespaced_rules_to_namespaced_tags ) }
def GetTagSlicesToRules( self ):
with self._lock:
return dict( self._tag_slices_to_rules )
def SetRule( self, tag_slice, rule ):
with self._lock:
self._tag_slices_to_rules[ tag_slice ] = rule
self._UpdateRuleCache()
def TagOK( self, tag, apply_unnamespaced_rules_to_namespaced_tags = False ):
with self._lock:
return self._TagOK( tag, apply_unnamespaced_rules_to_namespaced_tags = apply_unnamespaced_rules_to_namespaced_tags )
def ToBlacklistString( self ):
with self._lock:
blacklist = []
whitelist = []
for ( tag_slice, rule ) in self._tag_slices_to_rules.items():
if rule == HC.FILTER_BLACKLIST:
blacklist.append( tag_slice )
elif rule == HC.FILTER_WHITELIST:
whitelist.append( tag_slice )
blacklist.sort()
whitelist.sort()
if len( blacklist ) == 0:
return 'no blacklist set'
else:
if set( blacklist ) == { '', ':' }:
text = 'blacklisting on any tags'
else:
text = 'blacklisting on ' + ', '.join( ( ConvertTagSliceToString( tag_slice ) for tag_slice in blacklist ) )
if len( whitelist ) > 0:
text += ' except ' + ', '.join( ( ConvertTagSliceToString( tag_slice ) for tag_slice in whitelist ) )
return text
def ToCensoredString( self ):
with self._lock:
blacklist = []
whitelist = []
for ( tag_slice, rule ) in list(self._tag_slices_to_rules.items()):
if rule == HC.FILTER_BLACKLIST:
blacklist.append( tag_slice )
elif rule == HC.FILTER_WHITELIST:
whitelist.append( tag_slice )
blacklist.sort()
whitelist.sort()
if len( blacklist ) == 0:
return 'all tags allowed'
else:
if set( blacklist ) == { '', ':' }:
text = 'no tags allowed'
else:
text = 'all but ' + ', '.join( ( ConvertTagSliceToString( tag_slice ) for tag_slice in blacklist ) ) + ' allowed'
if len( whitelist ) > 0:
text += ' except ' + ', '.join( ( ConvertTagSliceToString( tag_slice ) for tag_slice in whitelist ) )
return text
def ToPermittedString( self ):
with self._lock:
blacklist = []
whitelist = []
for ( tag_slice, rule ) in list(self._tag_slices_to_rules.items()):
if rule == HC.FILTER_BLACKLIST:
blacklist.append( tag_slice )
elif rule == HC.FILTER_WHITELIST:
whitelist.append( tag_slice )
blacklist.sort()
whitelist.sort()
if len( blacklist ) == 0:
return 'all tags'
else:
if set( blacklist ) == { '', ':' }:
if len( whitelist ) == 0:
text = 'no tags'
else:
text = 'only ' + ', '.join( ( ConvertTagSliceToString( tag_slice ) for tag_slice in whitelist ) )
elif set( blacklist ) == { '' }:
text = 'all namespaced tags'
if len( whitelist ) > 0:
text += ' and ' + ', '.join( ( ConvertTagSliceToString( tag_slice ) for tag_slice in whitelist ) )
elif set( blacklist ) == { ':' }:
text = 'all unnamespaced tags'
if len( whitelist ) > 0:
text += ' and ' + ', '.join( ( ConvertTagSliceToString( tag_slice ) for tag_slice in whitelist ) )
else:
text = 'all tags except ' + ', '.join( ( ConvertTagSliceToString( tag_slice ) for tag_slice in blacklist ) )
if len( whitelist ) > 0:
text += ' (except ' + ', '.join( ( ConvertTagSliceToString( tag_slice ) for tag_slice in whitelist ) ) + ')'
return text
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_TAG_FILTER ] = TagFilter