hydrus/hydrus/core/HydrusTags.py

314 lines
6.8 KiB
Python

import collections
import os
import re
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusExceptions
from hydrus.core import HydrusText
def CensorshipMatch( tag, censorships ):
for censorship in censorships:
if censorship == '': # '' - all non namespaced tags
( namespace, subtag ) = SplitTag( tag )
if namespace == '':
return True
elif censorship == ':': # ':' - all namespaced tags
( namespace, subtag ) = SplitTag( tag )
if namespace != '':
return True
elif ':' in censorship:
if censorship.endswith( ':' ): # 'series:' - namespaced tags
( namespace, subtag ) = SplitTag( tag )
if namespace == censorship[:-1]:
return True
else: # 'series:evangelion' - exact match with namespace
if tag == censorship:
return True
else:
# 'table' - normal tag, or namespaced version of same
( namespace, subtag ) = SplitTag( tag )
if subtag == censorship:
return True
return False
def CollapseMultipleSortedNumericTagsToMinMax( tags ):
if len( tags ) <= 2:
return tags
else:
includes_non_numeric_tag = True in ( not isinstance( ConvertTagToSortable( tag ), tuple ) for tag in tags )
if includes_non_numeric_tag:
return tags
else:
# this list of tags is entirely numeric and may well be something like 1, 2, 3, 4, 5
# the caller wants to present 1-5 instead, so lets cut out the first and last
if not isinstance( tags, list ):
tags = list( tags )
return [ tags[0], tags[-1] ]
def ConvertTagToSortable( tag ):
# this copies the human sort in hydrusdata
convert = lambda text: ( '', int( text ) ) if text.isdecimal() else ( text, 0 )
return tuple( [ convert( c ) for c in re.split( '([0-9]+)', tag.lower() ) ] )
# old method
'''if len( t ) > 0 and t[0].isdecimal():
# We want to maintain that:
# 0 < 0a < 0b < 1 ( lexicographic comparison )
# -and-
# 2 < 22 ( value comparison )
# So, if the first bit can be turned into an int, split it into ( int, extra )
int_component = ''
i = 0
for character in t:
if character.isdecimal():
int_component += character
else:
break
i += 1
str_component = t[i:]
number = int( int_component )
return ( number, str_component )
else:
return t
'''
def FilterNamespaces( tags, namespaces ):
processed_tags = collections.defaultdict( set )
for tag in tags:
( namespace, subtag ) = SplitTag( tag )
processed_tags[ namespace ].add( tag )
result = set()
for namespace in namespaces:
if namespace == None:
result.update( processed_tags[ '' ] )
else:
result.update( processed_tags[ namespace ] )
return result
def SortNumericTags( tags ):
tags = list( tags )
tags.sort( key = ConvertTagToSortable )
return tags
def CheckTagNotEmpty( tag ):
( namespace, subtag ) = SplitTag( tag )
if subtag == '':
raise HydrusExceptions.TagSizeException( 'Received a zero-length tag!' )
def CleanTag( tag ):
try:
if tag is None:
raise Exception()
tag = tag[:1024]
tag = tag.lower()
tag = HydrusText.re_leading_single_colon.sub( '::', tag ) # Convert anything starting with one colon to start with two i.e. :D -> ::D
if ':' in tag:
tag = StripTextOfGumpf( tag ) # need to repeat here to catch 'system:' stuff
( namespace, subtag ) = SplitTag( tag )
namespace = StripTextOfGumpf( namespace )
subtag = StripTextOfGumpf( subtag )
tag = CombineTag( namespace, subtag )
else:
tag = StripTextOfGumpf( tag )
except Exception as e:
text = 'Was unable to parse the tag: ' + str( tag )
text += os.linesep * 2
text += str( e )
raise Exception( text )
return tag
def CleanTags( tags ):
clean_tags = set()
for tag in tags:
if tag is None:
continue
tag = CleanTag( tag )
try:
CheckTagNotEmpty( tag )
except HydrusExceptions.TagSizeException:
continue
clean_tags.add( tag )
return clean_tags
def CombineTag( namespace, subtag ):
if namespace == '':
if HydrusText.re_leading_single_colon.search( subtag ) is not None:
return ':' + subtag
else:
return subtag
else:
return namespace + ':' + subtag
def SplitTag( tag ):
if ':' in tag:
return tuple( tag.split( ':', 1 ) )
else:
return ( '', tag )
NULL_CHARACTER = '\x00'
def StripTextOfGumpf( t ):
t = HydrusText.re_newlines.sub( '', t )
t = HydrusText.re_multiple_spaces.sub( ' ', t )
t = t.strip()
t = HydrusText.re_leading_space_or_garbage.sub( '', t )
if NULL_CHARACTER in t:
t = t.replace( NULL_CHARACTER, '' )
return t
def TagOK( t ):
try:
CheckTagNotEmpty( CleanTag( t ) )
return True
except:
return False