hydrus/include/HydrusTags.py

256 lines
5.7 KiB
Python
Raw Normal View History

2013-07-17 20:56:13 +00:00
import collections
import HydrusConstants as HC
import itertools
import os
import threading
import time
import traceback
2015-03-25 22:04:19 +00:00
import HydrusData
import HydrusExceptions
import re
2017-05-10 21:33:58 +00:00
import HydrusGlobals as HG
2013-07-17 20:56:13 +00:00
2017-04-05 21:16:40 +00:00
re_newlines = re.compile( '[\r\n]', re.UNICODE )
re_multiple_spaces = re.compile( '\\s+', re.UNICODE )
re_trailing_space = re.compile( '\\s$', re.UNICODE )
re_leading_space_or_garbage = re.compile( '^(\\s|-|system:)', re.UNICODE )
re_leading_single_colon = re.compile( '^:(?!:)', re.UNICODE )
2016-04-06 19:52:45 +00:00
def CensorshipMatch( tag, censorships ):
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
for censorship in censorships:
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
if censorship == '': # '' - all non namespaced tags
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
if namespace == '':
2016-04-06 19:52:45 +00:00
return True
elif censorship == ':': # ':' - all namespaced tags
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
if namespace != '':
2016-04-06 19:52:45 +00:00
return True
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
elif ':' in censorship:
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
if censorship.endswith( ':' ): # 'series:' - namespaced tags
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
2017-03-15 20:13:04 +00:00
if namespace == censorship[:-1]:
2016-04-06 19:52:45 +00:00
return True
else: # 'series:evangelion' - exact match with namespace
if tag == censorship:
return True
else:
# 'table' - normal tag, or namespaced version of same
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
if subtag == censorship:
2016-04-06 19:52:45 +00:00
2017-02-08 22:27:00 +00:00
return True
2016-04-06 19:52:45 +00:00
2014-03-12 22:08:23 +00:00
2016-04-06 19:52:45 +00:00
return False
2015-02-03 20:40:21 +00:00
def ConvertTagToSortable( t ):
2016-12-14 21:19:07 +00:00
2017-02-08 22:27:00 +00:00
if len( t ) > 0 and t[0].isdecimal():
2015-02-03 20:40:21 +00:00
# We want to maintain that:
# 0 < 0a < 0b < 1 ( lexicographic comparison )
# -and-
# 2 < 22 ( value comparison )
# So, if the first bit can be turned into an int, split it into ( int, extra )
int_component = ''
i = 0
for character in t:
2016-12-14 21:19:07 +00:00
if character.isdecimal(): int_component += character
2015-02-03 20:40:21 +00:00
else: break
i += 1
str_component = t[i:]
2016-12-14 21:19:07 +00:00
number = int( int_component )
return ( number, str_component )
else:
return t
2015-02-03 20:40:21 +00:00
2014-11-12 23:33:13 +00:00
def FilterNamespaces( tags, namespaces ):
processed_tags = collections.defaultdict( set )
for tag in tags:
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
processed_tags[ namespace ].add( tag )
2014-11-12 23:33:13 +00:00
result = set()
for namespace in namespaces:
2017-02-08 22:27:00 +00:00
if namespace == None:
result.update( processed_tags[ '' ] )
else:
result.update( processed_tags[ namespace ] )
2014-11-12 23:33:13 +00:00
return result
2016-04-27 19:20:37 +00:00
def SortNumericTags( tags ):
2015-01-21 22:49:58 +00:00
tags = list( tags )
2015-02-03 20:40:21 +00:00
tags.sort( key = ConvertTagToSortable )
2015-01-21 22:49:58 +00:00
return tags
2015-03-25 22:04:19 +00:00
def CheckTagNotEmpty( tag ):
2017-02-08 22:27:00 +00:00
( namespace, subtag ) = SplitTag( tag )
2015-03-25 22:04:19 +00:00
2017-02-08 22:27:00 +00:00
if subtag == '':
2015-03-25 22:04:19 +00:00
2017-02-08 22:27:00 +00:00
raise HydrusExceptions.SizeException( 'Received a zero-length tag!' )
2015-03-25 22:04:19 +00:00
2017-04-05 21:16:40 +00:00
2015-03-25 22:04:19 +00:00
def CleanTag( tag ):
2015-07-15 20:28:26 +00:00
try:
tag = tag[:1024]
tag = tag.lower()
2015-11-04 22:30:28 +00:00
tag = HydrusData.ToUnicode( tag )
2015-07-15 20:28:26 +00:00
2017-03-02 02:14:56 +00:00
if tag.startswith( ':' ):
2015-07-15 20:28:26 +00:00
2017-04-05 21:16:40 +00:00
tag = re_leading_single_colon.sub( '::', tag ) # Convert anything starting with one colon to start with two i.e. :D -> ::D
2017-03-02 02:14:56 +00:00
2017-04-05 21:16:40 +00:00
tag = StripTextOfGumpf( tag )
2017-03-02 02:14:56 +00:00
elif ':' in tag:
2017-04-05 21:16:40 +00:00
tag = StripTextOfGumpf( tag ) # need to repeat here to catch 'system:' stuff
2017-03-02 02:14:56 +00:00
( namespace, subtag ) = SplitTag( tag )
2017-04-05 21:16:40 +00:00
namespace = StripTextOfGumpf( namespace )
subtag = StripTextOfGumpf( subtag )
2017-03-02 02:14:56 +00:00
tag = CombineTag( namespace, subtag )
else:
2017-04-05 21:16:40 +00:00
tag = StripTextOfGumpf( tag )
2015-07-15 20:28:26 +00:00
2015-12-09 23:16:41 +00:00
2015-07-15 20:28:26 +00:00
except Exception as e:
2015-11-18 22:44:07 +00:00
text = 'Was unable to parse the tag: ' + HydrusData.ToUnicode( tag )
2015-07-15 20:28:26 +00:00
text += os.linesep * 2
2017-03-29 19:39:34 +00:00
text += HydrusData.ToUnicode( e )
2015-03-25 22:04:19 +00:00
2015-07-15 20:28:26 +00:00
raise Exception( text )
2015-03-25 22:04:19 +00:00
return tag
def CleanTags( tags ):
clean_tags = set()
for tag in tags:
tag = CleanTag( tag )
try: CheckTagNotEmpty( tag )
except HydrusExceptions.SizeException: continue
clean_tags.add( tag )
return clean_tags
2017-02-08 22:27:00 +00:00
def CombineTag( namespace, subtag ):
2015-12-09 23:16:41 +00:00
if namespace == '':
2017-02-08 22:27:00 +00:00
if subtag.startswith( ':' ):
2015-12-09 23:16:41 +00:00
2017-02-08 22:27:00 +00:00
return ':' + subtag
2015-12-09 23:16:41 +00:00
else:
2017-02-08 22:27:00 +00:00
return subtag
2015-12-09 23:16:41 +00:00
else:
2017-02-08 22:27:00 +00:00
return namespace + ':' + subtag
2015-12-09 23:16:41 +00:00
2017-02-08 22:27:00 +00:00
def SplitTag( tag ):
if ':' in tag:
return tag.split( ':', 1 )
else:
return ( '', tag )
2017-04-05 21:16:40 +00:00
def StripTextOfGumpf( t ):
t = re_newlines.sub( '', t )
t = re_multiple_spaces.sub( ' ', t )
t = re_trailing_space.sub( '', t )
while re_leading_space_or_garbage.search( t ) is not None:
t = re_leading_space_or_garbage.sub( '', t )
return t