hydrus/hydrus/core/files/HydrusArchiveHandling.py

311 lines
8.8 KiB
Python
Raw Normal View History

2023-11-29 22:27:53 +00:00
import collections
import re
2023-07-19 20:38:06 +00:00
import zipfile
from hydrus.core import HydrusConstants as HC
2023-11-29 22:27:53 +00:00
from hydrus.core import HydrusData
from hydrus.core import HydrusExceptions
2023-07-19 20:38:06 +00:00
def ExtractSingleFileFromZip( path_to_zip, filename_to_extract, extract_into_file_path ):
with zipfile.ZipFile( path_to_zip ) as zip_handle:
with zip_handle.open( filename_to_extract ) as reader:
with open( extract_into_file_path, "wb" ) as writer:
writer.write( reader.read() )
2023-10-18 20:31:50 +00:00
2023-11-29 22:27:53 +00:00
def ExtractCoverPage( path_to_zip, extract_path ):
with zipfile.ZipFile( path_to_zip ) as zip_handle:
2023-12-13 22:29:24 +00:00
path = GetCoverPagePath( zip_handle )
2023-11-29 22:27:53 +00:00
2023-12-13 22:29:24 +00:00
with zip_handle.open( path ) as reader:
2023-11-29 22:27:53 +00:00
2023-12-13 22:29:24 +00:00
with open( extract_path, 'wb' ) as writer:
2023-11-29 22:27:53 +00:00
2023-12-13 22:29:24 +00:00
writer.write( reader.read() )
2023-11-29 22:27:53 +00:00
2023-12-13 22:29:24 +00:00
def GetCoverPagePath( zip_handle: zipfile.ZipFile ):
# this probably depth-first fails with a crazy multiple-nested-subdirectory structure, but we'll cross that bridge when we come to it
all_file_paths = [ zip_info.filename for zip_info in zip_handle.infolist() if not zip_info.is_dir() ]
HydrusData.HumanTextSort( all_file_paths )
for path in all_file_paths:
if path.startswith('__MACOSX/'):
continue
2023-12-13 22:29:24 +00:00
if '.' in path:
ext_with_dot = '.' + path.split( '.' )[-1]
if ext_with_dot in HC.IMAGE_FILE_EXTS:
return path
2023-11-29 22:27:53 +00:00
raise HydrusExceptions.DamagedOrUnusualFileException( 'Sorry, could not find an image file in there!' )
2023-10-18 20:31:50 +00:00
def GetSingleFileFromZipBytes( path_to_zip, path_in_zip ):
return GetZipAsPath( path_to_zip, path_in_zip = path_in_zip ).read_bytes()
2023-09-27 21:12:55 +00:00
2023-09-27 21:12:55 +00:00
def GetZipAsPath( path_to_zip, path_in_zip="" ):
return zipfile.Path( path_to_zip, at=path_in_zip )
2023-09-27 21:12:55 +00:00
2024-01-03 21:21:53 +00:00
def IsEncryptedZip( path_to_zip ):
2023-12-06 22:13:50 +00:00
2023-12-13 22:29:24 +00:00
ENCRYPTED_FLAG = 0x1
2023-12-06 22:13:50 +00:00
try:
with zipfile.ZipFile( path_to_zip ) as zip_handle:
2023-12-13 22:29:24 +00:00
zip_infos = zip_handle.infolist()
2023-12-06 22:13:50 +00:00
2023-12-13 22:29:24 +00:00
for zip_info in zip_infos:
2023-12-06 22:13:50 +00:00
2023-12-13 22:29:24 +00:00
is_encrypted = zip_info.flag_bits & ENCRYPTED_FLAG
if is_encrypted:
2023-12-06 22:13:50 +00:00
2024-01-03 21:21:53 +00:00
return True
2023-12-06 22:13:50 +00:00
2024-01-03 21:21:53 +00:00
return False
2023-12-06 22:13:50 +00:00
except:
2024-01-03 21:21:53 +00:00
raise HydrusExceptions.DamagedOrUnusualFileException( 'Could not open this zip at all!' )
def filename_has_image_ext( filename: str ):
if '.' in filename:
ext_with_dot = '.' + filename.split( '.' )[-1]
if ext_with_dot in HC.IMAGE_FILE_EXTS:
return True
return False
def filename_has_video_ext( filename: str ):
if '.' in filename:
ext_with_dot = '.' + filename.split( '.' )[-1]
if ext_with_dot in HC.VIDEO_FILE_EXTS:
return True
2023-12-06 22:13:50 +00:00
2024-01-03 21:21:53 +00:00
return False
2023-12-06 22:13:50 +00:00
2023-11-29 22:27:53 +00:00
def ZipLooksLikeCBZ( path_to_zip ):
# TODO: we should probably wangle this away from 'zip' and towards 'archive', but it is fine as a first step
# what does a Comic Book Archive look like? it is ad-hoc, not rigorous, so be forgiving
# it is a list of images
# they may be flat in the base, or they may be in one or more subfolders
2024-01-10 21:27:29 +00:00
# they may be accompanied by extra metadata files like: md5sum, .sfv/.SFV, .nfo, comicbook.xml, metadata.txt
# they _cannot_ be accompanied by .exe, .zip, .unitypackage, .psd, or other gubbins
2023-11-29 22:27:53 +00:00
# nothing else
directories_to_image_filenames = collections.defaultdict( set )
2024-01-10 21:27:29 +00:00
directories_with_stuff_in = set()
2023-11-29 22:27:53 +00:00
num_weird_files = 0
num_images = 0
2024-01-10 21:27:29 +00:00
num_images_with_numbers = 0
2023-11-29 22:27:53 +00:00
num_weird_files_allowed_per_directory = 5
num_images_needed_per_directory = 1
2024-01-10 21:27:29 +00:00
totally_ok_weird_filenames = { 'md5sum', 'comicbook.xml', 'metadata.txt', 'info.txt' }
weird_filename_extension_whitelist = { '.sfv', '.nfo', '.txt', '.xml', '.json' }
2023-11-29 22:27:53 +00:00
with zipfile.ZipFile( path_to_zip ) as zip_handle:
for zip_info in zip_handle.infolist():
if zip_info.is_dir():
continue
filename = zip_info.filename
if filename.startswith('__MACOSX/'):
continue
2023-11-29 22:27:53 +00:00
if '/' in filename:
directory_path = '/'.join( filename.split( '/' )[:-1] )
filename = filename.split( '/' )[-1]
else:
directory_path = ''
2024-01-10 21:27:29 +00:00
directories_with_stuff_in.add( directory_path )
2023-11-29 22:27:53 +00:00
filename = filename.lower()
2024-01-10 21:27:29 +00:00
if filename in totally_ok_weird_filenames:
2023-11-29 22:27:53 +00:00
continue
2024-01-03 21:21:53 +00:00
if filename_has_image_ext( filename ):
2023-11-29 22:27:53 +00:00
2024-01-03 21:21:53 +00:00
num_images += 1
2023-11-29 22:27:53 +00:00
2024-01-10 21:27:29 +00:00
if re.search( r'\d', filename ) is not None:
num_images_with_numbers += 1
2024-01-03 21:21:53 +00:00
directories_to_image_filenames[ directory_path ].add( filename )
continue
else:
2024-01-10 21:27:29 +00:00
if '.' in filename:
ext_with_dot = '.' + filename.split( '.' )[-1]
if ext_with_dot in weird_filename_extension_whitelist:
num_weird_files += 1
else:
# we got ourselves a .mp4 or .unitypackage or whatever. not a cbz!
return False
else:
# we out here with a 'gonk' file. not a cbz!
return False
2023-11-29 22:27:53 +00:00
2024-01-10 21:27:29 +00:00
# although we want to broadly check there are files with numbers, we don't want tempt ourselves into searching for 0 or 1. some 'chapters' start at page 55
# a two-paged comic is the minimum permissible
if num_images_with_numbers <= 1:
return False
2023-12-13 22:29:24 +00:00
try:
path = GetCoverPagePath( zip_handle )
with zip_handle.open( path ) as reader:
reader.read()
except:
return False
2023-11-29 22:27:53 +00:00
if len( directories_to_image_filenames ) > 0:
directories_to_looks_good_scores = {}
2024-01-03 21:21:53 +00:00
for ( directory_path, image_filenames ) in directories_to_image_filenames.items():
2023-11-29 22:27:53 +00:00
# ok, so a zip that has fifteen different filename styles is not a cbz
# one that is all "Coolguy Adventures-c4-p001.jpg" however is!
2024-01-03 21:21:53 +00:00
# so let's take all the numbers and figure out how commonly the image filenames are templated
2023-11-29 22:27:53 +00:00
2024-01-03 21:21:53 +00:00
unique_numberless_filenames = { re.sub( r'\d', '', filename ) for filename in image_filenames }
2023-11-29 22:27:53 +00:00
2024-01-03 21:21:53 +00:00
magical_uniqueness_percentage = ( len( unique_numberless_filenames ) - 1 ) / len( image_filenames )
2023-11-29 22:27:53 +00:00
directories_to_looks_good_scores[ directory_path ] = magical_uniqueness_percentage
all_percentages = list( directories_to_looks_good_scores.values() )
average_directory_good = sum( all_percentages ) / len( all_percentages )
2024-01-03 21:21:53 +00:00
# experimentally, I haven't seen this go above 0.103 on a legit cbz
2023-11-29 22:27:53 +00:00
if average_directory_good > 0.2:
return False
2024-01-10 21:27:29 +00:00
if num_weird_files * len( directories_with_stuff_in ) > num_weird_files_allowed_per_directory:
2023-11-29 22:27:53 +00:00
return False
2024-01-10 21:27:29 +00:00
if num_images * len( directories_with_stuff_in ) < num_images_needed_per_directory:
2023-11-29 22:27:53 +00:00
return False
return True
def MimeFromOpenDocument( path ):
try:
2023-09-27 21:12:55 +00:00
mimetype_data = GetZipAsPath( path, 'mimetype' ).read_text()
filetype = HC.mime_enum_lookup.get(mimetype_data, None)
2024-03-13 20:33:53 +00:00
return filetype if filetype in HC.OPEN_DOCUMENT_ZIPS else None
except:
return None
2023-09-27 21:12:55 +00:00
2024-03-13 20:33:53 +00:00