2023-11-29 22:27:53 +00:00
import collections
import re
2023-07-19 20:38:06 +00:00
import zipfile
2023-09-23 19:21:26 +00:00
from hydrus . core import HydrusConstants as HC
2023-11-29 22:27:53 +00:00
from hydrus . core import HydrusData
from hydrus . core import HydrusExceptions
2023-09-23 19:21:26 +00:00
2023-07-19 20:38:06 +00:00
def ExtractSingleFileFromZip ( path_to_zip , filename_to_extract , extract_into_file_path ) :
with zipfile . ZipFile ( path_to_zip ) as zip_handle :
with zip_handle . open ( filename_to_extract ) as reader :
with open ( extract_into_file_path , " wb " ) as writer :
writer . write ( reader . read ( ) )
2023-10-18 20:31:50 +00:00
2023-11-29 22:27:53 +00:00
def ExtractCoverPage ( path_to_zip , extract_path ) :
with zipfile . ZipFile ( path_to_zip ) as zip_handle :
2023-12-13 22:29:24 +00:00
path = GetCoverPagePath ( zip_handle )
2023-11-29 22:27:53 +00:00
2023-12-13 22:29:24 +00:00
with zip_handle . open ( path ) as reader :
2023-11-29 22:27:53 +00:00
2023-12-13 22:29:24 +00:00
with open ( extract_path , ' wb ' ) as writer :
2023-11-29 22:27:53 +00:00
2023-12-13 22:29:24 +00:00
writer . write ( reader . read ( ) )
2023-11-29 22:27:53 +00:00
2023-12-13 22:29:24 +00:00
def GetCoverPagePath ( zip_handle : zipfile . ZipFile ) :
# this probably depth-first fails with a crazy multiple-nested-subdirectory structure, but we'll cross that bridge when we come to it
all_file_paths = [ zip_info . filename for zip_info in zip_handle . infolist ( ) if not zip_info . is_dir ( ) ]
HydrusData . HumanTextSort ( all_file_paths )
for path in all_file_paths :
2024-03-16 20:20:09 +00:00
if path . startswith ( ' __MACOSX/ ' ) :
continue
2023-12-13 22:29:24 +00:00
if ' . ' in path :
ext_with_dot = ' . ' + path . split ( ' . ' ) [ - 1 ]
if ext_with_dot in HC . IMAGE_FILE_EXTS :
return path
2023-11-29 22:27:53 +00:00
raise HydrusExceptions . DamagedOrUnusualFileException ( ' Sorry, could not find an image file in there! ' )
2023-10-18 20:31:50 +00:00
def GetSingleFileFromZipBytes ( path_to_zip , path_in_zip ) :
return GetZipAsPath ( path_to_zip , path_in_zip = path_in_zip ) . read_bytes ( )
2023-09-27 21:12:55 +00:00
2023-08-26 19:05:45 +00:00
2023-09-27 21:12:55 +00:00
def GetZipAsPath ( path_to_zip , path_in_zip = " " ) :
2023-08-26 19:05:45 +00:00
return zipfile . Path ( path_to_zip , at = path_in_zip )
2023-09-27 21:12:55 +00:00
2023-09-23 19:21:26 +00:00
2024-01-03 21:21:53 +00:00
def IsEncryptedZip ( path_to_zip ) :
2023-12-06 22:13:50 +00:00
2023-12-13 22:29:24 +00:00
ENCRYPTED_FLAG = 0x1
2023-12-06 22:13:50 +00:00
try :
with zipfile . ZipFile ( path_to_zip ) as zip_handle :
2023-12-13 22:29:24 +00:00
zip_infos = zip_handle . infolist ( )
2023-12-06 22:13:50 +00:00
2023-12-13 22:29:24 +00:00
for zip_info in zip_infos :
2023-12-06 22:13:50 +00:00
2023-12-13 22:29:24 +00:00
is_encrypted = zip_info . flag_bits & ENCRYPTED_FLAG
if is_encrypted :
2023-12-06 22:13:50 +00:00
2024-01-03 21:21:53 +00:00
return True
2023-12-06 22:13:50 +00:00
2024-01-03 21:21:53 +00:00
return False
2023-12-06 22:13:50 +00:00
except :
2024-01-03 21:21:53 +00:00
raise HydrusExceptions . DamagedOrUnusualFileException ( ' Could not open this zip at all! ' )
def filename_has_image_ext ( filename : str ) :
if ' . ' in filename :
ext_with_dot = ' . ' + filename . split ( ' . ' ) [ - 1 ]
if ext_with_dot in HC . IMAGE_FILE_EXTS :
return True
return False
def filename_has_video_ext ( filename : str ) :
if ' . ' in filename :
ext_with_dot = ' . ' + filename . split ( ' . ' ) [ - 1 ]
if ext_with_dot in HC . VIDEO_FILE_EXTS :
return True
2023-12-06 22:13:50 +00:00
2024-01-03 21:21:53 +00:00
return False
2023-12-06 22:13:50 +00:00
2023-11-29 22:27:53 +00:00
def ZipLooksLikeCBZ ( path_to_zip ) :
# TODO: we should probably wangle this away from 'zip' and towards 'archive', but it is fine as a first step
# what does a Comic Book Archive look like? it is ad-hoc, not rigorous, so be forgiving
# it is a list of images
# they may be flat in the base, or they may be in one or more subfolders
2024-01-10 21:27:29 +00:00
# they may be accompanied by extra metadata files like: md5sum, .sfv/.SFV, .nfo, comicbook.xml, metadata.txt
# they _cannot_ be accompanied by .exe, .zip, .unitypackage, .psd, or other gubbins
2023-11-29 22:27:53 +00:00
# nothing else
directories_to_image_filenames = collections . defaultdict ( set )
2024-01-10 21:27:29 +00:00
directories_with_stuff_in = set ( )
2023-11-29 22:27:53 +00:00
num_weird_files = 0
num_images = 0
2024-01-10 21:27:29 +00:00
num_images_with_numbers = 0
2023-11-29 22:27:53 +00:00
num_weird_files_allowed_per_directory = 5
num_images_needed_per_directory = 1
2024-01-10 21:27:29 +00:00
totally_ok_weird_filenames = { ' md5sum ' , ' comicbook.xml ' , ' metadata.txt ' , ' info.txt ' }
weird_filename_extension_whitelist = { ' .sfv ' , ' .nfo ' , ' .txt ' , ' .xml ' , ' .json ' }
2023-11-29 22:27:53 +00:00
with zipfile . ZipFile ( path_to_zip ) as zip_handle :
for zip_info in zip_handle . infolist ( ) :
if zip_info . is_dir ( ) :
continue
filename = zip_info . filename
2024-03-16 20:20:09 +00:00
if filename . startswith ( ' __MACOSX/ ' ) :
continue
2023-11-29 22:27:53 +00:00
if ' / ' in filename :
directory_path = ' / ' . join ( filename . split ( ' / ' ) [ : - 1 ] )
filename = filename . split ( ' / ' ) [ - 1 ]
else :
directory_path = ' '
2024-01-10 21:27:29 +00:00
directories_with_stuff_in . add ( directory_path )
2023-11-29 22:27:53 +00:00
filename = filename . lower ( )
2024-01-10 21:27:29 +00:00
if filename in totally_ok_weird_filenames :
2023-11-29 22:27:53 +00:00
continue
2024-01-03 21:21:53 +00:00
if filename_has_image_ext ( filename ) :
2023-11-29 22:27:53 +00:00
2024-01-03 21:21:53 +00:00
num_images + = 1
2023-11-29 22:27:53 +00:00
2024-01-10 21:27:29 +00:00
if re . search ( r ' \ d ' , filename ) is not None :
num_images_with_numbers + = 1
2024-01-03 21:21:53 +00:00
directories_to_image_filenames [ directory_path ] . add ( filename )
continue
else :
2024-01-10 21:27:29 +00:00
if ' . ' in filename :
ext_with_dot = ' . ' + filename . split ( ' . ' ) [ - 1 ]
if ext_with_dot in weird_filename_extension_whitelist :
num_weird_files + = 1
else :
# we got ourselves a .mp4 or .unitypackage or whatever. not a cbz!
return False
else :
# we out here with a 'gonk' file. not a cbz!
return False
2023-11-29 22:27:53 +00:00
2024-01-10 21:27:29 +00:00
# although we want to broadly check there are files with numbers, we don't want tempt ourselves into searching for 0 or 1. some 'chapters' start at page 55
# a two-paged comic is the minimum permissible
if num_images_with_numbers < = 1 :
return False
2023-12-13 22:29:24 +00:00
try :
path = GetCoverPagePath ( zip_handle )
with zip_handle . open ( path ) as reader :
reader . read ( )
except :
return False
2023-11-29 22:27:53 +00:00
if len ( directories_to_image_filenames ) > 0 :
directories_to_looks_good_scores = { }
2024-01-03 21:21:53 +00:00
for ( directory_path , image_filenames ) in directories_to_image_filenames . items ( ) :
2023-11-29 22:27:53 +00:00
# ok, so a zip that has fifteen different filename styles is not a cbz
# one that is all "Coolguy Adventures-c4-p001.jpg" however is!
2024-01-03 21:21:53 +00:00
# so let's take all the numbers and figure out how commonly the image filenames are templated
2023-11-29 22:27:53 +00:00
2024-01-03 21:21:53 +00:00
unique_numberless_filenames = { re . sub ( r ' \ d ' , ' ' , filename ) for filename in image_filenames }
2023-11-29 22:27:53 +00:00
2024-01-03 21:21:53 +00:00
magical_uniqueness_percentage = ( len ( unique_numberless_filenames ) - 1 ) / len ( image_filenames )
2023-11-29 22:27:53 +00:00
directories_to_looks_good_scores [ directory_path ] = magical_uniqueness_percentage
all_percentages = list ( directories_to_looks_good_scores . values ( ) )
average_directory_good = sum ( all_percentages ) / len ( all_percentages )
2024-01-03 21:21:53 +00:00
# experimentally, I haven't seen this go above 0.103 on a legit cbz
2023-11-29 22:27:53 +00:00
if average_directory_good > 0.2 :
return False
2024-01-10 21:27:29 +00:00
if num_weird_files * len ( directories_with_stuff_in ) > num_weird_files_allowed_per_directory :
2023-11-29 22:27:53 +00:00
return False
2024-01-10 21:27:29 +00:00
if num_images * len ( directories_with_stuff_in ) < num_images_needed_per_directory :
2023-11-29 22:27:53 +00:00
return False
return True
2023-09-23 19:21:26 +00:00
def MimeFromOpenDocument ( path ) :
try :
2023-09-27 21:12:55 +00:00
2023-09-23 19:21:26 +00:00
mimetype_data = GetZipAsPath ( path , ' mimetype ' ) . read_text ( )
filetype = HC . mime_enum_lookup . get ( mimetype_data , None )
2024-03-13 20:33:53 +00:00
2023-09-23 19:21:26 +00:00
return filetype if filetype in HC . OPEN_DOCUMENT_ZIPS else None
except :
return None
2023-09-27 21:12:55 +00:00
2024-03-13 20:33:53 +00:00