hydrus/hydrus/core/files/HydrusArchiveHandling.py

import collections
import re
import zipfile

from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData
from hydrus.core import HydrusExceptions

def ExtractSingleFileFromZip( path_to_zip, filename_to_extract, extract_into_file_path ):
    
    with zipfile.ZipFile( path_to_zip ) as zip_handle:
        
        with zip_handle.open( filename_to_extract ) as reader:
            
            with open( extract_into_file_path, "wb" ) as writer:
                
                writer.write( reader.read() )
                
            
        
    

def ExtractCoverPage( path_to_zip, extract_path ):
    
    with zipfile.ZipFile( path_to_zip ) as zip_handle:
        
        path = GetCoverPagePath( zip_handle )
        
        with zip_handle.open( path ) as reader:
            
            with open( extract_path, 'wb' ) as writer:
                
                writer.write( reader.read() )
                
            
        
    

def GetCoverPagePath( zip_handle: zipfile.ZipFile ):
    
    # this probably depth-first fails with a crazy multiple-nested-subdirectory structure, but we'll cross that bridge when we come to it
    all_file_paths = [ zip_info.filename for zip_info in zip_handle.infolist() if not zip_info.is_dir() ]
    
    HydrusData.HumanTextSort( all_file_paths )
    
    for path in all_file_paths:
        
        if path.startswith('__MACOSX/'):
                
            continue
        
        if '.' in path:
            
            ext_with_dot = '.' + path.split( '.' )[-1]
            
            if ext_with_dot in HC.IMAGE_FILE_EXTS:
                
                return path
                
            
        
    
    raise HydrusExceptions.DamagedOrUnusualFileException( 'Sorry, could not find an image file in there!' )
    

def GetSingleFileFromZipBytes( path_to_zip, path_in_zip ):
    
    return GetZipAsPath( path_to_zip, path_in_zip = path_in_zip ).read_bytes()
    

def GetZipAsPath( path_to_zip, path_in_zip="" ):
    
    return zipfile.Path( path_to_zip, at=path_in_zip )
    

def IsEncryptedZip( path_to_zip ):
    
    ENCRYPTED_FLAG = 0x1
    
    try:
        
        with zipfile.ZipFile( path_to_zip ) as zip_handle:
            
            zip_infos = zip_handle.infolist()
            
            for zip_info in zip_infos:
                
                is_encrypted = zip_info.flag_bits & ENCRYPTED_FLAG
                
                if is_encrypted:
                    
                    return True
                    
                
            
            return False
            
        
    except:
        
        raise HydrusExceptions.DamagedOrUnusualFileException( 'Could not open this zip at all!' )
        
    

def filename_has_image_ext( filename: str ):
    
    if '.' in filename:
        
        ext_with_dot = '.' + filename.split( '.' )[-1]
        
        if ext_with_dot in HC.IMAGE_FILE_EXTS:
            
            return True
            
        
    
    return False
    

def filename_has_video_ext( filename: str ):
    
    if '.' in filename:
        
        ext_with_dot = '.' + filename.split( '.' )[-1]
        
        if ext_with_dot in HC.VIDEO_FILE_EXTS:
            
            return True
            
        
    
    return False
    

def ZipLooksLikeCBZ( path_to_zip ):
    
    # TODO: we should probably wangle this away from 'zip' and towards 'archive', but it is fine as a first step
    
    # what does a Comic Book Archive look like? it is ad-hoc, not rigorous, so be forgiving
    # it is a list of images
    # they may be flat in the base, or they may be in one or more subfolders
    # they may be accompanied by extra metadata files like: md5sum, .sfv/.SFV, .nfo, comicbook.xml, metadata.txt
    # they _cannot_ be accompanied by .exe, .zip, .unitypackage, .psd, or other gubbins
    # nothing else
    
    directories_to_image_filenames = collections.defaultdict( set )
    
    directories_with_stuff_in = set()
    num_weird_files = 0
    num_images = 0
    num_images_with_numbers = 0
    num_weird_files_allowed_per_directory = 5
    num_images_needed_per_directory = 1
    totally_ok_weird_filenames = { 'md5sum', 'comicbook.xml', 'metadata.txt', 'info.txt' }
    weird_filename_extension_whitelist = { '.sfv', '.nfo', '.txt', '.xml', '.json' }
    
    with zipfile.ZipFile( path_to_zip ) as zip_handle:
        
        for zip_info in zip_handle.infolist():
            
            if zip_info.is_dir():
                
                continue
                
            
            filename = zip_info.filename
            
            if filename.startswith('__MACOSX/'):
                
                continue
            
            if '/' in filename:
                
                directory_path = '/'.join( filename.split( '/' )[:-1] )
                
                filename = filename.split( '/' )[-1]
                
            else:
                
                directory_path = ''
                
            
            directories_with_stuff_in.add( directory_path )
            
            filename = filename.lower()
            
            if filename in totally_ok_weird_filenames:
                
                continue
                
            
            if filename_has_image_ext( filename ):
                
                num_images += 1
                
                if re.search( r'\d', filename ) is not None:
                    
                    num_images_with_numbers += 1
                    
                
                directories_to_image_filenames[ directory_path ].add( filename )
                
                continue
                
            else:
                
                if '.' in filename:
                    
                    ext_with_dot = '.' + filename.split( '.' )[-1]
                    
                    if ext_with_dot in weird_filename_extension_whitelist:
                        
                        num_weird_files += 1
                        
                    else:
                        
                        # we got ourselves a .mp4 or .unitypackage or whatever. not a cbz!
                        return False
                        
                    
                else:
                    
                    # we out here with a 'gonk' file. not a cbz!
                    return False
                    
                
            
        
        # although we want to broadly check there are files with numbers, we don't want tempt ourselves into searching for 0 or 1. some 'chapters' start at page 55
        # a two-paged comic is the minimum permissible
        if num_images_with_numbers <= 1:
            
            return False
            
        
        try:
            
            path = GetCoverPagePath( zip_handle )
            
            with zip_handle.open( path ) as reader:
                
                reader.read()
                
            
        except:
            
            return False
            
        
    
    if len( directories_to_image_filenames ) > 0:
        
        directories_to_looks_good_scores = {}
        
        for ( directory_path, image_filenames ) in directories_to_image_filenames.items():
            
            # ok, so a zip that has fifteen different filename styles is not a cbz
            # one that is all "Coolguy Adventures-c4-p001.jpg" however is!
            
            # so let's take all the numbers and figure out how commonly the image filenames are templated
            
            unique_numberless_filenames = { re.sub( r'\d', '', filename ) for filename in image_filenames }
            
            magical_uniqueness_percentage = ( len( unique_numberless_filenames ) - 1 ) / len( image_filenames )
            
            directories_to_looks_good_scores[ directory_path ] = magical_uniqueness_percentage
            
        
        all_percentages = list( directories_to_looks_good_scores.values() )
        
        average_directory_good = sum( all_percentages ) / len( all_percentages )
        
        # experimentally, I haven't seen this go above 0.103 on a legit cbz
        if average_directory_good > 0.2:
            
            return False
            
        
    
    if num_weird_files * len( directories_with_stuff_in ) > num_weird_files_allowed_per_directory:
        
        return False
        
    
    if num_images * len( directories_with_stuff_in ) < num_images_needed_per_directory:
        
        return False
        
    
    return True
    

def MimeFromOpenDocument( path ):
    
    try:
        
        mimetype_data = GetZipAsPath( path, 'mimetype' ).read_text()
        
        filetype = HC.mime_enum_lookup.get(mimetype_data, None)
        
        return filetype if filetype in HC.OPEN_DOCUMENT_ZIPS else None
        
    except:
        
        return None
Version 554 2023-11-29 22:27:53 +00:00			`import collections`
			`import re`
Version 535 2023-07-19 20:38:06 +00:00			`import zipfile`

Add support for EPUB files (#1439) * Add support for EPUB files Also generalize open document zip mime detection * Fix exception handling in MimeFromOpenDocument 2023-09-23 19:21:26 +00:00			`from hydrus.core import HydrusConstants as HC`
Version 554 2023-11-29 22:27:53 +00:00			`from hydrus.core import HydrusData`
			`from hydrus.core import HydrusExceptions`
Add support for EPUB files (#1439) * Add support for EPUB files Also generalize open document zip mime detection * Fix exception handling in MimeFromOpenDocument 2023-09-23 19:21:26 +00:00
Version 535 2023-07-19 20:38:06 +00:00			`def ExtractSingleFileFromZip( path_to_zip, filename_to_extract, extract_into_file_path ):`

			`with zipfile.ZipFile( path_to_zip ) as zip_handle:`

			`with zip_handle.open( filename_to_extract ) as reader:`

			`with open( extract_into_file_path, "wb" ) as writer:`

			`writer.write( reader.read() )`
Version 548 closes #1320, closes #1328 2023-10-18 20:31:50 +00:00




Version 554 2023-11-29 22:27:53 +00:00			`def ExtractCoverPage( path_to_zip, extract_path ):`

			`with zipfile.ZipFile( path_to_zip ) as zip_handle:`

Version 556 2023-12-13 22:29:24 +00:00			`path = GetCoverPagePath( zip_handle )`
Version 554 2023-11-29 22:27:53 +00:00
Version 556 2023-12-13 22:29:24 +00:00			`with zip_handle.open( path ) as reader:`
Version 554 2023-11-29 22:27:53 +00:00
Version 556 2023-12-13 22:29:24 +00:00			`with open( extract_path, 'wb' ) as writer:`
Version 554 2023-11-29 22:27:53 +00:00
Version 556 2023-12-13 22:29:24 +00:00			`writer.write( reader.read() )`
Version 554 2023-11-29 22:27:53 +00:00
Version 556 2023-12-13 22:29:24 +00:00



			`def GetCoverPagePath( zip_handle: zipfile.ZipFile ):`

			`# this probably depth-first fails with a crazy multiple-nested-subdirectory structure, but we'll cross that bridge when we come to it`
			`all_file_paths = [ zip_info.filename for zip_info in zip_handle.infolist() if not zip_info.is_dir() ]`

			`HydrusData.HumanTextSort( all_file_paths )`

			`for path in all_file_paths:`

Fix a couple issues with CBZ files (#1529) * Set cbz resolution to None, None when it can't be determined * Don't consider the `__MACOSX` directory for cbz 2024-03-16 20:20:09 +00:00			`if path.startswith('__MACOSX/'):`

			`continue`

Version 556 2023-12-13 22:29:24 +00:00			`if '.' in path:`

			`ext_with_dot = '.' + path.split( '.' )[-1]`

			`if ext_with_dot in HC.IMAGE_FILE_EXTS:`

			`return path`
Version 554 2023-11-29 22:27:53 +00:00



			`raise HydrusExceptions.DamagedOrUnusualFileException( 'Sorry, could not find an image file in there!' )`


Version 548 closes #1320, closes #1328 2023-10-18 20:31:50 +00:00			`def GetSingleFileFromZipBytes( path_to_zip, path_in_zip ):`

			`return GetZipAsPath( path_to_zip, path_in_zip = path_in_zip ).read_bytes()`
Version 545 closes #394 2023-09-27 21:12:55 +00:00
Add support for procreate file format (#1425) * Add function to get zip as Path and use it to avoid temp file for krita * Add support for procreate format 2023-08-26 19:05:45 +00:00
Version 545 closes #394 2023-09-27 21:12:55 +00:00			`def GetZipAsPath( path_to_zip, path_in_zip="" ):`

Add support for procreate file format (#1425) * Add function to get zip as Path and use it to avoid temp file for krita * Add support for procreate format 2023-08-26 19:05:45 +00:00			`return zipfile.Path( path_to_zip, at=path_in_zip )`
Version 545 closes #394 2023-09-27 21:12:55 +00:00
Add support for EPUB files (#1439) * Add support for EPUB files Also generalize open document zip mime detection * Fix exception handling in MimeFromOpenDocument 2023-09-23 19:21:26 +00:00
Version 557 2024-01-03 21:21:53 +00:00			`def IsEncryptedZip( path_to_zip ):`
Version 555 GET 2023-12-06 22:13:50 +00:00
Version 556 2023-12-13 22:29:24 +00:00			`ENCRYPTED_FLAG = 0x1`

Version 555 GET 2023-12-06 22:13:50 +00:00			`try:`

			`with zipfile.ZipFile( path_to_zip ) as zip_handle:`

Version 556 2023-12-13 22:29:24 +00:00			`zip_infos = zip_handle.infolist()`
Version 555 GET 2023-12-06 22:13:50 +00:00
Version 556 2023-12-13 22:29:24 +00:00			`for zip_info in zip_infos:`
Version 555 GET 2023-12-06 22:13:50 +00:00
Version 556 2023-12-13 22:29:24 +00:00			`is_encrypted = zip_info.flag_bits & ENCRYPTED_FLAG`

			`if is_encrypted:`
Version 555 GET 2023-12-06 22:13:50 +00:00
Version 557 2024-01-03 21:21:53 +00:00			`return True`
Version 555 GET 2023-12-06 22:13:50 +00:00


Version 557 2024-01-03 21:21:53 +00:00			`return False`
Version 555 GET 2023-12-06 22:13:50 +00:00

			`except:`

Version 557 2024-01-03 21:21:53 +00:00			`raise HydrusExceptions.DamagedOrUnusualFileException( 'Could not open this zip at all!' )`



			`def filename_has_image_ext( filename: str ):`

			`if '.' in filename:`

			`ext_with_dot = '.' + filename.split( '.' )[-1]`

			`if ext_with_dot in HC.IMAGE_FILE_EXTS:`

			`return True`



			`return False`


			`def filename_has_video_ext( filename: str ):`

			`if '.' in filename:`

			`ext_with_dot = '.' + filename.split( '.' )[-1]`

			`if ext_with_dot in HC.VIDEO_FILE_EXTS:`

			`return True`

Version 555 GET 2023-12-06 22:13:50 +00:00

Version 557 2024-01-03 21:21:53 +00:00			`return False`

Version 555 GET 2023-12-06 22:13:50 +00:00
Version 554 2023-11-29 22:27:53 +00:00			`def ZipLooksLikeCBZ( path_to_zip ):`

			`# TODO: we should probably wangle this away from 'zip' and towards 'archive', but it is fine as a first step`

			`# what does a Comic Book Archive look like? it is ad-hoc, not rigorous, so be forgiving`
			`# it is a list of images`
			`# they may be flat in the base, or they may be in one or more subfolders`
Version 558 2024-01-10 21:27:29 +00:00			`# they may be accompanied by extra metadata files like: md5sum, .sfv/.SFV, .nfo, comicbook.xml, metadata.txt`
			`# they _cannot_ be accompanied by .exe, .zip, .unitypackage, .psd, or other gubbins`
Version 554 2023-11-29 22:27:53 +00:00			`# nothing else`

			`directories_to_image_filenames = collections.defaultdict( set )`

Version 558 2024-01-10 21:27:29 +00:00			`directories_with_stuff_in = set()`
Version 554 2023-11-29 22:27:53 +00:00			`num_weird_files = 0`
			`num_images = 0`
Version 558 2024-01-10 21:27:29 +00:00			`num_images_with_numbers = 0`
Version 554 2023-11-29 22:27:53 +00:00			`num_weird_files_allowed_per_directory = 5`
			`num_images_needed_per_directory = 1`
Version 558 2024-01-10 21:27:29 +00:00			`totally_ok_weird_filenames = { 'md5sum', 'comicbook.xml', 'metadata.txt', 'info.txt' }`
			`weird_filename_extension_whitelist = { '.sfv', '.nfo', '.txt', '.xml', '.json' }`
Version 554 2023-11-29 22:27:53 +00:00
			`with zipfile.ZipFile( path_to_zip ) as zip_handle:`

			`for zip_info in zip_handle.infolist():`

			`if zip_info.is_dir():`

			`continue`


			`filename = zip_info.filename`

Fix a couple issues with CBZ files (#1529) * Set cbz resolution to None, None when it can't be determined * Don't consider the `__MACOSX` directory for cbz 2024-03-16 20:20:09 +00:00			`if filename.startswith('__MACOSX/'):`

			`continue`

Version 554 2023-11-29 22:27:53 +00:00			`if '/' in filename:`

			`directory_path = '/'.join( filename.split( '/' )[:-1] )`

			`filename = filename.split( '/' )[-1]`

			`else:`

			`directory_path = ''`


Version 558 2024-01-10 21:27:29 +00:00			`directories_with_stuff_in.add( directory_path )`

Version 554 2023-11-29 22:27:53 +00:00			`filename = filename.lower()`

Version 558 2024-01-10 21:27:29 +00:00			`if filename in totally_ok_weird_filenames:`
Version 554 2023-11-29 22:27:53 +00:00
			`continue`


Version 557 2024-01-03 21:21:53 +00:00			`if filename_has_image_ext( filename ):`
Version 554 2023-11-29 22:27:53 +00:00
Version 557 2024-01-03 21:21:53 +00:00			`num_images += 1`
Version 554 2023-11-29 22:27:53 +00:00
Version 558 2024-01-10 21:27:29 +00:00			`if re.search( r'\d', filename ) is not None:`

			`num_images_with_numbers += 1`


Version 557 2024-01-03 21:21:53 +00:00			`directories_to_image_filenames[ directory_path ].add( filename )`

			`continue`

			`else:`

Version 558 2024-01-10 21:27:29 +00:00			`if '.' in filename:`

			`ext_with_dot = '.' + filename.split( '.' )[-1]`

			`if ext_with_dot in weird_filename_extension_whitelist:`

			`num_weird_files += 1`

			`else:`

			`# we got ourselves a .mp4 or .unitypackage or whatever. not a cbz!`
			`return False`


			`else:`

			`# we out here with a 'gonk' file. not a cbz!`
			`return False`

Version 554 2023-11-29 22:27:53 +00:00


Version 558 2024-01-10 21:27:29 +00:00			`# although we want to broadly check there are files with numbers, we don't want tempt ourselves into searching for 0 or 1. some 'chapters' start at page 55`
			`# a two-paged comic is the minimum permissible`
			`if num_images_with_numbers <= 1:`

			`return False`


Version 556 2023-12-13 22:29:24 +00:00			`try:`

			`path = GetCoverPagePath( zip_handle )`

			`with zip_handle.open( path ) as reader:`

			`reader.read()`


			`except:`

			`return False`


Version 554 2023-11-29 22:27:53 +00:00
			`if len( directories_to_image_filenames ) > 0:`

			`directories_to_looks_good_scores = {}`

Version 557 2024-01-03 21:21:53 +00:00			`for ( directory_path, image_filenames ) in directories_to_image_filenames.items():`
Version 554 2023-11-29 22:27:53 +00:00
			`# ok, so a zip that has fifteen different filename styles is not a cbz`
			`# one that is all "Coolguy Adventures-c4-p001.jpg" however is!`

Version 557 2024-01-03 21:21:53 +00:00			`# so let's take all the numbers and figure out how commonly the image filenames are templated`
Version 554 2023-11-29 22:27:53 +00:00
Version 557 2024-01-03 21:21:53 +00:00			`unique_numberless_filenames = { re.sub( r'\d', '', filename ) for filename in image_filenames }`
Version 554 2023-11-29 22:27:53 +00:00
Version 557 2024-01-03 21:21:53 +00:00			`magical_uniqueness_percentage = ( len( unique_numberless_filenames ) - 1 ) / len( image_filenames )`
Version 554 2023-11-29 22:27:53 +00:00
			`directories_to_looks_good_scores[ directory_path ] = magical_uniqueness_percentage`


			`all_percentages = list( directories_to_looks_good_scores.values() )`

			`average_directory_good = sum( all_percentages ) / len( all_percentages )`

Version 557 2024-01-03 21:21:53 +00:00			`# experimentally, I haven't seen this go above 0.103 on a legit cbz`
Version 554 2023-11-29 22:27:53 +00:00			`if average_directory_good > 0.2:`

			`return False`



Version 558 2024-01-10 21:27:29 +00:00			`if num_weird_files * len( directories_with_stuff_in ) > num_weird_files_allowed_per_directory:`
Version 554 2023-11-29 22:27:53 +00:00
			`return False`


Version 558 2024-01-10 21:27:29 +00:00			`if num_images * len( directories_with_stuff_in ) < num_images_needed_per_directory:`
Version 554 2023-11-29 22:27:53 +00:00
			`return False`


			`return True`


Add support for EPUB files (#1439) * Add support for EPUB files Also generalize open document zip mime detection * Fix exception handling in MimeFromOpenDocument 2023-09-23 19:21:26 +00:00			`def MimeFromOpenDocument( path ):`

			`try:`
Version 545 closes #394 2023-09-27 21:12:55 +00:00
Add support for EPUB files (#1439) * Add support for EPUB files Also generalize open document zip mime detection * Fix exception handling in MimeFromOpenDocument 2023-09-23 19:21:26 +00:00			`mimetype_data = GetZipAsPath( path, 'mimetype' ).read_text()`

			`filetype = HC.mime_enum_lookup.get(mimetype_data, None)`
Version 566 2024-03-13 20:33:53 +00:00
Add support for EPUB files (#1439) * Add support for EPUB files Also generalize open document zip mime detection * Fix exception handling in MimeFromOpenDocument 2023-09-23 19:21:26 +00:00			`return filetype if filetype in HC.OPEN_DOCUMENT_ZIPS else None`

			`except:`

			`return None`
Version 545 closes #394 2023-09-27 21:12:55 +00:00

Version 566 2024-03-13 20:33:53 +00:00