Improvements to MS Office filetype support (#1528)

* Update icons for MS Office filetypes

* Remove unused old PDF word count code

* Add more strict office file detection

* Remove old HydrusDocumentHandling import

* Add word count for docx and pptx and thumbnails for pptx

* Update filetypes doc

* Update filetype docs with pptx dpi

* Remove prints and clean up formatting

* Center default thumbnails inside target resolution
This commit is contained in:
Paul Friederichsen 2024-03-16 15:35:32 -05:00 committed by GitHub
parent 31211945d9
commit 77ab38e50e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 231 additions and 144 deletions

View File

@ -70,16 +70,16 @@ The filetype for a file can be overridden with `manage -> force filetype` in the
## Applications
| Filetype | Extension | MIME type | Thumbnails | Viewable in Hydrus | Notes |
|----------|-----------|-----------------------------------------------------------------------------| :--------: | :----------------: | ----------------------------------------------------------------- |
| flash | `.swf` | `application/x-shockwave-flash` | ✅ | ❌ | |
| pdf | `.pdf` | `application/pdf` | ✅ | ❌ | 300 DPI assumed for resolution. No thumbnails for encrypted PDFs. |
| epub | `.epub` | `application/epub+zip` | ❌ | ❌ | |
| djvu | `.djvu` | `image/vnd.djvu` | ❌ | ❌ | |
| docx | `.docx` | `application/vnd.openxmlformats-officedocument.wordprocessingml.document` | ❌ | ❌ | |
| xlsx | `.xlsx` | `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` | ❌ | ❌ | |
| docx | `.pptx` | `application/vnd.openxmlformats-officedocument.presentationml.presentation` | ❌ | ❌ | |
| rtf | `.rtf` | `application/rtf` | ❌ | ❌ | |
| Filetype | Extension | MIME type | Thumbnails | Viewable in Hydrus | Notes |
| -------- | --------- | --------------------------------------------------------------------------- | :--------: | :----------------: | -------------------------------------------------------------------------- |
| flash | `.swf` | `application/x-shockwave-flash` | ✅ | ❌ | |
| pdf | `.pdf` | `application/pdf` | ✅ | ❌ | 300 DPI assumed for resolution. No thumbnails for encrypted PDFs. |
| epub | `.epub` | `application/epub+zip` | ❌ | ❌ | |
| djvu | `.djvu` | `image/vnd.djvu` | ❌ | ❌ | |
| docx | `.docx` | `application/vnd.openxmlformats-officedocument.wordprocessingml.document` | ❌ | ❌ | |
| xlsx | `.xlsx` | `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` | ❌ | ❌ | |
| pptx | `.pptx` | `application/vnd.openxmlformats-officedocument.presentationml.presentation` | ✅ | ❌ | 300 DPI assumed for resolution. Thumbnail only if embedded in the document |
| rtf | `.rtf` | `application/rtf` | ❌ | ❌ | |
## Image Project Files

View File

@ -741,9 +741,9 @@ APPLICATION_DJVU = 72
APPLICATION_CBZ = 73
ANIMATION_UGOIRA = 74
APPLICATION_RTF = 75
APPLICATION_MICROSOFT_OPEN_XML_DOCX = 76
APPLICATION_MICROSOFT_OPEN_XML_XLSX = 77
APPLICATION_MICROSOFT_OPEN_XML_PPTX = 78
APPLICATION_DOCX = 76
APPLICATION_XLSX = 77
APPLICATION_PPTX = 78
APPLICATION_OCTET_STREAM = 100
APPLICATION_UNKNOWN = 101
@ -794,9 +794,9 @@ SEARCHABLE_MIMES = {
APPLICATION_XCF,
APPLICATION_PROCREATE,
APPLICATION_PDF,
APPLICATION_MICROSOFT_OPEN_XML_DOCX,
APPLICATION_MICROSOFT_OPEN_XML_XLSX,
APPLICATION_MICROSOFT_OPEN_XML_PPTX,
APPLICATION_DOCX,
APPLICATION_XLSX,
APPLICATION_PPTX,
APPLICATION_EPUB,
APPLICATION_DJVU,
APPLICATION_RTF,
@ -895,9 +895,9 @@ APPLICATIONS = [
APPLICATION_PDF,
APPLICATION_EPUB,
APPLICATION_DJVU,
APPLICATION_MICROSOFT_OPEN_XML_DOCX,
APPLICATION_MICROSOFT_OPEN_XML_XLSX,
APPLICATION_MICROSOFT_OPEN_XML_PPTX,
APPLICATION_DOCX,
APPLICATION_XLSX,
APPLICATION_PPTX,
APPLICATION_RTF
]
@ -925,7 +925,7 @@ VIEWABLE_IMAGE_PROJECT_FILES = { APPLICATION_PSD, APPLICATION_KRITA }
OPEN_DOCUMENT_ZIPS = { APPLICATION_KRITA, APPLICATION_EPUB }
# zip files that have a `[Content_Types].xml` file inside
MICROSOFT_OPEN_XML_DOCUMENT_ZIPS = { APPLICATION_MICROSOFT_OPEN_XML_DOCX, APPLICATION_MICROSOFT_OPEN_XML_XLSX, APPLICATION_MICROSOFT_OPEN_XML_PPTX }
MICROSOFT_OPEN_XML_DOCUMENT_ZIPS = { APPLICATION_DOCX, APPLICATION_XLSX, APPLICATION_PPTX }
general_mimetypes_to_mime_groups = {
GENERAL_APPLICATION : APPLICATIONS,
@ -988,7 +988,7 @@ MIMES_THAT_MAY_THEORETICALLY_HAVE_TRANSPARENCY = MIMES_THAT_WE_CAN_CHECK_FOR_TRA
ANIMATION_APNG
} )
APPLICATIONS_WITH_THUMBNAILS = { IMAGE_SVG, APPLICATION_PDF, APPLICATION_FLASH, APPLICATION_CLIP, APPLICATION_PROCREATE }.union( VIEWABLE_IMAGE_PROJECT_FILES ).union( { APPLICATION_CBZ } )
APPLICATIONS_WITH_THUMBNAILS = { IMAGE_SVG, APPLICATION_PDF, APPLICATION_FLASH, APPLICATION_CLIP, APPLICATION_PROCREATE, APPLICATION_CBZ, APPLICATION_PPTX }.union( VIEWABLE_IMAGE_PROJECT_FILES )
MIMES_WITH_THUMBNAILS = set( IMAGES ).union( ANIMATIONS ).union( VIDEO ).union( APPLICATIONS_WITH_THUMBNAILS )
@ -1041,9 +1041,9 @@ mime_enum_lookup = {
'application/x-yaml' : APPLICATION_YAML,
'PDF document' : APPLICATION_PDF,
'application/pdf' : APPLICATION_PDF,
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' : APPLICATION_MICROSOFT_OPEN_XML_DOCX,
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' : APPLICATION_MICROSOFT_OPEN_XML_XLSX,
'application/vnd.openxmlformats-officedocument.presentationml.presentation' : APPLICATION_MICROSOFT_OPEN_XML_PPTX,
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' : APPLICATION_DOCX,
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' : APPLICATION_XLSX,
'application/vnd.openxmlformats-officedocument.presentationml.presentation' : APPLICATION_PPTX,
'application/epub+zip' : APPLICATION_EPUB,
'image/vnd.djvu' : APPLICATION_DJVU,
'image/vnd.djvu+multipage' : APPLICATION_DJVU,
@ -1116,9 +1116,9 @@ mime_string_lookup = {
APPLICATION_JSON : 'json',
APPLICATION_CBOR : 'cbor',
APPLICATION_PDF : 'pdf',
APPLICATION_MICROSOFT_OPEN_XML_DOCX : 'docx',
APPLICATION_MICROSOFT_OPEN_XML_XLSX : 'xlsx',
APPLICATION_MICROSOFT_OPEN_XML_PPTX : 'pptx',
APPLICATION_DOCX : 'docx',
APPLICATION_XLSX : 'xlsx',
APPLICATION_PPTX : 'pptx',
APPLICATION_EPUB : 'epub',
APPLICATION_DJVU : 'djvu',
APPLICATION_RTF : 'rtf',
@ -1201,9 +1201,9 @@ mime_mimetype_string_lookup = {
APPLICATION_JSON : 'application/json',
APPLICATION_CBOR : 'application/cbor',
APPLICATION_PDF : 'application/pdf',
APPLICATION_MICROSOFT_OPEN_XML_DOCX : 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
APPLICATION_MICROSOFT_OPEN_XML_XLSX : 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
APPLICATION_MICROSOFT_OPEN_XML_PPTX : 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
APPLICATION_DOCX : 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
APPLICATION_XLSX : 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
APPLICATION_PPTX : 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
APPLICATION_EPUB : 'application/epub+zip',
APPLICATION_DJVU : 'image/vnd.djvu',
APPLICATION_RTF: 'application/rtf',
@ -1284,9 +1284,9 @@ mime_ext_lookup = {
APPLICATION_YAML : '.yaml',
APPLICATION_JSON : '.json',
APPLICATION_PDF : '.pdf',
APPLICATION_MICROSOFT_OPEN_XML_DOCX : '.docx',
APPLICATION_MICROSOFT_OPEN_XML_XLSX : '.xlsx',
APPLICATION_MICROSOFT_OPEN_XML_PPTX : '.pptx',
APPLICATION_DOCX : '.docx',
APPLICATION_XLSX : '.xlsx',
APPLICATION_PPTX : '.pptx',
APPLICATION_EPUB : '.epub',
APPLICATION_DJVU : '.djvu',
APPLICATION_RTF : '.rtf',

View File

@ -306,31 +306,5 @@ def MimeFromOpenDocument( path ):
def MimeFromMicrosoftOpenXMLDocument( path ):
try:
content_types = GetZipAsPath( path, '[Content_Types].xml' ).read_text()
if 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml' in content_types:
return HC.APPLICATION_MICROSOFT_OPEN_XML_DOCX
elif 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml' in content_types:
return HC.APPLICATION_MICROSOFT_OPEN_XML_XLSX
elif 'application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml' in content_types:
return HC.APPLICATION_MICROSOFT_OPEN_XML_PPTX
else:
return None
except:
return None

View File

@ -1,43 +0,0 @@
#import PyPDF2
import re
def GetNumWordsFromString( s ):
s = re.sub( r'[\s]+', ' ', s ) # turns multiple spaces into single spaces
num_words = len( s.split( ' ' ) )
return num_words
def GetPDFNumWords( path ):
# I discovered a pdf that pulled this into an infinite loop due to malformed header.
# This gives bunk data anyway, so let's just cut it out until we have a better solution here all around
return None
num_words = None
try:
pass
'''
with open( path, 'rb' ) as f:
pdf_object = PyPDF2.PdfFileReader( f, strict = False )
# get.extractText() gives kooky and unreliable results
# num_words = sum( [ GetNumWordsFromString( page.extractText() ) for page in pdf_object.pages ] )
# so let's just estimate
return pdf_object.numPages * 350
'''
except:
num_words = 0
return num_words

View File

@ -14,7 +14,6 @@ from hydrus.core import HydrusTime
from hydrus.core.files import HydrusAnimationHandling
from hydrus.core.files import HydrusArchiveHandling
from hydrus.core.files import HydrusClipHandling
from hydrus.core.files import HydrusDocumentHandling
from hydrus.core.files import HydrusFlashHandling
from hydrus.core.files import HydrusKritaHandling
from hydrus.core.files import HydrusPDFHandling
@ -23,6 +22,7 @@ from hydrus.core.files import HydrusPSDHandling
from hydrus.core.files import HydrusSVGHandling
from hydrus.core.files import HydrusUgoiraHandling
from hydrus.core.files import HydrusVideoHandling
from hydrus.core.files import HydrusOfficeOpenXMLHandling
from hydrus.core.files.images import HydrusImageHandling
from hydrus.core.networking import HydrusNetwork
@ -75,9 +75,9 @@ for mime in HC.IMAGES:
mimes_to_default_thumbnail_paths[ HC.APPLICATION_UNKNOWN ] = os.path.join( HC.STATIC_DIR, 'hydrus.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_PDF ] = os.path.join( HC.STATIC_DIR, 'pdf.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_MICROSOFT_OPEN_XML_DOCX ] = os.path.join( HC.STATIC_DIR, 'docx.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_MICROSOFT_OPEN_XML_XLSX ] = os.path.join( HC.STATIC_DIR, 'xlsx.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_MICROSOFT_OPEN_XML_PPTX ] = os.path.join( HC.STATIC_DIR, 'pptx.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_DOCX ] = os.path.join( HC.STATIC_DIR, 'docx.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_XLSX ] = os.path.join( HC.STATIC_DIR, 'xlsx.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_PPTX ] = os.path.join( HC.STATIC_DIR, 'pptx.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_EPUB ] = os.path.join( HC.STATIC_DIR, 'epub.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_DJVU ] = os.path.join( HC.STATIC_DIR, 'djvu.png' )
mimes_to_default_thumbnail_paths[ HC.APPLICATION_PSD ] = os.path.join( HC.STATIC_DIR, 'psd.png' )
@ -90,6 +90,13 @@ mimes_to_default_thumbnail_paths[ HC.APPLICATION_PROCREATE ] = os.path.join( HC.
mimes_to_default_thumbnail_paths[ HC.APPLICATION_RTF ] = os.path.join( HC.STATIC_DIR, 'rtf.png' )
mimes_to_default_thumbnail_paths[ HC.IMAGE_SVG ] = os.path.join( HC.STATIC_DIR, 'svg.png' )
def GenerateDefaultThumbnail( mime: int, target_resolution: typing.Tuple[ int, int ] ):
thumb_path = mimes_to_default_thumbnail_paths[mime]
return HydrusImageHandling.GenerateDefaultThumbnailNumPyFromPath( thumb_path, target_resolution )
def GenerateThumbnailBytes( path, target_resolution, mime, duration, num_frames, percentage_in = 35 ):
thumbnail_numpy = GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames, percentage_in = percentage_in )
@ -130,9 +137,7 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'zip.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
finally:
@ -153,9 +158,7 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'clip.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
finally:
@ -171,10 +174,8 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
except Exception as e:
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'krita.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
elif mime == HC.APPLICATION_PROCREATE:
@ -191,9 +192,7 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'procreate.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
finally:
@ -224,9 +223,7 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
PrintMoreThumbErrorInfo( e, f'Secondary problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'psd.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
finally:
@ -244,9 +241,7 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'svg.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
elif mime == HC.APPLICATION_PDF:
@ -259,10 +254,20 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'pdf.png' )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
elif mime == HC.APPLICATION_PPTX:
try:
thumbnail_numpy = HydrusOfficeOpenXMLHandling.GenerateThumbnailNumPyFromOfficePath( path, target_resolution )
except Exception as e:
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
elif mime == HC.APPLICATION_FLASH:
@ -278,9 +283,7 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'flash.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
finally:
@ -299,9 +302,7 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'hydrus.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
elif mime == HC.ANIMATION_UGOIRA:
@ -322,9 +323,7 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
PrintMoreThumbErrorInfo( e, f'Problem generating thumbnail for "{path}".', extra_description = extra_description )
thumb_path = os.path.join( HC.STATIC_DIR, 'zip.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
finally:
@ -379,9 +378,7 @@ def GenerateThumbnailNumPy( path, target_resolution, mime, duration, num_frames,
if numpy_image is None:
thumb_path = os.path.join( HC.STATIC_DIR, 'hydrus.png' )
thumbnail_numpy = HydrusImageHandling.GenerateThumbnailNumPyFromStaticImagePath( thumb_path, target_resolution, HC.IMAGE_PNG )
thumbnail_numpy = GenerateDefaultThumbnail(mime, target_resolution)
else:
@ -539,14 +536,33 @@ def GetFileInfo( path, mime = None, ok_to_look_for_hydrus_updates = False ):
pass
elif mime == HC.APPLICATION_PPTX:
try:
( num_words, ( width, height ) ) = HydrusOfficeOpenXMLHandling.GetPPTXInfo( path )
except HydrusExceptions.LimitedSupportFileException:
pass
elif mime == HC.APPLICATION_DOCX:
try:
( num_words ) = HydrusOfficeOpenXMLHandling.GetDOCXInfo( path )
except HydrusExceptions.LimitedSupportFileException:
pass
elif mime == HC.APPLICATION_FLASH:
( ( width, height ), duration, num_frames ) = HydrusFlashHandling.GetFlashProperties( path )
elif mime == HC.APPLICATION_PDF:
num_words = HydrusDocumentHandling.GetPDFNumWords( path ) # this now give None until a better solution can be found
elif mime == HC.APPLICATION_PSD:
try:
@ -775,7 +791,7 @@ def GetMime( path, ok_to_look_for_hydrus_updates = False ):
return opendoc_mime
microsoft_mime = HydrusArchiveHandling.MimeFromMicrosoftOpenXMLDocument( path )
microsoft_mime = HydrusOfficeOpenXMLHandling.MimeFromMicrosoftOpenXMLDocument( path )
if microsoft_mime is not None:

View File

@ -0,0 +1,130 @@
import typing
from hydrus.core import HydrusConstants as HC
from hydrus.core.files.HydrusArchiveHandling import GetZipAsPath
from hydrus.core.files.images import HydrusImageHandling
import xml.etree.ElementTree as ET
from PIL import Image as PILImage
DOCX_XPATH = ".//{*}Override[@PartName='/word/document.xml'][@ContentType='application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml']"
XLSX_XPATH = ".//{*}Override[@PartName='/xl/workbook.xml'][@ContentType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml']"
PPTX_XPATH = ".//{*}Override[@PartName='/ppt/presentation.xml'][@ContentType='application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml']"
def MimeFromMicrosoftOpenXMLDocument(path: str):
try:
file = GetZipAsPath( path, '[Content_Types].xml' ).open( 'rb' )
root = ET.parse( file )
if root.find(DOCX_XPATH) is not None:
return HC.APPLICATION_DOCX
elif root.find(XLSX_XPATH) is not None:
return HC.APPLICATION_XLSX
elif root.find(PPTX_XPATH) is not None:
return HC.APPLICATION_PPTX
else:
return None
except:
return None
def GenerateThumbnailNumPyFromOfficePath( path: str, target_resolution: typing.Tuple[ int, int ] ) -> bytes:
zip_path_file_obj = GetZipAsPath( path, 'docProps/thumbnail.jpeg' ).open( 'rb' )
pil_image = HydrusImageHandling.GeneratePILImage( zip_path_file_obj )
thumbnail_pil_image = pil_image.resize( target_resolution, PILImage.LANCZOS )
numpy_image = HydrusImageHandling.GenerateNumPyImageFromPILImage( thumbnail_pil_image )
return numpy_image
PPTX_ASSUMED_DPI = 300
# https://startbigthinksmall.wordpress.com/2010/01/04/points-inches-and-emus-measuring-units-in-office-open-xml/
# PowerPoint uses English Metric Unit (EMU) for vector coordinates
# 1 inch = 914400 EMU
PPTX_PIXEL_PER_EMU = PPTX_ASSUMED_DPI / 914400
def PowerPointResolution( path: str ):
file = GetZipAsPath( path, 'ppt/presentation.xml' ).open( 'rb' )
root = ET.parse( file )
sldSz = root.find('./p:sldSz', {'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'})
x_emu = int(sldSz.get('cx'))
y_emu = int(sldSz.get('cy'))
width = round(x_emu * PPTX_PIXEL_PER_EMU)
height = round(y_emu * PPTX_PIXEL_PER_EMU)
return ( width, height)
def OfficeDocumentWordCount( path: str ):
file = GetZipAsPath( path, 'docProps/app.xml' ).open( 'rb' )
root = ET.parse( file )
words = root.findtext('./ep:Words', namespaces = {'ep' : 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties'})
num_words = int(words)
return num_words
def GetPPTXInfo( path: str ):
try:
( width, height ) = PowerPointResolution( path )
except:
( width, height ) = ( None, None )
try:
num_words = OfficeDocumentWordCount( path )
except:
num_words = None
return ( num_words, ( width, height ) )
def GetDOCXInfo( path:str ):
try:
num_words = OfficeDocumentWordCount( path )
except:
num_words = None
return ( num_words )

View File

@ -9,6 +9,7 @@ import warnings
from PIL import ImageFile as PILImageFile
from PIL import Image as PILImage
from PIL import ImageOps as PILImageOps
try:
@ -663,3 +664,12 @@ def ResizeNumPyImage( numpy_image: numpy.array, target_resolution, forced_interp
return cv2.resize( numpy_image, ( target_width, target_height ), interpolation = interpolation )
def GenerateDefaultThumbnailNumPyFromPath( path: str, target_resolution: typing.Tuple[ int, int ] ):
thumb_image = GeneratePILImage( path )
pil_image = PILImageOps.pad( thumb_image, target_resolution, PILImage.Resampling.LANCZOS )
return GenerateNumPyImageFromPILImage( pil_image, strip_useless_alpha = False )

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.7 KiB

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.3 KiB

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.5 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB