hydrus/hydrus/core/files/HydrusOfficeOpenXMLHandling.py

141 lines
3.7 KiB
Python
Raw Normal View History

import typing
from hydrus.core import HydrusConstants as HC
2024-03-20 21:10:16 +00:00
from hydrus.core import HydrusExceptions
from hydrus.core.files.HydrusArchiveHandling import GetZipAsPath
from hydrus.core.files.images import HydrusImageHandling
import xml.etree.ElementTree as ET
from PIL import Image as PILImage
DOCX_XPATH = ".//{*}Override[@PartName='/word/document.xml'][@ContentType='application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml']"
XLSX_XPATH = ".//{*}Override[@PartName='/xl/workbook.xml'][@ContentType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml']"
PPTX_XPATH = ".//{*}Override[@PartName='/ppt/presentation.xml'][@ContentType='application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml']"
def MimeFromMicrosoftOpenXMLDocument(path: str):
try:
file = GetZipAsPath( path, '[Content_Types].xml' ).open( 'rb' )
root = ET.parse( file )
if root.find(DOCX_XPATH) is not None:
return HC.APPLICATION_DOCX
elif root.find(XLSX_XPATH) is not None:
return HC.APPLICATION_XLSX
elif root.find(PPTX_XPATH) is not None:
return HC.APPLICATION_PPTX
else:
2024-03-20 21:10:16 +00:00
return None
except:
return None
2024-03-20 21:10:16 +00:00
def GenerateThumbnailNumPyFromOfficePath( path: str, target_resolution: typing.Tuple[ int, int ] ) -> bytes:
2024-03-20 21:10:16 +00:00
try:
zip_path_file_obj = GetZipAsPath( path, 'docProps/thumbnail.jpeg' ).open( 'rb' )
except FileNotFoundError:
2024-03-20 21:10:16 +00:00
raise HydrusExceptions.NoThumbnailFileException( 'No thumbnail.jpeg file!' )
pil_image = HydrusImageHandling.GeneratePILImage( zip_path_file_obj )
thumbnail_pil_image = pil_image.resize( target_resolution, PILImage.LANCZOS )
numpy_image = HydrusImageHandling.GenerateNumPyImageFromPILImage( thumbnail_pil_image )
return numpy_image
PPTX_ASSUMED_DPI = 300
# https://startbigthinksmall.wordpress.com/2010/01/04/points-inches-and-emus-measuring-units-in-office-open-xml/
# PowerPoint uses English Metric Unit (EMU) for vector coordinates
# 1 inch = 914400 EMU
PPTX_PIXEL_PER_EMU = PPTX_ASSUMED_DPI / 914400
def PowerPointResolution( path: str ):
2024-03-20 21:10:16 +00:00
file = GetZipAsPath( path, 'ppt/presentation.xml' ).open( 'rb' )
root = ET.parse( file )
sldSz = root.find('./p:sldSz', {'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'})
x_emu = int(sldSz.get('cx'))
y_emu = int(sldSz.get('cy'))
width = round(x_emu * PPTX_PIXEL_PER_EMU)
height = round(y_emu * PPTX_PIXEL_PER_EMU)
return ( width, height)
def OfficeDocumentWordCount( path: str ):
file = GetZipAsPath( path, 'docProps/app.xml' ).open( 'rb' )
root = ET.parse( file )
words = root.findtext('./ep:Words', namespaces = {'ep' : 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties'})
num_words = int(words)
return num_words
def GetPPTXInfo( path: str ):
try:
( width, height ) = PowerPointResolution( path )
except:
( width, height ) = ( None, None )
try:
num_words = OfficeDocumentWordCount( path )
except:
num_words = None
return ( num_words, ( width, height ) )
def GetDOCXInfo( path:str ):
try:
num_words = OfficeDocumentWordCount( path )
except:
num_words = None
2024-03-20 21:10:16 +00:00
return num_words