hydrus/hydrus/core/files/HydrusOfficeOpenXMLHandling.py

141 lines
3.7 KiB
Python

import typing
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusExceptions
from hydrus.core.files.HydrusArchiveHandling import GetZipAsPath
from hydrus.core.files.images import HydrusImageHandling
import xml.etree.ElementTree as ET
from PIL import Image as PILImage
DOCX_XPATH = ".//{*}Override[@PartName='/word/document.xml'][@ContentType='application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml']"
XLSX_XPATH = ".//{*}Override[@PartName='/xl/workbook.xml'][@ContentType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml']"
PPTX_XPATH = ".//{*}Override[@PartName='/ppt/presentation.xml'][@ContentType='application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml']"
def MimeFromMicrosoftOpenXMLDocument(path: str):
try:
file = GetZipAsPath( path, '[Content_Types].xml' ).open( 'rb' )
root = ET.parse( file )
if root.find(DOCX_XPATH) is not None:
return HC.APPLICATION_DOCX
elif root.find(XLSX_XPATH) is not None:
return HC.APPLICATION_XLSX
elif root.find(PPTX_XPATH) is not None:
return HC.APPLICATION_PPTX
else:
return None
except:
return None
def GenerateThumbnailNumPyFromOfficePath( path: str, target_resolution: typing.Tuple[ int, int ] ) -> bytes:
try:
zip_path_file_obj = GetZipAsPath( path, 'docProps/thumbnail.jpeg' ).open( 'rb' )
except FileNotFoundError:
raise HydrusExceptions.NoThumbnailFileException( 'No thumbnail.jpeg file!' )
pil_image = HydrusImageHandling.GeneratePILImage( zip_path_file_obj )
thumbnail_pil_image = pil_image.resize( target_resolution, PILImage.LANCZOS )
numpy_image = HydrusImageHandling.GenerateNumPyImageFromPILImage( thumbnail_pil_image )
return numpy_image
PPTX_ASSUMED_DPI = 300
# https://startbigthinksmall.wordpress.com/2010/01/04/points-inches-and-emus-measuring-units-in-office-open-xml/
# PowerPoint uses English Metric Unit (EMU) for vector coordinates
# 1 inch = 914400 EMU
PPTX_PIXEL_PER_EMU = PPTX_ASSUMED_DPI / 914400
def PowerPointResolution( path: str ):
file = GetZipAsPath( path, 'ppt/presentation.xml' ).open( 'rb' )
root = ET.parse( file )
sldSz = root.find('./p:sldSz', {'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'})
x_emu = int(sldSz.get('cx'))
y_emu = int(sldSz.get('cy'))
width = round(x_emu * PPTX_PIXEL_PER_EMU)
height = round(y_emu * PPTX_PIXEL_PER_EMU)
return ( width, height)
def OfficeDocumentWordCount( path: str ):
file = GetZipAsPath( path, 'docProps/app.xml' ).open( 'rb' )
root = ET.parse( file )
words = root.findtext('./ep:Words', namespaces = {'ep' : 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties'})
num_words = int(words)
return num_words
def GetPPTXInfo( path: str ):
try:
( width, height ) = PowerPointResolution( path )
except:
( width, height ) = ( None, None )
try:
num_words = OfficeDocumentWordCount( path )
except:
num_words = None
return ( num_words, ( width, height ) )
def GetDOCXInfo( path:str ):
try:
num_words = OfficeDocumentWordCount( path )
except:
num_words = None
return num_words