141 lines
3.7 KiB
Python
141 lines
3.7 KiB
Python
import typing
|
|
|
|
from hydrus.core import HydrusConstants as HC
|
|
from hydrus.core import HydrusExceptions
|
|
from hydrus.core.files.HydrusArchiveHandling import GetZipAsPath
|
|
from hydrus.core.files.images import HydrusImageHandling
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
from PIL import Image as PILImage
|
|
|
|
DOCX_XPATH = ".//{*}Override[@PartName='/word/document.xml'][@ContentType='application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml']"
|
|
XLSX_XPATH = ".//{*}Override[@PartName='/xl/workbook.xml'][@ContentType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml']"
|
|
PPTX_XPATH = ".//{*}Override[@PartName='/ppt/presentation.xml'][@ContentType='application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml']"
|
|
|
|
def MimeFromMicrosoftOpenXMLDocument(path: str):
|
|
|
|
try:
|
|
|
|
file = GetZipAsPath( path, '[Content_Types].xml' ).open( 'rb' )
|
|
|
|
root = ET.parse( file )
|
|
|
|
if root.find(DOCX_XPATH) is not None:
|
|
|
|
return HC.APPLICATION_DOCX
|
|
|
|
elif root.find(XLSX_XPATH) is not None:
|
|
|
|
return HC.APPLICATION_XLSX
|
|
|
|
elif root.find(PPTX_XPATH) is not None:
|
|
|
|
return HC.APPLICATION_PPTX
|
|
|
|
else:
|
|
|
|
return None
|
|
|
|
|
|
except:
|
|
|
|
return None
|
|
|
|
|
|
|
|
def GenerateThumbnailNumPyFromOfficePath( path: str, target_resolution: typing.Tuple[ int, int ] ) -> bytes:
|
|
|
|
try:
|
|
|
|
zip_path_file_obj = GetZipAsPath( path, 'docProps/thumbnail.jpeg' ).open( 'rb' )
|
|
|
|
except FileNotFoundError:
|
|
|
|
raise HydrusExceptions.NoThumbnailFileException( 'No thumbnail.jpeg file!' )
|
|
|
|
|
|
pil_image = HydrusImageHandling.GeneratePILImage( zip_path_file_obj )
|
|
|
|
thumbnail_pil_image = pil_image.resize( target_resolution, PILImage.LANCZOS )
|
|
|
|
numpy_image = HydrusImageHandling.GenerateNumPyImageFromPILImage( thumbnail_pil_image )
|
|
|
|
return numpy_image
|
|
|
|
|
|
PPTX_ASSUMED_DPI = 300
|
|
|
|
# https://startbigthinksmall.wordpress.com/2010/01/04/points-inches-and-emus-measuring-units-in-office-open-xml/
|
|
# PowerPoint uses English Metric Unit (EMU) for vector coordinates
|
|
# 1 inch = 914400 EMU
|
|
|
|
PPTX_PIXEL_PER_EMU = PPTX_ASSUMED_DPI / 914400
|
|
|
|
def PowerPointResolution( path: str ):
|
|
|
|
file = GetZipAsPath( path, 'ppt/presentation.xml' ).open( 'rb' )
|
|
|
|
root = ET.parse( file )
|
|
|
|
sldSz = root.find('./p:sldSz', {'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'})
|
|
|
|
x_emu = int(sldSz.get('cx'))
|
|
|
|
y_emu = int(sldSz.get('cy'))
|
|
|
|
width = round(x_emu * PPTX_PIXEL_PER_EMU)
|
|
|
|
height = round(y_emu * PPTX_PIXEL_PER_EMU)
|
|
|
|
return ( width, height)
|
|
|
|
|
|
def OfficeDocumentWordCount( path: str ):
|
|
|
|
file = GetZipAsPath( path, 'docProps/app.xml' ).open( 'rb' )
|
|
|
|
root = ET.parse( file )
|
|
|
|
words = root.findtext('./ep:Words', namespaces = {'ep' : 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties'})
|
|
|
|
num_words = int(words)
|
|
|
|
return num_words
|
|
|
|
|
|
def GetPPTXInfo( path: str ):
|
|
|
|
try:
|
|
|
|
( width, height ) = PowerPointResolution( path )
|
|
|
|
except:
|
|
|
|
( width, height ) = ( None, None )
|
|
|
|
try:
|
|
|
|
num_words = OfficeDocumentWordCount( path )
|
|
|
|
except:
|
|
|
|
num_words = None
|
|
|
|
return ( num_words, ( width, height ) )
|
|
|
|
|
|
def GetDOCXInfo( path:str ):
|
|
|
|
try:
|
|
|
|
num_words = OfficeDocumentWordCount( path )
|
|
|
|
except:
|
|
|
|
num_words = None
|
|
|
|
|
|
return num_words
|
|
|