import typing from hydrus.core import HydrusConstants as HC from hydrus.core.files.HydrusArchiveHandling import GetZipAsPath from hydrus.core.files.images import HydrusImageHandling import xml.etree.ElementTree as ET from PIL import Image as PILImage DOCX_XPATH = ".//{*}Override[@PartName='/word/document.xml'][@ContentType='application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml']" XLSX_XPATH = ".//{*}Override[@PartName='/xl/workbook.xml'][@ContentType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml']" PPTX_XPATH = ".//{*}Override[@PartName='/ppt/presentation.xml'][@ContentType='application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml']" def MimeFromMicrosoftOpenXMLDocument(path: str): try: file = GetZipAsPath( path, '[Content_Types].xml' ).open( 'rb' ) root = ET.parse( file ) if root.find(DOCX_XPATH) is not None: return HC.APPLICATION_DOCX elif root.find(XLSX_XPATH) is not None: return HC.APPLICATION_XLSX elif root.find(PPTX_XPATH) is not None: return HC.APPLICATION_PPTX else: return None except: return None def GenerateThumbnailNumPyFromOfficePath( path: str, target_resolution: typing.Tuple[ int, int ] ) -> bytes: zip_path_file_obj = GetZipAsPath( path, 'docProps/thumbnail.jpeg' ).open( 'rb' ) pil_image = HydrusImageHandling.GeneratePILImage( zip_path_file_obj ) thumbnail_pil_image = pil_image.resize( target_resolution, PILImage.LANCZOS ) numpy_image = HydrusImageHandling.GenerateNumPyImageFromPILImage( thumbnail_pil_image ) return numpy_image PPTX_ASSUMED_DPI = 300 # https://startbigthinksmall.wordpress.com/2010/01/04/points-inches-and-emus-measuring-units-in-office-open-xml/ # PowerPoint uses English Metric Unit (EMU) for vector coordinates # 1 inch = 914400 EMU PPTX_PIXEL_PER_EMU = PPTX_ASSUMED_DPI / 914400 def PowerPointResolution( path: str ): file = GetZipAsPath( path, 'ppt/presentation.xml' ).open( 'rb' ) root = ET.parse( file ) sldSz = root.find('./p:sldSz', {'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'}) x_emu = int(sldSz.get('cx')) y_emu = int(sldSz.get('cy')) width = round(x_emu * PPTX_PIXEL_PER_EMU) height = round(y_emu * PPTX_PIXEL_PER_EMU) return ( width, height) def OfficeDocumentWordCount( path: str ): file = GetZipAsPath( path, 'docProps/app.xml' ).open( 'rb' ) root = ET.parse( file ) words = root.findtext('./ep:Words', namespaces = {'ep' : 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties'}) num_words = int(words) return num_words def GetPPTXInfo( path: str ): try: ( width, height ) = PowerPointResolution( path ) except: ( width, height ) = ( None, None ) try: num_words = OfficeDocumentWordCount( path ) except: num_words = None return ( num_words, ( width, height ) ) def GetDOCXInfo( path:str ): try: num_words = OfficeDocumentWordCount( path ) except: num_words = None return ( num_words )