hydrus/hydrus/client/ClientPDFHandling.py

242 lines
6.1 KiB
Python

import re
import traceback
import typing
pdf_failed_reason = 'QtPdf seems ok!'
try:
from qtpy import QtPdf
PDF_OK = True
except Exception as e:
pdf_failed_reason = traceback.format_exc()
PDF_OK = False
from qtpy import QtGui as QG
from qtpy import QtCore as QC
from hydrus.core import HydrusData
from hydrus.core import HydrusExceptions
from hydrus.core.files import HydrusPDFHandling
from hydrus.core.files.images import HydrusImageHandling
from hydrus.client.gui import ClientGUIFunctions
def LoadPDF( path: str ):
if not PDF_OK:
raise HydrusExceptions.LimitedSupportFileException( 'Sorry, no QtPDF support!' )
try:
# it wants an Object in PyQt6, but giving it None is better since we are outside the Qt thread here
document = QtPdf.QPdfDocument( None )
document.load( path )
except:
raise HydrusExceptions.DamagedOrUnusualFileException( 'Could not load PDF file.' )
status = document.status()
if status is not QtPdf.QPdfDocument.Status.Ready:
if status is QtPdf.QPdfDocument.Status.Error:
error = document.error()
if error is QtPdf.QPdfDocument.Error.IncorrectPassword:
raise HydrusExceptions.EncryptedFileException( 'PDF is password protected!' )
elif error is QtPdf.QPdfDocument.Error.UnsupportedSecurityScheme:
raise HydrusExceptions.EncryptedFileException( 'PDF uses an unsupported security scheme' )
else:
raise HydrusExceptions.DamagedOrUnusualFileException( f'PDF document error: {document.error()}!' )
else:
raise HydrusExceptions.DamagedOrUnusualFileException( f'PDF document status: {status}!' )
return document
def GenerateThumbnailNumPyFromPDFPath( path: str, target_resolution: typing.Tuple[int, int] ) -> bytes:
try:
document = LoadPDF( path )
( target_width, target_height ) = target_resolution
resolution = QC.QSize( target_width, target_height )
qt_image = document.render(0, resolution)
# ClientGUIFunctions.ConvertQtImageToNumPy doesn't handle other formats well
qt_image.convertToFormat( QG.QImage.Format_RGBA8888 )
numpy_image = ClientGUIFunctions.ConvertQtImageToNumPy( qt_image )
document.close()
thumbnail_numpy_image = numpy_image
return thumbnail_numpy_image
except Exception as e:
message = f'PDF at {path} failed to make a thumbnail: {e}'
HydrusData.Print( message )
raise HydrusExceptions.NoThumbnailFileException()
HydrusPDFHandling.GenerateThumbnailNumPyFromPDFPath = GenerateThumbnailNumPyFromPDFPath
PDF_ASSUMED_DPI = 300
def GetHumanReadableEmbeddedMetadata( path ) -> str:
try:
document = LoadPDF( path )
except:
raise HydrusExceptions.LimitedSupportFileException()
result_components = []
jobs = [
( 'Title', QtPdf.QPdfDocument.MetaDataField.Title ),
( 'Author', QtPdf.QPdfDocument.MetaDataField.Author ),
( 'Subject', QtPdf.QPdfDocument.MetaDataField.Subject ),
( 'Keywords', QtPdf.QPdfDocument.MetaDataField.Keywords )
]
for ( prefix, key ) in jobs:
text = document.metaData( key )
if len( text ) > 0:
result_components.append( f'{prefix}: {text}' )
return '\n'.join( result_components )
def HasHumanReadableEmbeddedMetadata( path ) -> bool:
try:
text = GetHumanReadableEmbeddedMetadata( path )
except HydrusExceptions.LimitedSupportFileException:
return False
return len( text ) > 0
def GetPDFInfo( path: str ):
try:
document = LoadPDF( path )
except:
raise HydrusExceptions.LimitedSupportFileException()
try:
( width, height ) = GetPDFResolutionFromDocument( document )
except:
( width, height ) = ( None, None )
num_words = 0
num_pages = document.pageCount()
for i in range( num_pages ):
q_selection_gubbins = document.getAllText( i )
text = q_selection_gubbins.text()
depunctuated_text = re.sub( r'[^\w\s]', ' ', text )
despaced_text = re.sub( '\s\s+', ' ', depunctuated_text )
if despaced_text not in ( '', ' ' ):
num_words += despaced_text.count( ' ' ) + 1
document.close()
return ( num_words, ( width, height ) )
def GetPDFModifiedTimestampMS( path ):
# TODO: do something with this
# I thought about replacing the disk modified time, but it seemed like a minefield
# I think instead we'll have support for more non-web-domain timestamps and add a 'pdf' domain or similar and add hooks for it in normal local file import and timestamp regen code
try:
document = LoadPDF( path )
except:
raise HydrusExceptions.LimitedSupportFileException()
q_modified_date = document.metaData( QtPdf.QPdfDocument.MetaDataField.ModificationDate )
modified_timestamp_ms = q_modified_date.toMSecsSinceEpoch()
return modified_timestamp_ms
def GetPDFResolutionFromDocument( document ):
pointSize = document.pagePointSize(0)
# pointSize is in pts which are 1/72 of an inch.
# this calculates the "resolution" assuming PDF_ASSUMED_DPI dpi
width = pointSize.width() * ( PDF_ASSUMED_DPI / 72 )
height = pointSize.height() * ( PDF_ASSUMED_DPI / 72 )
return ( round( width ), round( height ) )
HydrusPDFHandling.GetPDFInfo = GetPDFInfo