hydrus/include/HydrusDocumentHandling.py

35 lines
846 B
Python
Raw Normal View History

2013-03-15 02:38:12 +00:00
import HydrusConstants as HC
import PyPDF2
import re
import time
import traceback
import wx
def GetNumWordsFromString( s ):
s = re.sub( '[\s]+', ' ', s, flags = re.UNICODE ) # turns multiple spaces into single spaces
num_words = len( s.split( ' ' ) )
return num_words
2013-08-07 22:25:18 +00:00
def GetPDFNumWords( path ):
2013-03-15 02:38:12 +00:00
try:
2013-08-07 22:25:18 +00:00
with HC.o( path, 'rb' ) as f:
pdf_object = PyPDF2.PdfFileReader( f, strict = False )
# get.extractText() gives kooky and unreliable results
# num_words = sum( [ GetNumWordsFromString( page.extractText() ) for page in pdf_object.pages ] )
# so let's just estimate
return pdf_object.numPages * 350
2013-03-15 02:38:12 +00:00
except: num_words = 0
return num_words