2019-01-09 22:59:03 +00:00
|
|
|
from . import HydrusConstants as HC
|
2018-11-07 23:09:40 +00:00
|
|
|
#import PyPDF2
|
2013-03-15 02:38:12 +00:00
|
|
|
import re
|
|
|
|
import time
|
|
|
|
import traceback
|
|
|
|
|
|
|
|
def GetNumWordsFromString( s ):
|
|
|
|
|
2019-01-09 22:59:03 +00:00
|
|
|
s = re.sub( '[\s]+', ' ', s ) # turns multiple spaces into single spaces
|
2013-03-15 02:38:12 +00:00
|
|
|
|
|
|
|
num_words = len( s.split( ' ' ) )
|
|
|
|
|
|
|
|
return num_words
|
|
|
|
|
2013-08-07 22:25:18 +00:00
|
|
|
def GetPDFNumWords( path ):
|
2013-03-15 02:38:12 +00:00
|
|
|
|
2018-11-07 23:09:40 +00:00
|
|
|
# I discovered a pdf that pulled this into an infinite loop due to malformed header.
|
|
|
|
# This gives bunk data anyway, so let's just cut it out until we have a better solution here all around
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
2013-03-15 02:38:12 +00:00
|
|
|
try:
|
|
|
|
|
2018-11-07 23:09:40 +00:00
|
|
|
pass
|
|
|
|
'''
|
2013-08-14 20:21:49 +00:00
|
|
|
with open( path, 'rb' ) as f:
|
2013-08-07 22:25:18 +00:00
|
|
|
|
|
|
|
pdf_object = PyPDF2.PdfFileReader( f, strict = False )
|
|
|
|
|
|
|
|
# get.extractText() gives kooky and unreliable results
|
|
|
|
# num_words = sum( [ GetNumWordsFromString( page.extractText() ) for page in pdf_object.pages ] )
|
|
|
|
|
|
|
|
# so let's just estimate
|
|
|
|
|
|
|
|
return pdf_object.numPages * 350
|
|
|
|
|
2018-11-07 23:09:40 +00:00
|
|
|
'''
|
2018-03-14 21:01:02 +00:00
|
|
|
except:
|
|
|
|
|
|
|
|
num_words = 0
|
|
|
|
|
2013-03-15 02:38:12 +00:00
|
|
|
|
|
|
|
return num_words
|
2018-03-14 21:01:02 +00:00
|
|
|
|