hydrus/hydrus/core/HydrusDocumentHandling.py

42 lines
1.0 KiB
Python
Raw Normal View History

2018-11-07 23:09:40 +00:00
#import PyPDF2
2013-03-15 02:38:12 +00:00
import re
def GetNumWordsFromString( s ):
2019-08-07 22:59:53 +00:00
s = re.sub( r'[\s]+', ' ', s ) # turns multiple spaces into single spaces
2013-03-15 02:38:12 +00:00
num_words = len( s.split( ' ' ) )
return num_words
2013-08-07 22:25:18 +00:00
def GetPDFNumWords( path ):
2013-03-15 02:38:12 +00:00
2018-11-07 23:09:40 +00:00
# I discovered a pdf that pulled this into an infinite loop due to malformed header.
# This gives bunk data anyway, so let's just cut it out until we have a better solution here all around
return None
2013-03-15 02:38:12 +00:00
try:
2018-11-07 23:09:40 +00:00
pass
'''
2013-08-14 20:21:49 +00:00
with open( path, 'rb' ) as f:
2013-08-07 22:25:18 +00:00
pdf_object = PyPDF2.PdfFileReader( f, strict = False )
# get.extractText() gives kooky and unreliable results
# num_words = sum( [ GetNumWordsFromString( page.extractText() ) for page in pdf_object.pages ] )
# so let's just estimate
return pdf_object.numPages * 350
2018-11-07 23:09:40 +00:00
'''
2018-03-14 21:01:02 +00:00
except:
num_words = 0
2013-03-15 02:38:12 +00:00
return num_words
2018-03-14 21:01:02 +00:00