hydrus/hydrus/core/HydrusDocumentHandling.py

#import PyPDF2
import re

def GetNumWordsFromString( s ):
    
    s = re.sub( r'[\s]+', ' ', s ) # turns multiple spaces into single spaces
    
    num_words = len( s.split( ' ' ) )
    
    return num_words
    
def GetPDFNumWords( path ):
    
    # I discovered a pdf that pulled this into an infinite loop due to malformed header.
    # This gives bunk data anyway, so let's just cut it out until we have a better solution here all around
    
    return None
    
    try:
        
        pass
        '''
        with open( path, 'rb' ) as f:
            
            pdf_object = PyPDF2.PdfFileReader( f, strict = False )
            
            # get.extractText() gives kooky and unreliable results
            # num_words = sum( [ GetNumWordsFromString( page.extractText() ) for page in pdf_object.pages ] )
            
            # so let's just estimate
            
            return pdf_object.numPages * 350
            
        '''
    except:
        
        num_words = 0
        
    
    return num_words
Version 329 2018-11-07 23:09:40 +00:00			`#import PyPDF2`
update to version 61 2013-03-15 02:38:12 +00:00			`import re`

			`def GetNumWordsFromString( s ):`

Version 363 2019-08-07 22:59:53 +00:00			`s = re.sub( r'[\s]+', ' ', s ) # turns multiple spaces into single spaces`
update to version 61 2013-03-15 02:38:12 +00:00
			`num_words = len( s.split( ' ' ) )`

			`return num_words`

Version 80 2013-08-07 22:25:18 +00:00			`def GetPDFNumWords( path ):`
update to version 61 2013-03-15 02:38:12 +00:00
Version 329 2018-11-07 23:09:40 +00:00			`# I discovered a pdf that pulled this into an infinite loop due to malformed header.`
			`# This gives bunk data anyway, so let's just cut it out until we have a better solution here all around`

			`return None`

update to version 61 2013-03-15 02:38:12 +00:00			`try:`

Version 329 2018-11-07 23:09:40 +00:00			`pass`
			`'''`
Version 81 2013-08-14 20:21:49 +00:00			`with open( path, 'rb' ) as f:`
Version 80 2013-08-07 22:25:18 +00:00
			`pdf_object = PyPDF2.PdfFileReader( f, strict = False )`

			`# get.extractText() gives kooky and unreliable results`
			`# num_words = sum( [ GetNumWordsFromString( page.extractText() ) for page in pdf_object.pages ] )`

			`# so let's just estimate`

			`return pdf_object.numPages * 350`

Version 329 2018-11-07 23:09:40 +00:00			`'''`
Version 298 2018-03-14 21:01:02 +00:00			`except:`

			`num_words = 0`

update to version 61 2013-03-15 02:38:12 +00:00
			`return num_words`
Version 298 2018-03-14 21:01:02 +00:00