hydrus/hydrus/core/HydrusDocumentHandling.py

#import PyPDF2
import re

def GetNumWordsFromString( s ):

    s = re.sub( r'[\s]+', ' ', s ) # turns multiple spaces into single spaces

    num_words = len( s.split( ' ' ) )

    return num_words

def GetPDFNumWords( path ):

    # I discovered a pdf that pulled this into an infinite loop due to malformed header.
    # This gives bunk data anyway, so let's just cut it out until we have a better solution here all around

    return None

    try:

        pass
        '''
        with open( path, 'rb' ) as f:

            pdf_object = PyPDF2.PdfFileReader( f, strict = False )

            # get.extractText() gives kooky and unreliable results
            # num_words = sum( [ GetNumWordsFromString( page.extractText() ) for page in pdf_object.pages ] )

            # so let's just estimate

            return pdf_object.numPages * 350

        '''
    except:

        num_words = 0


    return num_words