hydrus/include/ClientParsers.py

import bs4
import lxml
import traceback
import urlparse

def Parse4chanPostScreen( html ):
    
    soup = bs4.BeautifulSoup( html )
    
    title_tag = soup.find( 'title' )
    
    if title_tag.string == 'Post successful!': return ( 'success', None )
    elif title_tag.string == '4chan - Banned':
        
        print( repr( soup ) )
        
        message = 'You are banned from this board! html written to log.'
        
        HC.ShowText( message )
        
        return ( 'big error', message )
        
    else:
        
        try:
            
            problem_tag = soup.find( id = 'errmsg' )
            
            if problem_tag is None:
                
                try: print( repr( soup ) )
                except: pass
                
                message = 'Unknown problem; html written to log.'
                
                HC.ShowText( message )
                
                return ( 'error', message )
                
            
            problem = HC.u( problem_tag )
            
            if 'CAPTCHA' in problem: return ( 'captcha', None )
            elif 'seconds' in problem: return ( 'too quick', None )
            elif 'Duplicate' in problem: return ( 'error', 'duplicate file detected' )
            else: return ( 'error', problem )
            
        except: return ( 'error', 'unknown error' )
        
    
def ParsePageForURLs( html, starting_url ):
    
    soup = bs4.BeautifulSoup( html )
    
    all_links = soup.find_all( 'a' )
    
    links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]
    
    urls = [ urlparse.urljoin( starting_url, link[ 'href' ] ) for link in links_with_images ]
    
    # old version included (images that don't have a link wrapped around them)'s src
    
    return urls
Initial commit from version 57 2013-02-19 00:11:43 +00:00			`import bs4`
			`import lxml`
			`import traceback`
			`import urlparse`

			`def Parse4chanPostScreen( html ):`

			`soup = bs4.BeautifulSoup( html )`

			`title_tag = soup.find( 'title' )`

			`if title_tag.string == 'Post successful!': return ( 'success', None )`
update to version 61 2013-03-15 02:38:12 +00:00			`elif title_tag.string == '4chan - Banned':`

Version 83 2013-09-04 16:48:44 +00:00			`print( repr( soup ) )`
update to version 61 2013-03-15 02:38:12 +00:00
Version 79 2013-07-31 21:26:38 +00:00			`message = 'You are banned from this board! html written to log.'`

Version 100 2014-01-22 21:11:22 +00:00			`HC.ShowText( message )`
Version 79 2013-07-31 21:26:38 +00:00
			`return ( 'big error', message )`
update to version 61 2013-03-15 02:38:12 +00:00
Initial commit from version 57 2013-02-19 00:11:43 +00:00			`else:`

			`try:`

			`problem_tag = soup.find( id = 'errmsg' )`

			`if problem_tag is None:`

Version 83 2013-09-04 16:48:44 +00:00			`try: print( repr( soup ) )`
Initial commit from version 57 2013-02-19 00:11:43 +00:00			`except: pass`

Version 81 2013-08-14 20:21:49 +00:00			`message = 'Unknown problem; html written to log.'`
Version 79 2013-07-31 21:26:38 +00:00
Version 100 2014-01-22 21:11:22 +00:00			`HC.ShowText( message )`
Version 79 2013-07-31 21:26:38 +00:00
			`return ( 'error', message )`
Initial commit from version 57 2013-02-19 00:11:43 +00:00

Version 79 2013-07-31 21:26:38 +00:00			`problem = HC.u( problem_tag )`
Initial commit from version 57 2013-02-19 00:11:43 +00:00
			`if 'CAPTCHA' in problem: return ( 'captcha', None )`
			`elif 'seconds' in problem: return ( 'too quick', None )`
			`elif 'Duplicate' in problem: return ( 'error', 'duplicate file detected' )`
			`else: return ( 'error', problem )`

			`except: return ( 'error', 'unknown error' )`


Version 102 2014-02-05 20:54:28 +00:00			`def ParsePageForURLs( html, starting_url ):`
update to version 61 2013-03-15 02:38:12 +00:00
			`soup = bs4.BeautifulSoup( html )`

			`all_links = soup.find_all( 'a' )`

			`links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]`

			`urls = [ urlparse.urljoin( starting_url, link[ 'href' ] ) for link in links_with_images ]`

			`# old version included (images that don't have a link wrapped around them)'s src`

			`return urls`
Initial commit from version 57 2013-02-19 00:11:43 +00:00