hydrus/include/ClientParsers.py

64 lines
1.8 KiB
Python
Raw Normal View History

2013-02-19 00:11:43 +00:00
import bs4
import lxml
import traceback
import urlparse
def Parse4chanPostScreen( html ):
soup = bs4.BeautifulSoup( html )
title_tag = soup.find( 'title' )
if title_tag.string == 'Post successful!': return ( 'success', None )
2013-03-15 02:38:12 +00:00
elif title_tag.string == '4chan - Banned':
2013-09-04 16:48:44 +00:00
print( repr( soup ) )
2013-03-15 02:38:12 +00:00
2013-07-31 21:26:38 +00:00
message = 'You are banned from this board! html written to log.'
2014-01-22 21:11:22 +00:00
HC.ShowText( message )
2013-07-31 21:26:38 +00:00
return ( 'big error', message )
2013-03-15 02:38:12 +00:00
2013-02-19 00:11:43 +00:00
else:
try:
problem_tag = soup.find( id = 'errmsg' )
if problem_tag is None:
2013-09-04 16:48:44 +00:00
try: print( repr( soup ) )
2013-02-19 00:11:43 +00:00
except: pass
2013-08-14 20:21:49 +00:00
message = 'Unknown problem; html written to log.'
2013-07-31 21:26:38 +00:00
2014-01-22 21:11:22 +00:00
HC.ShowText( message )
2013-07-31 21:26:38 +00:00
return ( 'error', message )
2013-02-19 00:11:43 +00:00
2013-07-31 21:26:38 +00:00
problem = HC.u( problem_tag )
2013-02-19 00:11:43 +00:00
if 'CAPTCHA' in problem: return ( 'captcha', None )
elif 'seconds' in problem: return ( 'too quick', None )
elif 'Duplicate' in problem: return ( 'error', 'duplicate file detected' )
else: return ( 'error', problem )
except: return ( 'error', 'unknown error' )
2014-02-05 20:54:28 +00:00
def ParsePageForURLs( html, starting_url ):
2013-03-15 02:38:12 +00:00
soup = bs4.BeautifulSoup( html )
all_links = soup.find_all( 'a' )
links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]
urls = [ urlparse.urljoin( starting_url, link[ 'href' ] ) for link in links_with_images ]
# old version included (images that don't have a link wrapped around them)'s src
return urls
2013-02-19 00:11:43 +00:00