164 lines
5.0 KiB
Python
164 lines
5.0 KiB
Python
|
# Authors: John Dennis <jdennis@redhat.com>
|
||
|
#
|
||
|
# Copyright (C) 2007 Red Hat, Inc.
|
||
|
#
|
||
|
# This program is free software; you can redistribute it and/or modify
|
||
|
# it under the terms of the GNU General Public License as published by
|
||
|
# the Free Software Foundation; either version 2 of the License, or
|
||
|
# (at your option) any later version.
|
||
|
#
|
||
|
# This program is distributed in the hope that it will be useful,
|
||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
# GNU General Public License for more details.
|
||
|
#
|
||
|
# You should have received a copy of the GNU General Public License
|
||
|
# along with this program; if not, write to the Free Software
|
||
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||
|
#
|
||
|
|
||
|
|
||
|
__all__ = [
|
||
|
'escape_html',
|
||
|
'unescape_html',
|
||
|
'html_to_text',
|
||
|
|
||
|
'html_document',
|
||
|
]
|
||
|
|
||
|
import htmllib
|
||
|
import formatter as Formatter
|
||
|
import string
|
||
|
from types import *
|
||
|
import StringIO
|
||
|
|
||
|
#------------------------------------------------------------------------------
|
||
|
|
||
|
class TextWriter(Formatter.DumbWriter):
|
||
|
def __init__(self, file=None, maxcol=80, indent_width=4):
|
||
|
Formatter.DumbWriter.__init__(self, file, maxcol)
|
||
|
self.indent_level = 0
|
||
|
self.indent_width = indent_width
|
||
|
self._set_indent()
|
||
|
|
||
|
def _set_indent(self):
|
||
|
self.indent_col = self.indent_level * self.indent_width
|
||
|
self.indent = ' ' * self.indent_col
|
||
|
|
||
|
def new_margin(self, margin, level):
|
||
|
self.indent_level = level
|
||
|
self._set_indent()
|
||
|
|
||
|
def send_label_data(self, data):
|
||
|
data = data + ' '
|
||
|
if len(data) > self.indent_col:
|
||
|
self.send_literal_data(data)
|
||
|
else:
|
||
|
offset = self.indent_col - len(data)
|
||
|
self.send_literal_data(' ' * offset + data)
|
||
|
|
||
|
def send_flowing_data(self, data):
|
||
|
if not data: return
|
||
|
atbreak = self.atbreak or data[0] in string.whitespace
|
||
|
col = self.col
|
||
|
maxcol = self.maxcol
|
||
|
write = self.file.write
|
||
|
col = self.col
|
||
|
if col == 0:
|
||
|
write(self.indent)
|
||
|
col = self.indent_col
|
||
|
for word in data.split():
|
||
|
if atbreak:
|
||
|
if col + len(word) >= maxcol:
|
||
|
write('\n' + self.indent)
|
||
|
col = self.indent_col
|
||
|
else:
|
||
|
write(' ')
|
||
|
col = col + 1
|
||
|
write(word)
|
||
|
col = col + len(word)
|
||
|
atbreak = 1
|
||
|
self.col = col
|
||
|
self.atbreak = data[-1] in string.whitespace
|
||
|
|
||
|
class HTMLParserAnchor(htmllib.HTMLParser):
|
||
|
|
||
|
def __init__(self, formatter, verbose=0):
|
||
|
htmllib.HTMLParser.__init__(self, formatter, verbose)
|
||
|
|
||
|
def anchor_bgn(self, href, name, type):
|
||
|
self.anchor = href
|
||
|
|
||
|
def anchor_end(self):
|
||
|
if self.anchor:
|
||
|
self.handle_data(' (%s) ' % self.anchor)
|
||
|
self.anchor = None
|
||
|
|
||
|
#------------------------------------------------------------------------------
|
||
|
|
||
|
def escape_html(s):
|
||
|
if s is None: return None
|
||
|
s = s.replace("&", "&") # Must be done first!
|
||
|
s = s.replace("<", "<")
|
||
|
s = s.replace(">", ">")
|
||
|
s = s.replace("'", "'")
|
||
|
s = s.replace('"', """)
|
||
|
return s
|
||
|
|
||
|
|
||
|
def unescape_html(s):
|
||
|
if s is None: return None
|
||
|
if '&' not in s:
|
||
|
return s
|
||
|
s = s.replace("<", "<")
|
||
|
s = s.replace(">", ">")
|
||
|
s = s.replace("'", "'")
|
||
|
s = s.replace(""", '"')
|
||
|
s = s.replace("&", "&") # Must be last
|
||
|
return s
|
||
|
|
||
|
def html_to_text(html, maxcol=80):
|
||
|
try:
|
||
|
buffer = StringIO.StringIO()
|
||
|
formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol))
|
||
|
parser = HTMLParserAnchor(formatter)
|
||
|
parser.feed(html)
|
||
|
parser.close()
|
||
|
text = buffer.getvalue()
|
||
|
buffer.close()
|
||
|
return text
|
||
|
except Exception, e:
|
||
|
log_program.error('cannot convert html to text: %s' % e)
|
||
|
return None
|
||
|
|
||
|
def html_document(*body_components):
|
||
|
'''Wrap the body components in a HTML document structure with a valid header.
|
||
|
Accepts a variable number of arguments of of which canb be:
|
||
|
* string
|
||
|
* a sequences of strings (tuple or list).
|
||
|
* a callable object taking no parameters and returning a string or sequence of strings.
|
||
|
'''
|
||
|
head = '<html>\n <head>\n <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n </head>\n <body>\n'
|
||
|
tail = '\n </body>\n</html>'
|
||
|
|
||
|
doc = head
|
||
|
|
||
|
for body_component in body_components:
|
||
|
if type(body_component) is StringTypes:
|
||
|
doc += body_component
|
||
|
elif type(body_component) in [TupleType, ListType]:
|
||
|
for item in body_component:
|
||
|
doc += item
|
||
|
elif callable(body_component):
|
||
|
result = body_component()
|
||
|
if type(result) in [TupleType, ListType]:
|
||
|
for item in result:
|
||
|
doc += item
|
||
|
else:
|
||
|
doc += result
|
||
|
else:
|
||
|
doc += body_component
|
||
|
|
||
|
doc += tail
|
||
|
return doc
|