1 # -*- coding: utf-8 -*-
4 # Entities to be converted
6 # ISO-8895-1 (most common)
22 # Rarely used entities
41 def strip_readable(html):
42 # Newlines are rendered as whitespace in html
43 html = html.replace('\n', ' ')
45 # Multiple whitespaces are rendered as a single one
46 html = sub('\s\s+', ' ', html)
48 # Replace <br> by newlines
49 html = sub('<br(\s+/)?>', '\n', html)
51 # Replace <p>, <ul>, <ol> and end of these tags by newline
52 html = sub('</?(p|ul|ol)(\s+.*?)?>', '\n', html)
54 # Replace <li> by - and </li> by newline
55 html = sub('<li(\s+.*?)?>', '-', html)
56 html = html.replace('</li>', '\n')
58 # And 'normal' stripping
63 html = sub('<(.*?)>', '', html)
65 # Convert html entities
66 for escaped, unescaped in entities:
67 html = html.replace(escaped, unescaped)
69 # Return result with leading/trailing whitespaces removed