code.vuplus.com Git - vuplus_dvbapp-plugin/blob - emailclient/src/TagStrip.py

   1 # -*- coding: utf-8 -*-
   2 from re import sub
   3
   4 # Entities to be converted
   5 entities = (
   6         # ISO-8895-1 (most common)
   7         ("&#228;", u"ä"),
   8         ("&auml;", u"ä"),
   9         ("&#252;", u"ü"),
  10         ("&uuml;", u"ü"),
  11         ("&#246;", u"ö"),
  12         ("&ouml;", u"ö"),
  13         ("&#196;", u"Ä"),
  14         ("&Auml;", u"Ä"),
  15         ("&#220;", u"Ü"),
  16         ("&Uuml;", u"Ü"),
  17         ("&#214;", u"Ö"),
  18         ("&Ouml;", u"Ö"),
  19         ("&#223;", u"ß"),
  20         ("&szlig;", u"ß"),
  21
  22         # Rarely used entities
  23         ("&#8230;", u"..."),
  24         ("&#8211;", u"-"),
  25         ("&#160;", u" "),
  26         ("&#34;", u"\""),
  27         ("&#38;", u"&"),
  28         ("&#39;", u"'"),
  29         ("&#60;", u"<"),
  30         ("&#62;", u">"),
  31
  32         # Common entities
  33         ("&lt;", u"<"),
  34         ("&gt;", u">"),
  35         ("&nbsp;", u" "),
  36         ("&amp;", u"&"),
  37         ("&quot;", u"\""),
  38         ("&apos;", u"'"),
  39 )
  40
  41 def strip_readable(html):
  42         # Newlines are rendered as whitespace in html
  43         html = html.replace('\n', ' ')
  44
  45         # Multiple whitespaces are rendered as a single one
  46         html = sub('\s\s+', ' ', html)
  47
  48         # Replace <br> by newlines
  49         html = sub('<br(\s+/)?>', '\n', html)
  50
  51         # Replace <p>, <ul>, <ol> and end of these tags by newline
  52         html = sub('</?(p|ul|ol)(\s+.*?)?>', '\n', html)
  53
  54         # Replace <li> by - and </li> by newline
  55         html = sub('<li(\s+.*?)?>', '-', html)
  56         html = html.replace('</li>', '\n')
  57
  58         # And 'normal' stripping
  59         return strip(html)
  60
  61 def strip(html):
  62         # Strip enclosed tags
  63         html = sub('<(.*?)>', '', html)
  64
  65         # Convert html entities
  66         for escaped, unescaped in entities:
  67                 html = html.replace(escaped, unescaped)
  68
  69         # Return result with leading/trailing whitespaces removed
  70         return html.strip()
  71