Section: extra
Priority: optional
Maintainer: Moritz Venn <moritz.venn@freaque.net>
-Depends: enigma2(>=2.6git20090615), twisted-web, python-codecs, python-xml
+Depends: enigma2(>=2.6git20090615), twisted-web, python-codecs, python-xml, python-html
Source: http://schwerkraft.elitedvb.net/scm/?group_id=11
# -*- coding: utf-8 -*-
-from re import sub
+from re import sub, finditer
-# Entities to be converted
-entities = (
- # ISO-8895-1 (most common)
- ("ä", u"ä"),
- ("ä", u"ä"),
- ("ü", u"ü"),
- ("ü", u"ü"),
- ("ö", u"ö"),
- ("ö", u"ö"),
- ("Ä", u"Ä"),
- ("Ä", u"Ä"),
- ("Ü", u"Ü"),
- ("Ü", u"Ü"),
- ("Ö", u"Ö"),
- ("Ö", u"Ö"),
- ("ß", u"ß"),
- ("ß", u"ß"),
-
- # Rarely used entities
- ("…", u"..."),
- ("–", u"-"),
- (" ", u" "),
- (""", u"\""),
- ("&", u"&"),
- ("'", u"'"),
- ("<", u"<"),
- (">", u">"),
-
- # Common entities
- ("<", u"<"),
- (">", u">"),
- (" ", u" "),
- ("&", u"&"),
- (""", u"\""),
- ("'", u"'"),
-)
+import htmlentitydefs
def strip_readable(html):
# Newlines are rendered as whitespace in html
def strip(html):
# Strip enclosed tags
- html = sub('<(.*?)>', '', html)
-
- # Convert html entities
- for escaped, unescaped in entities:
- html = html.replace(escaped, unescaped)
+ html = sub('<.*?>', '', html)
+
+ entitydict = {}
+
+ entities = finditer('&([^#]\D{1,5}?);', html)
+ for x in entities:
+ key = x.group(0)
+ if key not in entitydict:
+ entitydict[key] = htmlentitydefs.name2codepoint[x.group(1)]
+
+ entities = finditer('&#x([0-9A-Fa-f]{2,2}?);', html)
+ for x in entities:
+ key = x.group(0)
+ if key not in entitydict:
+ entitydict[key] = "%d" % int(key[3:5], 16)
+
+ entities = finditer('&#(\d{1,5}?);', html)
+ for x in entities:
+ key = x.group(0)
+ if key not in entitydict:
+ entitydict[key] = x.group(1)
+
+ for key, codepoint in entitydict.items():
+ html = html.replace(key, unichr(int(codepoint)))
# Return result with leading/trailing whitespaces removed
return html.strip()