IMDb: improve entity converter

author Andreas Frisch <andreas.frisch@multimedia-labs.de>

Tue, 21 Apr 2009 15:16:04 +0000 (15:16 +0000)

committer Andreas Frisch <andreas.frisch@multimedia-labs.de>

Tue, 21 Apr 2009 15:16:04 +0000 (15:16 +0000)
author Andreas Frisch <andreas.frisch@multimedia-labs.de>
Tue, 21 Apr 2009 15:16:04 +0000 (15:16 +0000)
committer Andreas Frisch <andreas.frisch@multimedia-labs.de>
Tue, 21 Apr 2009 15:16:04 +0000 (15:16 +0000)
diff --git a/imdb/src/plugin.py b/imdb/src/plugin.py

index 7a58c3c..f5255da 100755 (executable)
--- a/imdb/src/plugin.py
+++ b/imdb/src/plugin.py
@@ -335,23 +335,27 @@ class IMDB(Screen):
  
         def html2utf8(self,in_html):
                 htmlentitynumbermask = re.compile('(&#(\d{1,5}?);)')
-               htmlentitynamemask = re.compile('(&(\D{1,5}?);)')
-               entities = htmlentitynamemask.finditer(in_html)
+               htmlentityhexmask = re.compile('(&#x([0-9A-Fa-f]{2,2}?);)')
+               htmlentitynamemask = re.compile('(&([^#]\D{1,5}?);)')
                 entitydict = {}
+               entityhexdict = {}
+               entities = htmlentitynamemask.finditer(in_html)
  
                 for x in entities:
                         entitydict[x.group(1)] = x.group(2)
                 for key, name in entitydict.items():
-                       if key[0:3] == "&#x":
-                               try:
-                                       entitydict[key] = "%d" % int(key[3:5], 16)
-                               except:
-                                       print "[IMDb] html2utf8 entity hex->dec conversion error"
-                       else:
-                               entitydict[key] = htmlentitydefs.name2codepoint[name]
-               entities = htmlentitynumbermask.finditer(in_html)
+                       entitydict[key] = htmlentitydefs.name2codepoint[name]
+               entities = htmlentityhexmask.finditer(in_html)
  
                 for x in entities:
+                       entityhexdict[x.group(1)] = x.group(2)
+
+               for key, name in entityhexdict.items():
+                       entitydict[key] = "%d" % int(key[3:5], 16)
+                       print "key:", key, "before:", name, "after:", entitydict[key]
+               
+               entities = htmlentitynumbermask.finditer(in_html)
+               for x in entities:
                         entitydict[x.group(1)] = x.group(2)
                 for key, codepoint in entitydict.items():
                         in_html = in_html.replace(key, (unichr(int(codepoint)).encode('latin-1')))
author	Andreas Frisch <andreas.frisch@multimedia-labs.de>
	Tue, 21 Apr 2009 15:16:04 +0000 (15:16 +0000)
committer	Andreas Frisch <andreas.frisch@multimedia-labs.de>
	Tue, 21 Apr 2009 15:16:04 +0000 (15:16 +0000)