CLEANUP: html umlaut handling
[vuplus_dvbapp-plugin] / fritzcall / src / nrzuname.py
1 #!/usr/bin/python
2 # -*- coding: UTF-8 -*-
3 # $Id$
4 # $Author$
5 # $Revision$
6 # $Date$
7
8 import re, sys, os
9 from xml.dom.minidom import parse
10 from twisted.web.client import getPage #@UnresolvedImport
11 from twisted.internet import reactor #@UnresolvedImport
12
13 debugSetting = True
14 def setDebug(what):
15         global debugSetting
16         debugSetting = what
17
18 def debug(str):
19         if debugSetting:
20                 print str
21
22 import htmlentitydefs
23 def html2unicode(in_html):
24         # sanity checks
25         try:
26                 in_html = in_html.decode('iso-8859-1')
27                 debug("[Callhtml2utf8] Converted from latin1")
28         except:
29                 debug("[Callhtml2utf8] lost in translation from latin1")
30                 pass
31         try:
32                 in_html = in_html.decode('utf-8')
33                 debug("[Callhtml2utf8] Converted from utf-8")
34         except:
35                 debug("[Callhtml2utf8] lost in translation from utf-8")
36                 pass
37
38         # first convert some WML codes from hex: e.g. &#xE4 -> &#228
39         htmlentityhexnumbermask = re.compile('(&#x(..);)')
40         entities = htmlentityhexnumbermask.finditer(in_html)
41         for x in entities:
42                 in_html = in_html.replace(x.group(1), '&#' + str(int(x.group(2),16)) + ';')
43
44         entitydict = {}
45         # catch ü and colleagues here
46         htmlentitynamemask = re.compile('(&(\D{1,5}?);)')
47         entities = htmlentitynamemask.finditer(in_html)
48         for x in entities:
49                 # debug("[Callhtml2utf8] mask: found %s" %repr(x.group(2)))
50                 entitydict[x.group(1)] = htmlentitydefs.name2codepoint[str(x.group(2))]
51
52         # this is for &#288 and other numbers
53         htmlentitynumbermask = re.compile('(&#(\d{1,5}?);)')
54         entities = htmlentitynumbermask.finditer(in_html)
55         for x in entities:
56                 # debug("[Callhtml2utf8] number: found %s" %x.group(1))
57                 entitydict[x.group(1)] = x.group(2)
58                 
59         # no go and replace all occurrences
60         for key, codepoint in entitydict.items():
61                 try:
62                         debug("[Callhtml2utf8] replace %s with %s" %(repr(key), unichr(int(codepoint))))
63                         in_html = in_html.replace(unicode(key), (unichr(int(codepoint))))
64                         # in_html = in_html.replace(unicode(key), (unichr(int(codepoint))).decode('cp1252').encode('utf-8'))
65                 except ValueError:
66                         debug("[Callhtml2utf8] ValueError " + key + "/" + str(codepoint))
67                         pass
68         return in_html
69
70 def out(number, caller):
71         debug("[out] %s: %s" %(number, caller))
72         if not caller:
73                 return
74         name = vorname = strasse = hnr = plz = ort = ""
75         lines = caller.split(', ')
76         found = re.match("(.+?)\s+(.+)", lines[0])
77         if found:
78                 name = found.group(1)
79                 vorname = found.group(2)
80         else:
81                 name = lines[0]
82         aktuell = 1
83         found = re.match("^(.+) ([-\d]+)$", lines[1], re.S)
84         if found:
85                 strasse = found.group(1)
86                 hnr = found.group(2)
87                 aktuell = 2
88         else:
89                 found = re.match("^(\d+) (.+)$", lines[1], re.S)
90                 if found:
91                         strasse = found.group(2)
92                         hnr = found.group(1)
93                 else:
94                         strasse = lines[1]
95                 aktuell = 2
96         for i in range(aktuell, len(lines)):
97                 found = re.match("(\S+)\s+(.+)", lines[i], re.S)
98                 if found:
99                         plz = found.group(1)
100                         ort = found.group(2)
101                         break
102         else:
103                 ort = lines[aktuell].strip()
104         print "NA: %s;VN: %s;STR: %s;HNR: %s;PLZ: %s;ORT: %s" %( name,vorname,strasse,hnr,plz,ort )
105
106 def simpleout(number, caller):
107         print caller
108
109 try:
110         from Tools.Directories import resolveFilename, SCOPE_PLUGINS
111         reverseLookupFileName = resolveFilename(SCOPE_PLUGINS, "Extensions/FritzCall/reverselookup.xml")
112 except ImportError:
113         reverseLookupFileName = "reverselookup.xml"
114
115 countries = { }
116 reverselookupMtime = 0
117
118 class ReverseLookupAndNotifier:
119         def __init__(self, number, outputFunction=out, charset="cp1252", countrycode = "0049"):
120                 debug("[ReverseLookupAndNotifier] reverse Lookup for %s!" %number)
121                 self.number = number
122                 self.outputFunction = outputFunction
123                 self.caller = ""
124                 self.currentWebsite = None
125                 self.nextWebsiteNo = 0
126 #===============================================================================
127 # sorry does not work at all
128 #               if not charset:
129 #                       charset = sys.getdefaultencoding()
130 #                       debug("[ReverseLookupAndNotifier] set charset from system: %s!" %charset)
131 #===============================================================================
132                 self.charset = charset
133
134                 global reverselookupMtime
135                 reverselookupMtimeAct = os.stat(reverseLookupFileName)[8]
136                 if not countries or reverselookupMtimeAct > reverselookupMtime:
137                         debug("[ReverseLookupAndNotifier] (Re-)Reading %s\n" %reverseLookupFileName)
138                         reverselookupMtime = reverselookupMtimeAct
139                         dom = parse(reverseLookupFileName)
140                         for top in dom.getElementsByTagName("reverselookup"):
141                                 for country in top.getElementsByTagName("country"):
142                                         code = country.getAttribute("code").replace("+","00")
143                                         countries[code] = country.getElementsByTagName("website")
144
145                 self.countrycode = countrycode
146
147                 if number[0] != "0":
148                         # self.caller = _("UNKNOWN")
149                         self.notifyAndReset()
150                         return
151
152                 if self.number[:2] == "00":
153                         if countries.has_key(self.number[:3]):   #      e.g. USA
154                                 self.countrycode = self.number[:3]
155                         elif countries.has_key(self.number[:4]):
156                                 self.countrycode = self.number[:4]
157                         elif countries.has_key(self.number[:5]):
158                                 self.countrycode = self.number[:5]
159                         else:
160                                 debug("[ReverseLookupAndNotifier] Country cannot be reverse handled")
161                                 # self.caller = _("UNKNOWN")
162                                 self.notifyAndReset()
163                                 return
164
165                 if countries.has_key(self.countrycode):
166                         debug("[ReverseLookupAndNotifier] Found website for reverse lookup")
167                         self.websites = countries[self.countrycode]
168                         self.nextWebsiteNo = 1
169                         self.handleWebsite(self.websites[0])
170                 else:
171                         debug("[ReverseLookupAndNotifier] Country cannot be reverse handled")
172                         # self.caller = _("UNKNOWN")
173                         self.notifyAndReset()
174                         return
175
176         def handleWebsite(self, website):
177                 debug("[ReverseLookupAndNotifier] handleWebsite: " + website.getAttribute("name"))
178                 if self.number[:2] == "00":
179                         number = website.getAttribute("prefix") + self.number.replace(self.countrycode,"")
180                 else:
181                         number = self.number
182
183                 url = website.getAttribute("url")
184                 if re.search('$AREACODE',url) or re.search('$PFXAREACODE',url):
185                         debug("[ReverseLookupAndNotifier] handleWebsite: (PFX)ARECODE cannot be handled")
186                         # self.caller = _("UNKNOWN")
187                         self.notifyAndReset()
188                         return
189                 #
190                 # Apparently, there is no attribute called (pfx)areacode anymore
191                 # So, this below will not work.
192                 #
193                 if re.search('\\$AREACODE',url) and website.hasAttribute("areacode"):
194                         areaCodeLen = int(website.getAttribute("areacode"))
195                         url = url.replace("$AREACODE","%(areacode)s").replace("$NUMBER","%(number)s")
196                         url = url %{ 'areacode':number[:areaCodeLen], 'number':number[areaCodeLen:] }
197                 elif re.search('\\$PFXAREACODE',url) and website.hasAttribute("pfxareacode"):
198                         areaCodeLen = int(website.getAttribute("pfxareacode"))
199                         url = url.replace("$PFXAREACODE","%(pfxareacode)s").replace("$NUMBER","%(number)s")
200                         url = url %{ 'pfxareacode':number[:areaCodeLen], 'number':number[areaCodeLen:] }
201                 elif re.search('\\$NUMBER',url): 
202                         url = url.replace("$NUMBER","%s") %number
203                 else:
204                         debug("[ReverseLookupAndNotifier] handleWebsite: cannot handle websites with no $NUMBER in url")
205                         # self.caller = _("UNKNOWN")
206                         self.notifyAndReset()
207                         return
208                 debug("[ReverseLookupAndNotifier] Url to query: " + url)
209                 url = url.encode("UTF-8", "replace")
210                 self.currentWebsite = website
211                 getPage(url,
212                         agent="Mozilla/5.0 (Windows; U; Windows NT 6.0; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5"
213                         ).addCallback(self._gotPage).addErrback(self._gotError)
214
215
216         def _gotPage(self, page):
217                 debug("[ReverseLookupAndNotifier] _gotPage")
218                 found = re.match('.*content=".*?charset=([^"]+)"',page,re.S)
219                 if found:
220                         debug("[ReverseLookupAndNotifier] Charset: " + found.group(1))
221                         page = page.replace("\xa0"," ").decode(found.group(1), "replace")
222                 else:
223                         page = page.replace("\xa0"," ").decode("ISO-8859-1", "replace")
224
225                 for entry in self.currentWebsite.getElementsByTagName("entry"):
226                         # debug("[ReverseLookupAndNotifier] _gotPage: try entry")
227                         details = []
228                         for what in ["name", "street", "city", "zipcode"]:
229                                 pat = ".*?" + self.getPattern(entry, what)
230                                 debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( what, pat ))
231                                 found = re.match(pat, page, re.S|re.M)
232                                 if found:
233                                         debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( what, repr(found.group(1)) ))
234                                         item = found.group(1).replace("&nbsp;"," ").replace("</b>","").replace(","," ")
235                                         item = html2unicode(item)
236                                         newitem = item.replace("  ", " ")
237                                         while newitem != item:
238                                                 item = newitem
239                                                 newitem = item.replace("  ", " ")
240                                         debug("[ReverseLookupAndNotifier] _gotPage: add to details: " + repr(item))
241                                         details.append(item.strip())
242                                 else:
243                                         break
244
245                         if len(details) != 4:
246                                 continue
247                         else:
248                                 name = details[0]
249                                 address =  details[1] + ", " + details[3] + " " + details[2]
250                                 debug("[ReverseLookupAndNotifier] _gotPage: Reverse lookup succeeded:\nName: %s\nAddress: %s" %(name, address))
251                                 self.caller = "%s, %s" %(name, address)
252                                 # if self.number != 0 and config.plugins.Call.addcallers.value and self.event == "RING":
253                                         # phonebook.add(self.number, self.caller)
254
255                                 self.notifyAndReset()
256                                 return True
257                 else:
258                         self._gotError("[ReverseLookupAndNotifier] _gotPage: Nothing found at %s" %self.currentWebsite.getAttribute("name"))
259                         
260         def _gotError(self, error = ""):
261                 debug("[ReverseLookupAndNotifier] _gotError - Error: %s" %error)
262                 if self.nextWebsiteNo >= len(self.websites):
263                         debug("[ReverseLookupAndNotifier] _gotError: I give up")
264                         # self.caller = _("UNKNOWN")
265                         self.notifyAndReset()
266                         return
267                 else:
268                         debug("[ReverseLookupAndNotifier] _gotError: try next website")
269                         self.nextWebsiteNo = self.nextWebsiteNo+1
270                         self.handleWebsite(self.websites[self.nextWebsiteNo-1])
271
272         def getPattern(self, website, which):
273                 pat1 = website.getElementsByTagName(which)
274                 if len(pat1) > 1:
275                         debug("[ReverseLookupAndNotifier] getPattern: Something strange: more than one %s for website %s" %(which, website.getAttribute("name")))
276                 return pat1[0].childNodes[0].data
277
278         def notifyAndReset(self):
279                 debug("[ReverseLookupAndNotifier] notifyAndReset: Number: " + self.number + "; Caller: " + self.caller)
280                 # debug("1: " + repr(self.caller))
281                 if self.caller:
282                         try:
283                                 # debug("2: " + repr(self.caller))
284                                 self.caller = self.caller.encode(self.charset)
285                                 # debug("3: " + repr(self.caller))
286                         except:
287                                 debug("[ReverseLookupAndNotifier] cannot encode?!?!")
288                                 pass
289                         # self.caller = unicode(self.caller)
290                         # debug("4: " + repr(self.caller))
291                         self.outputFunction(self.number, self.caller)
292                 else:
293                         self.outputFunction(self.number, "")
294                 if __name__ == '__main__':
295                         reactor.stop() #@UndefinedVariable
296
297 if __name__ == '__main__':
298         cwd = os.path.dirname(sys.argv[0])
299         if (len(sys.argv) == 2):
300                 # nrzuname.py Nummer
301                 ReverseLookupAndNotifier(sys.argv[1])
302                 reactor.run() #@UndefinedVariable
303         elif (len(sys.argv) == 3):
304                 # nrzuname.py Nummer Charset
305                 setDebug(False)
306                 ReverseLookupAndNotifier(sys.argv[1], simpleout, sys.argv[2])
307                 reactor.run() #@UndefinedVariable