FIX: problems with umlauten in reverse search
[vuplus_dvbapp-plugin] / fritzcall / src / nrzuname.py
1 #!/usr/bin/python
2 # -*- coding: UTF-8 -*-
3 '''
4 $Id$
5 $Author$
6 $Revision$
7 $Date$
8 '''
9
10 import re, sys, os
11 from xml.dom.minidom import parse
12 from twisted.web.client import getPage #@UnresolvedImport
13 from twisted.internet import reactor #@UnresolvedImport
14
15 try:
16         from . import debug #@UnresolvedImport # pylint: disable-msg=W0613,F0401
17         def setDebug(what): # pylint: disable-msg=W0613
18                 pass
19 except ValueError:
20         debugVal = True
21         def setDebug(what):
22                 global debugVal
23                 debugVal = what
24         def debug(message):
25                 if debugVal:
26                         print message
27
28 import htmlentitydefs
29 def html2unicode(in_html, charset):
30 #===============================================================================
31 #       # sanity checks
32 #       try:
33 #               in_html = in_html.decode('iso-8859-1')
34 #               debug("[Callhtml2utf8] Converted from latin1")
35 #       except:
36 #               debug("[Callhtml2utf8] lost in translation from latin1")
37 #               pass
38 #       try:
39 #               in_html = in_html.decode('utf-8')
40 #               debug("[Callhtml2utf8] Converted from utf-8")
41 #       except:
42 #               debug("[Callhtml2utf8] lost in translation from utf-8")
43 #               pass
44 #===============================================================================
45
46         # first convert some WML codes from hex: e.g. &#xE4 -> &#228
47         htmlentityhexnumbermask = re.compile('(&#x(..);)')
48         entities = htmlentityhexnumbermask.finditer(in_html)
49         for x in entities:
50                 in_html = in_html.replace(x.group(1), '&#' + str(int(x.group(2), 16)) + ';')
51
52         htmlentitynamemask = re.compile('(&(\D{1,5}?);)')
53         entitydict = {}
54         entities = htmlentitynamemask.finditer(in_html)
55         for x in entities:
56                 # debug("[Callhtml2utf8] mask: found %s" %repr(x.group(2)))
57                 entitydict[x.group(1)] = x.group(2)
58         for key, name in entitydict.items():
59                 try:
60                         entitydict[key] = htmlentitydefs.name2codepoint[str(name)]
61                 except KeyError:
62                         debug("[Callhtml2utf8] KeyError " + key + "/" + name)
63
64         htmlentitynumbermask = re.compile('(&#(\d{1,5}?);)')
65         entities = htmlentitynumbermask.finditer(in_html)
66         for x in entities:
67                 # debug("[Callhtml2utf8] number: found %s" %x.group(1))
68                 entitydict[x.group(1)] = x.group(2)
69         for key, codepoint in entitydict.items():
70                 try:
71                         debug("[nrzuname] html2utf8: replace %s with %s" %(repr(key), str(codepoint)))
72                         in_html = in_html.replace(unicode(key), (unichr(int(codepoint))).encode(charset))
73                 except ValueError:
74                         debug("[nrzuname] html2utf8: ValueError " + key + "/" + str(codepoint))
75         return in_html
76
77 def normalizePhoneNumber(intNo):
78         found = re.match('^\+(.*)', intNo)
79         if found:
80                 intNo = '00' + found.group(1)
81         intNo = intNo.replace('(', '').replace(')', '').replace(' ', '').replace('/', '').replace('-', '')
82         found = re.match('.*?([0-9]+)', intNo)
83         if found:
84                 return found.group(1)
85         else:
86                 return '0'
87
88 def out(number, caller):
89         debug("[nrzuname] out: %s: %s" %(number, caller))
90         found = re.match("NA: ([^;]*);VN: ([^;]*);STR: ([^;]*);HNR: ([^;]*);PLZ: ([^;]*);ORT: ([^;]*)", caller)
91         if not found:
92                 return
93         ( name, vorname, strasse, hnr, plz, ort ) = (found.group(1),
94                                                                                         found.group(2),
95                                                                                         found.group(3),
96                                                                                         found.group(4),
97                                                                                         found.group(5),
98                                                                                         found.group(6)
99                                                                                         )
100         if vorname:
101                 name += ' ' + vorname
102         if strasse or hnr or plz or ort:
103                 name += ', '
104         if strasse:
105                 name += strasse
106         if hnr:
107                 name += ' ' + hnr
108         if (strasse or hnr) and (plz or ort):
109                 name += ', '
110         if plz and ort:
111                 name += plz + ' ' + ort
112         elif plz:
113                 name += plz
114         elif ort:
115                 name += ort
116
117         print(name)
118
119 def simpleout(number, caller): #@UnusedVariable # pylint: disable-msg=W0613
120         print caller
121
122 try:
123         from Tools.Directories import resolveFilename, SCOPE_PLUGINS
124         reverseLookupFileName = resolveFilename(SCOPE_PLUGINS, "Extensions/FritzCall/reverselookup.xml")
125 except ImportError:
126         reverseLookupFileName = "reverselookup.xml"
127
128 countries = { }
129 reverselookupMtime = 0
130
131 class ReverseLookupAndNotifier:
132         def __init__(self, number, outputFunction=out, charset="cp1252", countrycode = "0049"):
133                 debug("[ReverseLookupAndNotifier] reverse Lookup for %s!" %number)
134                 self.number = number
135                 self.outputFunction = outputFunction
136                 self.caller = ""
137                 self.currentWebsite = None
138                 self.nextWebsiteNo = 0
139 #===============================================================================
140 # sorry does not work at all
141 #               if not charset:
142 #                       charset = sys.getdefaultencoding()
143 #                       debug("[ReverseLookupAndNotifier] set charset from system: %s!" %charset)
144 #===============================================================================
145                 self.charset = charset
146
147                 global reverselookupMtime
148                 reverselookupMtimeAct = os.stat(reverseLookupFileName)[8]
149                 if not countries or reverselookupMtimeAct > reverselookupMtime:
150                         debug("[ReverseLookupAndNotifier] (Re-)Reading %s\n" %reverseLookupFileName)
151                         reverselookupMtime = reverselookupMtimeAct
152                         dom = parse(reverseLookupFileName)
153                         for top in dom.getElementsByTagName("reverselookup"):
154                                 for country in top.getElementsByTagName("country"):
155                                         code = country.getAttribute("code").replace("+","00")
156                                         countries[code] = country.getElementsByTagName("website")
157
158                 self.countrycode = countrycode
159
160                 if re.match('^\+', self.number):
161                         self.number = '00' + self.number[1:]
162
163                 if self.number[:len(countrycode)] == countrycode:
164                         self.number = '0' + self.number[len(countrycode):]
165
166                 if number[0] != "0":
167                         # self.caller = _("UNKNOWN")
168                         self.notifyAndReset()
169                         return
170
171                 if self.number[:2] == "00":
172                         if countries.has_key(self.number[:3]):   #      e.g. USA
173                                 self.countrycode = self.number[:3]
174                         elif countries.has_key(self.number[:4]):
175                                 self.countrycode = self.number[:4]
176                         elif countries.has_key(self.number[:5]):
177                                 self.countrycode = self.number[:5]
178                         else:
179                                 debug("[ReverseLookupAndNotifier] Country cannot be reverse handled")
180                                 # self.caller = _("UNKNOWN")
181                                 self.notifyAndReset()
182                                 return
183
184                 if countries.has_key(self.countrycode):
185                         debug("[ReverseLookupAndNotifier] Found website for reverse lookup")
186                         self.websites = countries[self.countrycode]
187                         self.nextWebsiteNo = 1
188                         self.handleWebsite(self.websites[0])
189                 else:
190                         debug("[ReverseLookupAndNotifier] Country cannot be reverse handled")
191                         # self.caller = _("UNKNOWN")
192                         self.notifyAndReset()
193                         return
194
195         def handleWebsite(self, website):
196                 debug("[ReverseLookupAndNotifier] handleWebsite: " + website.getAttribute("name"))
197                 if self.number[:2] == "00":
198                         number = website.getAttribute("prefix") + self.number.replace(self.countrycode,"")
199                 else:
200                         number = self.number
201
202                 url = website.getAttribute("url")
203                 if re.search('$AREACODE', url) or re.search('$PFXAREACODE', url):
204                         debug("[ReverseLookupAndNotifier] handleWebsite: (PFX)ARECODE cannot be handled")
205                         # self.caller = _("UNKNOWN")
206                         self.notifyAndReset()
207                         return
208                 #
209                 # Apparently, there is no attribute called (pfx)areacode anymore
210                 # So, this below will not work.
211                 #
212                 if re.search('\\$AREACODE', url) and website.hasAttribute("areacode"):
213                         areaCodeLen = int(website.getAttribute("areacode"))
214                         url = url.replace("$AREACODE", number[:areaCodeLen]).replace("$NUMBER", number[areaCodeLen:])
215                 elif re.search('\\$PFXAREACODE', url) and website.hasAttribute("pfxareacode"):
216                         areaCodeLen = int(website.getAttribute("pfxareacode"))
217                         url = url.replace("$PFXAREACODE","%(pfxareacode)s").replace("$NUMBER", "%(number)s")
218                         url = url % { 'pfxareacode': number[:areaCodeLen], 'number': number[areaCodeLen:] }
219                 elif re.search('\\$NUMBER', url): 
220                         url = url.replace("$NUMBER","%s") %number
221                 else:
222                         debug("[ReverseLookupAndNotifier] handleWebsite: cannot handle websites with no $NUMBER in url")
223                         # self.caller = _("UNKNOWN")
224                         self.notifyAndReset()
225                         return
226                 debug("[ReverseLookupAndNotifier] Url to query: " + url)
227                 url = url.encode("UTF-8", "replace")
228                 self.currentWebsite = website
229                 getPage(url,
230                         agent="Mozilla/5.0 (Windows; U; Windows NT 6.0; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5"
231                         ).addCallback(self._gotPage).addErrback(self._gotError)
232
233
234         def _gotPage(self, page):
235                 def cleanName(text):
236                         item = text.replace("%20"," ").replace("&nbsp;"," ").replace("</b>","").replace(","," ").replace('\n',' ').replace('\t',' ')
237
238                         item = html2unicode(item, self.charset)
239                         #===================================================================
240                         # try: # this works under Windows
241                         #       item = item.encode('iso-8859-1')
242                         # except UnicodeEncodeError:
243                         #       debug("[ReverseLookupAndNotifier] cleanName: encoding problem with iso8859")
244                         #       try: # this works under Enigma2
245                         #               item = item.encode('utf-8')
246                         #       except UnicodeEncodeError:
247                         #               debug("[ReverseLookupAndNotifier] cleanName: encoding problem with utf-8")
248                         #               try: # fall back
249                         #                       item = item.encode(self.charset)
250                         #               except UnicodeEncodeError:
251                         #                       # debug("[ReverseLookupAndNotifier] cleanName: " + traceback.format_exc())
252                         #                       debug("[ReverseLookupAndNotifier] cleanName: encoding problem")
253                         #===================================================================
254
255                         newitem = item.replace("  ", " ")
256                         while newitem != item:
257                                 item = newitem
258                                 newitem = item.replace("  ", " ")
259                         return newitem.strip()
260         
261                 debug("[ReverseLookupAndNotifier] _gotPage")
262                 found = re.match('.*<meta http-equiv="Content-Type" content="(?:application/xhtml\+xml|text/html); charset=([^"]+)" />', page, re.S)
263                 if found:
264                         debug("[ReverseLookupAndNotifier] Charset: " + found.group(1))
265                         page = page.replace("\xa0"," ").decode(found.group(1), "replace")
266                 else:
267                         debug("[ReverseLookupAndNotifier] Default Charset: iso-8859-1")
268                         page = page.replace("\xa0"," ").decode("ISO-8859-1", "replace")
269
270                 for entry in self.currentWebsite.getElementsByTagName("entry"):
271                         #
272                         # for the sites delivering fuzzy matches, we check against the returned number
273                         #
274                         pat = self.getPattern(entry, "number")
275                         if pat:
276                                 pat = ".*?" + pat
277                                 debug("[ReverseLookupAndNotifier] _gotPage: look for number with '''%s'''" %( pat ))
278                                 found = re.match(pat, page, re.S|re.M)
279                                 if found:
280                                         if self.number[:2] == '00':
281                                                 number = '0' + self.number[4:]
282                                         else:
283                                                 number = self.number
284                                         if number != normalizePhoneNumber(found.group(1)):
285                                                 debug("[ReverseLookupAndNotifier] _gotPage: got unequal number '''%s''' for '''%s'''" %(found.group(1), self.number))
286                                                 continue
287                         
288                         # look for <firstname> and <lastname> match, if not there look for <name>, if not there break
289                         name = ''
290                         firstname = ''
291                         street = ''
292                         streetno = ''
293                         city = ''
294                         zipcode = ''
295                         pat = self.getPattern(entry, "lastname")
296                         if pat:
297                                 pat = ".*?" + pat
298                                 debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "lastname", pat ))
299                                 found = re.match(pat, page, re.S|re.M)
300                                 if found:
301                                         debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "lastname", found.group(1)))
302                                         name = cleanName(found.group(1))
303
304                                         pat = self.getPattern(entry, "firstname")
305                                         if pat:
306                                                 pat = ".*?" + pat
307                                                 debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "firstname", pat ))
308                                                 found = re.match(pat, page, re.S|re.M)
309                                                 if found:
310                                                         debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "firstname", found.group(1)))
311                                                 firstname = cleanName(found.group(1)).strip()
312
313                         else:
314                                 pat = ".*?" + self.getPattern(entry, "name")
315                                 debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "name", pat ))
316                                 found = re.match(pat, page, re.S|re.M)
317                                 if found:
318                                         debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "name", found.group(1)))
319                                         item = cleanName(found.group(1))
320                                         # debug("[ReverseLookupAndNotifier] _gotPage: name: " + item)
321                                         name = item.strip()
322                                         firstNameFirst = entry.getElementsByTagName('name')[0].getAttribute('swapFirstAndLastName')
323                                         # debug("[ReverseLookupAndNotifier] _gotPage: swapFirstAndLastName: " + firstNameFirst)
324                                         if firstNameFirst == 'true': # that means, the name is of the form "firstname lastname"
325                                                 found = re.match('(.*?)\s+(.*)', name)
326                                                 if found:
327                                                         firstname = found.group(1)
328                                                         name = found.group(2)
329                                 else:
330                                         debug("[ReverseLookupAndNotifier] _gotPage: no name found, skipping")
331                                         continue
332
333                         if not name:
334                                 continue
335
336                         pat = ".*?" + self.getPattern(entry, "city")
337                         debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "city", pat ))
338                         found = re.match(pat, page, re.S|re.M)
339                         if found:
340                                 debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "city", found.group(1)))
341                                 item = cleanName(found.group(1))
342                                 debug("[ReverseLookupAndNotifier] _gotPage: city: " + item)
343                                 city = item.strip()
344
345                         if not city:
346                                 continue
347
348                         pat = ".*?" + self.getPattern(entry, "zipcode")
349                         debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "zipcode", pat ))
350                         found = re.match(pat, page, re.S|re.M)
351                         if found and found.group(1):
352                                 debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "zipcode", found.group(1)))
353                                 item = cleanName(found.group(1))
354                                 debug("[ReverseLookupAndNotifier] _gotPage: zipcode: " + item)
355                                 zipcode = item.strip()
356
357                         pat = ".*?" + self.getPattern(entry, "street")
358                         debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "street", pat ))
359                         found = re.match(pat, page, re.S|re.M)
360                         if found and found.group(1):
361                                 debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "street", found.group(1)))
362                                 item = cleanName(found.group(1))
363                                 debug("[ReverseLookupAndNotifier] _gotPage: street: " + item)
364                                 street = item.strip()
365                                 streetno = ''
366                                 found = re.match("^(.+) ([-\d]+)$", street, re.S)
367                                 if found:
368                                         street = found.group(1)
369                                         streetno = found.group(2)
370                                 #===============================================================
371                                 # else:
372                                 #       found = re.match("^(\d+) (.+)$", street, re.S)
373                                 #       if found:
374                                 #               street = found.group(2)
375                                 #               streetno = found.group(1)
376                                 #===============================================================
377
378                         self.caller = "NA: %s;VN: %s;STR: %s;HNR: %s;PLZ: %s;ORT: %s" % ( name, firstname, street, streetno, zipcode, city )
379                         debug("[ReverseLookupAndNotifier] _gotPage: Reverse lookup succeeded:\nName: %s" %(self.caller))
380
381                         self.notifyAndReset()
382                         return True
383                 else:
384                         self._gotError("[ReverseLookupAndNotifier] _gotPage: Nothing found at %s" %self.currentWebsite.getAttribute("name"))
385                         return False
386                         
387         def _gotError(self, error = ""):
388                 debug("[ReverseLookupAndNotifier] _gotError - Error: %s" %error)
389                 if self.nextWebsiteNo >= len(self.websites):
390                         debug("[ReverseLookupAndNotifier] _gotError: I give up")
391                         # self.caller = _("UNKNOWN")
392                         self.notifyAndReset()
393                         return
394                 else:
395                         debug("[ReverseLookupAndNotifier] _gotError: try next website")
396                         self.nextWebsiteNo = self.nextWebsiteNo+1
397                         self.handleWebsite(self.websites[self.nextWebsiteNo-1])
398
399         def getPattern(self, website, which):
400                 pat1 = website.getElementsByTagName(which)
401                 if len(pat1) == 0:
402                         return ''
403                 else:
404                         if len(pat1) > 1:
405                                 debug("[ReverseLookupAndNotifier] getPattern: Something strange: more than one %s for website %s" %(which, website.getAttribute("name")))
406                         return pat1[0].childNodes[0].data
407
408         def notifyAndReset(self):
409                 debug("[ReverseLookupAndNotifier] notifyAndReset: Number: " + self.number + "; Caller: " + self.caller)
410                 # debug("1: " + repr(self.caller))
411                 if self.caller:
412                         try:
413                                 debug("2: " + repr(self.caller))
414                                 self.caller = self.caller.encode(self.charset, 'replace')
415                                 debug("3: " + repr(self.caller))
416                         except UnicodeDecodeError:
417                                 debug("[ReverseLookupAndNotifier] cannot encode?!?!")
418                         # self.caller = unicode(self.caller)
419                         # debug("4: " + repr(self.caller))
420                         self.outputFunction(self.number, self.caller)
421                 else:
422                         self.outputFunction(self.number, "")
423                 if __name__ == '__main__':
424                         reactor.stop() #@UndefinedVariable # pylint: disable-msg=E1101
425
426 if __name__ == '__main__':
427         cwd = os.path.dirname(sys.argv[0])
428         if (len(sys.argv) == 2):
429                 # nrzuname.py Nummer
430                 ReverseLookupAndNotifier(sys.argv[1], simpleout)
431                 reactor.run() #@UndefinedVariable # pylint: disable-msg=E1101
432         elif (len(sys.argv) == 3):
433                 # nrzuname.py Nummer Charset
434                 setDebug(False)
435                 ReverseLookupAndNotifier(sys.argv[1], out, sys.argv[2])
436                 reactor.run() #@UndefinedVariable # pylint: disable-msg=E1101