UPD: nrzuname results made more generic
[vuplus_dvbapp-plugin] / fritzcall / src / nrzuname.py
index e5a31ac..add59b9 100644 (file)
 #!/usr/bin/python
-# -*- coding: ISO-8859-1 -*-
+# -*- coding: UTF-8 -*-
 # $Id$
 # $Author$
 # $Revision$
 # $Date$
 
-import re, sys, os
+import re, sys, os, traceback
 from xml.dom.minidom import parse
 from twisted.web.client import getPage #@UnresolvedImport
 from twisted.internet import reactor #@UnresolvedImport
 
-debug = True
-def setDebug(what):
-       global debug
-       debug = what
+try:
+       from . import debug #@UnresolvedImport
+       def setDebug(what):
+               pass
+except ValueError:
+       debugVal = True
+       def setDebug(what):
+               global debugVal
+               debugVal = what
+       def debug(str):
+               if debugVal:
+                       print str
 
-def myprint(str):
-       if debug:
-               print str
+import htmlentitydefs
+def html2unicode(in_html):
+#===============================================================================
+#      # sanity checks
+#      try:
+#              in_html = in_html.decode('iso-8859-1')
+#              debug("[Callhtml2utf8] Converted from latin1")
+#      except:
+#              debug("[Callhtml2utf8] lost in translation from latin1")
+#              pass
+#      try:
+#              in_html = in_html.decode('utf-8')
+#              debug("[Callhtml2utf8] Converted from utf-8")
+#      except:
+#              debug("[Callhtml2utf8] lost in translation from utf-8")
+#              pass
+#===============================================================================
 
-def html2utf8(in_html):
-       try:
-               import htmlentitydefs
-       except ImportError:
-               try:
-                       return in_html.replace("&", "&").replace("ß", "ß\9f").replace("ä", "ä").replace("ö", "ö").replace("ü", "ü").replace("Ä", "Ä").replace("Ö", "Ö").replace("Ü", "Ü")
-               except UnicodeDecodeError:
-                       pass
-       else:
-               # first convert some WML codes; does not work?!?!
-               wmldefs = [
-                               ("ß", "ß"),
-                               ("ä", "ä"),
-                               ("ö", "ö"),
-                               ("ü", "ü"),
-                               ("Ä", "Ä"),
-                               ("Ö", "Ö"),
-                               ("Ü", "Ü")
-                               ]
-               for (a, b)in wmldefs:
-                       try:
-                               in_html = in_html.replace(a,b)
-                       except UnicodeError:
-                               pass
+       # first convert some WML codes from hex: e.g. &#xE4 -> &#228
+       htmlentityhexnumbermask = re.compile('(&#x(..);)')
+       entities = htmlentityhexnumbermask.finditer(in_html)
+       for x in entities:
+               in_html = in_html.replace(x.group(1), '&#' + str(int(x.group(2),16)) + ';')
 
-               htmlentitynamemask = re.compile('(&(\D{1,5}?);)')
-               entitydict = {}
-               entities = htmlentitynamemask.finditer(in_html)
-               for x in entities:
-                       entitydict[x.group(1)] = x.group(2)
-               for key, name in entitydict.items():
-                       try:
-                               entitydict[key] = htmlentitydefs.name2codepoint[name]
-                       except KeyError:
-                               myprint("[Callhtml2utf8] KeyError " + key + "/" + name)
-                               pass
+       htmlentitynamemask = re.compile('(&(\D{1,5}?);)')
+       entitydict = {}
+       entities = htmlentitynamemask.finditer(in_html)
+       for x in entities:
+               # debug("[Callhtml2utf8] mask: found %s" %repr(x.group(2)))
+               entitydict[x.group(1)] = x.group(2)
+       for key, name in entitydict.items():
+               try:
+                       entitydict[key] = htmlentitydefs.name2codepoint[str(name)]
+               except KeyError:
+                       debug("[Callhtml2utf8] KeyError " + key + "/" + name)
 
-               htmlentitynumbermask = re.compile('(&#(\d{1,5}?);)')
-               entities = htmlentitynumbermask.finditer(in_html)
-               for x in entities:
-                       entitydict[x.group(1)] = x.group(2)
-               for key, codepoint in entitydict.items():
-                       try:
-                               in_html = in_html.replace(key, (unichr(int(codepoint)).encode('utf8', "replace")))
-                       except ValueError:
-                               myprint("[Callhtml2utf8] ValueError " + key + "/" + str(codepoint))
-                               pass
+       htmlentitynumbermask = re.compile('(&#(\d{1,5}?);)')
+       entities = htmlentitynumbermask.finditer(in_html)
+       for x in entities:
+               # debug("[Callhtml2utf8] number: found %s" %x.group(1))
+               entitydict[x.group(1)] = x.group(2)
+       for key, codepoint in entitydict.items():
+               try:
+                       debug("[nrzuname] html2utf8: replace %s with %s" %(repr(key), str(codepoint)))
+                       in_html = in_html.replace(unicode(key), (unichr(int(codepoint))))
+               except ValueError:
+                       debug("[nrzuname] html2utf8: ValueError " + key + "/" + str(codepoint))
        return in_html
 
-def out(number, caller):
-       name = vorname = strasse = hnr = plz = ort = ""
-       lines = caller.split(', ')
-       found = re.match("(.+?)\s+(.+)", lines[0])
+def normalizePhoneNumber(intNo):
+       found = re.match('^\+(.*)', intNo)
        if found:
-               name = found.group(1)
-               vorname = found.group(2)
-       else:
-               name = lines[0]
-       aktuell = 1
-       found = re.match("^(.+) ([-\d]+)$", lines[1], re.S)
+               intNo = '00' + found.group(1)
+       intNo = intNo.replace('(', '').replace(')', '').replace(' ', '').replace('/', '').replace('-', '')
+       found = re.match('.*?([0-9]+)', intNo)
        if found:
-               strasse = found.group(1)
-               hnr = found.group(2)
-               aktuell = 2
+               return found.group(1)
        else:
-               found = re.match("^(\d+) (.+)$", lines[1], re.S)
-               if found:
-                       strasse = found.group(2)
-                       hnr = found.group(1)
-               else:
-                       strasse = lines[1]
-               aktuell = 2
-       for i in range(aktuell, len(lines)):
-               found = re.match("(\S+)\s+(.+)", lines[i], re.S)
-               if found:
-                       plz = found.group(1)
-                       ort = found.group(2)
-                       break
-       else:
-               ort = lines[aktuell].strip()
-       print "NA: %s;VN: %s;STR: %s;HNR: %s;PLZ: %s;ORT: %s" %( name,vorname,strasse,hnr,plz,ort )
+               return '0'
+
+def out(number, caller):
+       debug("[nrzuname] out: %s: %s" %(number, caller))
+       found = re.match("NA: ([^;]*);VN: ([^;]*);STR: ([^;]*);HNR: ([^;]*);PLZ: ([^;]*);ORT: ([^;]*)", caller)
+       if not found:
+               return
+       ( name,vorname,strasse,hnr,plz,ort ) = (found.group(1),
+                                                                                       found.group(2),
+                                                                                       found.group(3),
+                                                                                       found.group(4),
+                                                                                       found.group(5),
+                                                                                       found.group(6)
+                                                                                       )
+       if vorname: name += ' ' + vorname
+       if strasse or hnr or plz or ort: name += ', '
+       if strasse: name += strasse
+       if hnr: name += ' ' + hnr
+       if (strasse or hnr) and (plz or ort): name += ', '
+       if plz and ort: name += plz + ' ' + ort
+       elif plz: name += plz
+       elif ort: name += ort
+
+       print(name)
 
-def simpleout(number, caller):
+def simpleout(number, caller): #@UnusedVariable
        print caller
 
 try:
@@ -114,19 +119,25 @@ countries = { }
 reverselookupMtime = 0
 
 class ReverseLookupAndNotifier:
-       def __init__(self, number, outputFunction=out, charset="ISO-8859-1", countrycode = "0049"):
-               myprint("[ReverseLookupAndNotifier] reverse Lookup for %s!" %number)
+       def __init__(self, number, outputFunction=out, charset="cp1252", countrycode = "0049"):
+               debug("[ReverseLookupAndNotifier] reverse Lookup for %s!" %number)
                self.number = number
                self.outputFunction = outputFunction
-               self.charset = charset
                self.caller = ""
                self.currentWebsite = None
                self.nextWebsiteNo = 0
+#===============================================================================
+# sorry does not work at all
+#              if not charset:
+#                      charset = sys.getdefaultencoding()
+#                      debug("[ReverseLookupAndNotifier] set charset from system: %s!" %charset)
+#===============================================================================
+               self.charset = charset
 
                global reverselookupMtime
                reverselookupMtimeAct = os.stat(reverseLookupFileName)[8]
                if not countries or reverselookupMtimeAct > reverselookupMtime:
-                       myprint("[ReverseLookupAndNotifier] (Re-)Reading %s\n" %reverseLookupFileName)
+                       debug("[ReverseLookupAndNotifier] (Re-)Reading %s\n" %reverseLookupFileName)
                        reverselookupMtime = reverselookupMtimeAct
                        dom = parse(reverseLookupFileName)
                        for top in dom.getElementsByTagName("reverselookup"):
@@ -136,6 +147,12 @@ class ReverseLookupAndNotifier:
 
                self.countrycode = countrycode
 
+               if re.match('^\+', self.number):
+                       self.number = '00' + self.number[1:]
+
+               if self.number[:len(countrycode)] == countrycode:
+                       self.number = '0' + self.number[len(countrycode):]
+
                if number[0] != "0":
                        # self.caller = _("UNKNOWN")
                        self.notifyAndReset()
@@ -149,24 +166,24 @@ class ReverseLookupAndNotifier:
                        elif countries.has_key(self.number[:5]):
                                self.countrycode = self.number[:5]
                        else:
-                               myprint("[ReverseLookupAndNotifier] Country cannot be reverse handled")
+                               debug("[ReverseLookupAndNotifier] Country cannot be reverse handled")
                                # self.caller = _("UNKNOWN")
                                self.notifyAndReset()
                                return
 
                if countries.has_key(self.countrycode):
-                       myprint("[ReverseLookupAndNotifier] Found website for reverse lookup")
+                       debug("[ReverseLookupAndNotifier] Found website for reverse lookup")
                        self.websites = countries[self.countrycode]
                        self.nextWebsiteNo = 1
                        self.handleWebsite(self.websites[0])
                else:
-                       myprint("[ReverseLookupAndNotifier] Country cannot be reverse handled")
+                       debug("[ReverseLookupAndNotifier] Country cannot be reverse handled")
                        # self.caller = _("UNKNOWN")
                        self.notifyAndReset()
                        return
 
        def handleWebsite(self, website):
-               myprint("[ReverseLookupAndNotifier] handleWebsite: " + website.getAttribute("name"))
+               debug("[ReverseLookupAndNotifier] handleWebsite: " + website.getAttribute("name"))
                if self.number[:2] == "00":
                        number = website.getAttribute("prefix") + self.number.replace(self.countrycode,"")
                else:
@@ -174,7 +191,7 @@ class ReverseLookupAndNotifier:
 
                url = website.getAttribute("url")
                if re.search('$AREACODE',url) or re.search('$PFXAREACODE',url):
-                       myprint("[ReverseLookupAndNotifier] handleWebsite: (PFX)ARECODE cannot be handled")
+                       debug("[ReverseLookupAndNotifier] handleWebsite: (PFX)ARECODE cannot be handled")
                        # self.caller = _("UNKNOWN")
                        self.notifyAndReset()
                        return
@@ -193,86 +210,202 @@ class ReverseLookupAndNotifier:
                elif re.search('\\$NUMBER',url): 
                        url = url.replace("$NUMBER","%s") %number
                else:
-                       myprint("[ReverseLookupAndNotifier] handleWebsite: cannot handle websites with no $NUMBER in url")
+                       debug("[ReverseLookupAndNotifier] handleWebsite: cannot handle websites with no $NUMBER in url")
                        # self.caller = _("UNKNOWN")
                        self.notifyAndReset()
                        return
-               myprint("[ReverseLookupAndNotifier] Url to query: " + url)
+               debug("[ReverseLookupAndNotifier] Url to query: " + url)
                url = url.encode("UTF-8", "replace")
                self.currentWebsite = website
-               # I am not sure, whether setting the user-agent works this way
                getPage(url,
                        agent="Mozilla/5.0 (Windows; U; Windows NT 6.0; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5"
                        ).addCallback(self._gotPage).addErrback(self._gotError)
 
+
        def _gotPage(self, page):
-               myprint("[ReverseLookupAndNotifier] _gotPage")
-               found = re.match('.*content=".*?charset=([^"]+)"',page,re.S)
+               def cleanName(text):
+                       item = text.replace("%20"," ").replace("&nbsp;"," ").replace("</b>","").replace(","," ").replace('\n',' ').replace('\t',' ')
+
+                       item = html2unicode(item)
+                       try: # this works under Windows
+                               item = item.decode('iso-8859-1')
+                       except:
+                               try: # this works under Enigma2
+                                       item = item.decode('utf-8')
+                               except:
+                                       try: # fall back
+                                               item = item.decode(self.charset)
+                                       except:
+                                               # debug("[ReverseLookupAndNotifier] cleanName: " + traceback.format_exc())
+                                               debug("[ReverseLookupAndNotifier] cleanName: encoding problem")
+
+                       newitem = item.replace("  ", " ")
+                       while newitem != item:
+                               item = newitem
+                               newitem = item.replace("  ", " ")
+                       return newitem.strip()
+       
+               debug("[ReverseLookupAndNotifier] _gotPage")
+               found = re.match('.*<meta http-equiv="Content-Type" content="(?:application/xhtml\+xml|text/html); charset=([^"]+)" />',page, re.S)
                if found:
-                       myprint("[ReverseLookupAndNotifier] Charset: " + found.group(1))
+                       debug("[ReverseLookupAndNotifier] Charset: " + found.group(1))
                        page = page.replace("\xa0"," ").decode(found.group(1), "replace")
                else:
+                       debug("[ReverseLookupAndNotifier] Default Charset: iso-8859-1")
                        page = page.replace("\xa0"," ").decode("ISO-8859-1", "replace")
 
                for entry in self.currentWebsite.getElementsByTagName("entry"):
-                       # myprint("[ReverseLookupAndNotifier] _gotPage: try entry")
-                       details = []
-                       for what in ["name", "street", "city", "zipcode"]:
-                               # myprint("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( what, pat ))
-                               pat = ".*?" + self.getPattern(entry, what)
+                       #
+                       # for the sites delivering fuzzy matches, we check against the returned number
+                       #
+                       pat = self.getPattern(entry, "number")
+                       if pat:
+                               pat = ".*?" + pat
+                               debug("[ReverseLookupAndNotifier] _gotPage: look for number with '''%s'''" %( pat ))
+                               found = re.match(pat, page, re.S|re.M)
+                               if found:
+                                       if self.number[:2] == '00':
+                                               number = '0' + self.number[4:]
+                                       else:
+                                               number = self.number
+                                       if number != normalizePhoneNumber(found.group(1)):
+                                               debug("[ReverseLookupAndNotifier] _gotPage: got unequal number '''%s''' for '''%s'''" %(found.group(1),self.number))
+                                               continue
+                       
+                       # look for <firstname> and <lastname> match, if not there look for <name>, if not there break
+                       name = ''
+                       firstname = ''
+                       street = ''
+                       streetno = ''
+                       city = ''
+                       zipcode = ''
+                       pat = self.getPattern(entry, "lastname")
+                       if pat:
+                               pat = ".*?" + pat
+                               debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "lastname", pat ))
                                found = re.match(pat, page, re.S|re.M)
                                if found:
-                                       # myprint("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( what, found.group(2) ))
-                                       myprint(found.group(1))
-                                       item = found.group(1).replace("&nbsp;"," ").replace("</b>","").replace(","," ")
-                                       item = html2utf8(item).decode("ISO-8859-1", "replace")
-                                       newitem = item.replace("  ", " ")
-                                       while newitem != item:
-                                               item = newitem
-                                               newitem = item.replace("  ", " ")
-                                       details.append(item.strip())
+                                       debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "lastname", found.group(1)))
+                                       name = cleanName(found.group(1))
+
+                                       pat = self.getPattern(entry, "firstname")
+                                       if pat:
+                                               pat = ".*?" + pat
+                                               debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "firstname", pat ))
+                                               found = re.match(pat, page, re.S|re.M)
+                                               if found:
+                                                       debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "firstname", found.group(1)))
+                                               firstname = cleanName(found.group(1)).strip()
+
+                       else:
+                               pat = ".*?" + self.getPattern(entry, "name")
+                               debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "name", pat ))
+                               found = re.match(pat, page, re.S|re.M)
+                               if found:
+                                       debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "name", found.group(1)))
+                                       item = cleanName(found.group(1))
+                                       # debug("[ReverseLookupAndNotifier] _gotPage: name: " + item)
+                                       name = item.strip()
+                                       firstNameFirst = entry.getElementsByTagName('name')[0].getAttribute('swapFirstAndLastName')
+                                       # debug("[ReverseLookupAndNotifier] _gotPage: swapFirstAndLastName: " + firstNameFirst)
+                                       if firstNameFirst == 'true': # that means, the name is of the form "firstname lastname"
+                                               found = re.match('(.*?)\s+(.*)', name)
+                                               if found:
+                                                       firstname = found.group(1)
+                                                       name = found.group(2)
                                else:
-                                       break
+                                       debug("[ReverseLookupAndNotifier] _gotPage: no name found, skipping")
+                                       continue
 
-                       if len(details) != 4:
+                       if not name:
                                continue
-                       else:
-                               name = details[0]
-                               address =  details[1] + ", " + details[3] + " " + details[2]
-                               myprint("[ReverseLookupAndNotifier] _gotPage: Reverse lookup succeeded:\nName: %s\nAddress: %s" %(name, address))
-                               self.caller = "%s, %s" %(name, address)
-                               # if self.number != 0 and config.plugins.Call.addcallers.value and self.event == "RING":
-                                       # phonebook.add(self.number, self.caller)
-
-                               self.caller = self.caller.encode("UTF-8", "replace")
-                               self.notifyAndReset()
-                               return True
-                               break
+
+                       pat = ".*?" + self.getPattern(entry, "city")
+                       debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "city", pat ))
+                       found = re.match(pat, page, re.S|re.M)
+                       if found:
+                               debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "city", found.group(1)))
+                               item = cleanName(found.group(1))
+                               debug("[ReverseLookupAndNotifier] _gotPage: city: " + item)
+                               city = item.strip()
+
+                       if not city:
+                               continue
+
+                       pat = ".*?" + self.getPattern(entry, "zipcode")
+                       debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "zipcode", pat ))
+                       found = re.match(pat, page, re.S|re.M)
+                       if found and found.group(1):
+                               debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "zipcode", found.group(1)))
+                               item = cleanName(found.group(1))
+                               debug("[ReverseLookupAndNotifier] _gotPage: zipcode: " + item)
+                               zipcode = item.strip()
+
+                       pat = ".*?" + self.getPattern(entry, "street")
+                       debug("[ReverseLookupAndNotifier] _gotPage: look for '''%s''' with '''%s'''" %( "street", pat ))
+                       found = re.match(pat, page, re.S|re.M)
+                       if found and found.group(1):
+                               debug("[ReverseLookupAndNotifier] _gotPage: found for '''%s''': '''%s'''" %( "street", found.group(1)))
+                               item = cleanName(found.group(1))
+                               debug("[ReverseLookupAndNotifier] _gotPage: street: " + item)
+                               street = item.strip()
+                               streetno = ''
+                               found = re.match("^(.+) ([-\d]+)$", street, re.S)
+                               if found:
+                                       street = found.group(1)
+                                       streetno= found.group(2)
+                               #===============================================================
+                               # else:
+                               #       found = re.match("^(\d+) (.+)$", street, re.S)
+                               #       if found:
+                               #               street = found.group(2)
+                               #               streetno = found.group(1)
+                               #===============================================================
+
+                       self.caller = "NA: %s;VN: %s;STR: %s;HNR: %s;PLZ: %s;ORT: %s" %( name,firstname,street,streetno,zipcode,city )
+                       debug("[ReverseLookupAndNotifier] _gotPage: Reverse lookup succeeded:\nName: %s" %(self.caller))
+
+                       self.notifyAndReset()
+                       return True
                else:
                        self._gotError("[ReverseLookupAndNotifier] _gotPage: Nothing found at %s" %self.currentWebsite.getAttribute("name"))
+                       return False
                        
        def _gotError(self, error = ""):
-               myprint("[ReverseLookupAndNotifier] _gotError - Error: %s" %error)
+               debug("[ReverseLookupAndNotifier] _gotError - Error: %s" %error)
                if self.nextWebsiteNo >= len(self.websites):
-                       myprint("[ReverseLookupAndNotifier] _gotError: I give up")
+                       debug("[ReverseLookupAndNotifier] _gotError: I give up")
                        # self.caller = _("UNKNOWN")
                        self.notifyAndReset()
                        return
                else:
-                       myprint("[ReverseLookupAndNotifier] _gotError: try next website")
+                       debug("[ReverseLookupAndNotifier] _gotError: try next website")
                        self.nextWebsiteNo = self.nextWebsiteNo+1
                        self.handleWebsite(self.websites[self.nextWebsiteNo-1])
 
        def getPattern(self, website, which):
                pat1 = website.getElementsByTagName(which)
-               if len(pat1) > 1:
-                       myprint("Something strange: more than one %s for website %s" %(which, website.getAttribute("name")))
-               return pat1[0].childNodes[0].data
+               if len(pat1) == 0:
+                       return ''
+               else:
+                       if len(pat1) > 1:
+                               debug("[ReverseLookupAndNotifier] getPattern: Something strange: more than one %s for website %s" %(which, website.getAttribute("name")))
+                       return pat1[0].childNodes[0].data
 
        def notifyAndReset(self):
-               myprint("[ReverseLookupAndNotifier] notifyAndReset: Number: " + self.number + "; Caller: " + self.caller)
+               debug("[ReverseLookupAndNotifier] notifyAndReset: Number: " + self.number + "; Caller: " + self.caller)
+               # debug("1: " + repr(self.caller))
                if self.caller:
-                       self.outputFunction(self.number, self.caller.decode("utf-8").encode(self.charset))
+                       try:
+                               # debug("2: " + repr(self.caller))
+                               self.caller = self.caller.encode(self.charset, 'replace')
+                               # debug("3: " + repr(self.caller))
+                       except:
+                               debug("[ReverseLookupAndNotifier] cannot encode?!?!")
+                               pass
+                       # self.caller = unicode(self.caller)
+                       # debug("4: " + repr(self.caller))
+                       self.outputFunction(self.number, self.caller)
                else:
                        self.outputFunction(self.number, "")
                if __name__ == '__main__':
@@ -282,10 +415,10 @@ if __name__ == '__main__':
        cwd = os.path.dirname(sys.argv[0])
        if (len(sys.argv) == 2):
                # nrzuname.py Nummer
-               ReverseLookupAndNotifier(sys.argv[1])
+               ReverseLookupAndNotifier(sys.argv[1], simpleout)
                reactor.run() #@UndefinedVariable
        elif (len(sys.argv) == 3):
-               # nrzuname.py Nummer SimpleOut
-               debug = False
-               ReverseLookupAndNotifier(sys.argv[1], simpleout)
+               # nrzuname.py Nummer Charset
+               setDebug(False)
+               ReverseLookupAndNotifier(sys.argv[1], out, sys.argv[2])
                reactor.run() #@UndefinedVariable