--- rss2email-2.71-orig/rss2email.py 2019-09-27 02:24:47.182023614 +0200 +++ rss2email-2.71/rss2email.py 2019-09-27 03:13:01.529137689 +0200 @@ -350,6 +350,106 @@ if type(s) is types.UnicodeType: return s.encode('utf-8') else: return s +### Extracted code from deprecated html2text.unescape(s) ### + +import htmlentitydefs +import re + +class HTML2TextUnescape: + def __init__(self): + self.RE_UNESCAPE = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") + self.UNIFIABLE = { + 'rsquo': "'", + 'lsquo': "'", + 'rdquo': '"', + 'ldquo': '"', + 'copy': '(C)', + 'mdash': '--', + 'nbsp': ' ', + 'rarr': '->', + 'larr': '<-', + 'middot': '*', + 'ndash': '-', + 'oelig': 'oe', + 'aelig': 'ae', + 'agrave': 'a', + 'aacute': 'a', + 'acirc': 'a', + 'atilde': 'a', + 'auml': 'a', + 'aring': 'a', + 'egrave': 'e', + 'eacute': 'e', + 'ecirc': 'e', + 'euml': 'e', + 'igrave': 'i', + 'iacute': 'i', + 'icirc': 'i', + 'iuml': 'i', + 'ograve': 'o', + 'oacute': 'o', + 'ocirc': 'o', + 'otilde': 'o', + 'ouml': 'o', + 'ugrave': 'u', + 'uacute': 'u', + 'ucirc': 'u', + 'uuml': 'u', + 'lrm': '', + 'rlm': '' + } + + self.unifiable_n = {} + for k in self.UNIFIABLE: + self.unifiable_n[self.name2cp(k)] = self.UNIFIABLE[k] + + def name2cp(self, k): + """Return sname to codepoint""" + if k == 'apos': + return ord("'") + return htmlentitydefs.name2codepoint[k] + + def charref(self, name): + if name[0] in ['x', 'X']: + c = int(name[1:], 16) + else: + c = int(name) + + if c in unifiable_n: + return unifiable_n[c] + else: + try: + return chr(c) + except ValueError: # invalid unicode + return '' + + def entityref(self, c): + if c in UNIFIABLE: + return UNIFIABLE[c] + else: + try: + name2cp(c) + except KeyError: + return "&" + c + ';' + else: + if c == 'nbsp': + return UNIFIABLE[c] + else: + return chr(name2cp(c)) + + def replaceEntities(self, s): + s = s.group(1) + if s[0] == "#": + return self.charref(s[1:]) + else: + return self.entityref(s) + + def unescape(self, s): + return self.RE_UNESCAPE.sub(self.replaceEntities, s) + +def h2t_unescape(s): + return HTML2TextUnescape().unescape(s) + ### Parsing Utilities ### def getContent(entry, HTMLOK=0): @@ -678,7 +778,7 @@ from_addr = getEmail(r, entry) - name = h2t.unescape(getName(r, entry)) + name = h2t_unescape(getName(r, entry)) fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title