rss2email/0010-Extract-deprecated-html2text-unescape.patch
2019-09-27 03:22:22 +02:00

118 lines
3.2 KiB
Diff

--- rss2email-2.71-orig/rss2email.py 2019-09-27 02:24:47.182023614 +0200
+++ rss2email-2.71/rss2email.py 2019-09-27 03:13:01.529137689 +0200
@@ -350,6 +350,106 @@
if type(s) is types.UnicodeType: return s.encode('utf-8')
else: return s
+### Extracted code from deprecated html2text.unescape(s) ###
+
+import htmlentitydefs
+import re
+
+class HTML2TextUnescape:
+ def __init__(self):
+ self.RE_UNESCAPE = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
+ self.UNIFIABLE = {
+ 'rsquo': "'",
+ 'lsquo': "'",
+ 'rdquo': '"',
+ 'ldquo': '"',
+ 'copy': '(C)',
+ 'mdash': '--',
+ 'nbsp': ' ',
+ 'rarr': '->',
+ 'larr': '<-',
+ 'middot': '*',
+ 'ndash': '-',
+ 'oelig': 'oe',
+ 'aelig': 'ae',
+ 'agrave': 'a',
+ 'aacute': 'a',
+ 'acirc': 'a',
+ 'atilde': 'a',
+ 'auml': 'a',
+ 'aring': 'a',
+ 'egrave': 'e',
+ 'eacute': 'e',
+ 'ecirc': 'e',
+ 'euml': 'e',
+ 'igrave': 'i',
+ 'iacute': 'i',
+ 'icirc': 'i',
+ 'iuml': 'i',
+ 'ograve': 'o',
+ 'oacute': 'o',
+ 'ocirc': 'o',
+ 'otilde': 'o',
+ 'ouml': 'o',
+ 'ugrave': 'u',
+ 'uacute': 'u',
+ 'ucirc': 'u',
+ 'uuml': 'u',
+ 'lrm': '',
+ 'rlm': ''
+ }
+
+ self.unifiable_n = {}
+ for k in self.UNIFIABLE:
+ self.unifiable_n[self.name2cp(k)] = self.UNIFIABLE[k]
+
+ def name2cp(self, k):
+ """Return sname to codepoint"""
+ if k == 'apos':
+ return ord("'")
+ return htmlentitydefs.name2codepoint[k]
+
+ def charref(self, name):
+ if name[0] in ['x', 'X']:
+ c = int(name[1:], 16)
+ else:
+ c = int(name)
+
+ if c in unifiable_n:
+ return unifiable_n[c]
+ else:
+ try:
+ return chr(c)
+ except ValueError: # invalid unicode
+ return ''
+
+ def entityref(self, c):
+ if c in UNIFIABLE:
+ return UNIFIABLE[c]
+ else:
+ try:
+ name2cp(c)
+ except KeyError:
+ return "&" + c + ';'
+ else:
+ if c == 'nbsp':
+ return UNIFIABLE[c]
+ else:
+ return chr(name2cp(c))
+
+ def replaceEntities(self, s):
+ s = s.group(1)
+ if s[0] == "#":
+ return self.charref(s[1:])
+ else:
+ return self.entityref(s)
+
+ def unescape(self, s):
+ return self.RE_UNESCAPE.sub(self.replaceEntities, s)
+
+def h2t_unescape(s):
+ return HTML2TextUnescape().unescape(s)
+
### Parsing Utilities ###
def getContent(entry, HTMLOK=0):
@@ -678,7 +778,7 @@
from_addr = getEmail(r, entry)
- name = h2t.unescape(getName(r, entry))
+ name = h2t_unescape(getName(r, entry))
fromhdr = formataddr((name, from_addr,))
tohdr = (f.to or default_to)
subjecthdr = title