2 年之前 · fafa639328
--- a/davtelepot/__init__.py
+++ b/davtelepot/__init__.py
@@ -11,7 +11,7 @@ __author__ = "Davide Testa"
 
															 __email__ = "davide@davte.it"
														
 
															 __credits__ = ["Marco Origlia", "Nick Lee @Nickoala"]
														
 
															 __license__ = "GNU General Public License v3.0"
														
 
															-__version__ = "2.8.9"
														
 
															+__version__ = "2.8.10"
														
 
															 __maintainer__ = "Davide Testa"
														
 
															 __contact__ = "t.me/davte"
														
--- a/davtelepot/administration_tools.py
+++ b/davtelepot/administration_tools.py
@@ -27,7 +27,7 @@ from davtelepot.messages import default_admin_messages, default_talk_messages
 
															 from davtelepot.bot import Bot
														
 
															 from davtelepot.utilities import (
														
 
															     async_wrapper, CachedPage, Confirmator, extract, get_cleaned_text,
														
 
															-    get_user, escape_html_chars, line_drawing_unordered_list, make_button,
														
 
															+    get_user, clean_html_string, line_drawing_unordered_list, make_button,
														
 
															     make_inline_keyboard, remove_html_tags, send_part_of_text_file,
														
 
															     send_csv_file, make_lines_of_buttons
														
 
															 )
														
@@ -130,7 +130,7 @@ def get_talk_panel(bot: Bot,
 
															                 'help_text',
														
 
															                 update=update,
														
 
															                 user_record=user_record,
														
 
															-                q=escape_html_chars(
														
 
															+                q=clean_html_string(
														
 
															                     remove_html_tags(text)
														
 
															                 )
														
 
															             )
														
@@ -155,7 +155,7 @@ def get_talk_panel(bot: Bot,
 
															                 'user_not_found',
														
 
															                 update=update,
														
 
															                 user_record=user_record,
														
 
															-                q=escape_html_chars(
														
 
															+                q=clean_html_string(
														
 
															                     remove_html_tags(text)
														
 
															                 )
														
 
															             )
														
--- a/davtelepot/api.py
+++ b/davtelepot/api.py
@@ -382,7 +382,7 @@ class TelegramBot:
 
															     @staticmethod
														
 
															     def adapt_parameters(parameters, exclude=None):
														
 
															-        """Build a aiohttp.FormData object from given `parameters`.
														
 
															+        """Build an aiohttp.FormData object from given `parameters`.
														
 
															         Exclude `self`, empty values and parameters in `exclude` list.
														
 
															         Cast integers to string to avoid TypeError during json serialization.
														
@@ -1058,7 +1058,7 @@ class TelegramBot:
 
															             unbanned first.
														
 
															         Note: In regular groups (non-supergroups), this method will only work
														
 
															             if the ‘All Members Are Admins’ setting is off in the target group.
														
 
															-            Otherwise members may only be removed by the group's creator or by
														
 
															+            Otherwise, members may only be removed by the group's creator or by
														
 
															             the member that added them.
														
 
															         See https://core.telegram.org/bots/api#kickchatmember for details.
														
 
															         """
														
@@ -1245,7 +1245,7 @@ class TelegramBot:
 
															         )
														
 
															     async def getChat(self, chat_id: Union[int, str]):
														
 
															-        """Get up to date information about the chat.
														
 
															+        """Get up-to-date information about the chat.
														
 
															         Return a Chat object on success.
														
 
															         See https://core.telegram.org/bots/api#getchat for details.
														
--- a/davtelepot/bot.py
+++ b/davtelepot/bot.py
@@ -54,7 +54,7 @@ from davtelepot.database import ObjectWithDatabase
 
															 from davtelepot.languages import MultiLanguageObject
														
 
															 from davtelepot.messages import davtelepot_messages
														
 
															 from davtelepot.utilities import (
														
 
															-    async_get, escape_html_chars, extract, get_secure_key,
														
 
															+    async_get, clean_html_string, extract, get_secure_key,
														
 
															     make_inline_query_answer, make_lines_of_buttons, remove_html_tags
														
 
															 )
														
@@ -69,7 +69,7 @@ logging.getLogger('chardet').setLevel(logging.WARNING)
 
															 class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
														
 
															     """Simple Bot object, providing methods corresponding to Telegram bot API.
														
 
															-    Multiple Bot() instances may be run together, along with a aiohttp web app.
														
 
															+    Multiple Bot() instances may be run together, along with an aiohttp web app.
														
 
															     """
														
 
															     bots = []
														
@@ -347,7 +347,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															     @property
														
 
															     def errors_file_path(self):
														
 
															-        """Return errors file path basing on self.path and `_errors_file_name`.
														
 
															+        """Return errors file path basing on `self.path` and `_errors_file_name`.
														
 
															         Fallback to class file if set, otherwise return None.
														
 
															         """
														
@@ -417,7 +417,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															         """Maximum number of simultaneous HTTPS connections allowed.
														
 
															         Telegram will open as many connections as possible to boost bot’s
														
 
															-            throughput, lower values limit the load on bot‘s server.
														
 
															+            throughput, lower values limit the load on bot's server.
														
 
															         """
														
 
															         return self._max_connections
														
@@ -477,7 +477,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															     def allowed_during_maintenance(self):
														
 
															         """Return the list of criteria to allow an update during maintenance.
														
 
															-        If any of this criteria returns True on an update, that update will be
														
 
															+        If any of these criteria returns True on an update, that update will be
														
 
															             handled even during maintenance.
														
 
															         """
														
 
															         return self._allowed_during_maintenance
														
@@ -858,7 +858,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															             elif 'chat' in update and update['chat']['id'] > 0:
														
 
															                 reply = dict(text=self.unknown_command_message)
														
 
															         else:  # Handle command aliases and text parsers
														
 
															-            # Aliases are case insensitive: text and alias are both .lower()
														
 
															+            # Aliases are case-insensitive: text and alias are both .lower()
														
 
															             for alias, function in self.command_aliases.items():
														
 
															                 if lowered_text.startswith(alias.lower()):
														
 
															                     replier = function
														
@@ -1222,7 +1222,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															         with proper code markdown.
														
 
															         """
														
 
															         if parse_mode == 'HTML':
														
 
															-            text = escape_html_chars(text)
														
 
															+            text = clean_html_string(text)
														
 
															         tags = (
														
 
															             ('`', '`')
														
 
															             if parse_mode == 'Markdown'
														
@@ -1591,7 +1591,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															                             photo.startswith(url_starter)
														
 
															                             for url_starter in ('http', 'www',)
														
 
															                         ]
														
 
															-                ):  # If `photo` is not a url but a local file path
														
 
															+                ):  # If `photo` is not a URL but a local file path
														
 
															                     try:
														
 
															                         with io.BytesIO() as buffered_picture:
														
 
															                             with open(
														
@@ -1716,7 +1716,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															                             audio.startswith(url_starter)
														
 
															                             for url_starter in ('http', 'www',)
														
 
															                         ]
														
 
															-                ):  # If `audio` is not a url but a local file path
														
 
															+                ):  # If `audio` is not a URL but a local file path
														
 
															                     try:
														
 
															                         with io.BytesIO() as buffered_picture:
														
 
															                             with open(
														
@@ -1841,7 +1841,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															                             voice.startswith(url_starter)
														
 
															                             for url_starter in ('http', 'www',)
														
 
															                         ]
														
 
															-                ):  # If `voice` is not a url but a local file path
														
 
															+                ):  # If `voice` is not a URL but a local file path
														
 
															                     try:
														
 
															                         with io.BytesIO() as buffered_picture:
														
 
															                             with open(
														
@@ -1977,7 +1977,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															                             document_path.startswith(url_starter)
														
 
															                             for url_starter in ('http', 'www',)
														
 
															                         ]
														
 
															-                ):  # If `document_path` is not a url but a local file path
														
 
															+                ):  # If `document_path` is not a URL but a local file path
														
 
															                     try:
														
 
															                         with open(
														
 
															                             document_path.format(
														
@@ -3162,7 +3162,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
															         allowed_updates : List(str)
														
 
															             List of update types to be retrieved.
														
 
															             Empty list to allow all updates.
														
 
															-            None to fallback to class default.
														
 
															+            None to fall back to class default.
														
 
															         """
														
 
															         # Return if token is invalid
														
 
															         await self.get_me()
														
--- a/davtelepot/utilities.py
+++ b/davtelepot/utilities.py
@@ -16,9 +16,9 @@ import string
 
															 import time
														
 
															 from difflib import SequenceMatcher
														
 
															+from typing import Tuple, Union
														
 
															 # Third party modules
														
 
															-from typing import Tuple, Union
														
 
															 import aiohttp
														
 
															 from bs4 import BeautifulSoup
														
@@ -1251,7 +1251,7 @@ def parse_datetime_interval_string(text):
 
															             result_text.pop()
														
 
															             if len(result_text) > 0 and result_text[-1].lower() in TIME_WORDS:
														
 
															                 result_text.pop()
														
 
															-    result_text = escape_html_chars(
														
 
															+    result_text = clean_html_string(
														
 
															         ' '.join(result_text)
														
 
															     )
														
 
															     parsers = list(
														
@@ -1330,6 +1330,22 @@ MONTH_NAMES_ITA[10] = "ottobre"
 
															 MONTH_NAMES_ITA[11] = "novembre"
														
 
															 MONTH_NAMES_ITA[12] = "dicembre"
														
 
															+allowed_html_tags = ['b', 'strong',
														
 
															+                     'i', 'em',
														
 
															+                     'u', 'ins',
														
 
															+                     's', 'strike', 'del',
														
 
															+                     'span', 'tg-spoiler',
														
 
															+                     'a',
														
 
															+                     'code', 'pre']
														
 
															+
														
 
															+HTML_SYMBOLS = collections.OrderedDict()
														
 
															+HTML_SYMBOLS["&"] = "&amp;"
														
 
															+HTML_SYMBOLS["<"] = "&lt;"
														
 
															+HTML_SYMBOLS[">"] = "&gt;"
														
 
															+HTML_SYMBOLS["\""] = "&quot;"
														
 
															+
														
 
															+html_numeric_code_regex = re.compile(r'&amp;(?P<code>#\d{2,3};)')
														
 
															+
														
 
															 def beautytd(td):
														
 
															     """Format properly timedeltas."""
														
@@ -1410,67 +1426,56 @@ def beautydt(dt):
 
															     return result
														
 
															-HTML_SYMBOLS = MyOD()
														
 
															-HTML_SYMBOLS["&"] = "&amp;"
														
 
															-HTML_SYMBOLS["<"] = "&lt;"
														
 
															-HTML_SYMBOLS[">"] = "&gt;"
														
 
															-HTML_SYMBOLS["\""] = "&quot;"
														
 
															-HTML_SYMBOLS["&lt;b&gt;"] = "<b>"
														
 
															-HTML_SYMBOLS["&lt;/b&gt;"] = "</b>"
														
 
															-HTML_SYMBOLS["&lt;i&gt;"] = "<i>"
														
 
															-HTML_SYMBOLS["&lt;/i&gt;"] = "</i>"
														
 
															-HTML_SYMBOLS["&lt;code&gt;"] = "<code>"
														
 
															-HTML_SYMBOLS["&lt;/code&gt;"] = "</code>"
														
 
															-HTML_SYMBOLS["&lt;pre&gt;"] = "<pre>"
														
 
															-HTML_SYMBOLS["&lt;/pre&gt;"] = "</pre>"
														
 
															-HTML_SYMBOLS["&lt;a href=&quot;"] = "<a href=\""
														
 
															-HTML_SYMBOLS["&quot;&gt;"] = "\">"
														
 
															-HTML_SYMBOLS["&lt;/a&gt;"] = "</a>"
														
 
															-
														
 
															-HTML_TAGS = [
														
 
															-    None, "<b>", "</b>",
														
 
															-    None, "<i>", "</i>",
														
 
															-    None, "<code>", "</code>",
														
 
															-    None, "<pre>", "</pre>",
														
 
															-    None, "<a href=\"", "\">", "</a>",
														
 
															-    None
														
 
															-]
														
 
															-
														
 
															+def clean_html_string(text: str) -> str:
														
 
															+    """Escape HTML symbols, unless part of a valid tag or numeric code character.
														
 
															-def remove_html_tags(text):
														
 
															-    """Remove HTML tags from `text`."""
														
 
															-    for tag in HTML_TAGS:
														
 
															-        if tag is None:
														
 
															-            continue
														
 
															-        text = text.replace(tag, '')
														
 
															+    Find valid HTML tags;
														
 
															+    if there are any, choose the first occurring and call the function
														
 
															+        recursively on what comes before the tag, inside the tag and after the
														
 
															+        tag, preserving the tag opening and close as they are;
														
 
															+    if there aren't any, escape HTML symbols except for `&` in HTML numeric code
														
 
															+        characters (`&#` followed by 2 or 3 digits followed by `;`).
														
 
															+    """
														
 
															+    first_match = None
														
 
															+    for tag in allowed_html_tags:
														
 
															+        if tag in ('a', ):  # <a> must have href attribute
														
 
															+            attribute = r" href=\".*\""
														
 
															+        elif tag in ('span', ):  # <span> must have class attribute with "tg-spoiler" value
														
 
															+            attribute = r" class=\"tg-spoiler\""
														
 
															+        elif tag in ('code',):  # <code> may have a class with a programming language as value
														
 
															+            attribute = r"( class=\".*\")?"
														
 
															+        else:
														
 
															+            attribute = ""
														
 
															+        match = re.search(
														
 
															+            rf'(?P<opening><{tag}{attribute}>)'
														
 
															+            rf'(?P<body>.*?)'
														
 
															+            rf'(?P<close></{tag}>)',
														
 
															+            text
														
 
															+        )
														
 
															+        if match and (first_match is None or match.start() < first_match.start()):
														
 
															+            first_match = match
														
 
															+    if first_match is not None:
														
 
															+        groups = first_match.groupdict()
														
 
															+        text = (f"{clean_html_string(text[:first_match.start()])}"
														
 
															+                f"{groups['opening']}{clean_html_string(groups['body'])}{groups['close']}"
														
 
															+                f"{clean_html_string(text[first_match.end():])}")
														
 
															+    else:
														
 
															+        for key, value in HTML_SYMBOLS.items():
														
 
															+            text = text.replace(key, value)
														
 
															+        if re.search(html_numeric_code_regex, text):
														
 
															+            text = re.sub(html_numeric_code_regex, r'&\g<code>', text)
														
 
															     return text
														
 
															 def escape_html_chars(text):
														
 
															-    """Escape HTML chars if not part of a tag."""
														
 
															-    for s, r in HTML_SYMBOLS.items():
														
 
															-        text = text.replace(s, r)
														
 
															-    copy = text
														
 
															-    expected_tag = None
														
 
															-    while copy:
														
 
															-        min_ = min(
														
 
															-            (
														
 
															-                dict(
														
 
															-                    position=copy.find(tag) if tag in copy else len(copy),
														
 
															-                    tag=tag
														
 
															-                )
														
 
															-                for tag in HTML_TAGS
														
 
															-                if tag
														
 
															-            ),
														
 
															-            key=lambda x: x['position'],
														
 
															-            default=0
														
 
															-        )
														
 
															-        if min_['position'] == len(copy):
														
 
															-            break
														
 
															-        if expected_tag and min_['tag'] != expected_tag:
														
 
															-            return text.replace('<', '_').replace('>', '_')
														
 
															-        expected_tag = HTML_TAGS[HTML_TAGS.index(min_['tag'])+1]
														
 
															-        copy = extract(copy, min_['tag'])
														
 
															+    logging.error("`escape_html_chars` function deprecated, use `clean_html_string` instead.")
														
 
															+    return clean_html_string(text)
														
 
															+
														
 
															+
														
 
															+def remove_html_tags(text):
														
 
															+    """Remove HTML tags from `text`."""
														
 
															+    for tag in allowed_html_tags:
														
 
															+        text = re.sub(rf'</?{tag}( (href|class)=\".*\")?>', '', text)
														
 
															     return text