Queer European MD passionate about IT
Browse Source

HTML parse_mode check upgraded.

All supported tags permitted, malformed tags replaced with escaped characters
Davte 1 year ago
parent
commit
fafa639328
5 changed files with 82 additions and 77 deletions
  1. 1 1
      davtelepot/__init__.py
  2. 3 3
      davtelepot/administration_tools.py
  3. 3 3
      davtelepot/api.py
  4. 12 12
      davtelepot/bot.py
  5. 63 58
      davtelepot/utilities.py

+ 1 - 1
davtelepot/__init__.py

@@ -11,7 +11,7 @@ __author__ = "Davide Testa"
 __email__ = "davide@davte.it"
 __credits__ = ["Marco Origlia", "Nick Lee @Nickoala"]
 __license__ = "GNU General Public License v3.0"
-__version__ = "2.8.9"
+__version__ = "2.8.10"
 __maintainer__ = "Davide Testa"
 __contact__ = "t.me/davte"
 

+ 3 - 3
davtelepot/administration_tools.py

@@ -27,7 +27,7 @@ from davtelepot.messages import default_admin_messages, default_talk_messages
 from davtelepot.bot import Bot
 from davtelepot.utilities import (
     async_wrapper, CachedPage, Confirmator, extract, get_cleaned_text,
-    get_user, escape_html_chars, line_drawing_unordered_list, make_button,
+    get_user, clean_html_string, line_drawing_unordered_list, make_button,
     make_inline_keyboard, remove_html_tags, send_part_of_text_file,
     send_csv_file, make_lines_of_buttons
 )
@@ -130,7 +130,7 @@ def get_talk_panel(bot: Bot,
                 'help_text',
                 update=update,
                 user_record=user_record,
-                q=escape_html_chars(
+                q=clean_html_string(
                     remove_html_tags(text)
                 )
             )
@@ -155,7 +155,7 @@ def get_talk_panel(bot: Bot,
                 'user_not_found',
                 update=update,
                 user_record=user_record,
-                q=escape_html_chars(
+                q=clean_html_string(
                     remove_html_tags(text)
                 )
             )

+ 3 - 3
davtelepot/api.py

@@ -382,7 +382,7 @@ class TelegramBot:
 
     @staticmethod
     def adapt_parameters(parameters, exclude=None):
-        """Build a aiohttp.FormData object from given `parameters`.
+        """Build an aiohttp.FormData object from given `parameters`.
 
         Exclude `self`, empty values and parameters in `exclude` list.
         Cast integers to string to avoid TypeError during json serialization.
@@ -1058,7 +1058,7 @@ class TelegramBot:
             unbanned first.
         Note: In regular groups (non-supergroups), this method will only work
             if the ‘All Members Are Admins’ setting is off in the target group.
-            Otherwise members may only be removed by the group's creator or by
+            Otherwise, members may only be removed by the group's creator or by
             the member that added them.
         See https://core.telegram.org/bots/api#kickchatmember for details.
         """
@@ -1245,7 +1245,7 @@ class TelegramBot:
         )
 
     async def getChat(self, chat_id: Union[int, str]):
-        """Get up to date information about the chat.
+        """Get up-to-date information about the chat.
 
         Return a Chat object on success.
         See https://core.telegram.org/bots/api#getchat for details.

+ 12 - 12
davtelepot/bot.py

@@ -54,7 +54,7 @@ from davtelepot.database import ObjectWithDatabase
 from davtelepot.languages import MultiLanguageObject
 from davtelepot.messages import davtelepot_messages
 from davtelepot.utilities import (
-    async_get, escape_html_chars, extract, get_secure_key,
+    async_get, clean_html_string, extract, get_secure_key,
     make_inline_query_answer, make_lines_of_buttons, remove_html_tags
 )
 
@@ -69,7 +69,7 @@ logging.getLogger('chardet').setLevel(logging.WARNING)
 class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
     """Simple Bot object, providing methods corresponding to Telegram bot API.
 
-    Multiple Bot() instances may be run together, along with a aiohttp web app.
+    Multiple Bot() instances may be run together, along with an aiohttp web app.
     """
 
     bots = []
@@ -347,7 +347,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
 
     @property
     def errors_file_path(self):
-        """Return errors file path basing on self.path and `_errors_file_name`.
+        """Return errors file path basing on `self.path` and `_errors_file_name`.
 
         Fallback to class file if set, otherwise return None.
         """
@@ -417,7 +417,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
         """Maximum number of simultaneous HTTPS connections allowed.
 
         Telegram will open as many connections as possible to boost bot’s
-            throughput, lower values limit the load on bots server.
+            throughput, lower values limit the load on bot's server.
         """
         return self._max_connections
 
@@ -477,7 +477,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
     def allowed_during_maintenance(self):
         """Return the list of criteria to allow an update during maintenance.
 
-        If any of this criteria returns True on an update, that update will be
+        If any of these criteria returns True on an update, that update will be
             handled even during maintenance.
         """
         return self._allowed_during_maintenance
@@ -858,7 +858,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
             elif 'chat' in update and update['chat']['id'] > 0:
                 reply = dict(text=self.unknown_command_message)
         else:  # Handle command aliases and text parsers
-            # Aliases are case insensitive: text and alias are both .lower()
+            # Aliases are case-insensitive: text and alias are both .lower()
             for alias, function in self.command_aliases.items():
                 if lowered_text.startswith(alias.lower()):
                     replier = function
@@ -1222,7 +1222,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
         with proper code markdown.
         """
         if parse_mode == 'HTML':
-            text = escape_html_chars(text)
+            text = clean_html_string(text)
         tags = (
             ('`', '`')
             if parse_mode == 'Markdown'
@@ -1591,7 +1591,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
                             photo.startswith(url_starter)
                             for url_starter in ('http', 'www',)
                         ]
-                ):  # If `photo` is not a url but a local file path
+                ):  # If `photo` is not a URL but a local file path
                     try:
                         with io.BytesIO() as buffered_picture:
                             with open(
@@ -1716,7 +1716,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
                             audio.startswith(url_starter)
                             for url_starter in ('http', 'www',)
                         ]
-                ):  # If `audio` is not a url but a local file path
+                ):  # If `audio` is not a URL but a local file path
                     try:
                         with io.BytesIO() as buffered_picture:
                             with open(
@@ -1841,7 +1841,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
                             voice.startswith(url_starter)
                             for url_starter in ('http', 'www',)
                         ]
-                ):  # If `voice` is not a url but a local file path
+                ):  # If `voice` is not a URL but a local file path
                     try:
                         with io.BytesIO() as buffered_picture:
                             with open(
@@ -1977,7 +1977,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
                             document_path.startswith(url_starter)
                             for url_starter in ('http', 'www',)
                         ]
-                ):  # If `document_path` is not a url but a local file path
+                ):  # If `document_path` is not a URL but a local file path
                     try:
                         with open(
                             document_path.format(
@@ -3162,7 +3162,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
         allowed_updates : List(str)
             List of update types to be retrieved.
             Empty list to allow all updates.
-            None to fallback to class default.
+            None to fall back to class default.
         """
         # Return if token is invalid
         await self.get_me()

+ 63 - 58
davtelepot/utilities.py

@@ -16,9 +16,9 @@ import string
 import time
 
 from difflib import SequenceMatcher
+from typing import Tuple, Union
 
 # Third party modules
-from typing import Tuple, Union
 
 import aiohttp
 from bs4 import BeautifulSoup
@@ -1251,7 +1251,7 @@ def parse_datetime_interval_string(text):
             result_text.pop()
             if len(result_text) > 0 and result_text[-1].lower() in TIME_WORDS:
                 result_text.pop()
-    result_text = escape_html_chars(
+    result_text = clean_html_string(
         ' '.join(result_text)
     )
     parsers = list(
@@ -1330,6 +1330,22 @@ MONTH_NAMES_ITA[10] = "ottobre"
 MONTH_NAMES_ITA[11] = "novembre"
 MONTH_NAMES_ITA[12] = "dicembre"
 
+allowed_html_tags = ['b', 'strong',
+                     'i', 'em',
+                     'u', 'ins',
+                     's', 'strike', 'del',
+                     'span', 'tg-spoiler',
+                     'a',
+                     'code', 'pre']
+
+HTML_SYMBOLS = collections.OrderedDict()
+HTML_SYMBOLS["&"] = "&"
+HTML_SYMBOLS["<"] = "&lt;"
+HTML_SYMBOLS[">"] = "&gt;"
+HTML_SYMBOLS["\""] = "&quot;"
+
+html_numeric_code_regex = re.compile(r'&amp;(?P<code>#\d{2,3};)')
+
 
 def beautytd(td):
     """Format properly timedeltas."""
@@ -1410,67 +1426,56 @@ def beautydt(dt):
     return result
 
 
-HTML_SYMBOLS = MyOD()
-HTML_SYMBOLS["&"] = "&amp;"
-HTML_SYMBOLS["<"] = "&lt;"
-HTML_SYMBOLS[">"] = "&gt;"
-HTML_SYMBOLS["\""] = "&quot;"
-HTML_SYMBOLS["&lt;b&gt;"] = "<b>"
-HTML_SYMBOLS["&lt;/b&gt;"] = "</b>"
-HTML_SYMBOLS["&lt;i&gt;"] = "<i>"
-HTML_SYMBOLS["&lt;/i&gt;"] = "</i>"
-HTML_SYMBOLS["&lt;code&gt;"] = "<code>"
-HTML_SYMBOLS["&lt;/code&gt;"] = "</code>"
-HTML_SYMBOLS["&lt;pre&gt;"] = "<pre>"
-HTML_SYMBOLS["&lt;/pre&gt;"] = "</pre>"
-HTML_SYMBOLS["&lt;a href=&quot;"] = "<a href=\""
-HTML_SYMBOLS["&quot;&gt;"] = "\">"
-HTML_SYMBOLS["&lt;/a&gt;"] = "</a>"
-
-HTML_TAGS = [
-    None, "<b>", "</b>",
-    None, "<i>", "</i>",
-    None, "<code>", "</code>",
-    None, "<pre>", "</pre>",
-    None, "<a href=\"", "\">", "</a>",
-    None
-]
-
+def clean_html_string(text: str) -> str:
+    """Escape HTML symbols, unless part of a valid tag or numeric code character.
 
-def remove_html_tags(text):
-    """Remove HTML tags from `text`."""
-    for tag in HTML_TAGS:
-        if tag is None:
-            continue
-        text = text.replace(tag, '')
+    Find valid HTML tags;
+    if there are any, choose the first occurring and call the function
+        recursively on what comes before the tag, inside the tag and after the
+        tag, preserving the tag opening and close as they are;
+    if there aren't any, escape HTML symbols except for `&` in HTML numeric code
+        characters (`&#` followed by 2 or 3 digits followed by `;`).
+    """
+    first_match = None
+    for tag in allowed_html_tags:
+        if tag in ('a', ):  # <a> must have href attribute
+            attribute = r" href=\".*\""
+        elif tag in ('span', ):  # <span> must have class attribute with "tg-spoiler" value
+            attribute = r" class=\"tg-spoiler\""
+        elif tag in ('code',):  # <code> may have a class with a programming language as value
+            attribute = r"( class=\".*\")?"
+        else:
+            attribute = ""
+        match = re.search(
+            rf'(?P<opening><{tag}{attribute}>)'
+            rf'(?P<body>.*?)'
+            rf'(?P<close></{tag}>)',
+            text
+        )
+        if match and (first_match is None or match.start() < first_match.start()):
+            first_match = match
+    if first_match is not None:
+        groups = first_match.groupdict()
+        text = (f"{clean_html_string(text[:first_match.start()])}"
+                f"{groups['opening']}{clean_html_string(groups['body'])}{groups['close']}"
+                f"{clean_html_string(text[first_match.end():])}")
+    else:
+        for key, value in HTML_SYMBOLS.items():
+            text = text.replace(key, value)
+        if re.search(html_numeric_code_regex, text):
+            text = re.sub(html_numeric_code_regex, r'&\g<code>', text)
     return text
 
 
 def escape_html_chars(text):
-    """Escape HTML chars if not part of a tag."""
-    for s, r in HTML_SYMBOLS.items():
-        text = text.replace(s, r)
-    copy = text
-    expected_tag = None
-    while copy:
-        min_ = min(
-            (
-                dict(
-                    position=copy.find(tag) if tag in copy else len(copy),
-                    tag=tag
-                )
-                for tag in HTML_TAGS
-                if tag
-            ),
-            key=lambda x: x['position'],
-            default=0
-        )
-        if min_['position'] == len(copy):
-            break
-        if expected_tag and min_['tag'] != expected_tag:
-            return text.replace('<', '_').replace('>', '_')
-        expected_tag = HTML_TAGS[HTML_TAGS.index(min_['tag'])+1]
-        copy = extract(copy, min_['tag'])
+    logging.error("`escape_html_chars` function deprecated, use `clean_html_string` instead.")
+    return clean_html_string(text)
+
+
+def remove_html_tags(text):
+    """Remove HTML tags from `text`."""
+    for tag in allowed_html_tags:
+        text = re.sub(rf'</?{tag}( (href|class)=\".*\")?>', '', text)
     return text