# Pyrofork - Telegram MTProto API Client Library for Python # Copyright (C) 2017-present Dan # Copyright (C) 2022-present Mayuri-Chan # # This file is part of Pyrofork. # # Pyrofork is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Pyrofork is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with Pyrofork. If not, see . import html import logging import re from typing import Optional import pyrogram from pyrogram.enums import MessageEntityType from . import utils from .html import HTML BOLD_DELIM = "**" ITALIC_DELIM = "__" UNDERLINE_DELIM = "--" STRIKE_DELIM = "~~" SPOILER_DELIM = "||" CODE_DELIM = "`" PRE_DELIM = "```" BLOCKQUOTE_DELIM = ">" BLOCKQUOTE_EXPANDABLE_DELIM = "**>" MARKDOWN_RE = re.compile(r"({d})".format( d="|".join( ["".join(i) for i in [ [rf"\{j}" for j in i] for i in [ PRE_DELIM, CODE_DELIM, STRIKE_DELIM, UNDERLINE_DELIM, ITALIC_DELIM, BOLD_DELIM, SPOILER_DELIM ] ]] ))) URL_RE = re.compile(r"(!?)\[(.+?)\]\((.+?)\)") OPENING_TAG = "<{}>" CLOSING_TAG = "" URL_MARKUP = '{}' EMOJI_MARKUP = '{}' FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM] CODE_TAG_RE = re.compile(r".*?") class Markdown: def __init__(self, client: Optional["pyrogram.Client"]): self.html = HTML(client) def blockquote_parser(self, text): text = re.sub(r'\n>', '\n>', re.sub(r'^>', '>', text)) lines = text.split('\n') result = [] in_blockquote = False for line in lines: if line.startswith(BLOCKQUOTE_DELIM): if not in_blockquote: line = re.sub(r'^> ', OPENING_TAG.format("blockquote"), line) line = re.sub(r'^>', OPENING_TAG.format("blockquote"), line) in_blockquote = True result.append(line.strip()) else: result.append(line[1:].strip()) elif line.startswith(BLOCKQUOTE_EXPANDABLE_DELIM): if not in_blockquote: line = re.sub(r'^\*\*> ', OPENING_TAG.format("blockquote expandable"), line) line = re.sub(r'^\*\*>', OPENING_TAG.format("blockquote expandable"), line) in_blockquote = True result.append(line.strip()) else: result.append(line[3:].strip()) else: if in_blockquote: line = CLOSING_TAG.format("blockquote") + line in_blockquote = False result.append(line) if in_blockquote: line = result[len(result)-1] + CLOSING_TAG.format("blockquote") result.pop(len(result)-1) result.append(line) return '\n'.join(result) async def parse(self, text: str, strict: bool = False): if strict: text = html.escape(text) text = self.blockquote_parser(text) delims = set() is_fixed_width = False placeholders = {} for i, code_section in enumerate(CODE_TAG_RE.findall(text)): placeholder = f"{{CODE_SECTION_{i}}}" placeholders[placeholder] = code_section text = text.replace(code_section, placeholder, 1) for i, match in enumerate(re.finditer(MARKDOWN_RE, text)): start, _ = match.span() delim = match.group(1) full = match.group(0) if delim in FIXED_WIDTH_DELIMS: is_fixed_width = not is_fixed_width if is_fixed_width and delim not in FIXED_WIDTH_DELIMS: continue if delim == BOLD_DELIM: tag = "b" elif delim == ITALIC_DELIM: tag = "i" elif delim == UNDERLINE_DELIM: tag = "u" elif delim == STRIKE_DELIM: tag = "s" elif delim == CODE_DELIM: tag = "code" elif delim == PRE_DELIM: tag = "pre" elif delim == SPOILER_DELIM: tag = "spoiler" else: continue if delim not in delims: delims.add(delim) tag = OPENING_TAG.format(tag) else: delims.remove(delim) tag = CLOSING_TAG.format(tag) if delim == PRE_DELIM and delim in delims: delim_and_language = text[text.find(PRE_DELIM):].split("\n")[0] language = delim_and_language[len(PRE_DELIM):] text = utils.replace_once(text, delim_and_language, f'
', start)
                continue

            text = utils.replace_once(text, delim, tag, start)

        for i, match in enumerate(re.finditer(URL_RE, text)):
            start, _ = match.span()
            is_emoji, text_url, url = match.groups()
            full = match.group(0)

            if not is_emoji and text_url:
                text = utils.replace_once(text, full, URL_MARKUP.format(url, text_url), start)
                continue

            if is_emoji:
                emoji = text_url
                emoji_id = url.lstrip("tg://emoji?id=")
                text = utils.replace_once(text, full, EMOJI_MARKUP.format(emoji_id, emoji), start)
                continue

        for placeholder, code_section in placeholders.items():
            text = text.replace(placeholder, code_section)

        return await self.html.parse(text)

    @staticmethod
    def unparse(text: str, entities: list):
        """
        Performs the reverse operation to .parse(), effectively returning
        markdown-like syntax given a normal text and its MessageEntity's.

        :param text: the text to be reconverted into markdown.
        :param entities: list of MessageEntity's applied to the text.
        :return: a markdown-like text representing the combination of both inputs.
        """
        delimiters = {
            MessageEntityType.BOLD: BOLD_DELIM,
            MessageEntityType.ITALIC: ITALIC_DELIM,
            MessageEntityType.UNDERLINE: UNDERLINE_DELIM,
            MessageEntityType.STRIKETHROUGH: STRIKE_DELIM,
            MessageEntityType.CODE: CODE_DELIM,
            MessageEntityType.PRE: PRE_DELIM,
            MessageEntityType.BLOCKQUOTE: BLOCKQUOTE_DELIM,
            MessageEntityType.EXPANDABLE_BLOCKQUOTE: BLOCKQUOTE_EXPANDABLE_DELIM,
            MessageEntityType.SPOILER: SPOILER_DELIM
        }

        text = utils.add_surrogates(text)

        insert_at = []
        for i, entity in enumerate(entities):
            s = entity.offset
            e = entity.offset + entity.length
            delimiter = delimiters.get(entity.type, None)
            if delimiter:
                open_delimiter = delimiter
                close_delimiter = delimiter
                if entity.type == MessageEntityType.PRE:
                    if entity.language:
                        open_delimiter += entity.language + '\n'
                    else:
                        open_delimiter += '\n'
                insert_at.append((s, i, open_delimiter))
                insert_at.append((e, -i, close_delimiter))
            else:
                url = None
                if entity.type == MessageEntityType.TEXT_LINK:
                    url = entity.url
                elif entity.type == MessageEntityType.TEXT_MENTION:
                    url = 'tg://user?id={}'.format(entity.user.id)
                if url:
                    insert_at.append((s, i, '['))
                    insert_at.append((e, -i, ']({})'.format(url)))

        insert_at.sort(key=lambda t: (t[0], t[1]))
        while insert_at:
            at, _, what = insert_at.pop()

            # If we are in the middle of a surrogate nudge the position by -1.
            # Otherwise we would end up with malformed text and fail to encode.
            # For example of bad input: "Hi \ud83d\ude1c"
            # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
            while utils.within_surrogate(text, at):
                at += 1

            text = text[:at] + what + text[at:]

        return utils.remove_surrogates(text)