pyrofork/pyrogram/parser/markdown.py
wulan17 9fd572d8a2
Some checks failed
Build-docs / build (push) Has been cancelled
Pyrofork / build (macos-latest, 3.10) (push) Has been cancelled
Pyrofork / build (macos-latest, 3.11) (push) Has been cancelled
Pyrofork / build (macos-latest, 3.12) (push) Has been cancelled
Pyrofork / build (macos-latest, 3.13) (push) Has been cancelled
Pyrofork / build (macos-latest, 3.9) (push) Has been cancelled
Pyrofork / build (ubuntu-latest, 3.10) (push) Has been cancelled
Pyrofork / build (ubuntu-latest, 3.11) (push) Has been cancelled
Pyrofork / build (ubuntu-latest, 3.12) (push) Has been cancelled
Pyrofork / build (ubuntu-latest, 3.13) (push) Has been cancelled
Pyrofork / build (ubuntu-latest, 3.9) (push) Has been cancelled
pyrofork: Cleanup codes
Signed-off-by: wulan17 <wulan17@nusantararom.org>
2025-03-27 02:26:24 +07:00

290 lines
11 KiB
Python

# Pyrofork - Telegram MTProto API Client Library for Python
# Copyright (C) 2017-present Dan <https://github.com/delivrance>
# Copyright (C) 2022-present Mayuri-Chan <https://github.com/Mayuri-Chan>
#
# This file is part of Pyrofork.
#
# Pyrofork is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Pyrofork is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Pyrofork. If not, see <http://www.gnu.org/licenses/>.
import html
import re
from typing import Optional
import pyrogram
from pyrogram.enums import MessageEntityType
from . import utils
from .html import HTML
BOLD_DELIM = "**"
ITALIC_DELIM = "__"
UNDERLINE_DELIM = "--"
STRIKE_DELIM = "~~"
SPOILER_DELIM = "||"
CODE_DELIM = "`"
PRE_DELIM = "```"
BLOCKQUOTE_DELIM = ">"
BLOCKQUOTE_EXPANDABLE_DELIM = "**>"
MARKDOWN_RE = re.compile(r"({d})".format(
d="|".join(
["".join(i) for i in [
[rf"\{j}" for j in i]
for i in [
PRE_DELIM,
CODE_DELIM,
STRIKE_DELIM,
UNDERLINE_DELIM,
ITALIC_DELIM,
BOLD_DELIM,
SPOILER_DELIM
]
]]
)))
URL_RE = re.compile(r"(!?)\[(.+?)\]\((.+?)\)")
OPENING_TAG = "<{}>"
CLOSING_TAG = "</{}>"
URL_MARKUP = '<a href="{}">{}</a>'
EMOJI_MARKUP = '<emoji id={}>{}</emoji>'
FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]
CODE_TAG_RE = re.compile(r"<code>.*?</code>")
class Markdown:
def __init__(self, client: Optional["pyrogram.Client"]):
self.html = HTML(client)
def blockquote_parser(self, text):
text = re.sub(r'\n&gt;', '\n>', re.sub(r'^&gt;', '>', text))
lines = text.split('\n')
result = []
in_blockquote = False
for line in lines:
if line.startswith(BLOCKQUOTE_DELIM):
if not in_blockquote:
line = re.sub(r'^> ', OPENING_TAG.format("blockquote"), line)
line = re.sub(r'^>', OPENING_TAG.format("blockquote"), line)
in_blockquote = True
result.append(line.strip())
else:
result.append(line[1:].strip())
elif line.startswith(BLOCKQUOTE_EXPANDABLE_DELIM):
if not in_blockquote:
line = re.sub(r'^\*\*> ', OPENING_TAG.format("blockquote expandable"), line)
line = re.sub(r'^\*\*>', OPENING_TAG.format("blockquote expandable"), line)
in_blockquote = True
result.append(line.strip())
else:
result.append(line[3:].strip())
else:
if in_blockquote:
line = CLOSING_TAG.format("blockquote") + line
in_blockquote = False
result.append(line)
if in_blockquote:
line = result[len(result)-1] + CLOSING_TAG.format("blockquote")
result.pop(len(result)-1)
result.append(line)
return '\n'.join(result)
async def parse(self, text: str, strict: bool = False):
if strict:
text = html.escape(text)
text = self.blockquote_parser(text)
delims = set()
is_fixed_width = False
placeholders = {}
for i, code_section in enumerate(CODE_TAG_RE.findall(text)):
placeholder = f"{{CODE_SECTION_{i}}}"
placeholders[placeholder] = code_section
text = text.replace(code_section, placeholder, 1)
for i, match in enumerate(re.finditer(MARKDOWN_RE, text)):
start, _ = match.span()
delim = match.group(1)
full = match.group(0)
if delim in FIXED_WIDTH_DELIMS:
is_fixed_width = not is_fixed_width
if is_fixed_width and delim not in FIXED_WIDTH_DELIMS:
continue
if delim == BOLD_DELIM:
tag = "b"
elif delim == ITALIC_DELIM:
tag = "i"
elif delim == UNDERLINE_DELIM:
tag = "u"
elif delim == STRIKE_DELIM:
tag = "s"
elif delim == CODE_DELIM:
tag = "code"
elif delim == PRE_DELIM:
tag = "pre"
elif delim == SPOILER_DELIM:
tag = "spoiler"
else:
continue
if delim not in delims:
delims.add(delim)
tag = OPENING_TAG.format(tag)
else:
delims.remove(delim)
tag = CLOSING_TAG.format(tag)
if delim == PRE_DELIM and delim in delims:
delim_and_language = text[text.find(PRE_DELIM):].split("\n")[0]
language = delim_and_language[len(PRE_DELIM):]
text = utils.replace_once(text, delim_and_language, f'<pre language="{language}">', start)
continue
text = utils.replace_once(text, delim, tag, start)
for i, match in enumerate(re.finditer(URL_RE, text)):
start, _ = match.span()
is_emoji, text_url, url = match.groups()
full = match.group(0)
if not is_emoji and text_url:
text = utils.replace_once(text, full, URL_MARKUP.format(url, text_url), start)
continue
if is_emoji:
emoji = text_url
emoji_id = url.lstrip("tg://emoji?id=")
text = utils.replace_once(text, full, EMOJI_MARKUP.format(emoji_id, emoji), start)
continue
for placeholder, code_section in placeholders.items():
text = text.replace(placeholder, code_section)
return await self.html.parse(text)
@staticmethod
def unparse(text: str, entities: list):
"""
Performs the reverse operation to .parse(), effectively returning
markdown-like syntax given a normal text and its MessageEntity's.
:param text: the text to be reconverted into markdown.
:param entities: list of MessageEntity's applied to the text.
:return: a markdown-like text representing the combination of both inputs.
"""
delimiters = {
MessageEntityType.BOLD: BOLD_DELIM,
MessageEntityType.ITALIC: ITALIC_DELIM,
MessageEntityType.UNDERLINE: UNDERLINE_DELIM,
MessageEntityType.STRIKETHROUGH: STRIKE_DELIM,
MessageEntityType.CODE: CODE_DELIM,
MessageEntityType.PRE: PRE_DELIM,
MessageEntityType.BLOCKQUOTE: BLOCKQUOTE_DELIM,
MessageEntityType.EXPANDABLE_BLOCKQUOTE: BLOCKQUOTE_EXPANDABLE_DELIM,
MessageEntityType.SPOILER: SPOILER_DELIM
}
text = utils.add_surrogates(text)
insert_at = []
for i, entity in enumerate(entities):
s = entity.offset
e = entity.offset + entity.length
delimiter = delimiters.get(entity.type, None)
if delimiter:
if entity.type == MessageEntityType.PRE:
inside_blockquote = any(
blk_entity.offset <= s < blk_entity.offset + blk_entity.length and
blk_entity.offset < e <= blk_entity.offset + blk_entity.length
for blk_entity in entities
if blk_entity.type == MessageEntityType.BLOCKQUOTE
)
is_expandable = any(
blk_entity.offset <= s < blk_entity.offset + blk_entity.length and
blk_entity.offset < e <= blk_entity.offset + blk_entity.length and
blk_entity.collapsed
for blk_entity in entities
if blk_entity.type == MessageEntityType.BLOCKQUOTE
)
if inside_blockquote:
if is_expandable:
if entity.language:
open_delimiter = f"{delimiter}{entity.language}\n**>"
else:
open_delimiter = f"{delimiter}\n**>"
close_delimiter = f"\n**>{delimiter}"
else:
if entity.language:
open_delimiter = f"{delimiter}{entity.language}\n>"
else:
open_delimiter = f"{delimiter}\n>"
close_delimiter = f"\n>{delimiter}"
else:
open_delimiter = delimiter
close_delimiter = delimiter
insert_at.append((s, i, open_delimiter))
insert_at.append((e, -i, close_delimiter))
elif entity.type != MessageEntityType.BLOCKQUOTE and entity.type != MessageEntityType.EXPANDABLE_BLOCKQUOTE:
open_delimiter = delimiter
close_delimiter = delimiter
insert_at.append((s, i, open_delimiter))
insert_at.append((e, -i, close_delimiter))
else:
# Handle multiline blockquotes
text_subset = text[s:e]
lines = text_subset.splitlines()
for line_num, _ in enumerate(lines):
line_start = s + sum(len(l) + 1 for l in lines[:line_num])
if entity.collapsed:
insert_at.append((line_start, i, BLOCKQUOTE_EXPANDABLE_DELIM))
else:
insert_at.append((line_start, i, BLOCKQUOTE_DELIM))
# No closing delimiter for blockquotes
else:
url = None
is_emoji = False
if entity.type == MessageEntityType.TEXT_LINK:
url = entity.url
elif entity.type == MessageEntityType.TEXT_MENTION:
url = f'tg://user?id={entity.user.id}'
elif entity.type == MessageEntityType.CUSTOM_EMOJI:
url = f"tg://emoji?id={entity.custom_emoji_id}"
is_emoji = True
if url:
if is_emoji:
insert_at.append((s, i, '!['))
else:
insert_at.append((s, i, '['))
insert_at.append((e, -i, f']({url})'))
insert_at.sort(key=lambda t: (t[0], t[1]))
while insert_at:
at, _, what = insert_at.pop()
# If we are in the middle of a surrogate nudge the position by -1.
# Otherwise we would end up with malformed text and fail to encode.
# For example of bad input: "Hi \ud83d\ude1c"
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
while utils.within_surrogate(text, at):
at += 1
text = text[:at] + what + text[at:]
return utils.remove_surrogates(text)