mirror of
https://github.com/Mayuri-Chan/pyrofork.git
synced 2025-12-29 12:04:51 +00:00
pyrofork: Adapt markdown unparser from telethon
* The problem with current implementation is when we have nested markdown inside a url the markdown order is messed up. for example link with bold text will be unparsed like this [**github](https://github.com**). Signed-off-by: wulan17 <wulan17@nusantararom.org>
This commit is contained in:
parent
5c9470fd4f
commit
bec31032cc
2 changed files with 67 additions and 67 deletions
|
|
@ -182,78 +182,65 @@ class Markdown:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def unparse(text: str, entities: list):
|
def unparse(text: str, entities: list):
|
||||||
|
"""
|
||||||
|
Performs the reverse operation to .parse(), effectively returning
|
||||||
|
markdown-like syntax given a normal text and its MessageEntity's.
|
||||||
|
|
||||||
|
:param text: the text to be reconverted into markdown.
|
||||||
|
:param entities: list of MessageEntity's applied to the text.
|
||||||
|
:return: a markdown-like text representing the combination of both inputs.
|
||||||
|
"""
|
||||||
|
delimiters = {
|
||||||
|
MessageEntityType.BOLD: BOLD_DELIM,
|
||||||
|
MessageEntityType.ITALIC: ITALIC_DELIM,
|
||||||
|
MessageEntityType.UNDERLINE: UNDERLINE_DELIM,
|
||||||
|
MessageEntityType.STRIKETHROUGH: STRIKE_DELIM,
|
||||||
|
MessageEntityType.CODE: CODE_DELIM,
|
||||||
|
MessageEntityType.PRE: PRE_DELIM,
|
||||||
|
MessageEntityType.BLOCKQUOTE: BLOCKQUOTE_DELIM,
|
||||||
|
MessageEntityType.EXPANDABLE_BLOCKQUOTE: BLOCKQUOTE_EXPANDABLE_DELIM,
|
||||||
|
MessageEntityType.SPOILER: SPOILER_DELIM
|
||||||
|
}
|
||||||
|
|
||||||
text = utils.add_surrogates(text)
|
text = utils.add_surrogates(text)
|
||||||
|
|
||||||
entities_offsets = []
|
insert_at = []
|
||||||
|
for i, entity in enumerate(entities):
|
||||||
for entity in entities:
|
s = entity.offset
|
||||||
entity_type = entity.type
|
e = entity.offset + entity.length
|
||||||
start = entity.offset
|
delimiter = delimiters.get(entity.type, None)
|
||||||
end = start + entity.length
|
if delimiter:
|
||||||
|
open_delimiter = delimiter
|
||||||
if entity_type == MessageEntityType.BOLD:
|
close_delimiter = delimiter
|
||||||
start_tag = end_tag = BOLD_DELIM
|
if entity.type == MessageEntityType.PRE:
|
||||||
elif entity_type == MessageEntityType.ITALIC:
|
close_delimiter = '\n' + delimiter
|
||||||
start_tag = end_tag = ITALIC_DELIM
|
if entity.language:
|
||||||
elif entity_type == MessageEntityType.UNDERLINE:
|
open_delimiter += entity.language + '\n'
|
||||||
start_tag = end_tag = UNDERLINE_DELIM
|
|
||||||
elif entity_type == MessageEntityType.STRIKETHROUGH:
|
|
||||||
start_tag = end_tag = STRIKE_DELIM
|
|
||||||
elif entity_type == MessageEntityType.CODE:
|
|
||||||
start_tag = end_tag = CODE_DELIM
|
|
||||||
elif entity_type == MessageEntityType.PRE:
|
|
||||||
language = getattr(entity, "language", "") or ""
|
|
||||||
start_tag = f"{PRE_DELIM}{language}\n"
|
|
||||||
end_tag = f"\n{PRE_DELIM}"
|
|
||||||
elif entity_type == MessageEntityType.BLOCKQUOTE:
|
|
||||||
if entity.collapsed:
|
|
||||||
start_tag = BLOCKQUOTE_EXPANDABLE_DELIM + " "
|
|
||||||
else:
|
else:
|
||||||
start_tag = BLOCKQUOTE_DELIM + " "
|
open_delimiter += '\n'
|
||||||
end_tag = ""
|
insert_at.append((s, i, open_delimiter))
|
||||||
blockquote_text = text[start:end]
|
insert_at.append((e, -i, close_delimiter))
|
||||||
lines = blockquote_text.split("\n")
|
else:
|
||||||
last_length = 0
|
url = None
|
||||||
for line in lines:
|
if entity.type == MessageEntityType.TEXT_LINK:
|
||||||
if len(line) == 0 and last_length == end:
|
|
||||||
continue
|
|
||||||
start_offset = start+last_length
|
|
||||||
last_length = last_length+len(line)
|
|
||||||
end_offset = start_offset+last_length
|
|
||||||
entities_offsets.append((start_tag, start_offset,))
|
|
||||||
entities_offsets.append((end_tag, end_offset,))
|
|
||||||
last_length = last_length+1
|
|
||||||
continue
|
|
||||||
elif entity_type == MessageEntityType.SPOILER:
|
|
||||||
start_tag = end_tag = SPOILER_DELIM
|
|
||||||
elif entity_type == MessageEntityType.TEXT_LINK:
|
|
||||||
url = entity.url
|
url = entity.url
|
||||||
start_tag = "["
|
elif entity.type == MessageEntityType.TEXT_MENTION:
|
||||||
end_tag = f"]({url})"
|
url = f'tg://user?id={entity.user.id}'
|
||||||
elif entity_type == MessageEntityType.TEXT_MENTION:
|
if url:
|
||||||
user = entity.user
|
insert_at.append((s, i, '['))
|
||||||
start_tag = "["
|
insert_at.append((e, -i, f']({url})'))
|
||||||
end_tag = f"](tg://user?id={user.id})"
|
|
||||||
elif entity_type == MessageEntityType.CUSTOM_EMOJI:
|
|
||||||
emoji_id = entity.custom_emoji_id
|
|
||||||
start_tag = ""
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
entities_offsets.append((start_tag, start,))
|
insert_at.sort(key=lambda t: (t[0], t[1]))
|
||||||
entities_offsets.append((end_tag, end,))
|
while insert_at:
|
||||||
|
at, _, what = insert_at.pop()
|
||||||
|
|
||||||
entities_offsets = map(
|
# If we are in the middle of a surrogate nudge the position by -1.
|
||||||
lambda x: x[1],
|
# Otherwise we would end up with malformed text and fail to encode.
|
||||||
sorted(
|
# For example of bad input: "Hi \ud83d\ude1c"
|
||||||
enumerate(entities_offsets),
|
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
|
||||||
key=lambda x: (x[1][1], x[0]),
|
while utils.within_surrogate(text, at):
|
||||||
reverse=True
|
at += 1
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
for entity, offset in entities_offsets:
|
text = text[:at] + what + text[at:]
|
||||||
text = text[:offset] + entity + text[offset:]
|
|
||||||
|
|
||||||
return utils.remove_surrogates(text)
|
return utils.remove_surrogates(text)
|
||||||
|
|
|
||||||
|
|
@ -40,3 +40,16 @@ def remove_surrogates(text):
|
||||||
|
|
||||||
def replace_once(source: str, old: str, new: str, start: int):
|
def replace_once(source: str, old: str, new: str, start: int):
|
||||||
return source[:start] + source[start:].replace(old, new, 1)
|
return source[:start] + source[start:].replace(old, new, 1)
|
||||||
|
|
||||||
|
def within_surrogate(text, index, *, length=None):
|
||||||
|
"""
|
||||||
|
`True` if ``index`` is within a surrogate (before and after it, not at!).
|
||||||
|
"""
|
||||||
|
if length is None:
|
||||||
|
length = len(text)
|
||||||
|
|
||||||
|
return (
|
||||||
|
1 < index < len(text) and # in bounds
|
||||||
|
'\ud800' <= text[index - 1] <= '\udbff' and # previous is
|
||||||
|
'\ud800' <= text[index] <= '\udfff' # current is
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue