mirror of
https://github.com/glitch-soc/mastodon.git
synced 2025-03-03 06:51:41 -05:00
Use a tree‐based approach for adv. text formatting
Sanitizing HTML/Markdown means parsing the content into an HTML tree under‐the‐hood anyway, and it is more accurate to do mention/hashtag replacement on the text nodes in that tree than it is to try to hack it in with regexes et cetera. This undoes the overrides of `#entities` and `#rewrite` on `AdvancedTextFormatter` but also stops using them, instead keeping track of the parsed Nokogiri tree itself and using that in the `#to_s` method. Internally, this tree uses `<mastodon-entity>` nodes to keep track of hashtags, links, and mentions. Sanitization is moved to the beginning, so it should be known that these do not appear in the input.
This commit is contained in:
parent
b2a25d446a
commit
09cbd6c65e
@ -33,67 +33,89 @@ class AdvancedTextFormatter < TextFormatter
|
||||
@text = format_markdown(text) if content_type == 'text/markdown'
|
||||
end
|
||||
|
||||
# Differs from TextFormatter by not messing with newline after parsing
|
||||
# Differs from TextFormatter by operating on the parsed HTML tree ;)
|
||||
#
|
||||
# See +#tree+
|
||||
def to_s
|
||||
return ''.html_safe if text.blank?
|
||||
|
||||
html = rewrite do |entity|
|
||||
if entity[:url]
|
||||
link_to_url(entity)
|
||||
elsif entity[:hashtag]
|
||||
link_to_hashtag(entity)
|
||||
elsif entity[:screen_name]
|
||||
link_to_mention(entity)
|
||||
result = tree.dup
|
||||
result.css('mastodon-entity').each do |entity|
|
||||
case entity['kind']
|
||||
when 'hashtag'
|
||||
entity.replace(link_to_hashtag({ hashtag: entity['value'] }))
|
||||
when 'link'
|
||||
entity.replace(link_to_url({ url: entity['value'] }))
|
||||
when 'mention'
|
||||
entity.replace(link_to_mention({ screen_name: entity['value'] }))
|
||||
end
|
||||
end
|
||||
|
||||
html.html_safe # rubocop:disable Rails/OutputSafety
|
||||
result.to_html
|
||||
end
|
||||
|
||||
# Differs from `TextFormatter` by skipping HTML tags and entities
|
||||
def entities
|
||||
@entities ||= begin
|
||||
gaps = []
|
||||
total_offset = 0
|
||||
##
|
||||
# Process the status into a Nokogiri document fragment, with entities
|
||||
# replaced with +<mastodon-entity>+s.
|
||||
#
|
||||
# Since +<mastodon-entity>+ is not allowed by the sanitizer, any such
|
||||
# elements in the output *must* have been produced by this algorithm.
|
||||
#
|
||||
# These elements will need to be replaced prior to serialization (see
|
||||
# +#to_s+).
|
||||
def tree
|
||||
if @tree.nil?
|
||||
src = text.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, '')
|
||||
@tree = Nokogiri::HTML5.fragment(src)
|
||||
Sanitize.node!(@tree, Sanitize::Config::MASTODON_OUTGOING)
|
||||
document = @tree.document
|
||||
|
||||
escaped = text.gsub(/<[^>]*>|&#[0-9]+;/) do |match|
|
||||
total_offset += match.length - 1
|
||||
end_offset = Regexp.last_match.end(0)
|
||||
gaps << [end_offset - total_offset, total_offset]
|
||||
' '
|
||||
end
|
||||
|
||||
Extractor.extract_entities_with_indices(escaped, extract_url_without_protocol: false).map do |entity|
|
||||
start_pos, end_pos = entity[:indices]
|
||||
offset_idx = gaps.rindex { |gap| gap.first <= start_pos }
|
||||
offset = offset_idx.nil? ? 0 : gaps[offset_idx].last
|
||||
entity.merge(indices: [start_pos + offset, end_pos + offset])
|
||||
@tree.xpath('.//text()[not(ancestor::a)]').each do |text_node|
|
||||
# Iterate over text elements and build up their replacements.
|
||||
content = text_node.content
|
||||
replacement = Nokogiri::XML::NodeSet.new(document)
|
||||
processed_index = 0
|
||||
Extractor.extract_entities_with_indices(
|
||||
content,
|
||||
extract_url_without_protocol: false
|
||||
) do |entity|
|
||||
# Iterate over entities in this text node.
|
||||
advance = entity[:indices].first - processed_index
|
||||
if advance.positive?
|
||||
# Text node for content which precedes entity.
|
||||
replacement << Nokogiri::XML::Text.new(
|
||||
content[processed_index, advance],
|
||||
@tree.document
|
||||
)
|
||||
end
|
||||
elt = Nokogiri::XML::Element.new('mastodon-entity', document)
|
||||
if entity[:url]
|
||||
elt['kind'] = 'link'
|
||||
elt['value'] = entity[:url]
|
||||
elsif entity[:hashtag]
|
||||
elt['kind'] = 'hashtag'
|
||||
elt['value'] = entity[:hashtag]
|
||||
elsif entity[:screen_name]
|
||||
elt['kind'] = 'mention'
|
||||
elt['value'] = entity[:screen_name]
|
||||
end
|
||||
replacement << elt
|
||||
processed_index = entity[:indices].last
|
||||
end
|
||||
if processed_index < content.size
|
||||
# Text node for remaining content.
|
||||
replacement << Nokogiri::XML::Text.new(
|
||||
content[processed_index, content.size - processed_index],
|
||||
document
|
||||
)
|
||||
end
|
||||
text_node.replace(replacement)
|
||||
end
|
||||
end
|
||||
@tree
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Differs from `TextFormatter` in that it keeps HTML; but it sanitizes at the end to remain safe
|
||||
def rewrite
|
||||
entities.sort_by! do |entity|
|
||||
entity[:indices].first
|
||||
end
|
||||
|
||||
result = ''.dup
|
||||
|
||||
last_index = entities.reduce(0) do |index, entity|
|
||||
indices = entity[:indices]
|
||||
result << text[index...indices.first]
|
||||
result << yield(entity)
|
||||
indices.last
|
||||
end
|
||||
|
||||
result << text[last_index..-1]
|
||||
|
||||
Sanitize.fragment(result, Sanitize::Config::MASTODON_OUTGOING)
|
||||
end
|
||||
|
||||
def format_markdown(html)
|
||||
html = markdown_formatter.render(html)
|
||||
html.delete("\r").delete("\n")
|
||||
|
Loading…
x
Reference in New Issue
Block a user