improvements for rich text, plain text link embedding

This commit is contained in:
Dylan Knutson
2025-03-03 01:10:54 +00:00
parent 720a2ab1b8
commit c0ddef96f0
8 changed files with 396 additions and 162 deletions

View File

@@ -56,6 +56,17 @@ module Domain::DescriptionsHelper
end
end
WEAK_URL_MATCHER_REGEX =
%r{(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)}
ALLOWED_INFERRED_URL_DOMAINS =
T.let(
%w[furaffinity.net inkbunny.net e621.net]
.flat_map { |domain| [domain, "www.#{domain}"] }
.freeze,
T::Array[String],
)
sig { params(model: HasDescriptionHtmlForView).returns(T.nilable(String)) }
def sanitize_description_html(model)
html = model.description_html_for_view
@@ -75,107 +86,154 @@ module Domain::DescriptionsHelper
replacements = {}
# Transform bare text that is not contained within an anchor tag into an anchor tag
text_link_transformer =
lambda do |env|
node = T.cast(env[:node], Nokogiri::XML::Node)
return if env[:is_allowlisted]
next unless node.text?
next unless node.ancestors("a").empty?
next unless (node_text = T.cast(node.text, T.nilable(String)))
next unless (match = node_text.match(WEAK_URL_MATCHER_REGEX))
next unless (url_text = match[0])
unless (
uri =
try_parse_uri(model.description_html_base_domain, url_text)
)
next
end
unless ALLOWED_PLAIN_TEXT_URL_DOMAINS.any? { |domain|
url_matches_domain?(domain, uri.host)
}
next
end
before, after = node.text.split(url_text, 2)
new_node = "#{before}<a href=\"#{url_text}\">#{url_text}</a>#{after}"
node.replace(new_node)
end
tag_class_and_style_transformer =
lambda do |env|
node = T.cast(env[:node], Nokogiri::XML::Node)
node_name = T.cast(env[:node_name], String)
return if env[:is_allowlisted] || !node.element?
# Convert bbcode_center class to text-align: center style
# and remove all other styling
add_node_styles = []
if node["class"]&.include?("bbcode_center")
add_node_styles << "text-align: center"
end
node.name = "div" if node_name == "code"
node.remove_attribute("class")
# add to original styles
node["style"] = (node["style"] || "")
.split(";")
.map(&:strip)
.concat(add_node_styles)
.map { |s| s + ";" }
.join(" ")
end
link_to_model_link_transformer =
lambda do |env|
node = T.cast(env[:node], Nokogiri::XML::Node)
node_name = T.cast(env[:node_name], String)
next if env[:is_allowlisted] || !node.element?
# Only allow and transform FA links
if node_name == "a"
href_str = node["href"]&.downcase || ""
url = try_parse_uri(model.description_html_base_domain, href_str)
next { node_whitelist: [] } if url.nil?
found_link = link_for_source(url.to_s)
if found_link.present? && (found_model = found_link.model)
partial, locals =
case found_model
when Domain::Post
[
"domain/has_description_html/inline_link_domain_post",
{
post: found_model,
link_text: node.text,
visual_style: "description-section-link",
},
]
when Domain::User
[
"domain/has_description_html/inline_link_domain_user",
{
user: found_model,
link_text: node.text,
visual_style: "description-section-link",
},
]
else
raise "Unknown model type: #{found_link.model.class}"
end
replacements[node] = Nokogiri::HTML5.fragment(
render(partial:, locals:),
)
next { node_whitelist: [node] }
else
if ALLOWED_EXTERNAL_LINK_DOMAINS.any? { |domain|
url_matches_domain?(domain, url.host)
}
if node.text.blank? || text_same_as_url?(node.text, url.to_s)
title = title_for_url(url.to_s)
else
title = node.text
end
replacements[node] = Nokogiri::HTML5.fragment(
render(
partial: "domain/has_description_html/inline_link_external",
locals: {
url: url.to_s,
title:,
icon_path: icon_path_for_domain(url.host),
},
),
)
next { node_whitelist: [node] }
end
end
end
end
disallowed_link_transformer =
lambda do |env|
node = T.cast(env[:node], Nokogiri::XML::Node)
node_name = T.cast(env[:node_name], String)
return if env[:is_allowlisted] || !node.element?
if node_name == "a"
# by the time we're here, we know this is not a valid link node,
# and it should be replaced with its text
node.replace(node.inner_html)
end
end
sanitizer =
Sanitize.new(
elements: %w[code div br img b i span strong hr p],
elements: %w[a code div br img b i span strong hr p],
attributes: {
all: %w[class style],
"a" => %w[href class],
:all => %w[class style],
},
css: {
properties: %w[font-size color text-align class],
},
transformers: [
lambda do |env|
node = env[:node]
node_name = env[:node_name]
return if env[:is_allowlisted] || !node.element?
# Convert bbcode_center class to text-align: center style
# and remove all other styling
add_node_styles = []
if node["class"]&.include?("bbcode_center")
add_node_styles << "text-align: center"
end
node.name = "div" if node_name == "code"
node.remove_attribute("class")
# add to original styles
node["style"] = (node["style"] || "")
.split(";")
.map(&:strip)
.concat(add_node_styles)
.map { |s| s + ";" }
.join(" ")
# Only allow and transform FA links
if node_name == "a"
href_str = node["href"]&.downcase || ""
url = try_parse_uri(model.description_html_base_domain, href_str)
next { node_whitelist: [] } if url.nil?
found_link = link_for_source(url.to_s)
if found_link.nil?
if ALLOWED_EXTERNAL_LINK_DOMAINS.any? { |domain|
url_matches_domain?(domain, url.host)
}
if node.text.blank? || text_same_as_url?(node.text, url.to_s)
title = title_for_url(url.to_s)
else
title = node.text
end
replacements[node] = Nokogiri::HTML5.fragment(
render(
partial:
"domain/has_description_html/inline_link_external",
locals: {
url: url.to_s,
title:,
icon_path: icon_path_for_domain(url.host),
},
),
)
next { node_whitelist: [node] }
else
next { node_whitelist: [] }
end
end
found_model = found_link.model
partial, locals =
case found_model
when Domain::Post
[
"domain/has_description_html/inline_link_domain_post",
{
post: found_model,
link_text: node.text,
visual_style: "description-section-link",
},
]
when Domain::User
[
"domain/has_description_html/inline_link_domain_user",
{
user: found_model,
link_text: node.text,
visual_style: "description-section-link",
},
]
else
raise "Unknown model type: #{found_link.model.class}"
end
replacements[node] = Nokogiri::HTML5.fragment(
render(partial:, locals:),
)
next { node_whitelist: [node] }
end
# Else, don't allow any other nodes
{ node_whitelist: [] }
end,
text_link_transformer,
tag_class_and_style_transformer,
link_to_model_link_transformer,
disallowed_link_transformer,
],
)

View File

@@ -5,63 +5,81 @@ module Domain::DomainsHelper
include HelpersInterface
abstract!
ALLOWED_EXTERNAL_LINK_DOMAINS = %w[
youtube.com
x.com
weasyl.com
vimeo.com
twitter.com
twitch.tv
tumblr.com
t.me
spreadshirt.de
spreadshirt.com
redbubble.com
pixiv.net
pinterest.com
patreon.com
mstdn.social
livejournal.com
ko-fi.com
instagram.com
facebook.com
dribbble.com
discord.gg
deviantart.com
bsky.app
behance.net
gumroad.com
bigcartel.com
# If a URL is detected in plain text and is one of these domains,
# it will be converted to an anchor tag.
ALLOWED_PLAIN_TEXT_URL_DOMAINS = %w[
e621.net
furaffinity.net
boosty.to
hipolink.me
archiveofourown.org
inkbunny.net
].freeze
# If a link is detected in an anchor tag and is one of these domains,
# it will be converted to a link.
ALLOWED_EXTERNAL_LINK_DOMAINS =
T.let(
(
%w[
archiveofourown.org
behance.net
bigcartel.com
boosty.to
bsky.app
deviantart.com
discord.gg
dribbble.com
e621.net
facebook.com
furaffinity.net
gumroad.com
hipolink.me
inkbunny.net
instagram.com
ko-fi.com
livejournal.com
mstdn.social
patreon.com
pinterest.com
pixiv.net
redbubble.com
spreadshirt.com
spreadshirt.de
t.me
tumblr.com
twitch.tv
twitter.com
vimeo.com
weasyl.com
x.com
youtube.com
] + ALLOWED_PLAIN_TEXT_URL_DOMAINS
).freeze,
T::Array[String],
)
DOMAIN_TO_ICON_PATH =
T.let(
{
"x.com" => "x-twitter.png",
"wixmp.com" => "deviantart.png",
"weasyl.com" => "weasyl.png",
"twitter.com" => "x-twitter.png",
"t.me" => "telegram.png",
"pixiv.net" => "pixiv.png",
"patreon.com" => "patreon.png",
"newgrounds.com" => "newgrounds.png",
"itaku.ee" => "itaku.png",
"inkbunny.net" => "inkbunny.png",
"ib.metapix.net" => "inkbunny.png",
"furaffinity.net" => "fa.png",
"e621.net" => "e621.png",
"deviantart.com" => "deviantart.png",
"bsky.app" => "bsky.png",
"redbubble.com" => "redbubble.png",
"spreadshirt.de" => "spreadshirt.png",
"spreadshirt.com" => "spreadshirt.png",
"boosty.to" => "boosty.png",
"tumblr.com" => "tumblr.png",
"bsky.app" => "bsky.png",
"deviantart.com" => "deviantart.png",
"e621.net" => "e621.png",
"furaffinity.net" => "fa.png",
"ib.metapix.net" => "inkbunny.png",
"inkbunny.net" => "inkbunny.png",
"itaku.ee" => "itaku.png",
"ko-fi.com" => "ko-fi.png",
"newgrounds.com" => "newgrounds.png",
"patreon.com" => "patreon.png",
"pixiv.net" => "pixiv.png",
"redbubble.com" => "redbubble.png",
"spreadshirt.com" => "spreadshirt.png",
"spreadshirt.de" => "spreadshirt.png",
"t.me" => "telegram.png",
"tumblr.com" => "tumblr.png",
"twitter.com" => "x-twitter.png",
"weasyl.com" => "weasyl.png",
"wixmp.com" => "deviantart.png",
"x.com" => "x-twitter.png",
}.freeze,
T::Hash[String, String],
)

View File

@@ -103,6 +103,8 @@ module Domain::PostsHelper
"Microsoft Word Document"
when %r{application/vnd\.openxmlformats-officedocument\.wordprocessingml\.document}
"Microsoft Word Document (OpenXML)"
when %r{application/vnd\.oasis\.opendocument\.text}
"OpenDocument Text"
when %r{application/rtf}
"Rich Text Document"
when %r{image/jpeg}
@@ -224,6 +226,7 @@ module Domain::PostsHelper
FA_CDN_HOSTS = %w[d.furaffinity.net *.facdn.net facdn.net]
IB_HOSTS = %w[*.inkbunny.net inkbunny.net]
IB_CDN_HOSTS = %w[*.ib.metapix.net ib.metapix.net]
E621_HOSTS = %w[www.e621.net e621.net]
URL_SUFFIX_QUERY = T.let(<<-SQL.strip.chomp.freeze, String)
lower(json_attributes->>'url_str') = lower(?)
@@ -235,7 +238,10 @@ module Domain::PostsHelper
# Furaffinity posts
SourceMatcher.new(
hosts: FA_HOSTS,
patterns: [%r{/view/(\d+)/?}],
patterns: [
%r{/view/(\d+)/?},
%r{/controls/submissions/changeinfo/(\d+)/?},
],
find_proc: ->(helper, match, _) do
if post = Domain::Post::FaPost.find_by(fa_id: match[1])
SourceResult.new(
@@ -248,7 +254,7 @@ module Domain::PostsHelper
# Furaffinity posts via direct file URL
SourceMatcher.new(
hosts: FA_CDN_HOSTS,
patterns: [//],
patterns: [/.+/],
find_proc: ->(helper, _, url) do
url = Addressable::URI.parse(url)
@@ -329,6 +335,32 @@ module Domain::PostsHelper
end
end,
),
# E621 posts
SourceMatcher.new(
hosts: E621_HOSTS,
patterns: [%r{/posts/(\d+)/?}],
find_proc: ->(helper, match, _) do
if post = Domain::Post::E621Post.find_by(e621_id: match[1])
SourceResult.new(
model: post,
title: helper.title_for_post_model(post),
)
end
end,
),
# E621 users
SourceMatcher.new(
hosts: E621_HOSTS,
patterns: [%r{/users/(\d+)/?}],
find_proc: ->(helper, match, _) do
if user = Domain::User::E621User.find_by(e621_id: match[1])
SourceResult.new(
model: user,
title: user.name_for_view || "unknown",
)
end
end,
),
],
T::Array[SourceMatcher],
)

View File

@@ -70,6 +70,8 @@ module LogEntriesHelper
"doc"
when %r{application/vnd\.openxmlformats-officedocument\.wordprocessingml\.document}
"docx"
when %r{application/vnd\.oasis\.opendocument\.text}
"odt"
else
nil
end
@@ -94,6 +96,7 @@ module LogEntriesHelper
application/rtf
application/msword
text/plain
application/vnd.oasis.opendocument.text
application/vnd.openxmlformats-officedocument.wordprocessingml.document
].any? { |ct| content_type.starts_with?(ct) }
end
@@ -248,7 +251,13 @@ module LogEntriesHelper
end
return nil if document_html.blank?
sanitize_rich_text_document_html(document_html, is_plain_text)
end
sig do
params(document_html: String, is_plain_text: T::Boolean).returns(String)
end
def sanitize_rich_text_document_html(document_html, is_plain_text)
quote_transformer =
Kernel.lambda do |env|
node = env[:node]
@@ -265,6 +274,7 @@ module LogEntriesHelper
clean_plain_text_node =
Kernel.lambda do |node|
node = T.cast(node, Nokogiri::XML::Node)
if node.text?
node_text = node.text.strip
if node_text.empty?
@@ -275,7 +285,7 @@ module LogEntriesHelper
# and the collected nodes
current_node = node
inline_elements = []
while (next_sibling = current_node&.next_sibling) &&
while (next_sibling = current_node.next_sibling) &&
(next_sibling.name != "br") && (next_sibling.name != "p")
inline_elements << next_sibling
current_node = next_sibling
@@ -294,29 +304,33 @@ module LogEntriesHelper
plain_text_transformer =
Kernel.lambda do |env|
# within a div, wrap bare text nodes in a <p>
node = env[:node]
node_name = env[:node_name]
node = T.cast(env[:node], Nokogiri::XML::Node)
node_name = T.cast(env[:node_name], String)
if node_name == "div"
node.children.each { |child| clean_plain_text_node.call(child) }
current_child = T.unsafe(node.children.first)
while current_child.present?
clean_plain_text_node.call(current_child)
current_child = current_child.next_sibling
end
elsif node.text? && node.parent&.name == "#document-fragment"
clean_plain_text_node.call(node)
end
{ node_allowlist: [node] }
end
remove_empty_newline_transformer =
Kernel.lambda do |env|
node = env[:node]
node.unlink if node.text? && node.text.strip.chomp.blank?
end
# remove_empty_newline_transformer =
# Kernel.lambda do |env|
# node = env[:node]
# node.unlink if node.text? && node.text.strip.chomp.blank?
# end
remove_multiple_br_transformer =
Kernel.lambda do |env|
node = env[:node]
if node.name == "br"
node.unlink if node.previous_sibling&.name == "br"
end
end
# remove_multiple_br_transformer =
# Kernel.lambda do |env|
# node = env[:node]
# if node.name == "br"
# node.unlink if node.previous_sibling&.name == "br"
# end
# end
sanitizer =
Sanitize.new(

View File

@@ -168,7 +168,8 @@ class Domain::Post::FaPost < Domain::Post
begin
contents = guess_last_submission_log_entry&.response_bytes
if contents
parser = Domain::Fa::Parser::Page.new(contents)
parser =
Domain::Fa::Parser::Page.new(contents, require_logged_in: false)
parser.submission.posted_date if parser.probably_submission?
end
end

View File

@@ -0,0 +1,17 @@
# typed: strict
class Nokogiri::XML::Node
sig { returns(Nokogiri::XML::NodeSet) }
def children
end
sig { returns(T.nilable(Nokogiri::XML::Node)) }
def next_sibling
end
end
class Nokogiri::XML::NodeSet
sig { params(n: T.nilable(Integer)).returns(T.nilable(Nokogiri::XML::Node)) }
def first(n = nil)
end
end

View File

@@ -67,6 +67,82 @@ RSpec.describe Domain::DescriptionsHelper, type: :helper do
end
end
describe "handling links that are bare text" do
it "does not replace links that are already wrapped in an anchor tag" do
html =
'<a href="https://e621.net/posts/5406941" title="https://e621.net/posts/5406941" class="auto_link">https://e621.net/posts/5406941</a><br />'
sanitized = sanitize_description_html(html)
# should render the external link partial because the e621 post model is not found
expect(sanitized).to include("_blank noopener noreferrer nofollow")
expect(sanitized).to include("https://e621.net/posts/5406941")
expect(sanitized).to include("<a")
end
it "does not make a link if the domain is not in the allowlist" do
html =
'before <a href="https://www.example.com/foo/" title="https://www.example.com/foo/" class="auto_link">https://www.example.com/foo/</a> after'
sanitized = sanitize_description_html(html)
expect(sanitized).to include("before ")
expect(sanitized).to include(" after")
expect(sanitized).to include("https://www.example.com/foo/")
expect(sanitized).not_to include("<a")
end
it "does not make a link if anchor does not have href" do
html = "<a>No href</a>"
sanitized = sanitize_description_html(html)
expect(sanitized).to eq("No href")
end
it "does not allow nested links" do
html =
'<a href="https://www.example.com/foo">Link <a href="https://www.example.com/bar/">Nested Link</a></a>'
sanitized = sanitize_description_html(html)
expect(sanitized).to eq("Link Nested Link")
end
it "handles models that are not recorded in the database as external links" do
html = "before http://www.furaffinity.net/view/123456/ after"
sanitized = sanitize_description_html(html)
expect(sanitized).to include("before ")
expect(sanitized).to include(" after")
expect(sanitized).to include("http://www.furaffinity.net/view/123456/")
# renders the inline_link_external partial because the FA post model is not found
expect(sanitized).to include("_blank noopener noreferrer nofollow")
end
it "handles models that are recorded in the database as inline hoverable links" do
post = create(:domain_post_e621_post, e621_id: 5_406_941)
html = "before https://e621.net/posts/5406941 after"
sanitized = sanitize_description_html(html)
expect(sanitized).to include("before ")
expect(sanitized).to include(" after")
expect(sanitized).to include(
/PostHoverPreviewWrapper.+#{post.to_param}/,
)
end
it "handles text links that do not have a scheme" do
html = "before e621.net/posts/5406941 after"
sanitized = sanitize_description_html(html)
expect(sanitized).to include("before ")
expect(sanitized).to include(" after")
expect(sanitized).to include("e621.net/posts/5406941")
expect(sanitized).to include("_blank noopener noreferrer nofollow")
end
it "deep links text links that do not have a scheme" do
post = create(:domain_post_e621_post, e621_id: 5_406_941)
html = "before e621.net/posts/5406941 after"
sanitized = sanitize_description_html(html)
expect(sanitized).to include("before ")
expect(sanitized).to include(" after")
expect(sanitized).to include(
/PostHoverPreviewWrapper.+#{post.to_param}/,
)
end
end
describe "FA post link handling" do
%w[
https://www.furaffinity.net/view/123456/
@@ -257,7 +333,7 @@ RSpec.describe Domain::DescriptionsHelper, type: :helper do
expect(sanitized).not_to include("FA User")
expect(sanitized).to include("Google")
expect(sanitized).to include("No href")
expect(sanitized.scan(/<a/).length).to eq(0)
expect(sanitized).not_to include("<a")
end
end

View File

@@ -80,4 +80,22 @@ RSpec.describe LogEntriesHelper, type: :helper do
expect(rendered).not_to be_nil
end
end
describe "#sanitize_rich_text_document_html" do
it "puts sequences of inline elements into a <p>" do
html =
"
<div>
Does that
<em>really</em>
make you...
</div>
"
rendered = helper.sanitize_rich_text_document_html(html, true)
expect(rendered).to eq_html(
"<div><p>Does that <em>really</em> make you...</p></div>",
)
end
end
end