fix bsky user profile link sanitizer

This commit is contained in:
Dylan Knutson
2025-08-17 18:51:55 +00:00
parent 6bf64cf8c6
commit 15ea73a350
6 changed files with 126 additions and 31 deletions

View File

@@ -57,7 +57,12 @@ module Domain::DescriptionsHelper
end
WEAK_URL_MATCHER_REGEX =
%r{(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)}
%r{(http(s)?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)}
sig { params(str: String).returns(T.nilable(String)) }
def extract_weak_url(str)
str.match(WEAK_URL_MATCHER_REGEX)&.[](0)
end
ALLOWED_INFERRED_URL_DOMAINS =
T.let(
@@ -72,6 +77,16 @@ module Domain::DescriptionsHelper
html = model.description_html_for_view
return nil if html.blank?
is_bsky_description = model.is_a?(Domain::User::BlueskyUser)
visual_style =
(
if model.is_a?(Domain::User::BlueskyUser)
"description-section-link-light"
else
"description-section-link"
end
)
case model
when Domain::Post::E621Post
dtext_result = DText.parse(html)
@@ -95,19 +110,25 @@ module Domain::DescriptionsHelper
next unless node.text?
next unless node.ancestors("a").empty?
next unless (node_text = T.cast(node.text, T.nilable(String)))
next unless (match = node_text.match(WEAK_URL_MATCHER_REGEX))
next unless (url_text = match[0])
next unless (url_text = extract_weak_url(node_text))
next if url_text.blank?
unless (
uri =
try_parse_uri(model.description_html_base_domain, url_text)
)
next
end
unless ALLOWED_PLAIN_TEXT_URL_DOMAINS.any? { |domain|
if is_bsky_description
unless ALLOWED_EXTERNAL_LINK_DOMAINS.any? { |domain|
url_matches_domain?(domain, uri.host)
}
next
end
elsif ALLOWED_PLAIN_TEXT_URL_DOMAINS.none? do |domain|
url_matches_domain?(domain, uri.host)
end
next
end
before, after = node.text.split(url_text, 2)
new_node = "#{before}<a href=\"#{url_text}\">#{url_text}</a>#{after}"
@@ -157,20 +178,12 @@ module Domain::DescriptionsHelper
when Domain::Post
[
"domain/has_description_html/inline_link_domain_post",
{
post: found_model,
link_text: node.text,
visual_style: "description-section-link",
},
{ post: found_model, link_text: node.text, visual_style: },
]
when Domain::User
[
"domain/has_description_html/inline_link_domain_user",
{
user: found_model,
link_text: node.text,
visual_style: "description-section-link",
},
{ user: found_model, link_text: node.text, visual_style: },
]
else
raise "Unknown model type: #{found_link.model.class}"
@@ -191,6 +204,15 @@ module Domain::DescriptionsHelper
end
replacements[node] = Nokogiri::HTML5.fragment(
if is_bsky_description
render(
partial: "domain/has_description_html/external_link",
locals: {
link_text: node.text,
url: url.to_s,
},
)
else
render(
partial: "domain/has_description_html/inline_link_external",
locals: {
@@ -198,7 +220,8 @@ module Domain::DescriptionsHelper
title:,
icon_path: icon_path_for_domain(url.host),
},
),
)
end,
)
next { node_whitelist: [node] }
end

View File

@@ -11,6 +11,7 @@ module Domain::DomainsHelper
e621.net
furaffinity.net
inkbunny.net
bsky.app
].freeze
# If a link is detected in an anchor tag and is one of these domains,

View File

@@ -242,7 +242,7 @@ class Domain::Bluesky::Job::ScanPostsJob < Domain::Bluesky::Job::Base
def enqueue_pending_files_job(post)
post.files.each do |post_file|
if post_file.state_pending?
defer_job(Domain::StaticFileJob, { post_file: })
defer_job(Domain::StaticFileJob, { post_file: }, { queue: "bluesky" })
end
end
end

View File

@@ -69,7 +69,11 @@ class Bluesky::ProcessPostHelper
set_alt_text(post_file, image_data["alt"])
post_file.save!
@deferred_job_sink.defer_job(Domain::StaticFileJob, { post_file: })
@deferred_job_sink.defer_job(
Domain::StaticFileJob,
{ post_file: },
{ queue: "bluesky" },
)
logger.debug(
format_tags(
@@ -93,7 +97,11 @@ class Bluesky::ProcessPostHelper
set_aspect_ratio(post_file, embed_data["aspectRatio"])
set_alt_text(post_file, embed_data["alt"])
post_file.save!
@deferred_job_sink.defer_job(Domain::StaticFileJob, { post_file: })
@deferred_job_sink.defer_job(
Domain::StaticFileJob,
{ post_file: },
{ queue: "bluesky" },
)
logger.debug(
format_tags(

View File

@@ -97,6 +97,28 @@ namespace :bluesky do
puts "added #{user.did} / #{user.handle} to monitor"
added += 1
end
if added % 25 == 0
loop do
queue_size =
GoodJob::Job
.where("queue_name IN ('bluesky', 'static_file')")
.where(finished_at: nil, performed_at: nil, error: nil)
.where(
[
"(serialized_params->'exception_executions' = '{}')",
"(serialized_params->'exception_executions' is null)",
].join(" OR "),
)
.count
puts "queue size: #{queue_size}"
if queue_size > 150
sleep 10
else
break
end
end
end
else
puts "user not found: #{line}"
end

View File

@@ -20,10 +20,22 @@ class ModelWithDescriptionHtml
end
RSpec.describe Domain::DescriptionsHelper, type: :helper do
def sanitize_description_html(html, base_domain = "www.furaffinity.net")
helper.sanitize_description_html(
ModelWithDescriptionHtml.new(base_domain, html),
def sanitize_description_html(
html,
base_domain = "www.furaffinity.net",
bsky_model: false
)
model =
if bsky_model
create(
:domain_user_bluesky_user,
handle: "lost_spots",
description: html,
)
else
ModelWithDescriptionHtml.new(base_domain, html)
end
helper.sanitize_description_html(model)
end
# Mock the policy for posts to avoid Devise authentication errors
@@ -225,6 +237,35 @@ RSpec.describe Domain::DescriptionsHelper, type: :helper do
end
end
describe "#extract_weak_url" do
shared_examples "extract_weak_url" do |url|
it "works for #{url}" do
expect(helper.extract_weak_url(url)).to eq(url)
expect(helper.extract_weak_url("before #{url} after")).to eq(url)
end
end
%w[http://twitter.com/foo http://t.me/foo].each do |url|
include_examples "extract_weak_url", url
end
end
describe "bare links" do
it "works for t.me" do
html = "https://t.me/lost_spots"
sanitized = sanitize_description_html(html, bsky_model: true)
expect(sanitized).to include("t.me/lost_spots")
expect(sanitized).to include("href=\"https://t.me/lost_spots\"")
end
it "works for twitter.com" do
html = "https://twitter.com/lost_spots"
sanitized = sanitize_description_html(html, bsky_model: true)
expect(sanitized).to include("twitter.com/lost_spots")
expect(sanitized).to include("href=\"https://twitter.com/lost_spots\"")
end
end
describe "relative links" do
it "works for posts" do
post1 =