normalize fa cdn hosts to avoid redownloading files

This commit is contained in:
Dylan Knutson
2025-07-24 16:19:41 +00:00
parent 19fc98e4ef
commit 430247a3ad
2 changed files with 113 additions and 4 deletions

View File

@@ -117,11 +117,25 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
uri = Addressable::URI.parse(submission.full_res_img)
uri.scheme = "https" if uri.scheme.blank?
if (file = post.file) && (file.url_str != uri.to_s)
file = post.files.build(url_str: uri.to_s)
else
file = post.file || post.build_file(url_str: uri.to_s)
# resolve the existing file and check if the URL has changed.
# sometimes, the domain will change from `d.facdn.net` to
# `d.furaffinity.net`, and we want to ignore the change in that case
file = post.file
if file && (old_url_str = file.url_str) && (old_url_str != uri.to_s)
if self.class.uri_same_with_normalized_facdn_host?(old_url_str, uri.to_s)
logger.info(
format_tags(
make_tag("old_url_str", old_url_str),
make_tag("new_url_str", uri.to_s),
"file url has changed, but is the same domain",
),
)
else
file = post.files.build(url_str: uri.to_s)
end
end
file ||= post.build_file(url_str: uri.to_s)
if file.url_str_changed?
file.enqueue_job_after_save(
Domain::Fa::Job::ScanFileJob,
@@ -142,4 +156,22 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
post.posted_at = submission.posted_date&.in_time_zone("UTC")
post.scanned_at = Time.now
end
FA_CDN_HOSTS = %w[d.facdn.net d.furaffinity.net].freeze
sig { params(url_str: String, new_url_str: String).returns(T::Boolean) }
def self.uri_same_with_normalized_facdn_host?(url_str, new_url_str)
uri = Addressable::URI.parse(url_str)
new_uri = Addressable::URI.parse(new_url_str)
uri.scheme = nil
new_uri.scheme = nil
if [uri, new_uri].all? { |uri| FA_CDN_HOSTS.include?(uri.host) }
# both URIs have an facdn host, so compare them but ignore the host
uri.host = nil
new_uri.host = nil
uri == new_uri
else
url_str == new_url_str
end
end
end

View File

@@ -219,4 +219,81 @@ describe Domain::Fa::Job::ScanPostJob do
expect(post.state).to eq("removed")
end
end
describe "#uri_same_with_normalized_facdn_host?" do
let(:client_mock_config) { [] }
shared_examples "has result" do |result|
it "is #{result.to_s}, both have schema" do
url1 = "https://#{host1}#{path1}"
url2 = "https://#{host2}#{path2}"
expect(
described_class.uri_same_with_normalized_facdn_host?(url1, url2),
).to eq(result)
end
it "is #{result.to_s}, both missing schema" do
url1 = "//#{host1}#{path1}"
url2 = "//#{host2}#{path2}"
expect(
described_class.uri_same_with_normalized_facdn_host?(url1, url2),
).to eq(result)
end
it "is #{result.to_s}, one has schema" do
url1 = "https://#{host1}#{path1}"
url2 = "//#{host2}#{path2}"
expect(
described_class.uri_same_with_normalized_facdn_host?(url1, url2),
).to eq(result)
end
end
shared_context "host: different cdn hosts" do
let(:host1) { "d.facdn.net" }
let(:host2) { "d.furaffinity.net" }
end
shared_context "host: both hosts are d.facdn.net" do
let(:host1) { "d.facdn.net" }
let(:host2) { "d.facdn.net" }
end
shared_context "host: both hosts are d.furaffinity.net" do
let(:host1) { "d.furaffinity.net" }
let(:host2) { "d.furaffinity.net" }
end
shared_context "host: one domain is not a cdn" do
let(:host1) { "d.facdn.net" }
let(:host2) { "example.com" }
end
shared_context "paths: are the same" do
let(:path1) { "/art/user/1234567890/image.jpg" }
let(:path2) { "/art/user/1234567890/image.jpg" }
end
shared_context "paths: are different" do
let(:path1) { "/art/user/1234567890/image.jpg" }
let(:path2) { "/art/user/1234567890/some_other_image.jpg" }
end
[
["host: different cdn hosts", "paths: are the same", true],
["host: both hosts are d.facdn.net", "paths: are the same", true],
["host: both hosts are d.furaffinity.net", "paths: are the same", true],
["host: one domain is not a cdn", "paths: are the same", false],
["host: different cdn hosts", "paths: are different", false],
["host: both hosts are d.facdn.net", "paths: are different", false],
["host: both hosts are d.furaffinity.net", "paths: are different", false],
["host: one domain is not a cdn", "paths: are different", false],
].each do |host_context, path_context, result|
context "#{host_context} and #{path_context}" do
include_context host_context
include_context path_context
include_examples "has result", result
end
end
end
end