backup tor archive scraping

This commit is contained in:
Dylan Knutson
2025-07-25 00:25:12 +00:00
parent d86612ee2e
commit dffdef51cd
26 changed files with 614 additions and 187 deletions

View File

@@ -13,6 +13,7 @@ RSpec.describe FaUriHelper do
original_file_posted: 1_740_700_581,
latest_file_posted: 1_740_700_581,
filename: "zzreg_stippling-crop.jpg",
filename_with_ts: "1740700581.zzreg_stippling-crop.jpg",
),
)
expect(parsed.original_file_posted_at).to eq(Time.at(1_740_700_581))
@@ -29,10 +30,27 @@ RSpec.describe FaUriHelper do
original_file_posted: 1_740_700_581,
latest_file_posted: 1_753_374_875,
filename: "zzreg_stippling-crop.jpg",
filename_with_ts: "1740700581.zzreg_stippling-crop.jpg",
),
)
expect(parsed.original_file_posted_at).to eq(Time.at(1_740_700_581))
expect(parsed.latest_file_posted_at).to eq(Time.at(1_753_374_875))
end
it "parses story uris" do
url =
"https://d.furaffinity.net/art/irontankris/stories/1753207806/1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg"
parsed = described_class.parse_fa_media_url(url)
expect(parsed).to eq(
FaUriHelper::FaMediaUrlInfo.new(
url_name: "irontankris",
original_file_posted: 1_753_207_806,
latest_file_posted: 1_753_207_806,
filename: "thumbnail.irontankris_royal_rivalry_gains.rtf.jpg",
filename_with_ts:
"1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg",
),
)
end
end
end

View File

@@ -14,70 +14,19 @@ describe Domain::Fa::Job::ScanFuzzysearchJob do
end
let(:fuzzysearch_response_51015903) do
[
{
id: 51_015_903,
file_id: 1_676_417_528,
artist: "crimetxt",
hash: 2_367_248_181_756_250_600,
hash_str: "2367248181756250660",
url:
"https://d.furaffinity.net/art/crimetxt/1676417528/1676417528.crimetxt_2023-02-15_00_18_48.png",
filename: "1676417528.crimetxt_2023-02-15_00_18_48.png",
rating: "adult",
posted_at: "2023-02-14T23:32:00Z",
file_size: 2_188_273,
sha256:
"d488dabd8eb22398a228fb662eb520bb4daaac3a9ab0dc9be8b8c5e1b9522efb",
updated_at: nil,
deleted: false,
tags: %w[some_tag another_tag],
},
]
JSON.parse(File.read("test/fixtures/files/fuzzysearch/51015903.json"))
end
let(:fuzzysearch_response_21275696) do
[
{
id: 21_275_696,
file_id: nil,
artist: nil,
hash: nil,
hash_str: nil,
url: nil,
filename: nil,
rating: nil,
posted_at: nil,
file_size: nil,
sha256: nil,
updated_at: "2023-08-16T08:02:27.746944Z",
deleted: true,
tags: [],
},
]
JSON.parse(File.read("test/fixtures/files/fuzzysearch/21275696.json"))
end
let(:fuzzysearch_response_53068507) do
[
{
id: 53_068_507,
file_id: 1_690_504_099,
artist: "Meesh",
hash: -5_278_701_664_616_650_000,
hash_str: "-5278701664616649812",
url:
"https://d.furaffinity.net/art/meesh/1690504135/1690504099.meesh_pamperingjack3_crop.png",
filename: "1690504099.meesh_pamperingjack3_crop.png",
rating: "adult",
posted_at: "2023-07-28T00:28:00Z",
file_size: 555_926,
sha256:
"f47e974ef7e72c53fce5a52b28b7c34576eb26af04299155d1cc3912379b0dea",
updated_at: nil,
deleted: false,
tags: %w[advertisement patreon female preview_limited beastars juno],
},
]
JSON.parse(File.read("test/fixtures/files/fuzzysearch/53068507.json"))
end
let(:fuzzysearch_response_61665194) do
JSON.parse(File.read("test/fixtures/files/fuzzysearch/61665194.json"))
end
describe "post was marked removed" do
@@ -209,5 +158,41 @@ describe Domain::Fa::Job::ScanFuzzysearchJob do
expect(post.creator.full_name).to eq("Meesh")
end
end
context "and the post has a story url" do
let(:fa_id) { 61_665_194 }
let(:client_mock_config) do
[
{
uri:
"https://api-next.fuzzysearch.net/v1/file/furaffinity?search=#{fa_id}",
status_code: 200,
content_type: "application/json",
contents: fuzzysearch_response_61665194.to_json,
},
]
end
it "does not change the post state" do
perform_now({ post: })
post.reload
expect(post.state).to eq("removed")
end
it "sets the artist" do
perform_now({ post: })
post.reload
expect(post.creator).to be_present
expect(post.creator.url_name).to eq("irontankris")
end
it "updates keywords", quiet: false do
post.keywords = []
post.save!
perform_now({ post: })
post.reload
expect(post.keywords).to include("female", "mlp", "little", "anthro")
end
end
end
end

View File

@@ -78,7 +78,7 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do
it "downloads the file from fur archiver" do
expect do
perform_now({ post_file: post_file })
perform_now({ post: post })
post.reload
end.to change { post.file&.log_entry }.from(old_log_entry).to(
have_attributes(uri: have_attributes(to_s: fur_archiver_url_str)),
@@ -87,7 +87,7 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do
it "updates the post_file blob" do
expect do
perform_now({ post_file: post_file })
perform_now({ post: post })
post.reload
end.to change { post.file&.blob }.from(old_log_entry.response).to(
@log_entries[0].response,
@@ -96,17 +96,24 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do
it "sets the last status code" do
expect do
perform_now({ post_file: post_file })
perform_now({ post: post })
post.reload
end.to change { post.file&.last_status_code }.from(404).to(200)
end
it "sets the post_file state to ok" do
expect do
perform_now({ post_file: post_file })
perform_now({ post: post })
post.reload
end.to change { post.file&.state }.from("terminal_error").to("ok")
end
it "does not perform the request twice" do
perform_now({ post: post })
perform_now({ post: post })
post.reload
expect(post.files.length).to eq(2)
end
end
context "with a d.facdn.net url" do
@@ -122,5 +129,50 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do
end
include_examples "correct behavior"
end
context "when furarchiver returns 404" do
let(:client_mock_config) do
[
{
uri: fur_archiver_url_str,
status_code: 404,
content_type: "text/html",
contents: "not found",
},
{
uri:
"http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/fa/wolfsparta/1496842943.wolfsparta_caught_pt2.png",
status_code: 200,
content_type: "image/png",
contents: image_data,
caused_by_entry_idx: 0,
},
]
end
let(:file_url_str) do
"https://d.facdn.net/art/wolfsparta/1496842943/1496842943.wolfsparta_caught_pt2.png"
end
it "creates a tor post file" do
perform_now({ post: post })
post.reload
# should have original post file, the fur archiver post file, and the tor post file
expect(post.files.count).to eq(3)
original, furarchiver, tor = post.files.sort_by(&:id)
expect(original.state).to eq("terminal_error")
expect(furarchiver.state).to eq("terminal_error")
expect(tor.state).to eq("ok")
expect(tor.url_str).to eq(
"http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/fa/wolfsparta/1496842943.wolfsparta_caught_pt2.png",
)
end
it "does not perform the request twice" do
perform_now({ post: post })
perform_now({ post: post })
post.reload
expect(post.files.length).to eq(3)
end
end
end
end