backup tor archive scraping

This commit is contained in:
Dylan Knutson
2025-07-25 00:25:12 +00:00
parent d86612ee2e
commit dffdef51cd
26 changed files with 614 additions and 187 deletions

View File

@@ -95,9 +95,3 @@ ENV PATH "/home/vscode/.exo/bin:$PATH"
# install just (command runner)
RUN curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | sudo bash -s -- --to /usr/local/bin
# RUN source /usr/local/share/nvm/nvm.sh && nvm install 18 && nvm use 18 && npm install -g yarn 2>&1
# ENV PATH /usr/local/share/nvm/current/bin:$PATH
# # install `backlog` tool
# RUN npm i -g backlog.md

View File

@@ -98,9 +98,19 @@ services:
- WIREGUARD_ADDRESSES=10.165.87.232/32,fd7d:76ee:e68f:a993:4d1b:a77a:b471:a606/128
- SERVER_CITIES="San Jose California, Fremont California"
tor:
image: dockurr/tor
container_name: tor
volumes:
- devcontainer-redux-tor-config:/etc/tor
- devcontainer-redux-tor-data:/var/lib/tor
restart: always
volumes:
postgres-17-data:
devcontainer-redux-gem-cache:
devcontainer-redux-blob-files:
devcontainer-redux-grafana-data:
devcontainer-redux-prometheus-data:
devcontainer-redux-tor-config:
devcontainer-redux-tor-data:

View File

@@ -25,3 +25,13 @@ function blob-files-stats
printf "apparent size: %s\n" (du -sh --apparent-size $files_dir)
printf "actual size: %s\n" (du -sh $files_dir)
end
function curl-fa-onion
curl \
--socks5-hostname tor:9050 \
--compressed \
-A "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0" \
-H "Accept-Encoding: gzip, deflate" \
-H "Connection: keep-alive" \
"http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/$argv[1]"
end

View File

@@ -13,6 +13,7 @@ module FaUriHelper
const :original_file_posted, Integer
const :latest_file_posted, Integer
const :filename, String
const :filename_with_ts, String
sig { returns(Time) }
def original_file_posted_at
@@ -35,7 +36,7 @@ module FaUriHelper
path = uri.path
match =
path.match(
%r{/art/(?<url_name>[^/]+)/(?<latest_ts>\d+)/(?<original_ts>\d+)\.(?<filename>.*)},
%r{/art/(?<url_name>[^/]+)/(stories/)?(?<latest_ts>\d+)/(?<original_ts>\d+)\.(?<filename>.*)},
)
return nil unless match
url_name = match[:url_name]
@@ -47,6 +48,7 @@ module FaUriHelper
original_file_posted: original_ts,
latest_file_posted: latest_ts,
filename:,
filename_with_ts: path.split("/").last,
)
end

View File

@@ -11,11 +11,12 @@ class Domain::Fa::Job::ScanFuzzysearchJob < Domain::Fa::Job::Base
sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
def perform(args)
post_file = T.let(nil, T.nilable(Domain::PostFile))
post = post_from_args!
fs_client = Scraper::FuzzysearchApiClient.new(http_client)
logger.tagged(make_arg_tag(post)) do
if post.fuzzysearch_checked_at.present?
if post.fuzzysearch_checked_at.present? && !force_scan?
logger.warn("fuzzysearch already checked, skipping")
return
end
@@ -35,6 +36,7 @@ class Domain::Fa::Job::ScanFuzzysearchJob < Domain::Fa::Job::Base
post.fuzzysearch_checked_at = Time.now
if response.is_a?(HttpLogEntry)
post.fuzzysearch_entry = response
logger.error("fuzzysearch query failed")
return
end
@@ -83,5 +85,6 @@ class Domain::Fa::Job::ScanFuzzysearchJob < Domain::Fa::Job::Base
end
ensure
post.save! if post
post_file.save! if post_file
end
end

View File

@@ -2,8 +2,8 @@
class Job::FaPostFurArchiverPostFileJob < Scraper::JobBase
extend T::Sig
include Domain::StaticFileJobHelper
queue_as :fur_archiver
queue_as :static_file
discard_on Scraper::JobBase::JobError, ActiveJob::DeserializationError
sig { override.returns(Symbol) }
@@ -13,102 +13,89 @@ class Job::FaPostFurArchiverPostFileJob < Scraper::JobBase
sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
def perform(args)
post_file = T.cast(args[:post_file], Domain::PostFile)
logger.tagged(make_arg_tag(post_file), make_arg_tag(post_file.post)) do
handle(post_file)
post = T.cast(args[:post], Domain::Post::FaPost)
# todo - try multiple post files?
post_file =
post
.files
.to_a
.sort_by { |file| T.must(file.created_at) }
.reverse
.find do |file|
url_str = file.url_str || next
uri = Addressable::URI.parse(url_str)
FaUriHelper.is_fa_cdn_host?(uri.host)
end
fatal_error("no existing post file found, skipping") if post_file.nil?
logger.tagged(make_arg_tag(post), make_arg_tag(post_file)) do
if post_file.state_ok? && post_file.last_status_code == 200
logger.info("file already downloaded, skipping")
return :ok
end
unless post_file.last_status_code == 404
logger.warn("last status code is not 404, skipping")
return
end
unless post_file.state_terminal_error?
logger.warn("post file not in terminal error state, skipping")
return
end
file_url_str = post_file.url_str
fatal_error("no file url str") unless file_url_str
url_parsed = FaUriHelper.parse_fa_media_url(file_url_str)
fatal_error("failed to parse fa file url") unless url_parsed
creator_url_name = post.creator&.url_name
fatal_error("no creator url name") unless creator_url_name
unless creator_url_name == url_parsed.url_name
logger.tagged(
make_tag("in_db", creator_url_name),
make_tag("in_url", url_parsed.url_name),
) { fatal_error("creator name mismatch") }
end
next if try_from_furarchiver(post, url_parsed)
try_from_tor(post, url_parsed)
end
end
FA_URL_PATTERN =
%r{
https://(d\.facdn\.net|d\.furaffinity\.net)/art/([^\/]+)/(\d+)/([^\/]+)
}x
sig { params(post_file: Domain::PostFile).void }
def handle(post_file)
post = T.cast(post_file.post, Domain::Post::FaPost)
if post.file != post_file && post.file&.state_ok?
logger.info("file already downloaded, deleting old file")
post_file.destroy
return
end
if post_file.state_ok? && post_file.last_status_code == 200
logger.info("file already downloaded, skipping")
return
end
if post.tried_from_fur_archiver?
logger.warn("already tried to download from fur archiver, skipping")
return
end
unless post_file.last_status_code == 404
logger.warn("last status code is not 404, skipping")
return
end
unless post_file.state_terminal_error?
logger.warn("post file not in terminal error state, skipping")
return
end
user_url_name = post.creator&.url_name
fatal_error("no user url name") unless user_url_name
fa_file_url_str = post_file.url_str
fatal_error("no fa file url") unless fa_file_url_str
match = fa_file_url_str.match(FA_URL_PATTERN)
unless match
if fa_file_url_str.include?("#{user_url_name}/stories/")
logger.warn("old stories URL, force rescan")
post.reload
Domain::Fa::Job::ScanPostJob.perform_now(
{ post: post, force_scan: true },
)
post.reload
unless post.state_ok?
fatal_error("post not in ok state after rescan: #{post.state}")
end
return if post.file&.state_ok? || post.file&.state_pending?
match = fa_file_url_str.match(FA_URL_PATTERN)
unless match
fatal_error("invalid fa file url after rescan: #{fa_file_url_str}")
end
else
fatal_error("invalid fa file url: #{fa_file_url_str}")
end
end
unless url_user_url_name = match.captures[1]
fatal_error("no user url name in url: #{fa_file_url_str}")
end
unless url_file_name = match.captures[3]
fatal_error("no file name in url: #{fa_file_url_str}")
end
unless user_url_name == url_user_url_name
logger.tagged(
make_tag("in_db", user_url_name),
make_tag("in_url", url_user_url_name),
) { fatal_error("user name mismatch") }
end
# returns true if the post file was found and downloaded
sig do
params(
post: Domain::Post::FaPost,
url_parsed: FaUriHelper::FaMediaUrlInfo,
).returns(T::Boolean)
end
def try_from_furarchiver(post, url_parsed)
fur_archiver_url_str =
"https://furarchiver.net/File/View?artist=#{url_user_url_name}&filename=#{url_file_name}"
"https://furarchiver.net/File/View?artist=#{url_parsed.url_name}&filename=#{url_parsed.filename_with_ts}"
post.tried_from_fur_archiver = true
post_file = post.files.build(url_str: fur_archiver_url_str)
archiver_post_file =
post.files.find_or_create_by!(url_str: fur_archiver_url_str)
if archiver_post_file.state_ok?
logger.warn("already downloaded from fur archiver, skipping")
return true
elsif archiver_post_file.state_terminal_error?
logger.warn("previously failed to download from fur archiver, trying tor")
return false
end
begin
response = http_client.get(fur_archiver_url_str)
rescue Scraper::HttpClient::InvalidURLError,
Curl::Err::HostResolutionError => e
post_file.state_terminal_error!
post_file.error_message = e.message
archiver_post_file.state_terminal_error!
archiver_post_file.error_message = e.message
archiver_post_file.save!
logger.error(
format_tags(
"invalid fur archiver url, terminal error state",
@@ -116,14 +103,40 @@ class Job::FaPostFurArchiverPostFileJob < Scraper::JobBase
make_tag("url", fur_archiver_url_str),
),
)
return
return false
ensure
post.save! if post
post.tried_from_fur_archiver = true
post.save!
end
post_file.save!
post.reload
handle_file_download_response(archiver_post_file, response)
return archiver_post_file.state_ok?
end
handle_file_download_response(post_file, response)
sig do
params(
post: Domain::Post::FaPost,
url_parsed: FaUriHelper::FaMediaUrlInfo,
).void
end
def try_from_tor(post, url_parsed)
tor_path = "fa/#{url_parsed.url_name}/#{url_parsed.filename_with_ts}"
tor_url_str =
"http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/#{tor_path}"
tor_post_file =
post.files.find_by(url_str: tor_url_str) ||
post.files.create!(url_str: tor_url_str)
if tor_post_file.state_ok?
logger.warn("already downloaded from tor, skipping")
return
elsif tor_post_file.state_terminal_error?
logger.warn("previously failed to download from tor, skipping")
return
end
response = tor_http_client.get(tor_url_str)
handle_file_download_response(tor_post_file, response)
end
end

View File

@@ -73,6 +73,7 @@ class Scraper::JobBase < ApplicationJob
@deferred_jobs = T.let(Set.new, T::Set[DeferredJob])
@suppressed_jobs = T.let(Set.new, T::Set[SuppressedJob])
@http_client = T.let(nil, T.nilable(Scraper::HttpClient))
@tor_http_client = T.let(nil, T.nilable(Scraper::HttpClient))
@gallery_dl_client = T.let(nil, T.nilable(Scraper::GalleryDlClient))
@first_log_entry = T.let(nil, T.nilable(HttpLogEntry))
@last_log_entry = T.let(nil, T.nilable(HttpLogEntry))
@@ -88,6 +89,12 @@ class Scraper::JobBase < ApplicationJob
WrappedHttpClient.new(self, @http_client)
end
sig { returns(WrappedHttpClient) }
def tor_http_client
@tor_http_client ||= Scraper::ClientFactory.get_tor_http_client
WrappedHttpClient.new(self, @tor_http_client)
end
sig { returns(Scraper::GalleryDlClient) }
def gallery_dl_client
@gallery_dl_client ||= Scraper::ClientFactory.get_gallery_dl_client

View File

@@ -1,5 +1,7 @@
# typed: true
class Scraper::ClientFactory
extend T::Sig
@http_clients = Concurrent::ThreadLocalVar.new() { {} }
@gallery_dl_clients = Concurrent::ThreadLocalVar.new(nil)
@@ -88,6 +90,18 @@ class Scraper::ClientFactory
end
end
def self.get_tor_http_client
if Rails.env.test?
@http_client_mock || raise("no http client mock set")
else
_http_client_impl(
:tor,
Scraper::TorHttpClientConfig,
Scraper::TorCurlHttpPerformer,
)
end
end
def self._gallery_dl_client_impl
@gallery_dl_clients.value ||=
begin
@@ -105,9 +119,20 @@ class Scraper::ClientFactory
end
end
def self._http_client_impl(key, config_klass)
sig do
params(
key: Symbol,
config_klass: T.class_of(Scraper::HttpClientConfig),
performer_klass: T.class_of(Scraper::CurlHttpPerformer),
).returns(Scraper::HttpClient)
end
def self._http_client_impl(
key,
config_klass,
performer_klass = Scraper::CurlHttpPerformer
)
@http_clients.value[key] ||= begin
Scraper::HttpClient.new(config_klass.new, Scraper::CurlHttpPerformer.new)
Scraper::HttpClient.new(config_klass.new, performer_klass.new)
end
end
end

View File

@@ -38,32 +38,33 @@ class Scraper::CurlHttpPerformer
"direct"
end
sig { params(request: Request).returns(Response) }
sig(:final) { params(request: Request).returns(Response) }
def do_request(request)
do_request_impl(request)
end
private
sig { returns(String) }
def performed_by
proxy_url = ENV["HTTP_PROXY_URL"]
case proxy_url
when nil
"direct"
when /airvpn-netherlands-proxy:(\d+)/
"airvpn-1-netherlands"
when /airvpn-san-jose-proxy:(\d+)/
"airvpn-2-san-jose"
else
raise("Unknown proxy URL: #{proxy_url}")
end
end
sig { params(request: Request).returns(Response) }
def do_request_impl(request)
curl = get_curl
start_at = Time.now
proxy_url = ENV["HTTP_PROXY_URL"]
performed_by =
case proxy_url
when nil
"direct"
when /airvpn-netherlands-proxy:(\d+)/
"airvpn-1-netherlands"
when /airvpn-san-jose-proxy:(\d+)/
"airvpn-2-san-jose"
else
raise("Unknown proxy URL: #{proxy_url}")
end
curl.proxy_url = proxy_url
curl.timeout = 30
curl.url = request.uri.normalize.to_s
curl.follow_location = request.follow_redirects
@@ -120,7 +121,7 @@ class Scraper::CurlHttpPerformer
response_headers:,
response_time_ms:,
body: body_str,
performed_by:,
performed_by: performed_by,
)
end
@@ -131,6 +132,8 @@ class Scraper::CurlHttpPerformer
t.thread_variable_set(:curl, Curl::Easy.new)
end
curl = t.thread_variable_get(:curl)
proxy_url = ENV["HTTP_PROXY_URL"]
curl.proxy_url = proxy_url
curl.headers = {}
curl
end

View File

@@ -3,6 +3,7 @@
class Scraper::FuzzysearchApiClient
extend T::Sig
include HasColorLogger
API_BASE_URL = "https://api-next.fuzzysearch.net"
API_PATH_FA_ID_INFO = "/v1/file/furaffinity"
@@ -31,18 +32,45 @@ class Scraper::FuzzysearchApiClient
url = Addressable::URI.parse("#{API_BASE_URL}#{API_PATH_FA_ID_INFO}")
url.query_values = { search: fa_id.to_s }
response = @http_client.get(url)
return response.log_entry if response.status_code != 200
if response.status_code != 200
logger.error(
format_tags(
make_tag("status_code", response.status_code),
make_tag("uri", url.to_s),
"fuzzysearch query failed",
),
)
return response.log_entry
end
json = JSON.parse(response.body)
return response.log_entry unless json.is_a?(Array)
return response.log_entry if json.empty?
unless json.is_a?(Array)
logger.error("fuzzysearch response is not an array")
return response.log_entry
end
if json.empty?
logger.error("fuzzysearch response is empty")
return response.log_entry
end
json = json.first
return response.log_entry unless json.is_a?(Hash)
unless json.is_a?(Hash)
logger.error("fuzzysearch response is not a hash")
return response.log_entry
end
file_url = json["url"]
return response.log_entry unless file_url.present?
if file_url.blank?
logger.error("fuzzysearch response has no file url")
return response.log_entry
end
url_parsed = FaUriHelper.parse_fa_media_url(file_url)
return response.log_entry unless url_parsed.present?
if url_parsed.blank?
logger.error(
format_tags(make_tag("file_url", file_url), "failed to parse file url"),
)
return response.log_entry
end
FaIdInfo.new(
log_entry: response.log_entry,

View File

@@ -0,0 +1,28 @@
# typed: strict
# frozen_string_literal: true
class Scraper::TorCurlHttpPerformer < Scraper::CurlHttpPerformer
extend T::Sig
sig { override.returns(String) }
def performed_by
"tor-1"
end
sig { override.returns(Curl::Easy) }
def get_curl
t = Thread.current
unless t.thread_variable?(:curl)
t.thread_variable_set(:curl, Curl::Easy.new)
end
curl = T.cast(t.thread_variable_get(:curl), Curl::Easy)
curl.proxy_url = "socks5h://tor:9050"
curl.headers = {
"User-Agent" =>
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0",
"Accept-Encoding" => "gzip, deflate",
"Connection" => "keep-alive",
}
curl
end
end

View File

@@ -0,0 +1,29 @@
# typed: strict
# frozen_string_literal: true
class Scraper::TorHttpClientConfig < Scraper::HttpClientConfig
TOR_ARCHIVE_HOST =
"g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion"
extend T::Sig
sig { override.returns(T.nilable(T::Array[T::Hash[Symbol, T.untyped]])) }
def cookies
nil
end
sig { override.returns(T::Array[[String, Numeric]]) }
def ratelimit
[[TOR_ARCHIVE_HOST, 1.0]]
end
sig { override.returns(Integer) }
def redirect_limit
2
end
sig { override.returns(T::Array[String]) }
def allowed_domains
[TOR_ARCHIVE_HOST]
end
end

View File

@@ -28,6 +28,7 @@ class Domain::Post::FaPost < Domain::Post
# TODO - convert `file` to Domain::PostFile::FaPostFile and
# move this to Domain::PostFile::FaPostFile
attr_json :tried_from_fur_archiver, :boolean, default: false
attr_json :tried_from_tor, :boolean, default: false
belongs_to :last_user_page, class_name: "::HttpLogEntry", optional: true
belongs_to :first_browse_page, class_name: "::HttpLogEntry", optional: true

View File

@@ -14,6 +14,7 @@ class HttpLogEntry < ReduxApplicationRecord
serverhost-1
airvpn-1-netherlands
airvpn-2-san-jose
tor-1
],
prefix: true

View File

@@ -2015,6 +2015,9 @@ class Domain::Post::FaPost
sig { void }
def restore_tried_from_fur_archiver!; end
sig { void }
def restore_tried_from_tor!; end
sig { void }
def restore_type!; end
@@ -2189,6 +2192,12 @@ class Domain::Post::FaPost
sig { returns(T::Boolean) }
def saved_change_to_tried_from_fur_archiver?; end
sig { returns(T.nilable([T.nilable(T::Boolean), T.nilable(T::Boolean)])) }
def saved_change_to_tried_from_tor; end
sig { returns(T::Boolean) }
def saved_change_to_tried_from_tor?; end
sig { returns(T.nilable([T.untyped, T.untyped])) }
def saved_change_to_type; end
@@ -2526,6 +2535,51 @@ class Domain::Post::FaPost
sig { void }
def tried_from_fur_archiver_will_change!; end
sig { returns(T.nilable(T::Boolean)) }
def tried_from_tor; end
sig { params(value: T.nilable(T::Boolean)).returns(T.nilable(T::Boolean)) }
def tried_from_tor=(value); end
sig { returns(T::Boolean) }
def tried_from_tor?; end
sig { returns(T.nilable(T::Boolean)) }
def tried_from_tor_before_last_save; end
sig { returns(T.untyped) }
def tried_from_tor_before_type_cast; end
sig { returns(T::Boolean) }
def tried_from_tor_came_from_user?; end
sig { returns(T.nilable([T.nilable(T::Boolean), T.nilable(T::Boolean)])) }
def tried_from_tor_change; end
sig { returns(T.nilable([T.nilable(T::Boolean), T.nilable(T::Boolean)])) }
def tried_from_tor_change_to_be_saved; end
sig { params(from: T.nilable(T::Boolean), to: T.nilable(T::Boolean)).returns(T::Boolean) }
def tried_from_tor_changed?(from: T.unsafe(nil), to: T.unsafe(nil)); end
sig { returns(T.nilable(T::Boolean)) }
def tried_from_tor_in_database; end
sig { returns(T.nilable([T.nilable(T::Boolean), T.nilable(T::Boolean)])) }
def tried_from_tor_previous_change; end
sig { params(from: T.nilable(T::Boolean), to: T.nilable(T::Boolean)).returns(T::Boolean) }
def tried_from_tor_previously_changed?(from: T.unsafe(nil), to: T.unsafe(nil)); end
sig { returns(T.nilable(T::Boolean)) }
def tried_from_tor_previously_was; end
sig { returns(T.nilable(T::Boolean)) }
def tried_from_tor_was; end
sig { void }
def tried_from_tor_will_change!; end
sig { returns(T.untyped) }
def type; end
@@ -2710,6 +2764,9 @@ class Domain::Post::FaPost
sig { returns(T::Boolean) }
def will_save_change_to_tried_from_fur_archiver?; end
sig { returns(T::Boolean) }
def will_save_change_to_tried_from_tor?; end
sig { returns(T::Boolean) }
def will_save_change_to_type?; end

View File

@@ -445,6 +445,12 @@ class HttpLogEntry
sig { returns(T::Boolean) }
def performed_by_serverhost_1?; end
sig { void }
def performed_by_tor_1!; end
sig { returns(T::Boolean) }
def performed_by_tor_1?; end
sig { void }
def verb_get!; end
@@ -679,6 +685,9 @@ class HttpLogEntry
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def not_performed_by_serverhost_1(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def not_performed_by_tor_1(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def not_verb_get(*args, &blk); end
@@ -741,6 +750,9 @@ class HttpLogEntry
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def performed_by_serverhost_1(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def performed_by_tor_1(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def preload(*args, &blk); end
@@ -2046,6 +2058,9 @@ class HttpLogEntry
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def not_performed_by_serverhost_1(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def not_performed_by_tor_1(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def not_verb_get(*args, &blk); end
@@ -2108,6 +2123,9 @@ class HttpLogEntry
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def performed_by_serverhost_1(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def performed_by_tor_1(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def preload(*args, &blk); end

View File

@@ -0,0 +1,16 @@
# typed: true
# DO NOT EDIT MANUALLY
# This is an autogenerated file for dynamic methods in `Scraper::FuzzysearchApiClient`.
# Please instead update this file by running `bin/tapioca dsl Scraper::FuzzysearchApiClient`.
class Scraper::FuzzysearchApiClient
sig { returns(ColorLogger) }
def logger; end
class << self
sig { returns(ColorLogger) }
def logger; end
end
end

View File

@@ -0,0 +1,16 @@
# typed: true
# DO NOT EDIT MANUALLY
# This is an autogenerated file for dynamic methods in `Scraper::TorHttpClientConfig`.
# Please instead update this file by running `bin/tapioca dsl Scraper::TorHttpClientConfig`.
class Scraper::TorHttpClientConfig
sig { returns(ColorLogger) }
def logger; end
class << self
sig { returns(ColorLogger) }
def logger; end
end
end

View File

@@ -0,0 +1,16 @@
# typed: true
# DO NOT EDIT MANUALLY
# This is an autogenerated file for dynamic methods in `Tasks::Fa::QueryMissingPostsFromFuzzysearch`.
# Please instead update this file by running `bin/tapioca dsl Tasks::Fa::QueryMissingPostsFromFuzzysearch`.
class Tasks::Fa::QueryMissingPostsFromFuzzysearch
sig { returns(ColorLogger) }
def logger; end
class << self
sig { returns(ColorLogger) }
def logger; end
end
end

View File

@@ -13,6 +13,7 @@ RSpec.describe FaUriHelper do
original_file_posted: 1_740_700_581,
latest_file_posted: 1_740_700_581,
filename: "zzreg_stippling-crop.jpg",
filename_with_ts: "1740700581.zzreg_stippling-crop.jpg",
),
)
expect(parsed.original_file_posted_at).to eq(Time.at(1_740_700_581))
@@ -29,10 +30,27 @@ RSpec.describe FaUriHelper do
original_file_posted: 1_740_700_581,
latest_file_posted: 1_753_374_875,
filename: "zzreg_stippling-crop.jpg",
filename_with_ts: "1740700581.zzreg_stippling-crop.jpg",
),
)
expect(parsed.original_file_posted_at).to eq(Time.at(1_740_700_581))
expect(parsed.latest_file_posted_at).to eq(Time.at(1_753_374_875))
end
it "parses story uris" do
url =
"https://d.furaffinity.net/art/irontankris/stories/1753207806/1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg"
parsed = described_class.parse_fa_media_url(url)
expect(parsed).to eq(
FaUriHelper::FaMediaUrlInfo.new(
url_name: "irontankris",
original_file_posted: 1_753_207_806,
latest_file_posted: 1_753_207_806,
filename: "thumbnail.irontankris_royal_rivalry_gains.rtf.jpg",
filename_with_ts:
"1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg",
),
)
end
end
end

View File

@@ -14,70 +14,19 @@ describe Domain::Fa::Job::ScanFuzzysearchJob do
end
let(:fuzzysearch_response_51015903) do
[
{
id: 51_015_903,
file_id: 1_676_417_528,
artist: "crimetxt",
hash: 2_367_248_181_756_250_600,
hash_str: "2367248181756250660",
url:
"https://d.furaffinity.net/art/crimetxt/1676417528/1676417528.crimetxt_2023-02-15_00_18_48.png",
filename: "1676417528.crimetxt_2023-02-15_00_18_48.png",
rating: "adult",
posted_at: "2023-02-14T23:32:00Z",
file_size: 2_188_273,
sha256:
"d488dabd8eb22398a228fb662eb520bb4daaac3a9ab0dc9be8b8c5e1b9522efb",
updated_at: nil,
deleted: false,
tags: %w[some_tag another_tag],
},
]
JSON.parse(File.read("test/fixtures/files/fuzzysearch/51015903.json"))
end
let(:fuzzysearch_response_21275696) do
[
{
id: 21_275_696,
file_id: nil,
artist: nil,
hash: nil,
hash_str: nil,
url: nil,
filename: nil,
rating: nil,
posted_at: nil,
file_size: nil,
sha256: nil,
updated_at: "2023-08-16T08:02:27.746944Z",
deleted: true,
tags: [],
},
]
JSON.parse(File.read("test/fixtures/files/fuzzysearch/21275696.json"))
end
let(:fuzzysearch_response_53068507) do
[
{
id: 53_068_507,
file_id: 1_690_504_099,
artist: "Meesh",
hash: -5_278_701_664_616_650_000,
hash_str: "-5278701664616649812",
url:
"https://d.furaffinity.net/art/meesh/1690504135/1690504099.meesh_pamperingjack3_crop.png",
filename: "1690504099.meesh_pamperingjack3_crop.png",
rating: "adult",
posted_at: "2023-07-28T00:28:00Z",
file_size: 555_926,
sha256:
"f47e974ef7e72c53fce5a52b28b7c34576eb26af04299155d1cc3912379b0dea",
updated_at: nil,
deleted: false,
tags: %w[advertisement patreon female preview_limited beastars juno],
},
]
JSON.parse(File.read("test/fixtures/files/fuzzysearch/53068507.json"))
end
let(:fuzzysearch_response_61665194) do
JSON.parse(File.read("test/fixtures/files/fuzzysearch/61665194.json"))
end
describe "post was marked removed" do
@@ -209,5 +158,41 @@ describe Domain::Fa::Job::ScanFuzzysearchJob do
expect(post.creator.full_name).to eq("Meesh")
end
end
context "and the post has a story url" do
let(:fa_id) { 61_665_194 }
let(:client_mock_config) do
[
{
uri:
"https://api-next.fuzzysearch.net/v1/file/furaffinity?search=#{fa_id}",
status_code: 200,
content_type: "application/json",
contents: fuzzysearch_response_61665194.to_json,
},
]
end
it "does not change the post state" do
perform_now({ post: })
post.reload
expect(post.state).to eq("removed")
end
it "sets the artist" do
perform_now({ post: })
post.reload
expect(post.creator).to be_present
expect(post.creator.url_name).to eq("irontankris")
end
it "updates keywords", quiet: false do
post.keywords = []
post.save!
perform_now({ post: })
post.reload
expect(post.keywords).to include("female", "mlp", "little", "anthro")
end
end
end
end

View File

@@ -78,7 +78,7 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do
it "downloads the file from fur archiver" do
expect do
perform_now({ post_file: post_file })
perform_now({ post: post })
post.reload
end.to change { post.file&.log_entry }.from(old_log_entry).to(
have_attributes(uri: have_attributes(to_s: fur_archiver_url_str)),
@@ -87,7 +87,7 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do
it "updates the post_file blob" do
expect do
perform_now({ post_file: post_file })
perform_now({ post: post })
post.reload
end.to change { post.file&.blob }.from(old_log_entry.response).to(
@log_entries[0].response,
@@ -96,17 +96,24 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do
it "sets the last status code" do
expect do
perform_now({ post_file: post_file })
perform_now({ post: post })
post.reload
end.to change { post.file&.last_status_code }.from(404).to(200)
end
it "sets the post_file state to ok" do
expect do
perform_now({ post_file: post_file })
perform_now({ post: post })
post.reload
end.to change { post.file&.state }.from("terminal_error").to("ok")
end
it "does not perform the request twice" do
perform_now({ post: post })
perform_now({ post: post })
post.reload
expect(post.files.length).to eq(2)
end
end
context "with a d.facdn.net url" do
@@ -122,5 +129,50 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do
end
include_examples "correct behavior"
end
context "when furarchiver returns 404" do
let(:client_mock_config) do
[
{
uri: fur_archiver_url_str,
status_code: 404,
content_type: "text/html",
contents: "not found",
},
{
uri:
"http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/fa/wolfsparta/1496842943.wolfsparta_caught_pt2.png",
status_code: 200,
content_type: "image/png",
contents: image_data,
caused_by_entry_idx: 0,
},
]
end
let(:file_url_str) do
"https://d.facdn.net/art/wolfsparta/1496842943/1496842943.wolfsparta_caught_pt2.png"
end
it "creates a tor post file" do
perform_now({ post: post })
post.reload
# should have original post file, the fur archiver post file, and the tor post file
expect(post.files.count).to eq(3)
original, furarchiver, tor = post.files.sort_by(&:id)
expect(original.state).to eq("terminal_error")
expect(furarchiver.state).to eq("terminal_error")
expect(tor.state).to eq("ok")
expect(tor.url_str).to eq(
"http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/fa/wolfsparta/1496842943.wolfsparta_caught_pt2.png",
)
end
it "does not perform the request twice" do
perform_now({ post: post })
perform_now({ post: post })
post.reload
expect(post.files.length).to eq(3)
end
end
end
end

View File

@@ -0,0 +1,18 @@
[
{
"id": 21275696,
"file_id": null,
"artist": null,
"hash": null,
"hash_str": null,
"url": null,
"filename": null,
"rating": null,
"posted_at": null,
"file_size": null,
"sha256": null,
"updated_at": "2023-08-16T08:02:27.746944Z",
"deleted": true,
"tags": []
}
]

View File

@@ -0,0 +1,18 @@
[
{
"id": 51015903,
"file_id": 1676417528,
"artist": "crimetxt",
"hash": 2367248181756250600,
"hash_str": "2367248181756250660",
"url": "https://d.furaffinity.net/art/crimetxt/1676417528/1676417528.crimetxt_2023-02-15_00_18_48.png",
"filename": "1676417528.crimetxt_2023-02-15_00_18_48.png",
"rating": "adult",
"posted_at": "2023-02-14T23:32:00Z",
"file_size": 2188273,
"sha256": "d488dabd8eb22398a228fb662eb520bb4daaac3a9ab0dc9be8b8c5e1b9522efb",
"updated_at": null,
"deleted": false,
"tags": ["some_tag", "another_tag"]
}
]

View File

@@ -0,0 +1,25 @@
[
{
"id": 53068507,
"file_id": 1690504099,
"artist": "Meesh",
"hash": -5278701664616650000,
"hash_str": "-5278701664616649812",
"url": "https://d.furaffinity.net/art/meesh/1690504135/1690504099.meesh_pamperingjack3_crop.png",
"filename": "1690504099.meesh_pamperingjack3_crop.png",
"rating": "adult",
"posted_at": "2023-07-28T00:28:00Z",
"file_size": 555926,
"sha256": "f47e974ef7e72c53fce5a52b28b7c34576eb26af04299155d1cc3912379b0dea",
"updated_at": null,
"deleted": false,
"tags": [
"advertisement",
"patreon",
"female",
"preview_limited",
"beastars",
"juno"
]
}
]

View File

@@ -0,0 +1,34 @@
[
{
"id": 61665194,
"file_id": 1753207806,
"artist": "irontankris",
"hash": 3088256223167493600,
"hash_str": "3088256223167493676",
"url": "https://d.furaffinity.net/art/irontankris/stories/1753207806/1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg",
"filename": "1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg",
"rating": "adult",
"posted_at": "2025-07-22T18:10:00Z",
"file_size": 10283,
"sha256": "3809e988c41506211bd2885061646432e021b3810afc9080d244dd63a84a783e",
"updated_at": null,
"deleted": false,
"tags": [
"female",
"mlp",
"little",
"pony",
"sex",
"lesbian",
"weight",
"gain",
"wg",
"incest",
"princess_celestia",
"princess_luna",
"celestia",
"luna",
"anthro"
]
}
]