From dffdef51cdafc1aab07863773c311875ac39d259 Mon Sep 17 00:00:00 2001 From: Dylan Knutson Date: Fri, 25 Jul 2025 00:25:12 +0000 Subject: [PATCH] backup tor archive scraping --- .devcontainer/Dockerfile.devcontainer | 6 - .devcontainer/docker-compose.yml | 10 + .devcontainer/fish-shell-conf-d/utils.fish | 12 +- app/helpers/fa_uri_helper.rb | 4 +- .../domain/fa/job/scan_fuzzysearch_job.rb | 5 +- .../job/fa_post_fur_archiver_post_file_job.rb | 197 ++++++++++-------- app/jobs/scraper/job_base.rb | 7 + app/lib/scraper/client_factory.rb | 29 ++- app/lib/scraper/curl_http_performer.rb | 35 ++-- app/lib/scraper/fuzzysearch_api_client.rb | 40 +++- app/lib/scraper/tor_curl_http_performer.rb | 28 +++ app/lib/scraper/tor_http_client_config.rb | 29 +++ app/models/domain/post/fa_post.rb | 1 + app/models/http_log_entry.rb | 1 + sorbet/rbi/dsl/domain/post/fa_post.rbi | 57 +++++ sorbet/rbi/dsl/http_log_entry.rbi | 18 ++ .../dsl/scraper/fuzzysearch_api_client.rbi | 16 ++ .../dsl/scraper/tor_http_client_config.rbi | 16 ++ .../query_missing_posts_from_fuzzysearch.rbi | 16 ++ spec/helpers/fa_uri_helper_spec.rb | 18 ++ .../fa/job/scan_fuzzysearch_job_spec.rb | 101 ++++----- ...fa_post_fur_archiver_post_file_job_spec.rb | 60 +++++- test/fixtures/files/fuzzysearch/21275696.json | 18 ++ test/fixtures/files/fuzzysearch/51015903.json | 18 ++ test/fixtures/files/fuzzysearch/53068507.json | 25 +++ test/fixtures/files/fuzzysearch/61665194.json | 34 +++ 26 files changed, 614 insertions(+), 187 deletions(-) create mode 100644 app/lib/scraper/tor_curl_http_performer.rb create mode 100644 app/lib/scraper/tor_http_client_config.rb create mode 100644 sorbet/rbi/dsl/scraper/fuzzysearch_api_client.rbi create mode 100644 sorbet/rbi/dsl/scraper/tor_http_client_config.rbi create mode 100644 sorbet/rbi/dsl/tasks/fa/query_missing_posts_from_fuzzysearch.rbi create mode 100644 test/fixtures/files/fuzzysearch/21275696.json create mode 100644 test/fixtures/files/fuzzysearch/51015903.json create mode 100644 test/fixtures/files/fuzzysearch/53068507.json create mode 100644 test/fixtures/files/fuzzysearch/61665194.json diff --git a/.devcontainer/Dockerfile.devcontainer b/.devcontainer/Dockerfile.devcontainer index 5cd027be..9f2f8c44 100644 --- a/.devcontainer/Dockerfile.devcontainer +++ b/.devcontainer/Dockerfile.devcontainer @@ -95,9 +95,3 @@ ENV PATH "/home/vscode/.exo/bin:$PATH" # install just (command runner) RUN curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | sudo bash -s -- --to /usr/local/bin - -# RUN source /usr/local/share/nvm/nvm.sh && nvm install 18 && nvm use 18 && npm install -g yarn 2>&1 -# ENV PATH /usr/local/share/nvm/current/bin:$PATH - -# # install `backlog` tool -# RUN npm i -g backlog.md diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 13b34e92..69613894 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -98,9 +98,19 @@ services: - WIREGUARD_ADDRESSES=10.165.87.232/32,fd7d:76ee:e68f:a993:4d1b:a77a:b471:a606/128 - SERVER_CITIES="San Jose California, Fremont California" + tor: + image: dockurr/tor + container_name: tor + volumes: + - devcontainer-redux-tor-config:/etc/tor + - devcontainer-redux-tor-data:/var/lib/tor + restart: always + volumes: postgres-17-data: devcontainer-redux-gem-cache: devcontainer-redux-blob-files: devcontainer-redux-grafana-data: devcontainer-redux-prometheus-data: + devcontainer-redux-tor-config: + devcontainer-redux-tor-data: diff --git a/.devcontainer/fish-shell-conf-d/utils.fish b/.devcontainer/fish-shell-conf-d/utils.fish index 1b161e79..d7243662 100755 --- a/.devcontainer/fish-shell-conf-d/utils.fish +++ b/.devcontainer/fish-shell-conf-d/utils.fish @@ -24,4 +24,14 @@ function blob-files-stats set -l files_dir (blob-files-dir || return 1) printf "apparent size: %s\n" (du -sh --apparent-size $files_dir) printf "actual size: %s\n" (du -sh $files_dir) -end \ No newline at end of file +end + +function curl-fa-onion + curl \ + --socks5-hostname tor:9050 \ + --compressed \ + -A "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0" \ + -H "Accept-Encoding: gzip, deflate" \ + -H "Connection: keep-alive" \ + "http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/$argv[1]" +end diff --git a/app/helpers/fa_uri_helper.rb b/app/helpers/fa_uri_helper.rb index d9fe1c7f..d39830e5 100644 --- a/app/helpers/fa_uri_helper.rb +++ b/app/helpers/fa_uri_helper.rb @@ -13,6 +13,7 @@ module FaUriHelper const :original_file_posted, Integer const :latest_file_posted, Integer const :filename, String + const :filename_with_ts, String sig { returns(Time) } def original_file_posted_at @@ -35,7 +36,7 @@ module FaUriHelper path = uri.path match = path.match( - %r{/art/(?[^/]+)/(?\d+)/(?\d+)\.(?.*)}, + %r{/art/(?[^/]+)/(stories/)?(?\d+)/(?\d+)\.(?.*)}, ) return nil unless match url_name = match[:url_name] @@ -47,6 +48,7 @@ module FaUriHelper original_file_posted: original_ts, latest_file_posted: latest_ts, filename:, + filename_with_ts: path.split("/").last, ) end diff --git a/app/jobs/domain/fa/job/scan_fuzzysearch_job.rb b/app/jobs/domain/fa/job/scan_fuzzysearch_job.rb index db702eb0..510c3c28 100644 --- a/app/jobs/domain/fa/job/scan_fuzzysearch_job.rb +++ b/app/jobs/domain/fa/job/scan_fuzzysearch_job.rb @@ -11,11 +11,12 @@ class Domain::Fa::Job::ScanFuzzysearchJob < Domain::Fa::Job::Base sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) } def perform(args) + post_file = T.let(nil, T.nilable(Domain::PostFile)) post = post_from_args! fs_client = Scraper::FuzzysearchApiClient.new(http_client) logger.tagged(make_arg_tag(post)) do - if post.fuzzysearch_checked_at.present? + if post.fuzzysearch_checked_at.present? && !force_scan? logger.warn("fuzzysearch already checked, skipping") return end @@ -35,6 +36,7 @@ class Domain::Fa::Job::ScanFuzzysearchJob < Domain::Fa::Job::Base post.fuzzysearch_checked_at = Time.now if response.is_a?(HttpLogEntry) post.fuzzysearch_entry = response + logger.error("fuzzysearch query failed") return end @@ -83,5 +85,6 @@ class Domain::Fa::Job::ScanFuzzysearchJob < Domain::Fa::Job::Base end ensure post.save! if post + post_file.save! if post_file end end diff --git a/app/jobs/job/fa_post_fur_archiver_post_file_job.rb b/app/jobs/job/fa_post_fur_archiver_post_file_job.rb index a4ca0ecf..872a0c34 100644 --- a/app/jobs/job/fa_post_fur_archiver_post_file_job.rb +++ b/app/jobs/job/fa_post_fur_archiver_post_file_job.rb @@ -2,8 +2,8 @@ class Job::FaPostFurArchiverPostFileJob < Scraper::JobBase extend T::Sig include Domain::StaticFileJobHelper + queue_as :fur_archiver - queue_as :static_file discard_on Scraper::JobBase::JobError, ActiveJob::DeserializationError sig { override.returns(Symbol) } @@ -13,102 +13,89 @@ class Job::FaPostFurArchiverPostFileJob < Scraper::JobBase sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) } def perform(args) - post_file = T.cast(args[:post_file], Domain::PostFile) - logger.tagged(make_arg_tag(post_file), make_arg_tag(post_file.post)) do - handle(post_file) + post = T.cast(args[:post], Domain::Post::FaPost) + + # todo - try multiple post files? + post_file = + post + .files + .to_a + .sort_by { |file| T.must(file.created_at) } + .reverse + .find do |file| + url_str = file.url_str || next + uri = Addressable::URI.parse(url_str) + FaUriHelper.is_fa_cdn_host?(uri.host) + end + + fatal_error("no existing post file found, skipping") if post_file.nil? + + logger.tagged(make_arg_tag(post), make_arg_tag(post_file)) do + if post_file.state_ok? && post_file.last_status_code == 200 + logger.info("file already downloaded, skipping") + return :ok + end + + unless post_file.last_status_code == 404 + logger.warn("last status code is not 404, skipping") + return + end + + unless post_file.state_terminal_error? + logger.warn("post file not in terminal error state, skipping") + return + end + + file_url_str = post_file.url_str + fatal_error("no file url str") unless file_url_str + + url_parsed = FaUriHelper.parse_fa_media_url(file_url_str) + fatal_error("failed to parse fa file url") unless url_parsed + + creator_url_name = post.creator&.url_name + fatal_error("no creator url name") unless creator_url_name + + unless creator_url_name == url_parsed.url_name + logger.tagged( + make_tag("in_db", creator_url_name), + make_tag("in_url", url_parsed.url_name), + ) { fatal_error("creator name mismatch") } + end + + next if try_from_furarchiver(post, url_parsed) + try_from_tor(post, url_parsed) end end - FA_URL_PATTERN = - %r{ - https://(d\.facdn\.net|d\.furaffinity\.net)/art/([^\/]+)/(\d+)/([^\/]+) - }x - - sig { params(post_file: Domain::PostFile).void } - def handle(post_file) - post = T.cast(post_file.post, Domain::Post::FaPost) - - if post.file != post_file && post.file&.state_ok? - logger.info("file already downloaded, deleting old file") - post_file.destroy - return - end - - if post_file.state_ok? && post_file.last_status_code == 200 - logger.info("file already downloaded, skipping") - return - end - - if post.tried_from_fur_archiver? - logger.warn("already tried to download from fur archiver, skipping") - return - end - - unless post_file.last_status_code == 404 - logger.warn("last status code is not 404, skipping") - return - end - - unless post_file.state_terminal_error? - logger.warn("post file not in terminal error state, skipping") - return - end - - user_url_name = post.creator&.url_name - fatal_error("no user url name") unless user_url_name - - fa_file_url_str = post_file.url_str - fatal_error("no fa file url") unless fa_file_url_str - - match = fa_file_url_str.match(FA_URL_PATTERN) - unless match - if fa_file_url_str.include?("#{user_url_name}/stories/") - logger.warn("old stories URL, force rescan") - post.reload - Domain::Fa::Job::ScanPostJob.perform_now( - { post: post, force_scan: true }, - ) - post.reload - unless post.state_ok? - fatal_error("post not in ok state after rescan: #{post.state}") - end - return if post.file&.state_ok? || post.file&.state_pending? - match = fa_file_url_str.match(FA_URL_PATTERN) - unless match - fatal_error("invalid fa file url after rescan: #{fa_file_url_str}") - end - else - fatal_error("invalid fa file url: #{fa_file_url_str}") - end - end - - unless url_user_url_name = match.captures[1] - fatal_error("no user url name in url: #{fa_file_url_str}") - end - - unless url_file_name = match.captures[3] - fatal_error("no file name in url: #{fa_file_url_str}") - end - - unless user_url_name == url_user_url_name - logger.tagged( - make_tag("in_db", user_url_name), - make_tag("in_url", url_user_url_name), - ) { fatal_error("user name mismatch") } - end - + # returns true if the post file was found and downloaded + sig do + params( + post: Domain::Post::FaPost, + url_parsed: FaUriHelper::FaMediaUrlInfo, + ).returns(T::Boolean) + end + def try_from_furarchiver(post, url_parsed) fur_archiver_url_str = - "https://furarchiver.net/File/View?artist=#{url_user_url_name}&filename=#{url_file_name}" + "https://furarchiver.net/File/View?artist=#{url_parsed.url_name}&filename=#{url_parsed.filename_with_ts}" - post.tried_from_fur_archiver = true - post_file = post.files.build(url_str: fur_archiver_url_str) + archiver_post_file = + post.files.find_or_create_by!(url_str: fur_archiver_url_str) + + if archiver_post_file.state_ok? + logger.warn("already downloaded from fur archiver, skipping") + return true + elsif archiver_post_file.state_terminal_error? + logger.warn("previously failed to download from fur archiver, trying tor") + return false + end begin response = http_client.get(fur_archiver_url_str) rescue Scraper::HttpClient::InvalidURLError, Curl::Err::HostResolutionError => e - post_file.state_terminal_error! - post_file.error_message = e.message + archiver_post_file.state_terminal_error! + archiver_post_file.error_message = e.message + archiver_post_file.save! logger.error( format_tags( "invalid fur archiver url, terminal error state", @@ -116,14 +103,40 @@ class Job::FaPostFurArchiverPostFileJob < Scraper::JobBase make_tag("url", fur_archiver_url_str), ), ) - return + return false ensure - post.save! if post + post.tried_from_fur_archiver = true + post.save! end - post_file.save! - post.reload + handle_file_download_response(archiver_post_file, response) + return archiver_post_file.state_ok? + end - handle_file_download_response(post_file, response) + sig do + params( + post: Domain::Post::FaPost, + url_parsed: FaUriHelper::FaMediaUrlInfo, + ).void + end + def try_from_tor(post, url_parsed) + tor_path = "fa/#{url_parsed.url_name}/#{url_parsed.filename_with_ts}" + tor_url_str = + "http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/#{tor_path}" + + tor_post_file = + post.files.find_by(url_str: tor_url_str) || + post.files.create!(url_str: tor_url_str) + + if tor_post_file.state_ok? + logger.warn("already downloaded from tor, skipping") + return + elsif tor_post_file.state_terminal_error? + logger.warn("previously failed to download from tor, skipping") + return + end + + response = tor_http_client.get(tor_url_str) + handle_file_download_response(tor_post_file, response) end end diff --git a/app/jobs/scraper/job_base.rb b/app/jobs/scraper/job_base.rb index 1252411f..963a448e 100644 --- a/app/jobs/scraper/job_base.rb +++ b/app/jobs/scraper/job_base.rb @@ -73,6 +73,7 @@ class Scraper::JobBase < ApplicationJob @deferred_jobs = T.let(Set.new, T::Set[DeferredJob]) @suppressed_jobs = T.let(Set.new, T::Set[SuppressedJob]) @http_client = T.let(nil, T.nilable(Scraper::HttpClient)) + @tor_http_client = T.let(nil, T.nilable(Scraper::HttpClient)) @gallery_dl_client = T.let(nil, T.nilable(Scraper::GalleryDlClient)) @first_log_entry = T.let(nil, T.nilable(HttpLogEntry)) @last_log_entry = T.let(nil, T.nilable(HttpLogEntry)) @@ -88,6 +89,12 @@ class Scraper::JobBase < ApplicationJob WrappedHttpClient.new(self, @http_client) end + sig { returns(WrappedHttpClient) } + def tor_http_client + @tor_http_client ||= Scraper::ClientFactory.get_tor_http_client + WrappedHttpClient.new(self, @tor_http_client) + end + sig { returns(Scraper::GalleryDlClient) } def gallery_dl_client @gallery_dl_client ||= Scraper::ClientFactory.get_gallery_dl_client diff --git a/app/lib/scraper/client_factory.rb b/app/lib/scraper/client_factory.rb index 771c2762..aae1a82e 100644 --- a/app/lib/scraper/client_factory.rb +++ b/app/lib/scraper/client_factory.rb @@ -1,5 +1,7 @@ # typed: true class Scraper::ClientFactory + extend T::Sig + @http_clients = Concurrent::ThreadLocalVar.new() { {} } @gallery_dl_clients = Concurrent::ThreadLocalVar.new(nil) @@ -88,6 +90,18 @@ class Scraper::ClientFactory end end + def self.get_tor_http_client + if Rails.env.test? + @http_client_mock || raise("no http client mock set") + else + _http_client_impl( + :tor, + Scraper::TorHttpClientConfig, + Scraper::TorCurlHttpPerformer, + ) + end + end + def self._gallery_dl_client_impl @gallery_dl_clients.value ||= begin @@ -105,9 +119,20 @@ class Scraper::ClientFactory end end - def self._http_client_impl(key, config_klass) + sig do + params( + key: Symbol, + config_klass: T.class_of(Scraper::HttpClientConfig), + performer_klass: T.class_of(Scraper::CurlHttpPerformer), + ).returns(Scraper::HttpClient) + end + def self._http_client_impl( + key, + config_klass, + performer_klass = Scraper::CurlHttpPerformer + ) @http_clients.value[key] ||= begin - Scraper::HttpClient.new(config_klass.new, Scraper::CurlHttpPerformer.new) + Scraper::HttpClient.new(config_klass.new, performer_klass.new) end end end diff --git a/app/lib/scraper/curl_http_performer.rb b/app/lib/scraper/curl_http_performer.rb index 8e2462fd..91b9fc5c 100644 --- a/app/lib/scraper/curl_http_performer.rb +++ b/app/lib/scraper/curl_http_performer.rb @@ -38,32 +38,33 @@ class Scraper::CurlHttpPerformer "direct" end - sig { params(request: Request).returns(Response) } + sig(:final) { params(request: Request).returns(Response) } def do_request(request) do_request_impl(request) end private + sig { returns(String) } + def performed_by + proxy_url = ENV["HTTP_PROXY_URL"] + case proxy_url + when nil + "direct" + when /airvpn-netherlands-proxy:(\d+)/ + "airvpn-1-netherlands" + when /airvpn-san-jose-proxy:(\d+)/ + "airvpn-2-san-jose" + else + raise("Unknown proxy URL: #{proxy_url}") + end + end + sig { params(request: Request).returns(Response) } def do_request_impl(request) curl = get_curl start_at = Time.now - proxy_url = ENV["HTTP_PROXY_URL"] - performed_by = - case proxy_url - when nil - "direct" - when /airvpn-netherlands-proxy:(\d+)/ - "airvpn-1-netherlands" - when /airvpn-san-jose-proxy:(\d+)/ - "airvpn-2-san-jose" - else - raise("Unknown proxy URL: #{proxy_url}") - end - - curl.proxy_url = proxy_url curl.timeout = 30 curl.url = request.uri.normalize.to_s curl.follow_location = request.follow_redirects @@ -120,7 +121,7 @@ class Scraper::CurlHttpPerformer response_headers:, response_time_ms:, body: body_str, - performed_by:, + performed_by: performed_by, ) end @@ -131,6 +132,8 @@ class Scraper::CurlHttpPerformer t.thread_variable_set(:curl, Curl::Easy.new) end curl = t.thread_variable_get(:curl) + proxy_url = ENV["HTTP_PROXY_URL"] + curl.proxy_url = proxy_url curl.headers = {} curl end diff --git a/app/lib/scraper/fuzzysearch_api_client.rb b/app/lib/scraper/fuzzysearch_api_client.rb index 19767eee..e217c114 100644 --- a/app/lib/scraper/fuzzysearch_api_client.rb +++ b/app/lib/scraper/fuzzysearch_api_client.rb @@ -3,6 +3,7 @@ class Scraper::FuzzysearchApiClient extend T::Sig + include HasColorLogger API_BASE_URL = "https://api-next.fuzzysearch.net" API_PATH_FA_ID_INFO = "/v1/file/furaffinity" @@ -31,18 +32,45 @@ class Scraper::FuzzysearchApiClient url = Addressable::URI.parse("#{API_BASE_URL}#{API_PATH_FA_ID_INFO}") url.query_values = { search: fa_id.to_s } response = @http_client.get(url) - return response.log_entry if response.status_code != 200 + if response.status_code != 200 + logger.error( + format_tags( + make_tag("status_code", response.status_code), + make_tag("uri", url.to_s), + "fuzzysearch query failed", + ), + ) + return response.log_entry + end json = JSON.parse(response.body) - return response.log_entry unless json.is_a?(Array) - return response.log_entry if json.empty? + unless json.is_a?(Array) + logger.error("fuzzysearch response is not an array") + return response.log_entry + end + if json.empty? + logger.error("fuzzysearch response is empty") + return response.log_entry + end json = json.first - return response.log_entry unless json.is_a?(Hash) + unless json.is_a?(Hash) + logger.error("fuzzysearch response is not a hash") + return response.log_entry + end + file_url = json["url"] - return response.log_entry unless file_url.present? + if file_url.blank? + logger.error("fuzzysearch response has no file url") + return response.log_entry + end url_parsed = FaUriHelper.parse_fa_media_url(file_url) - return response.log_entry unless url_parsed.present? + if url_parsed.blank? + logger.error( + format_tags(make_tag("file_url", file_url), "failed to parse file url"), + ) + return response.log_entry + end FaIdInfo.new( log_entry: response.log_entry, diff --git a/app/lib/scraper/tor_curl_http_performer.rb b/app/lib/scraper/tor_curl_http_performer.rb new file mode 100644 index 00000000..146f3894 --- /dev/null +++ b/app/lib/scraper/tor_curl_http_performer.rb @@ -0,0 +1,28 @@ +# typed: strict +# frozen_string_literal: true + +class Scraper::TorCurlHttpPerformer < Scraper::CurlHttpPerformer + extend T::Sig + + sig { override.returns(String) } + def performed_by + "tor-1" + end + + sig { override.returns(Curl::Easy) } + def get_curl + t = Thread.current + unless t.thread_variable?(:curl) + t.thread_variable_set(:curl, Curl::Easy.new) + end + curl = T.cast(t.thread_variable_get(:curl), Curl::Easy) + curl.proxy_url = "socks5h://tor:9050" + curl.headers = { + "User-Agent" => + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0", + "Accept-Encoding" => "gzip, deflate", + "Connection" => "keep-alive", + } + curl + end +end diff --git a/app/lib/scraper/tor_http_client_config.rb b/app/lib/scraper/tor_http_client_config.rb new file mode 100644 index 00000000..ee836e79 --- /dev/null +++ b/app/lib/scraper/tor_http_client_config.rb @@ -0,0 +1,29 @@ +# typed: strict +# frozen_string_literal: true + +class Scraper::TorHttpClientConfig < Scraper::HttpClientConfig + TOR_ARCHIVE_HOST = + "g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion" + + extend T::Sig + + sig { override.returns(T.nilable(T::Array[T::Hash[Symbol, T.untyped]])) } + def cookies + nil + end + + sig { override.returns(T::Array[[String, Numeric]]) } + def ratelimit + [[TOR_ARCHIVE_HOST, 1.0]] + end + + sig { override.returns(Integer) } + def redirect_limit + 2 + end + + sig { override.returns(T::Array[String]) } + def allowed_domains + [TOR_ARCHIVE_HOST] + end +end diff --git a/app/models/domain/post/fa_post.rb b/app/models/domain/post/fa_post.rb index c1ecb17c..d4f101ea 100644 --- a/app/models/domain/post/fa_post.rb +++ b/app/models/domain/post/fa_post.rb @@ -28,6 +28,7 @@ class Domain::Post::FaPost < Domain::Post # TODO - convert `file` to Domain::PostFile::FaPostFile and # move this to Domain::PostFile::FaPostFile attr_json :tried_from_fur_archiver, :boolean, default: false + attr_json :tried_from_tor, :boolean, default: false belongs_to :last_user_page, class_name: "::HttpLogEntry", optional: true belongs_to :first_browse_page, class_name: "::HttpLogEntry", optional: true diff --git a/app/models/http_log_entry.rb b/app/models/http_log_entry.rb index a12ecef6..b4bf8c54 100644 --- a/app/models/http_log_entry.rb +++ b/app/models/http_log_entry.rb @@ -14,6 +14,7 @@ class HttpLogEntry < ReduxApplicationRecord serverhost-1 airvpn-1-netherlands airvpn-2-san-jose + tor-1 ], prefix: true diff --git a/sorbet/rbi/dsl/domain/post/fa_post.rbi b/sorbet/rbi/dsl/domain/post/fa_post.rbi index 75c28088..3ac314b0 100644 --- a/sorbet/rbi/dsl/domain/post/fa_post.rbi +++ b/sorbet/rbi/dsl/domain/post/fa_post.rbi @@ -2015,6 +2015,9 @@ class Domain::Post::FaPost sig { void } def restore_tried_from_fur_archiver!; end + sig { void } + def restore_tried_from_tor!; end + sig { void } def restore_type!; end @@ -2189,6 +2192,12 @@ class Domain::Post::FaPost sig { returns(T::Boolean) } def saved_change_to_tried_from_fur_archiver?; end + sig { returns(T.nilable([T.nilable(T::Boolean), T.nilable(T::Boolean)])) } + def saved_change_to_tried_from_tor; end + + sig { returns(T::Boolean) } + def saved_change_to_tried_from_tor?; end + sig { returns(T.nilable([T.untyped, T.untyped])) } def saved_change_to_type; end @@ -2526,6 +2535,51 @@ class Domain::Post::FaPost sig { void } def tried_from_fur_archiver_will_change!; end + sig { returns(T.nilable(T::Boolean)) } + def tried_from_tor; end + + sig { params(value: T.nilable(T::Boolean)).returns(T.nilable(T::Boolean)) } + def tried_from_tor=(value); end + + sig { returns(T::Boolean) } + def tried_from_tor?; end + + sig { returns(T.nilable(T::Boolean)) } + def tried_from_tor_before_last_save; end + + sig { returns(T.untyped) } + def tried_from_tor_before_type_cast; end + + sig { returns(T::Boolean) } + def tried_from_tor_came_from_user?; end + + sig { returns(T.nilable([T.nilable(T::Boolean), T.nilable(T::Boolean)])) } + def tried_from_tor_change; end + + sig { returns(T.nilable([T.nilable(T::Boolean), T.nilable(T::Boolean)])) } + def tried_from_tor_change_to_be_saved; end + + sig { params(from: T.nilable(T::Boolean), to: T.nilable(T::Boolean)).returns(T::Boolean) } + def tried_from_tor_changed?(from: T.unsafe(nil), to: T.unsafe(nil)); end + + sig { returns(T.nilable(T::Boolean)) } + def tried_from_tor_in_database; end + + sig { returns(T.nilable([T.nilable(T::Boolean), T.nilable(T::Boolean)])) } + def tried_from_tor_previous_change; end + + sig { params(from: T.nilable(T::Boolean), to: T.nilable(T::Boolean)).returns(T::Boolean) } + def tried_from_tor_previously_changed?(from: T.unsafe(nil), to: T.unsafe(nil)); end + + sig { returns(T.nilable(T::Boolean)) } + def tried_from_tor_previously_was; end + + sig { returns(T.nilable(T::Boolean)) } + def tried_from_tor_was; end + + sig { void } + def tried_from_tor_will_change!; end + sig { returns(T.untyped) } def type; end @@ -2710,6 +2764,9 @@ class Domain::Post::FaPost sig { returns(T::Boolean) } def will_save_change_to_tried_from_fur_archiver?; end + sig { returns(T::Boolean) } + def will_save_change_to_tried_from_tor?; end + sig { returns(T::Boolean) } def will_save_change_to_type?; end diff --git a/sorbet/rbi/dsl/http_log_entry.rbi b/sorbet/rbi/dsl/http_log_entry.rbi index 4e11c615..81ac4175 100644 --- a/sorbet/rbi/dsl/http_log_entry.rbi +++ b/sorbet/rbi/dsl/http_log_entry.rbi @@ -445,6 +445,12 @@ class HttpLogEntry sig { returns(T::Boolean) } def performed_by_serverhost_1?; end + sig { void } + def performed_by_tor_1!; end + + sig { returns(T::Boolean) } + def performed_by_tor_1?; end + sig { void } def verb_get!; end @@ -679,6 +685,9 @@ class HttpLogEntry sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) } def not_performed_by_serverhost_1(*args, &blk); end + sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) } + def not_performed_by_tor_1(*args, &blk); end + sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) } def not_verb_get(*args, &blk); end @@ -741,6 +750,9 @@ class HttpLogEntry sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) } def performed_by_serverhost_1(*args, &blk); end + sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) } + def performed_by_tor_1(*args, &blk); end + sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) } def preload(*args, &blk); end @@ -2046,6 +2058,9 @@ class HttpLogEntry sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) } def not_performed_by_serverhost_1(*args, &blk); end + sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) } + def not_performed_by_tor_1(*args, &blk); end + sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) } def not_verb_get(*args, &blk); end @@ -2108,6 +2123,9 @@ class HttpLogEntry sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) } def performed_by_serverhost_1(*args, &blk); end + sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) } + def performed_by_tor_1(*args, &blk); end + sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) } def preload(*args, &blk); end diff --git a/sorbet/rbi/dsl/scraper/fuzzysearch_api_client.rbi b/sorbet/rbi/dsl/scraper/fuzzysearch_api_client.rbi new file mode 100644 index 00000000..7edf9df6 --- /dev/null +++ b/sorbet/rbi/dsl/scraper/fuzzysearch_api_client.rbi @@ -0,0 +1,16 @@ +# typed: true + +# DO NOT EDIT MANUALLY +# This is an autogenerated file for dynamic methods in `Scraper::FuzzysearchApiClient`. +# Please instead update this file by running `bin/tapioca dsl Scraper::FuzzysearchApiClient`. + + +class Scraper::FuzzysearchApiClient + sig { returns(ColorLogger) } + def logger; end + + class << self + sig { returns(ColorLogger) } + def logger; end + end +end diff --git a/sorbet/rbi/dsl/scraper/tor_http_client_config.rbi b/sorbet/rbi/dsl/scraper/tor_http_client_config.rbi new file mode 100644 index 00000000..6ab67f4d --- /dev/null +++ b/sorbet/rbi/dsl/scraper/tor_http_client_config.rbi @@ -0,0 +1,16 @@ +# typed: true + +# DO NOT EDIT MANUALLY +# This is an autogenerated file for dynamic methods in `Scraper::TorHttpClientConfig`. +# Please instead update this file by running `bin/tapioca dsl Scraper::TorHttpClientConfig`. + + +class Scraper::TorHttpClientConfig + sig { returns(ColorLogger) } + def logger; end + + class << self + sig { returns(ColorLogger) } + def logger; end + end +end diff --git a/sorbet/rbi/dsl/tasks/fa/query_missing_posts_from_fuzzysearch.rbi b/sorbet/rbi/dsl/tasks/fa/query_missing_posts_from_fuzzysearch.rbi new file mode 100644 index 00000000..b47549eb --- /dev/null +++ b/sorbet/rbi/dsl/tasks/fa/query_missing_posts_from_fuzzysearch.rbi @@ -0,0 +1,16 @@ +# typed: true + +# DO NOT EDIT MANUALLY +# This is an autogenerated file for dynamic methods in `Tasks::Fa::QueryMissingPostsFromFuzzysearch`. +# Please instead update this file by running `bin/tapioca dsl Tasks::Fa::QueryMissingPostsFromFuzzysearch`. + + +class Tasks::Fa::QueryMissingPostsFromFuzzysearch + sig { returns(ColorLogger) } + def logger; end + + class << self + sig { returns(ColorLogger) } + def logger; end + end +end diff --git a/spec/helpers/fa_uri_helper_spec.rb b/spec/helpers/fa_uri_helper_spec.rb index 864d6616..6fb709fe 100644 --- a/spec/helpers/fa_uri_helper_spec.rb +++ b/spec/helpers/fa_uri_helper_spec.rb @@ -13,6 +13,7 @@ RSpec.describe FaUriHelper do original_file_posted: 1_740_700_581, latest_file_posted: 1_740_700_581, filename: "zzreg_stippling-crop.jpg", + filename_with_ts: "1740700581.zzreg_stippling-crop.jpg", ), ) expect(parsed.original_file_posted_at).to eq(Time.at(1_740_700_581)) @@ -29,10 +30,27 @@ RSpec.describe FaUriHelper do original_file_posted: 1_740_700_581, latest_file_posted: 1_753_374_875, filename: "zzreg_stippling-crop.jpg", + filename_with_ts: "1740700581.zzreg_stippling-crop.jpg", ), ) expect(parsed.original_file_posted_at).to eq(Time.at(1_740_700_581)) expect(parsed.latest_file_posted_at).to eq(Time.at(1_753_374_875)) end + + it "parses story uris" do + url = + "https://d.furaffinity.net/art/irontankris/stories/1753207806/1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg" + parsed = described_class.parse_fa_media_url(url) + expect(parsed).to eq( + FaUriHelper::FaMediaUrlInfo.new( + url_name: "irontankris", + original_file_posted: 1_753_207_806, + latest_file_posted: 1_753_207_806, + filename: "thumbnail.irontankris_royal_rivalry_gains.rtf.jpg", + filename_with_ts: + "1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg", + ), + ) + end end end diff --git a/spec/jobs/domain/fa/job/scan_fuzzysearch_job_spec.rb b/spec/jobs/domain/fa/job/scan_fuzzysearch_job_spec.rb index a321b478..9caeba14 100644 --- a/spec/jobs/domain/fa/job/scan_fuzzysearch_job_spec.rb +++ b/spec/jobs/domain/fa/job/scan_fuzzysearch_job_spec.rb @@ -14,70 +14,19 @@ describe Domain::Fa::Job::ScanFuzzysearchJob do end let(:fuzzysearch_response_51015903) do - [ - { - id: 51_015_903, - file_id: 1_676_417_528, - artist: "crimetxt", - hash: 2_367_248_181_756_250_600, - hash_str: "2367248181756250660", - url: - "https://d.furaffinity.net/art/crimetxt/1676417528/1676417528.crimetxt_2023-02-15_00_18_48.png", - filename: "1676417528.crimetxt_2023-02-15_00_18_48.png", - rating: "adult", - posted_at: "2023-02-14T23:32:00Z", - file_size: 2_188_273, - sha256: - "d488dabd8eb22398a228fb662eb520bb4daaac3a9ab0dc9be8b8c5e1b9522efb", - updated_at: nil, - deleted: false, - tags: %w[some_tag another_tag], - }, - ] + JSON.parse(File.read("test/fixtures/files/fuzzysearch/51015903.json")) end let(:fuzzysearch_response_21275696) do - [ - { - id: 21_275_696, - file_id: nil, - artist: nil, - hash: nil, - hash_str: nil, - url: nil, - filename: nil, - rating: nil, - posted_at: nil, - file_size: nil, - sha256: nil, - updated_at: "2023-08-16T08:02:27.746944Z", - deleted: true, - tags: [], - }, - ] + JSON.parse(File.read("test/fixtures/files/fuzzysearch/21275696.json")) end let(:fuzzysearch_response_53068507) do - [ - { - id: 53_068_507, - file_id: 1_690_504_099, - artist: "Meesh", - hash: -5_278_701_664_616_650_000, - hash_str: "-5278701664616649812", - url: - "https://d.furaffinity.net/art/meesh/1690504135/1690504099.meesh_pamperingjack3_crop.png", - filename: "1690504099.meesh_pamperingjack3_crop.png", - rating: "adult", - posted_at: "2023-07-28T00:28:00Z", - file_size: 555_926, - sha256: - "f47e974ef7e72c53fce5a52b28b7c34576eb26af04299155d1cc3912379b0dea", - updated_at: nil, - deleted: false, - tags: %w[advertisement patreon female preview_limited beastars juno], - }, - ] + JSON.parse(File.read("test/fixtures/files/fuzzysearch/53068507.json")) + end + + let(:fuzzysearch_response_61665194) do + JSON.parse(File.read("test/fixtures/files/fuzzysearch/61665194.json")) end describe "post was marked removed" do @@ -209,5 +158,41 @@ describe Domain::Fa::Job::ScanFuzzysearchJob do expect(post.creator.full_name).to eq("Meesh") end end + + context "and the post has a story url" do + let(:fa_id) { 61_665_194 } + let(:client_mock_config) do + [ + { + uri: + "https://api-next.fuzzysearch.net/v1/file/furaffinity?search=#{fa_id}", + status_code: 200, + content_type: "application/json", + contents: fuzzysearch_response_61665194.to_json, + }, + ] + end + + it "does not change the post state" do + perform_now({ post: }) + post.reload + expect(post.state).to eq("removed") + end + + it "sets the artist" do + perform_now({ post: }) + post.reload + expect(post.creator).to be_present + expect(post.creator.url_name).to eq("irontankris") + end + + it "updates keywords", quiet: false do + post.keywords = [] + post.save! + perform_now({ post: }) + post.reload + expect(post.keywords).to include("female", "mlp", "little", "anthro") + end + end end end diff --git a/spec/jobs/fa_post_fur_archiver_post_file_job_spec.rb b/spec/jobs/fa_post_fur_archiver_post_file_job_spec.rb index 3aafeab9..0b50643d 100644 --- a/spec/jobs/fa_post_fur_archiver_post_file_job_spec.rb +++ b/spec/jobs/fa_post_fur_archiver_post_file_job_spec.rb @@ -78,7 +78,7 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do it "downloads the file from fur archiver" do expect do - perform_now({ post_file: post_file }) + perform_now({ post: post }) post.reload end.to change { post.file&.log_entry }.from(old_log_entry).to( have_attributes(uri: have_attributes(to_s: fur_archiver_url_str)), @@ -87,7 +87,7 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do it "updates the post_file blob" do expect do - perform_now({ post_file: post_file }) + perform_now({ post: post }) post.reload end.to change { post.file&.blob }.from(old_log_entry.response).to( @log_entries[0].response, @@ -96,17 +96,24 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do it "sets the last status code" do expect do - perform_now({ post_file: post_file }) + perform_now({ post: post }) post.reload end.to change { post.file&.last_status_code }.from(404).to(200) end it "sets the post_file state to ok" do expect do - perform_now({ post_file: post_file }) + perform_now({ post: post }) post.reload end.to change { post.file&.state }.from("terminal_error").to("ok") end + + it "does not perform the request twice" do + perform_now({ post: post }) + perform_now({ post: post }) + post.reload + expect(post.files.length).to eq(2) + end end context "with a d.facdn.net url" do @@ -122,5 +129,50 @@ RSpec.describe Job::FaPostFurArchiverPostFileJob do end include_examples "correct behavior" end + + context "when furarchiver returns 404" do + let(:client_mock_config) do + [ + { + uri: fur_archiver_url_str, + status_code: 404, + content_type: "text/html", + contents: "not found", + }, + { + uri: + "http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/fa/wolfsparta/1496842943.wolfsparta_caught_pt2.png", + status_code: 200, + content_type: "image/png", + contents: image_data, + caused_by_entry_idx: 0, + }, + ] + end + let(:file_url_str) do + "https://d.facdn.net/art/wolfsparta/1496842943/1496842943.wolfsparta_caught_pt2.png" + end + + it "creates a tor post file" do + perform_now({ post: post }) + post.reload + # should have original post file, the fur archiver post file, and the tor post file + expect(post.files.count).to eq(3) + original, furarchiver, tor = post.files.sort_by(&:id) + expect(original.state).to eq("terminal_error") + expect(furarchiver.state).to eq("terminal_error") + expect(tor.state).to eq("ok") + expect(tor.url_str).to eq( + "http://g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion/fa/wolfsparta/1496842943.wolfsparta_caught_pt2.png", + ) + end + + it "does not perform the request twice" do + perform_now({ post: post }) + perform_now({ post: post }) + post.reload + expect(post.files.length).to eq(3) + end + end end end diff --git a/test/fixtures/files/fuzzysearch/21275696.json b/test/fixtures/files/fuzzysearch/21275696.json new file mode 100644 index 00000000..f3f2f267 --- /dev/null +++ b/test/fixtures/files/fuzzysearch/21275696.json @@ -0,0 +1,18 @@ +[ + { + "id": 21275696, + "file_id": null, + "artist": null, + "hash": null, + "hash_str": null, + "url": null, + "filename": null, + "rating": null, + "posted_at": null, + "file_size": null, + "sha256": null, + "updated_at": "2023-08-16T08:02:27.746944Z", + "deleted": true, + "tags": [] + } +] diff --git a/test/fixtures/files/fuzzysearch/51015903.json b/test/fixtures/files/fuzzysearch/51015903.json new file mode 100644 index 00000000..7efb746c --- /dev/null +++ b/test/fixtures/files/fuzzysearch/51015903.json @@ -0,0 +1,18 @@ +[ + { + "id": 51015903, + "file_id": 1676417528, + "artist": "crimetxt", + "hash": 2367248181756250600, + "hash_str": "2367248181756250660", + "url": "https://d.furaffinity.net/art/crimetxt/1676417528/1676417528.crimetxt_2023-02-15_00_18_48.png", + "filename": "1676417528.crimetxt_2023-02-15_00_18_48.png", + "rating": "adult", + "posted_at": "2023-02-14T23:32:00Z", + "file_size": 2188273, + "sha256": "d488dabd8eb22398a228fb662eb520bb4daaac3a9ab0dc9be8b8c5e1b9522efb", + "updated_at": null, + "deleted": false, + "tags": ["some_tag", "another_tag"] + } +] diff --git a/test/fixtures/files/fuzzysearch/53068507.json b/test/fixtures/files/fuzzysearch/53068507.json new file mode 100644 index 00000000..2d2d9b3d --- /dev/null +++ b/test/fixtures/files/fuzzysearch/53068507.json @@ -0,0 +1,25 @@ +[ + { + "id": 53068507, + "file_id": 1690504099, + "artist": "Meesh", + "hash": -5278701664616650000, + "hash_str": "-5278701664616649812", + "url": "https://d.furaffinity.net/art/meesh/1690504135/1690504099.meesh_pamperingjack3_crop.png", + "filename": "1690504099.meesh_pamperingjack3_crop.png", + "rating": "adult", + "posted_at": "2023-07-28T00:28:00Z", + "file_size": 555926, + "sha256": "f47e974ef7e72c53fce5a52b28b7c34576eb26af04299155d1cc3912379b0dea", + "updated_at": null, + "deleted": false, + "tags": [ + "advertisement", + "patreon", + "female", + "preview_limited", + "beastars", + "juno" + ] + } +] diff --git a/test/fixtures/files/fuzzysearch/61665194.json b/test/fixtures/files/fuzzysearch/61665194.json new file mode 100644 index 00000000..cd3a9d74 --- /dev/null +++ b/test/fixtures/files/fuzzysearch/61665194.json @@ -0,0 +1,34 @@ +[ + { + "id": 61665194, + "file_id": 1753207806, + "artist": "irontankris", + "hash": 3088256223167493600, + "hash_str": "3088256223167493676", + "url": "https://d.furaffinity.net/art/irontankris/stories/1753207806/1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg", + "filename": "1753207806.thumbnail.irontankris_royal_rivalry_gains.rtf.jpg", + "rating": "adult", + "posted_at": "2025-07-22T18:10:00Z", + "file_size": 10283, + "sha256": "3809e988c41506211bd2885061646432e021b3810afc9080d244dd63a84a783e", + "updated_at": null, + "deleted": false, + "tags": [ + "female", + "mlp", + "little", + "pony", + "sex", + "lesbian", + "weight", + "gain", + "wg", + "incest", + "princess_celestia", + "princess_luna", + "celestia", + "luna", + "anthro" + ] + } +]