diff --git a/app/lib/scraper/curl_http_performer.rb b/app/lib/scraper/curl_http_performer.rb index 30a4a6cb..e66e9c84 100644 --- a/app/lib/scraper/curl_http_performer.rb +++ b/app/lib/scraper/curl_http_performer.rb @@ -65,10 +65,12 @@ class Scraper::CurlHttpPerformer curl = get_curl start_at = Time.now - curl.url = request.uri.normalize.to_s + # TODO - normalizing the URL breaks URLs with utf-8 characters + # curl.url = request.uri.normalize.to_s + curl.url = request.uri.to_s curl.follow_location = request.follow_redirects request.request_headers.each { |key, value| curl.headers[key.to_s] = value } - curl.headers["User-Agent"] = "FurryArchiver/1.0 / dhelta" + curl.headers["User-Agent"] = "FurryArchiver/1.0 / telegram: @DeltaNoises" case request.http_method when Method::Get curl.get diff --git a/rake/posts.rake b/rake/posts.rake new file mode 100644 index 00000000..3454b102 --- /dev/null +++ b/rake/posts.rake @@ -0,0 +1,44 @@ +# typed: true +# frozen_string_literal: true +T.bind(self, T.all(Rake::DSL, Object)) +namespace :posts do + desc "Find 404 post files with mismatched normalized URLs" + task find_404_post_files_with_mismatched_normalized_urls: :environment do + query = + Domain::PostFile + .where(state: "terminal_error") + .where("url_str ~ '[^\\x00-\\x7F]'") + .includes(:log_entry, post: :files) + + query.find_each(batch_size: 32, order: :desc) do |post_file| + le = post_file.log_entry + next if le.nil? + next if le.status_code != 404 + next if post_file.url_str.blank? + uri = Addressable::URI.parse(post_file.url_str) + next if uri.to_s == uri.normalize.to_s + next if post_file.post&.files&.any? { |file| file.state_ok? } + if post_file.post&.files&.any? { |file| + file.url_str =~ /furarchiver.net/ + } + next + end + puts "#{post_file.post.to_param} ::::: #{post_file.to_param} ::::: #{post_file.url_str}" + end + end + + desc "Redownload 404 post files with mismatched normalized URLs" + task redownload_404_post_files_with_mismatched_normalized_urls: + :environment do + File + .readlines(ENV["file"] || raise("must provide file")) + .map(&:strip) + .each do |line| + _, file_id, _ = line.split(" ::::: ") + raise("no file id") if file_id.blank? + file = Domain::PostFile.find(file_id) + file.state_pending! + Domain::Fa::Job::ScanFileJob.perform_now(file:) + end + end +end