do not normalize urls

This commit is contained in:
Dylan Knutson
2025-08-05 18:52:50 +00:00
parent 65b0c519aa
commit 6b8fce7ddc
2 changed files with 48 additions and 2 deletions

View File

@@ -65,10 +65,12 @@ class Scraper::CurlHttpPerformer
curl = get_curl curl = get_curl
start_at = Time.now start_at = Time.now
curl.url = request.uri.normalize.to_s # TODO - normalizing the URL breaks URLs with utf-8 characters
# curl.url = request.uri.normalize.to_s
curl.url = request.uri.to_s
curl.follow_location = request.follow_redirects curl.follow_location = request.follow_redirects
request.request_headers.each { |key, value| curl.headers[key.to_s] = value } request.request_headers.each { |key, value| curl.headers[key.to_s] = value }
curl.headers["User-Agent"] = "FurryArchiver/1.0 / dhelta" curl.headers["User-Agent"] = "FurryArchiver/1.0 / telegram: @DeltaNoises"
case request.http_method case request.http_method
when Method::Get when Method::Get
curl.get curl.get

44
rake/posts.rake Normal file
View File

@@ -0,0 +1,44 @@
# typed: true
# frozen_string_literal: true
T.bind(self, T.all(Rake::DSL, Object))
namespace :posts do
desc "Find 404 post files with mismatched normalized URLs"
task find_404_post_files_with_mismatched_normalized_urls: :environment do
query =
Domain::PostFile
.where(state: "terminal_error")
.where("url_str ~ '[^\\x00-\\x7F]'")
.includes(:log_entry, post: :files)
query.find_each(batch_size: 32, order: :desc) do |post_file|
le = post_file.log_entry
next if le.nil?
next if le.status_code != 404
next if post_file.url_str.blank?
uri = Addressable::URI.parse(post_file.url_str)
next if uri.to_s == uri.normalize.to_s
next if post_file.post&.files&.any? { |file| file.state_ok? }
if post_file.post&.files&.any? { |file|
file.url_str =~ /furarchiver.net/
}
next
end
puts "#{post_file.post.to_param} ::::: #{post_file.to_param} ::::: #{post_file.url_str}"
end
end
desc "Redownload 404 post files with mismatched normalized URLs"
task redownload_404_post_files_with_mismatched_normalized_urls:
:environment do
File
.readlines(ENV["file"] || raise("must provide file"))
.map(&:strip)
.each do |line|
_, file_id, _ = line.split(" ::::: ")
raise("no file id") if file_id.blank?
file = Domain::PostFile.find(file_id)
file.state_pending!
Domain::Fa::Job::ScanFileJob.perform_now(file:)
end
end
end