Files
redux-scraper/app/lib/scraper/http_client.rb
Dylan Knutson 572c61cebb add proxies
2025-07-23 04:51:44 +00:00

315 lines
8.9 KiB
Ruby

# typed: strict
class Scraper::HttpClient
extend T::Sig
include HasColorLogger
sig { returns(Scraper::HttpClientConfig) }
attr_reader :config
class Response < T::Struct
include T::Struct::ActsAsComparable
const :status_code, Integer
const :body, String
const :log_entry, HttpLogEntry
end
class InvalidURLError < ArgumentError
end
sig do
params(config: Scraper::HttpClientConfig, http_performer: T.untyped).void
end
def initialize(config, http_performer)
@config = T.let(config, Scraper::HttpClientConfig)
@http_performer = T.let(http_performer, Scraper::CurlHttpPerformer)
@domain_last_requested_at = T.let({}, T::Hash[String, Time])
@config.do_login(http_performer)
@cookie_jar = T.let(HTTP::CookieJar.new, HTTP::CookieJar)
init_cookie_jar
end
sig do
params(
url: String,
caused_by_entry: T.untyped,
use_http_cache: T::Boolean,
).returns(Response)
end
def get(url, caused_by_entry: nil, use_http_cache: false)
do_request(
Scraper::CurlHttpPerformer::Method::Get,
url,
caused_by_entry,
use_http_cache,
)
end
sig do
params(
url: String,
caused_by_entry: T.untyped,
use_http_cache: T::Boolean,
).returns(Response)
end
def post(url, caused_by_entry: nil, use_http_cache: false)
do_request(
Scraper::CurlHttpPerformer::Method::Post,
url,
caused_by_entry,
use_http_cache,
)
end
private
sig do
params(
from_cache: T::Boolean,
log_entry: HttpLogEntry,
total_time_ms: Integer,
).void
end
def print_request_performed_log_line(from_cache:, log_entry:, total_time_ms:)
response_code = T.must(log_entry.status_code)
response_code_colorized =
if response_code == 200
response_code.to_s.light_green
elsif (response_code / 100) == 4 || (response_code / 100) == 5
response_code.to_s.red.bold
else
response_code.to_s.yellow.bold
end
response_blob_entry = T.must(log_entry.response)
response_time_ms = T.must(log_entry.response_time_ms)
logger.info(
[
from_cache ? "[" + "CACHED".light_green.bold + "]" : nil,
"[hle #{log_entry.id.to_s.bold} /",
"GET #{response_code_colorized} /",
"#{HexUtil.humansize(T.must(response_blob_entry.size_bytes)).bold}]",
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
log_entry.uri.to_s.black,
].compact.join(" "),
)
end
sig do
params(
method: Scraper::CurlHttpPerformer::Method,
url: String,
caused_by_entry: T.untyped,
use_http_cache: T::Boolean,
).returns(Response)
end
def do_request(method, url, caused_by_entry, use_http_cache)
requested_at = Time.now
uri = Addressable::URI.parse(url)
uri.scheme = "https" if uri.scheme.blank?
uri.path = "/" if uri.path.blank?
uri = @config.map_uri(uri)
url = uri.to_s
unless allowed_domain?(uri.host)
raise InvalidURLError.new(
"domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}",
)
end
if use_http_cache
if (cached_response = HttpLogEntry.find_by_uri(uri)) &&
(status_code = cached_response.status_code) &&
(body = cached_response.response&.content_bytes)
print_request_performed_log_line(
from_cache: true,
log_entry: cached_response,
total_time_ms: ((Time.now - requested_at) * 1000).to_i,
)
return(
Response.new(
status_code: status_code,
body: body,
log_entry: cached_response,
)
)
end
end
ratelimit_conf =
config.ratelimit.find { |conf| File.fnmatch? conf[0], uri.host } ||
["*", :none]
ratelimit_delay = ratelimit_conf[1]
if ratelimit_delay == :none
logger.debug("no rate limit for #{uri.host}")
elsif ratelimit_delay.is_a?(Numeric)
last_requested_at = @domain_last_requested_at[uri.host]
if last_requested_at
sleep_for = ratelimit_delay - (Time.now - last_requested_at)
if sleep_for >= 0
logger.info("wait #{sleep_for.round(2)} sec / domain = #{uri.host}")
sleep sleep_for
end
end
else
raise "unknown ratelimit delay: #{ratelimit_delay}"
end
request_headers = {
"Cookie" =>
HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri.normalize.to_s)),
}
request =
Scraper::CurlHttpPerformer::Request.new(
http_method: method,
uri: uri,
request_headers: request_headers,
)
request = @config.map_request(request)
Scraper::Metrics::HttpClientMetrics.observe_request_start(request)
response = @http_performer.do_request(request)
response = @config.map_response(response)
response_code = response.response_code
response_headers = response.response_headers
response_time_ms = response.response_time_ms
response_body = response.body
performed_by = response.performed_by
@domain_last_requested_at[uri.host] = requested_at +
(response_time_ms.to_f / 1000)
content_type =
response_headers["Content-Type"] || response_headers["content-type"] ||
"none/none"
if method == Scraper::CurlHttpPerformer::Method::Get &&
uri.host == "furarchiver.net" && response_code == 200
if response_body.include?(
"Error 404 - File Not Found -- - Fur Affinity [dot] net",
)
logger.warn("fixing buggy fur archiver 404 response (Not Found XML)")
response_code = 404
content_type = "text/xml"
elsif response_body.size == 3072 &&
Digest::SHA256.hexdigest(response_body) ==
"fbe9b8727e0ae24baacf63b6553d331c84779e40b743380628a5181e0e9fa2ff"
logger.warn(
"fixing buggy fur archiver 404 response (image not found gif)",
)
response_code = 404
content_type = "image/gif"
end
end
retries = 0
total_time_ms = -1
begin
response_blob_entry =
BlobFile.find_or_initialize_from_contents(response_body) do |blob_file|
blob_file.content_type = content_type
end
scrubbed_uri = @config.scrub_stored_uri(uri)
log_entry =
HttpLogEntry.new(
{
uri: scrubbed_uri.to_s,
verb: method.serialize.downcase,
content_type: content_type,
status_code: response_code,
request_headers:
HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers:
HttpLogEntryHeader.find_or_build(headers: response_headers),
response: response_blob_entry,
response_time_ms: response_time_ms,
requested_at: requested_at,
caused_by_entry: caused_by_entry,
performed_by: performed_by,
},
)
total_time_ms = ((Time.now - requested_at) * 1000).to_i
Scraper::Metrics::HttpClientMetrics.observe_request_finish(
method: method,
host: uri.host,
content_type: T.must(content_type.split(";").first),
status_code: response_code,
response_time_ms: response_time_ms,
response_size_bytes: response_body.bytesize,
)
log_entry.save!
rescue StandardError
retries += 1
retry if retries < 2
raise
end
print_request_performed_log_line(
from_cache: false,
log_entry: log_entry,
total_time_ms: total_time_ms,
)
if response_code == 403 &&
response_body.include?("This users favorites are hidden")
logger.warn("403 - favorites are hidden")
elsif response_code == 524 || response_code == 502 ||
response_code == 503 || response_code == 403
logger.error("HTTP #{response_code} - sleeping to not throttle server")
sleep 15
end
if content_type =~ %r{text/plain} &&
response_body =~ /Your IP address has been temporarily banned/
logger.error("IP temp banned, sleeping & raising...")
sleep 60 * 5
raise
end
Response.new(
status_code: response_code,
body: response_body,
log_entry: log_entry,
)
end
sig { params(domain: String).returns(T::Boolean) }
def allowed_domain?(domain)
config.allowed_domains.any? do |domain_to_check|
File.fnmatch? domain_to_check, domain
end
end
sig { void }
def init_cookie_jar
cookies = config.cookies || []
cookies.each do |cookie_domain_conf|
domain = cookie_domain_conf[:domain]
cookie_domain_conf[:cookies].each do |cookie_key_conf|
conf = {
domain: domain,
name: cookie_key_conf[:name],
value: cookie_key_conf[:value],
path: cookie_key_conf[:path] || "",
for_domain: true,
}
@cookie_jar.add(HTTP::Cookie.new(conf))
end
end
end
sig { params(ct: String).returns(String) }
def clean_content_type(ct)
T.must(ct.split(";").first).strip
end
end