Files
redux-scraper/app/lib/scraper/http_client.rb
2024-12-17 17:57:17 +00:00

215 lines
6.0 KiB
Ruby

class Scraper::HttpClient
include HasColorLogger
attr_reader :config
Response = Struct.new(:status_code, :body, :log_entry)
class InvalidURLError < ArgumentError
end
def initialize(config, http_performer)
@config = config
@http_performer = http_performer
@domain_last_requested_at = {}
init_cookie_jar
# TODO - populate AdaptiveCache with domains this client cares about
@max_cache_size = 8
@blob_entry_cache =
Hash.new do |hash, key|
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
end
logger.level = :info
@config.do_login(http_performer)
end
def close!
@http_performer.close!
end
def get(url, caused_by_entry: nil)
do_request(:get, url, caused_by_entry)
end
def post(url, caused_by_entry: nil)
do_request(:post, url, caused_by_entry)
end
private
def do_request(method, url, caused_by_entry)
uri = Addressable::URI.parse(url)
uri.scheme = "https" if uri.scheme.blank?
uri.path = "/" if uri.path.blank?
uri = @config.map_uri(uri)
url = uri.to_s
unless allowed_domain?(uri.host)
raise InvalidURLError.new(
"domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}"
)
end
ratelimit_conf =
config.ratelimit.find { |conf| File.fnmatch? conf[0], uri.host } ||
["*", :none]
ratelimit_delay = ratelimit_conf[1]
if ratelimit_delay == :none
logger.debug("no rate limit for #{uri.host}")
else
last_requested_at = @domain_last_requested_at[uri.host]
if last_requested_at
sleep_for = ratelimit_delay - (Time.now - last_requested_at)
if sleep_for >= 0
logger.info("wait #{sleep_for.round(2)} sec / domain = #{uri.host}")
sleep sleep_for
end
end
end
request_headers = {
"cookie" =>
HTTP::Cookie.cookie_value(
@cookie_jar.cookies(Addressable::URI.encode url)
)
}
requested_at = Time.now
response = @http_performer.do_request(method, url, request_headers)
response_code = response.response_code
response_headers = response.response_headers
response_time_ms = response.response_time_ms
response_body = response.body
@domain_last_requested_at[uri.host] = requested_at +
(response_time_ms.to_f / 1000)
content_type =
response_headers["Content-Type"] || response_headers["content-type"] ||
"none/none"
cache_key = "#{uri.host}|#{content_type}"
blob_entry_cache = @blob_entry_cache[cache_key]
candidates =
if blob_entry_cache.at_capacity? && rand(0..100) >= 5
blob_entry_cache.candidates
else
[]
end
candidates << caused_by_entry.response if caused_by_entry&.response
if caused_by_entry&.response&.base
candidates << caused_by_entry.response.base
end
retries = 0
begin
response_blob_entry =
BlobEntryP.find_or_build(
content_type: content_type,
contents: response_body,
candidates: candidates
)
scrubbed_uri = @config.scrub_stored_uri(uri)
log_entry =
HttpLogEntry.new(
{
uri: scrubbed_uri.to_s,
verb: method,
content_type: content_type,
status_code: response_code,
request_headers:
HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers:
HttpLogEntryHeader.find_or_build(headers: response_headers),
response: response_blob_entry,
response_time_ms: response_time_ms,
requested_at: requested_at,
caused_by_entry: caused_by_entry,
performed_by: @http_performer.name
}
)
log_entry.save!
rescue StandardError
retries += 1
retry if retries < 2
raise
end
if response_blob_entry.base_sha256
blob_entry_cache.reward(
HexUtil.bin2hex(response_blob_entry.base_sha256)[0..8]
)
else
blob_entry_cache.insert(
HexUtil.bin2hex(response_blob_entry.sha256)[0..8],
response_blob_entry,
scrubbed_uri
)
end
response_code_colorized =
if response_code == 200
response_code.to_s.light_green
elsif (response_code / 100) == 4 || (response_code / 100) == 5
response_code.to_s.red.bold
else
response_code.to_s.yellow.bold
end
total_time_ms = ((Time.now - requested_at) * 1000).round(0)
logger.info(
[
"[entry #{log_entry.id.to_s.bold} /",
"GET #{response_code_colorized} /",
"#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]",
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
scrubbed_uri.to_s.black
].reject(&:nil?).join(" ")
)
if response_code == 524 || response_code == 502 || response_code == 503 ||
response_code == 403
logger.error("HTTP #{response_code} - sleeping to not throttle server")
sleep 15
end
if content_type =~ %r{text/plain} &&
response_body =~ /Your IP address has been temporarily banned/
logger.error("IP temp banned, sleeping & raising...")
sleep 60 * 5
raise
end
Response.new(response_code, response_body, log_entry)
end
def allowed_domain?(domain)
config.allowed_domains.any? do |domain_to_check|
File.fnmatch? domain_to_check, domain
end
end
def init_cookie_jar
@cookie_jar = HTTP::CookieJar.new
cookies = config.cookies || []
cookies.each do |cookie_domain_conf|
domain = cookie_domain_conf[:domain]
cookie_domain_conf[:cookies].each do |cookie_key_conf|
conf = {
domain: domain,
name: cookie_key_conf[:name],
value: cookie_key_conf[:value],
path: cookie_key_conf[:path] || "",
for_domain: true
}
@cookie_jar.add(HTTP::Cookie.new(conf))
end
end
end
def clean_content_type(ct)
ct.split(";").first.strip
end
end