279 lines
7.8 KiB
Ruby
279 lines
7.8 KiB
Ruby
# typed: strict
|
|
class Scraper::HttpClient
|
|
extend T::Sig
|
|
include HasColorLogger
|
|
|
|
sig { returns(Scraper::HttpClientConfig) }
|
|
attr_reader :config
|
|
|
|
class Response < T::Struct
|
|
include T::Struct::ActsAsComparable
|
|
|
|
const :status_code, Integer
|
|
const :body, String
|
|
const :log_entry, HttpLogEntry
|
|
end
|
|
|
|
class InvalidURLError < ArgumentError
|
|
end
|
|
|
|
sig do
|
|
params(config: Scraper::HttpClientConfig, http_performer: T.untyped).void
|
|
end
|
|
def initialize(config, http_performer)
|
|
@config = T.let(config, Scraper::HttpClientConfig)
|
|
@http_performer = T.let(http_performer, Scraper::CurlHttpPerformer)
|
|
@domain_last_requested_at = T.let({}, T::Hash[String, Time])
|
|
@cookie_jar = T.let(HTTP::CookieJar.new, HTTP::CookieJar)
|
|
init_cookie_jar
|
|
@config.do_login(http_performer)
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
url: String,
|
|
caused_by_entry: T.untyped,
|
|
use_http_cache: T::Boolean,
|
|
).returns(Response)
|
|
end
|
|
def get(url, caused_by_entry: nil, use_http_cache: false)
|
|
do_request(:get, url, caused_by_entry, use_http_cache)
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
url: String,
|
|
caused_by_entry: T.untyped,
|
|
use_http_cache: T::Boolean,
|
|
).returns(Response)
|
|
end
|
|
def post(url, caused_by_entry: nil, use_http_cache: false)
|
|
do_request(:post, url, caused_by_entry, use_http_cache)
|
|
end
|
|
|
|
private
|
|
|
|
sig do
|
|
params(
|
|
from_cache: T::Boolean,
|
|
log_entry: HttpLogEntry,
|
|
total_time_ms: Integer,
|
|
).void
|
|
end
|
|
def print_request_performed_log_line(from_cache:, log_entry:, total_time_ms:)
|
|
response_code = T.must(log_entry.status_code)
|
|
response_code_colorized =
|
|
if response_code == 200
|
|
response_code.to_s.light_green
|
|
elsif (response_code / 100) == 4 || (response_code / 100) == 5
|
|
response_code.to_s.red.bold
|
|
else
|
|
response_code.to_s.yellow.bold
|
|
end
|
|
|
|
response_blob_entry = T.must(log_entry.response)
|
|
response_time_ms = T.must(log_entry.response_time_ms)
|
|
|
|
logger.info(
|
|
[
|
|
from_cache ? "[" + "CACHED".light_green.bold + "]" : nil,
|
|
"[entry #{log_entry.id.to_s.bold} /",
|
|
"GET #{response_code_colorized} /",
|
|
"#{HexUtil.humansize(T.must(response_blob_entry.size_bytes)).bold}]",
|
|
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
|
|
log_entry.uri.to_s.black,
|
|
].compact.join(" "),
|
|
)
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
method: Symbol,
|
|
url: String,
|
|
caused_by_entry: T.untyped,
|
|
use_http_cache: T::Boolean,
|
|
).returns(Response)
|
|
end
|
|
def do_request(method, url, caused_by_entry, use_http_cache)
|
|
requested_at = Time.now
|
|
|
|
uri = Addressable::URI.parse(url)
|
|
uri.scheme = "https" if uri.scheme.blank?
|
|
uri.path = "/" if uri.path.blank?
|
|
uri = @config.map_uri(uri)
|
|
url = uri.to_s
|
|
|
|
unless allowed_domain?(uri.host)
|
|
raise InvalidURLError.new(
|
|
"domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}",
|
|
)
|
|
end
|
|
|
|
if use_http_cache
|
|
if (cached_response = HttpLogEntry.find_by_uri(uri)) &&
|
|
(status_code = cached_response.status_code) &&
|
|
(body = cached_response.response&.content_bytes)
|
|
print_request_performed_log_line(
|
|
from_cache: true,
|
|
log_entry: cached_response,
|
|
total_time_ms: ((Time.now - requested_at) * 1000).to_i,
|
|
)
|
|
|
|
return(
|
|
Response.new(
|
|
status_code: status_code,
|
|
body: body,
|
|
log_entry: cached_response,
|
|
)
|
|
)
|
|
end
|
|
end
|
|
|
|
ratelimit_conf =
|
|
config.ratelimit.find { |conf| File.fnmatch? conf[0], uri.host } ||
|
|
["*", :none]
|
|
ratelimit_delay = ratelimit_conf[1]
|
|
if ratelimit_delay == :none
|
|
logger.debug("no rate limit for #{uri.host}")
|
|
elsif ratelimit_delay.is_a?(Numeric)
|
|
last_requested_at = @domain_last_requested_at[uri.host]
|
|
|
|
if last_requested_at
|
|
sleep_for = ratelimit_delay - (Time.now - last_requested_at)
|
|
if sleep_for >= 0
|
|
logger.info("wait #{sleep_for.round(2)} sec / domain = #{uri.host}")
|
|
sleep sleep_for
|
|
end
|
|
end
|
|
else
|
|
raise "unknown ratelimit delay: #{ratelimit_delay}"
|
|
end
|
|
|
|
request_headers = {
|
|
"cookie" =>
|
|
HTTP::Cookie.cookie_value(
|
|
@cookie_jar.cookies(Addressable::URI.encode url),
|
|
),
|
|
}
|
|
|
|
Scraper::Metrics::HttpClientMetrics.observe_request_start(method, uri.host)
|
|
|
|
response = @http_performer.do_request(method, url, request_headers)
|
|
|
|
response_code = response.response_code
|
|
response_headers = response.response_headers
|
|
response_time_ms = response.response_time_ms
|
|
response_body = response.body
|
|
@domain_last_requested_at[uri.host] = requested_at +
|
|
(response_time_ms.to_f / 1000)
|
|
|
|
content_type =
|
|
response_headers["Content-Type"] || response_headers["content-type"] ||
|
|
"none/none"
|
|
|
|
retries = 0
|
|
total_time_ms = -1
|
|
begin
|
|
response_blob_entry =
|
|
BlobFile.find_or_initialize_from_contents(response_body) do |blob_file|
|
|
blob_file.content_type = content_type
|
|
end
|
|
|
|
scrubbed_uri = @config.scrub_stored_uri(uri)
|
|
log_entry =
|
|
HttpLogEntry.new(
|
|
{
|
|
uri: scrubbed_uri.to_s,
|
|
verb: method,
|
|
content_type: content_type,
|
|
status_code: response_code,
|
|
request_headers:
|
|
HttpLogEntryHeader.find_or_build(headers: request_headers),
|
|
response_headers:
|
|
HttpLogEntryHeader.find_or_build(headers: response_headers),
|
|
response: response_blob_entry,
|
|
response_time_ms: response_time_ms,
|
|
requested_at: requested_at,
|
|
caused_by_entry: caused_by_entry,
|
|
performed_by: "direct",
|
|
},
|
|
)
|
|
|
|
total_time_ms = ((Time.now - requested_at) * 1000).to_i
|
|
|
|
Scraper::Metrics::HttpClientMetrics.observe_request_finish(
|
|
method: method,
|
|
host: uri.host,
|
|
content_type: T.must(content_type.split(";").first),
|
|
status_code: response_code,
|
|
response_time_ms: response_time_ms,
|
|
response_size_bytes: response_body.bytesize,
|
|
)
|
|
|
|
log_entry.save!
|
|
rescue StandardError
|
|
retries += 1
|
|
retry if retries < 2
|
|
raise
|
|
end
|
|
|
|
print_request_performed_log_line(
|
|
from_cache: false,
|
|
log_entry: log_entry,
|
|
total_time_ms: total_time_ms,
|
|
)
|
|
|
|
if response_code == 403 &&
|
|
response_body.include?("This users favorites are hidden")
|
|
logger.warn("403 - favorites are hidden")
|
|
elsif response_code == 524 || response_code == 502 ||
|
|
response_code == 503 || response_code == 403
|
|
logger.error("HTTP #{response_code} - sleeping to not throttle server")
|
|
sleep 15
|
|
end
|
|
|
|
if content_type =~ %r{text/plain} &&
|
|
response_body =~ /Your IP address has been temporarily banned/
|
|
logger.error("IP temp banned, sleeping & raising...")
|
|
sleep 60 * 5
|
|
raise
|
|
end
|
|
|
|
Response.new(
|
|
status_code: response_code,
|
|
body: response_body,
|
|
log_entry: log_entry,
|
|
)
|
|
end
|
|
|
|
sig { params(domain: String).returns(T::Boolean) }
|
|
def allowed_domain?(domain)
|
|
config.allowed_domains.any? do |domain_to_check|
|
|
File.fnmatch? domain_to_check, domain
|
|
end
|
|
end
|
|
|
|
sig { void }
|
|
def init_cookie_jar
|
|
cookies = config.cookies || []
|
|
cookies.each do |cookie_domain_conf|
|
|
domain = cookie_domain_conf[:domain]
|
|
cookie_domain_conf[:cookies].each do |cookie_key_conf|
|
|
conf = {
|
|
domain: domain,
|
|
name: cookie_key_conf[:name],
|
|
value: cookie_key_conf[:value],
|
|
path: cookie_key_conf[:path] || "",
|
|
for_domain: true,
|
|
}
|
|
@cookie_jar.add(HTTP::Cookie.new(conf))
|
|
end
|
|
end
|
|
end
|
|
|
|
sig { params(ct: String).returns(String) }
|
|
def clean_content_type(ct)
|
|
T.must(ct.split(";").first).strip
|
|
end
|
|
end
|