215 lines
6.0 KiB
Ruby
215 lines
6.0 KiB
Ruby
class Scraper::HttpClient
|
|
include HasColorLogger
|
|
attr_reader :config
|
|
|
|
Response = Struct.new(:status_code, :body, :log_entry)
|
|
|
|
class InvalidURLError < ArgumentError
|
|
end
|
|
|
|
def initialize(config, http_performer)
|
|
@config = config
|
|
@http_performer = http_performer
|
|
@domain_last_requested_at = {}
|
|
init_cookie_jar
|
|
# TODO - populate AdaptiveCache with domains this client cares about
|
|
@max_cache_size = 8
|
|
@blob_entry_cache =
|
|
Hash.new do |hash, key|
|
|
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
|
|
end
|
|
logger.level = :info
|
|
@config.do_login(http_performer)
|
|
end
|
|
|
|
def close!
|
|
@http_performer.close!
|
|
end
|
|
|
|
def get(url, caused_by_entry: nil)
|
|
do_request(:get, url, caused_by_entry)
|
|
end
|
|
|
|
def post(url, caused_by_entry: nil)
|
|
do_request(:post, url, caused_by_entry)
|
|
end
|
|
|
|
private
|
|
|
|
def do_request(method, url, caused_by_entry)
|
|
uri = Addressable::URI.parse(url)
|
|
uri.scheme = "https" if uri.scheme.blank?
|
|
uri.path = "/" if uri.path.blank?
|
|
uri = @config.map_uri(uri)
|
|
url = uri.to_s
|
|
|
|
unless allowed_domain?(uri.host)
|
|
raise InvalidURLError.new(
|
|
"domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}"
|
|
)
|
|
end
|
|
|
|
ratelimit_conf =
|
|
config.ratelimit.find { |conf| File.fnmatch? conf[0], uri.host } ||
|
|
["*", :none]
|
|
ratelimit_delay = ratelimit_conf[1]
|
|
if ratelimit_delay == :none
|
|
logger.debug("no rate limit for #{uri.host}")
|
|
else
|
|
last_requested_at = @domain_last_requested_at[uri.host]
|
|
|
|
if last_requested_at
|
|
sleep_for = ratelimit_delay - (Time.now - last_requested_at)
|
|
if sleep_for >= 0
|
|
logger.info("wait #{sleep_for.round(2)} sec / domain = #{uri.host}")
|
|
sleep sleep_for
|
|
end
|
|
end
|
|
end
|
|
|
|
request_headers = {
|
|
"cookie" =>
|
|
HTTP::Cookie.cookie_value(
|
|
@cookie_jar.cookies(Addressable::URI.encode url)
|
|
)
|
|
}
|
|
requested_at = Time.now
|
|
response = @http_performer.do_request(method, url, request_headers)
|
|
|
|
response_code = response.response_code
|
|
response_headers = response.response_headers
|
|
response_time_ms = response.response_time_ms
|
|
response_body = response.body
|
|
@domain_last_requested_at[uri.host] = requested_at +
|
|
(response_time_ms.to_f / 1000)
|
|
|
|
content_type =
|
|
response_headers["Content-Type"] || response_headers["content-type"] ||
|
|
"none/none"
|
|
|
|
cache_key = "#{uri.host}|#{content_type}"
|
|
blob_entry_cache = @blob_entry_cache[cache_key]
|
|
candidates =
|
|
if blob_entry_cache.at_capacity? && rand(0..100) >= 5
|
|
blob_entry_cache.candidates
|
|
else
|
|
[]
|
|
end
|
|
candidates << caused_by_entry.response if caused_by_entry&.response
|
|
if caused_by_entry&.response&.base
|
|
candidates << caused_by_entry.response.base
|
|
end
|
|
|
|
retries = 0
|
|
begin
|
|
response_blob_entry =
|
|
BlobEntryP.find_or_build(
|
|
content_type: content_type,
|
|
contents: response_body,
|
|
candidates: candidates
|
|
)
|
|
|
|
scrubbed_uri = @config.scrub_stored_uri(uri)
|
|
log_entry =
|
|
HttpLogEntry.new(
|
|
{
|
|
uri: scrubbed_uri.to_s,
|
|
verb: method,
|
|
content_type: content_type,
|
|
status_code: response_code,
|
|
request_headers:
|
|
HttpLogEntryHeader.find_or_build(headers: request_headers),
|
|
response_headers:
|
|
HttpLogEntryHeader.find_or_build(headers: response_headers),
|
|
response: response_blob_entry,
|
|
response_time_ms: response_time_ms,
|
|
requested_at: requested_at,
|
|
caused_by_entry: caused_by_entry,
|
|
performed_by: @http_performer.name
|
|
}
|
|
)
|
|
|
|
log_entry.save!
|
|
rescue StandardError
|
|
retries += 1
|
|
retry if retries < 2
|
|
raise
|
|
end
|
|
|
|
if response_blob_entry.base_sha256
|
|
blob_entry_cache.reward(
|
|
HexUtil.bin2hex(response_blob_entry.base_sha256)[0..8]
|
|
)
|
|
else
|
|
blob_entry_cache.insert(
|
|
HexUtil.bin2hex(response_blob_entry.sha256)[0..8],
|
|
response_blob_entry,
|
|
scrubbed_uri
|
|
)
|
|
end
|
|
|
|
response_code_colorized =
|
|
if response_code == 200
|
|
response_code.to_s.light_green
|
|
elsif (response_code / 100) == 4 || (response_code / 100) == 5
|
|
response_code.to_s.red.bold
|
|
else
|
|
response_code.to_s.yellow.bold
|
|
end
|
|
|
|
total_time_ms = ((Time.now - requested_at) * 1000).round(0)
|
|
logger.info(
|
|
[
|
|
"[entry #{log_entry.id.to_s.bold} /",
|
|
"GET #{response_code_colorized} /",
|
|
"#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]",
|
|
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
|
|
scrubbed_uri.to_s.black
|
|
].reject(&:nil?).join(" ")
|
|
)
|
|
|
|
if response_code == 524 || response_code == 502 || response_code == 503 ||
|
|
response_code == 403
|
|
logger.error("HTTP #{response_code} - sleeping to not throttle server")
|
|
sleep 15
|
|
end
|
|
|
|
if content_type =~ %r{text/plain} &&
|
|
response_body =~ /Your IP address has been temporarily banned/
|
|
logger.error("IP temp banned, sleeping & raising...")
|
|
sleep 60 * 5
|
|
raise
|
|
end
|
|
|
|
Response.new(response_code, response_body, log_entry)
|
|
end
|
|
|
|
def allowed_domain?(domain)
|
|
config.allowed_domains.any? do |domain_to_check|
|
|
File.fnmatch? domain_to_check, domain
|
|
end
|
|
end
|
|
|
|
def init_cookie_jar
|
|
@cookie_jar = HTTP::CookieJar.new
|
|
cookies = config.cookies || []
|
|
cookies.each do |cookie_domain_conf|
|
|
domain = cookie_domain_conf[:domain]
|
|
cookie_domain_conf[:cookies].each do |cookie_key_conf|
|
|
conf = {
|
|
domain: domain,
|
|
name: cookie_key_conf[:name],
|
|
value: cookie_key_conf[:value],
|
|
path: cookie_key_conf[:path] || "",
|
|
for_domain: true
|
|
}
|
|
@cookie_jar.add(HTTP::Cookie.new(conf))
|
|
end
|
|
end
|
|
end
|
|
|
|
def clean_content_type(ct)
|
|
ct.split(";").first.strip
|
|
end
|
|
end
|