http client creates log entries
This commit is contained in:
@@ -1,11 +1,14 @@
|
||||
class Scraper::HttpClientBase
|
||||
Response = Struct.new(
|
||||
:status,
|
||||
:status_code,
|
||||
:request_headers,
|
||||
:response_headers,
|
||||
:body,
|
||||
:log_entry,
|
||||
)
|
||||
|
||||
class InvalidURLError < ArgumentError; end
|
||||
|
||||
def initialize(curl_client)
|
||||
# @cookie_jar
|
||||
init_cookie_jar
|
||||
@@ -14,9 +17,6 @@ class Scraper::HttpClientBase
|
||||
@curl_client = curl_client
|
||||
end
|
||||
|
||||
def setup_faraday_client_hook(client)
|
||||
end
|
||||
|
||||
def cookies
|
||||
raise NotImplementedError
|
||||
end
|
||||
@@ -46,24 +46,46 @@ class Scraper::HttpClientBase
|
||||
|
||||
def perform(method, url)
|
||||
uri = Addressable::URI.parse(url)
|
||||
uri.scheme = "https" if uri.scheme.blank?
|
||||
uri.path = "/" if uri.path.blank?
|
||||
url = uri.to_s
|
||||
|
||||
unless allowed_domain?(uri.host)
|
||||
raise("domain not permitted: #{uri.host} - allowed: #{allowed_domains}")
|
||||
raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{allowed_domains}")
|
||||
end
|
||||
|
||||
@curl_client.url = url
|
||||
@curl_client.headers["cookie"] = HTTP::Cookie.cookie_value(@cookie_jar.cookies(url))
|
||||
|
||||
requested_at = Time.now
|
||||
@curl_client.perform
|
||||
response_time_ms = Time.now - requested_at
|
||||
|
||||
response_code = @curl_client.response_code
|
||||
body_str = @curl_client.body_str
|
||||
response_headers = self.class.parse_header_str(@curl_client.header_str).to_h
|
||||
request_headers = @curl_client.headers
|
||||
content_type = response_headers["Content-Type"] || response_headers["content-type"] || raise("no content type provided: #{response_headers}")
|
||||
|
||||
log_entry = HttpLogEntry.new({
|
||||
uri: url,
|
||||
verb: :get,
|
||||
content_type: response_headers["content-type"],
|
||||
status_code: response_code,
|
||||
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers),
|
||||
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers),
|
||||
response: BlobEntry.find_or_build(content_type: content_type, contents: body_str),
|
||||
response_time_ms: response_time_ms,
|
||||
requested_at: requested_at,
|
||||
})
|
||||
log_entry.save!
|
||||
|
||||
Response.new(
|
||||
response_code,
|
||||
request_headers,
|
||||
response_headers,
|
||||
body_str,
|
||||
log_entry
|
||||
)
|
||||
end
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ class BlobEntry < ReduxApplicationRecord
|
||||
file_mime = `file -ib #{file_path}`
|
||||
raise("error running `file` on #{file_path}: #{file_mime}") if $?.exitstatus != 0
|
||||
file_mime.chomp!
|
||||
record = find_or_build(file_mime, file_contents)
|
||||
record = find_or_build(content_type: file_mime, contents: file_contents)
|
||||
|
||||
# guess the name is a sha256 hash
|
||||
if file_name.length == 64
|
||||
@@ -76,10 +76,15 @@ class BlobEntry < ReduxApplicationRecord
|
||||
record
|
||||
end
|
||||
|
||||
def self.find_or_build(content_type, contents)
|
||||
def self.find_or_build(content_type:, contents:, candidates: [])
|
||||
sha256 = Digest::SHA256.digest(contents)
|
||||
BlobEntry.find_by(sha256: sha256) || begin
|
||||
build_record(content_type: content_type, sha256: sha256, contents: contents)
|
||||
build_record(
|
||||
content_type: content_type,
|
||||
sha256: sha256,
|
||||
contents: contents,
|
||||
candidates: candidates,
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -89,8 +89,8 @@ class Domain::Fa::Post < ReduxApplicationRecord
|
||||
response_time_ms: -1,
|
||||
content_type: blob_entry.content_type,
|
||||
requested_at: Time.now,
|
||||
request_headers: ::HttpLogEntryHeader.find_or_create({}),
|
||||
response_headers: ::HttpLogEntryHeader.find_or_create({}),
|
||||
request_headers: ::HttpLogEntryHeader.find_or_create(headers: {}),
|
||||
response_headers: ::HttpLogEntryHeader.find_or_create(headers: {}),
|
||||
response: blob_entry,
|
||||
})
|
||||
|
||||
|
||||
@@ -51,8 +51,8 @@ class HttpLogEntry < ReduxApplicationRecord
|
||||
content_type: legacy_model.content_type,
|
||||
status_code: legacy_model.status,
|
||||
response_time_ms: legacy_model.response_time,
|
||||
request_headers: ::HttpLogEntryHeader.find_or_build(legacy_model.req_headers),
|
||||
response_headers: ::HttpLogEntryHeader.find_or_build(legacy_model.res_headers),
|
||||
request_headers: ::HttpLogEntryHeader.find_or_build(headers: legacy_model.req_headers),
|
||||
response_headers: ::HttpLogEntryHeader.find_or_build(headers: legacy_model.res_headers),
|
||||
response: blob_entry,
|
||||
requested_at: legacy_model.requested_at,
|
||||
created_at: legacy_model.created_at,
|
||||
|
||||
@@ -18,19 +18,19 @@ class HttpLogEntryHeader < ReduxApplicationRecord
|
||||
)
|
||||
end
|
||||
|
||||
def self.find_or_build(headers)
|
||||
def self.find_or_build(headers:)
|
||||
record = build(headers: headers)
|
||||
find_by(sha256: record.sha256) || record
|
||||
end
|
||||
|
||||
def self.find_or_create(headers)
|
||||
record = find_or_build(headers)
|
||||
def self.find_or_create(headers:)
|
||||
record = find_or_build(headers: headers)
|
||||
record.save! unless record.persisted?
|
||||
record
|
||||
end
|
||||
|
||||
def self.scrub_header(headers, key)
|
||||
headers[key] = "(scrubbed)" if headers[key]
|
||||
def self.scrub_header(hash, key)
|
||||
hash[key] = "(scrubbed)" if hash[key]
|
||||
end
|
||||
|
||||
def to_bulk_insert_hash
|
||||
|
||||
@@ -2,22 +2,18 @@ class Domain::Fa::Scraper::HttpClientTest < ActiveSupport::TestCase
|
||||
test "creates an http log entry" do
|
||||
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_curl_easy(
|
||||
"https://www.furaffinity.net/",
|
||||
{
|
||||
response_code: 200,
|
||||
body_str: "the body",
|
||||
request_headers: { "req" => "reqheader" },
|
||||
response_headers_str: "HTTP 200\r\nresp: respheader\r\n\r\n",
|
||||
}
|
||||
response_code: 200,
|
||||
body_str: "a plain text body",
|
||||
))
|
||||
response = client.get("https://www.furaffinity.net/")
|
||||
|
||||
assert_equal 200, response.status
|
||||
assert_equal "the body", response.body
|
||||
assert_equal 200, response.status_code
|
||||
assert_equal "a plain text body", response.body
|
||||
|
||||
expected = { "req" => "reqheader" }
|
||||
expected = {}
|
||||
assert_equal expected, response.request_headers
|
||||
|
||||
expected = { "resp" => "respheader" }
|
||||
expected = { "resp" => "respheader", "content-type" => "text/plain" }
|
||||
assert_equal expected, response.response_headers
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,7 +1,42 @@
|
||||
class Scraper::HttpClientBaseTest < ActiveSupport::TestCase
|
||||
def test_parse_header_str
|
||||
test "can parse header string" do
|
||||
headers = Scraper::HttpClientBase.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
|
||||
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
|
||||
assert_equal ["content-encoding", "gzip"], headers[8]
|
||||
end
|
||||
|
||||
class Target < Scraper::HttpClientBase
|
||||
def cookies
|
||||
[]
|
||||
end
|
||||
|
||||
def allowed_domains
|
||||
["*.example.com"]
|
||||
end
|
||||
end
|
||||
|
||||
test "throws on unallowed domain" do
|
||||
client = Target.new(TestUtil.mock_curl_easy(""))
|
||||
assert_raises(Scraper::HttpClientBase::InvalidURLError) do
|
||||
client.get("https://foobar.com")
|
||||
end
|
||||
end
|
||||
|
||||
test "creates an http log entry" do
|
||||
body_str = "the response body " + TestUtil.random_string(16)
|
||||
client = Target.new(TestUtil.mock_curl_easy("https://www.example.com/", body_str: body_str))
|
||||
# note the lack of trailing slash - http client should set path to '/'
|
||||
|
||||
response = client.get("https://www.example.com")
|
||||
assert_equal 200, response.status_code
|
||||
|
||||
log_entry = response.log_entry
|
||||
assert log_entry.persisted?
|
||||
assert_equal "text/plain", log_entry.content_type
|
||||
assert_in_delta Time.now, log_entry.requested_at, 50
|
||||
assert_equal "get", log_entry.verb
|
||||
assert_equal 200, log_entry.status_code
|
||||
assert_equal "text/plain", log_entry.response.content_type
|
||||
assert_equal body_str, log_entry.response.contents
|
||||
end
|
||||
end
|
||||
|
||||
@@ -2,10 +2,11 @@ ENV["RAILS_ENV"] ||= "test"
|
||||
require_relative "../config/environment"
|
||||
require "rails/test_help"
|
||||
require "minitest/mock"
|
||||
require_relative "./test_util"
|
||||
|
||||
class ActiveSupport::TestCase
|
||||
# Run tests in parallel with specified workers
|
||||
parallelize(workers: :number_of_processors)
|
||||
# parallelize(workers: :number_of_processors)
|
||||
|
||||
# Setup all fixtures in test/fixtures/*.yml for all tests in alphabetical order.
|
||||
fixtures :all
|
||||
@@ -29,57 +30,3 @@ class ActiveSupport::TestCase
|
||||
File.read File.join("test/fixtures/files", path)
|
||||
end
|
||||
end
|
||||
|
||||
module TestUtil
|
||||
def self.random_string(length)
|
||||
(0...length).map { (65 + rand(26)).chr }.join
|
||||
end
|
||||
|
||||
def self.build_http_log_entry
|
||||
entry = ::HttpLogEntry.new({
|
||||
uri: "http://example.com/",
|
||||
verb: :get,
|
||||
status_code: 200,
|
||||
response_time_ms: rand(20..100),
|
||||
request_headers: create_http_headers,
|
||||
response_headers: create_http_headers,
|
||||
response: build_blob_entry,
|
||||
content_type: "text/plain",
|
||||
requested_at: Time.now,
|
||||
})
|
||||
raise entry.errors.full_messages.join(", ") unless entry.valid?
|
||||
|
||||
entry
|
||||
end
|
||||
def self.build_blob_entry
|
||||
BlobEntry.find_or_build("text/plain", random_string(1024))
|
||||
end
|
||||
def self.create_blob_entry
|
||||
be = build_blob_entry
|
||||
be.save!
|
||||
be
|
||||
end
|
||||
def self.create_http_headers
|
||||
::HttpLogEntryHeader.find_or_create({ test_header_key: "test header value #{random_string(16)}" })
|
||||
end
|
||||
|
||||
def self.build_fa_user
|
||||
user = ::Domain::Fa::User.new(name: "Test-User-#{random_string(8)}")
|
||||
user.valid? || raise(user.errors.full_messages.join(", "))
|
||||
user
|
||||
end
|
||||
|
||||
def self.mock_curl_easy(expected_url, response)
|
||||
mock = MiniTest::Mock.new
|
||||
|
||||
mock.expect(:url=, nil, [expected_url])
|
||||
mock.expect(:headers, {})
|
||||
mock.expect(:perform, nil)
|
||||
mock.expect(:response_code, response[:response_code])
|
||||
mock.expect(:body_str, response[:body_str])
|
||||
mock.expect(:header_str, response[:response_headers_str])
|
||||
mock.expect(:headers, response[:request_headers])
|
||||
|
||||
mock
|
||||
end
|
||||
end
|
||||
|
||||
62
test/test_util.rb
Normal file
62
test/test_util.rb
Normal file
@@ -0,0 +1,62 @@
|
||||
module TestUtil
|
||||
def self.random_string(length)
|
||||
(0...length).map { (65 + rand(26)).chr }.join
|
||||
end
|
||||
|
||||
def self.build_http_log_entry
|
||||
entry = ::HttpLogEntry.new({
|
||||
uri: "http://example.com/",
|
||||
verb: :get,
|
||||
status_code: 200,
|
||||
response_time_ms: rand(20..100),
|
||||
request_headers: create_http_headers,
|
||||
response_headers: create_http_headers,
|
||||
response: build_blob_entry,
|
||||
content_type: "text/plain",
|
||||
requested_at: Time.now,
|
||||
})
|
||||
raise entry.errors.full_messages.join(", ") unless entry.valid?
|
||||
|
||||
entry
|
||||
end
|
||||
|
||||
def self.build_blob_entry
|
||||
BlobEntry.find_or_build(content_type: "text/plain", contents: random_string(1024))
|
||||
end
|
||||
|
||||
def self.create_blob_entry
|
||||
be = build_blob_entry
|
||||
be.save!
|
||||
be
|
||||
end
|
||||
|
||||
def self.create_http_headers
|
||||
::HttpLogEntryHeader.find_or_create(headers: { test_header_key: "test header value #{random_string(16)}" })
|
||||
end
|
||||
|
||||
def self.build_fa_user
|
||||
user = ::Domain::Fa::User.new(name: "Test-User-#{random_string(8)}")
|
||||
user.valid? || raise(user.errors.full_messages.join(", "))
|
||||
user
|
||||
end
|
||||
|
||||
def self.mock_curl_easy(
|
||||
expected_url,
|
||||
response_code: 200,
|
||||
body_str: "http body",
|
||||
request_headers: {},
|
||||
response_headers_str: "HTTP 200\r\nresp: respheader\r\ncontent-type: text/plain\r\n\r\n"
|
||||
)
|
||||
mock = MiniTest::Mock.new
|
||||
|
||||
mock.expect(:url=, nil, [expected_url])
|
||||
mock.expect(:headers, {})
|
||||
mock.expect(:perform, nil)
|
||||
mock.expect(:response_code, response_code)
|
||||
mock.expect(:body_str, body_str)
|
||||
mock.expect(:header_str, response_headers_str)
|
||||
mock.expect(:headers, request_headers)
|
||||
|
||||
mock
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user