http client creates log entries

This commit is contained in:
2023-02-19 18:57:00 -08:00
parent c28ab1435f
commit d0a09a4615
9 changed files with 150 additions and 83 deletions

View File

@@ -1,11 +1,14 @@
class Scraper::HttpClientBase
Response = Struct.new(
:status,
:status_code,
:request_headers,
:response_headers,
:body,
:log_entry,
)
class InvalidURLError < ArgumentError; end
def initialize(curl_client)
# @cookie_jar
init_cookie_jar
@@ -14,9 +17,6 @@ class Scraper::HttpClientBase
@curl_client = curl_client
end
def setup_faraday_client_hook(client)
end
def cookies
raise NotImplementedError
end
@@ -46,24 +46,46 @@ class Scraper::HttpClientBase
def perform(method, url)
uri = Addressable::URI.parse(url)
uri.scheme = "https" if uri.scheme.blank?
uri.path = "/" if uri.path.blank?
url = uri.to_s
unless allowed_domain?(uri.host)
raise("domain not permitted: #{uri.host} - allowed: #{allowed_domains}")
raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{allowed_domains}")
end
@curl_client.url = url
@curl_client.headers["cookie"] = HTTP::Cookie.cookie_value(@cookie_jar.cookies(url))
requested_at = Time.now
@curl_client.perform
response_time_ms = Time.now - requested_at
response_code = @curl_client.response_code
body_str = @curl_client.body_str
response_headers = self.class.parse_header_str(@curl_client.header_str).to_h
request_headers = @curl_client.headers
content_type = response_headers["Content-Type"] || response_headers["content-type"] || raise("no content type provided: #{response_headers}")
log_entry = HttpLogEntry.new({
uri: url,
verb: :get,
content_type: response_headers["content-type"],
status_code: response_code,
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers),
response: BlobEntry.find_or_build(content_type: content_type, contents: body_str),
response_time_ms: response_time_ms,
requested_at: requested_at,
})
log_entry.save!
Response.new(
response_code,
request_headers,
response_headers,
body_str,
log_entry
)
end

View File

@@ -56,7 +56,7 @@ class BlobEntry < ReduxApplicationRecord
file_mime = `file -ib #{file_path}`
raise("error running `file` on #{file_path}: #{file_mime}") if $?.exitstatus != 0
file_mime.chomp!
record = find_or_build(file_mime, file_contents)
record = find_or_build(content_type: file_mime, contents: file_contents)
# guess the name is a sha256 hash
if file_name.length == 64
@@ -76,10 +76,15 @@ class BlobEntry < ReduxApplicationRecord
record
end
def self.find_or_build(content_type, contents)
def self.find_or_build(content_type:, contents:, candidates: [])
sha256 = Digest::SHA256.digest(contents)
BlobEntry.find_by(sha256: sha256) || begin
build_record(content_type: content_type, sha256: sha256, contents: contents)
build_record(
content_type: content_type,
sha256: sha256,
contents: contents,
candidates: candidates,
)
end
end

View File

@@ -89,8 +89,8 @@ class Domain::Fa::Post < ReduxApplicationRecord
response_time_ms: -1,
content_type: blob_entry.content_type,
requested_at: Time.now,
request_headers: ::HttpLogEntryHeader.find_or_create({}),
response_headers: ::HttpLogEntryHeader.find_or_create({}),
request_headers: ::HttpLogEntryHeader.find_or_create(headers: {}),
response_headers: ::HttpLogEntryHeader.find_or_create(headers: {}),
response: blob_entry,
})

View File

@@ -51,8 +51,8 @@ class HttpLogEntry < ReduxApplicationRecord
content_type: legacy_model.content_type,
status_code: legacy_model.status,
response_time_ms: legacy_model.response_time,
request_headers: ::HttpLogEntryHeader.find_or_build(legacy_model.req_headers),
response_headers: ::HttpLogEntryHeader.find_or_build(legacy_model.res_headers),
request_headers: ::HttpLogEntryHeader.find_or_build(headers: legacy_model.req_headers),
response_headers: ::HttpLogEntryHeader.find_or_build(headers: legacy_model.res_headers),
response: blob_entry,
requested_at: legacy_model.requested_at,
created_at: legacy_model.created_at,

View File

@@ -18,19 +18,19 @@ class HttpLogEntryHeader < ReduxApplicationRecord
)
end
def self.find_or_build(headers)
def self.find_or_build(headers:)
record = build(headers: headers)
find_by(sha256: record.sha256) || record
end
def self.find_or_create(headers)
record = find_or_build(headers)
def self.find_or_create(headers:)
record = find_or_build(headers: headers)
record.save! unless record.persisted?
record
end
def self.scrub_header(headers, key)
headers[key] = "(scrubbed)" if headers[key]
def self.scrub_header(hash, key)
hash[key] = "(scrubbed)" if hash[key]
end
def to_bulk_insert_hash

View File

@@ -2,22 +2,18 @@ class Domain::Fa::Scraper::HttpClientTest < ActiveSupport::TestCase
test "creates an http log entry" do
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_curl_easy(
"https://www.furaffinity.net/",
{
response_code: 200,
body_str: "the body",
request_headers: { "req" => "reqheader" },
response_headers_str: "HTTP 200\r\nresp: respheader\r\n\r\n",
}
response_code: 200,
body_str: "a plain text body",
))
response = client.get("https://www.furaffinity.net/")
assert_equal 200, response.status
assert_equal "the body", response.body
assert_equal 200, response.status_code
assert_equal "a plain text body", response.body
expected = { "req" => "reqheader" }
expected = {}
assert_equal expected, response.request_headers
expected = { "resp" => "respheader" }
expected = { "resp" => "respheader", "content-type" => "text/plain" }
assert_equal expected, response.response_headers
end
end

View File

@@ -1,7 +1,42 @@
class Scraper::HttpClientBaseTest < ActiveSupport::TestCase
def test_parse_header_str
test "can parse header string" do
headers = Scraper::HttpClientBase.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
assert_equal ["content-encoding", "gzip"], headers[8]
end
class Target < Scraper::HttpClientBase
def cookies
[]
end
def allowed_domains
["*.example.com"]
end
end
test "throws on unallowed domain" do
client = Target.new(TestUtil.mock_curl_easy(""))
assert_raises(Scraper::HttpClientBase::InvalidURLError) do
client.get("https://foobar.com")
end
end
test "creates an http log entry" do
body_str = "the response body " + TestUtil.random_string(16)
client = Target.new(TestUtil.mock_curl_easy("https://www.example.com/", body_str: body_str))
# note the lack of trailing slash - http client should set path to '/'
response = client.get("https://www.example.com")
assert_equal 200, response.status_code
log_entry = response.log_entry
assert log_entry.persisted?
assert_equal "text/plain", log_entry.content_type
assert_in_delta Time.now, log_entry.requested_at, 50
assert_equal "get", log_entry.verb
assert_equal 200, log_entry.status_code
assert_equal "text/plain", log_entry.response.content_type
assert_equal body_str, log_entry.response.contents
end
end

View File

@@ -2,10 +2,11 @@ ENV["RAILS_ENV"] ||= "test"
require_relative "../config/environment"
require "rails/test_help"
require "minitest/mock"
require_relative "./test_util"
class ActiveSupport::TestCase
# Run tests in parallel with specified workers
parallelize(workers: :number_of_processors)
# parallelize(workers: :number_of_processors)
# Setup all fixtures in test/fixtures/*.yml for all tests in alphabetical order.
fixtures :all
@@ -29,57 +30,3 @@ class ActiveSupport::TestCase
File.read File.join("test/fixtures/files", path)
end
end
module TestUtil
def self.random_string(length)
(0...length).map { (65 + rand(26)).chr }.join
end
def self.build_http_log_entry
entry = ::HttpLogEntry.new({
uri: "http://example.com/",
verb: :get,
status_code: 200,
response_time_ms: rand(20..100),
request_headers: create_http_headers,
response_headers: create_http_headers,
response: build_blob_entry,
content_type: "text/plain",
requested_at: Time.now,
})
raise entry.errors.full_messages.join(", ") unless entry.valid?
entry
end
def self.build_blob_entry
BlobEntry.find_or_build("text/plain", random_string(1024))
end
def self.create_blob_entry
be = build_blob_entry
be.save!
be
end
def self.create_http_headers
::HttpLogEntryHeader.find_or_create({ test_header_key: "test header value #{random_string(16)}" })
end
def self.build_fa_user
user = ::Domain::Fa::User.new(name: "Test-User-#{random_string(8)}")
user.valid? || raise(user.errors.full_messages.join(", "))
user
end
def self.mock_curl_easy(expected_url, response)
mock = MiniTest::Mock.new
mock.expect(:url=, nil, [expected_url])
mock.expect(:headers, {})
mock.expect(:perform, nil)
mock.expect(:response_code, response[:response_code])
mock.expect(:body_str, response[:body_str])
mock.expect(:header_str, response[:response_headers_str])
mock.expect(:headers, response[:request_headers])
mock
end
end

62
test/test_util.rb Normal file
View File

@@ -0,0 +1,62 @@
module TestUtil
def self.random_string(length)
(0...length).map { (65 + rand(26)).chr }.join
end
def self.build_http_log_entry
entry = ::HttpLogEntry.new({
uri: "http://example.com/",
verb: :get,
status_code: 200,
response_time_ms: rand(20..100),
request_headers: create_http_headers,
response_headers: create_http_headers,
response: build_blob_entry,
content_type: "text/plain",
requested_at: Time.now,
})
raise entry.errors.full_messages.join(", ") unless entry.valid?
entry
end
def self.build_blob_entry
BlobEntry.find_or_build(content_type: "text/plain", contents: random_string(1024))
end
def self.create_blob_entry
be = build_blob_entry
be.save!
be
end
def self.create_http_headers
::HttpLogEntryHeader.find_or_create(headers: { test_header_key: "test header value #{random_string(16)}" })
end
def self.build_fa_user
user = ::Domain::Fa::User.new(name: "Test-User-#{random_string(8)}")
user.valid? || raise(user.errors.full_messages.join(", "))
user
end
def self.mock_curl_easy(
expected_url,
response_code: 200,
body_str: "http body",
request_headers: {},
response_headers_str: "HTTP 200\r\nresp: respheader\r\ncontent-type: text/plain\r\n\r\n"
)
mock = MiniTest::Mock.new
mock.expect(:url=, nil, [expected_url])
mock.expect(:headers, {})
mock.expect(:perform, nil)
mock.expect(:response_code, response_code)
mock.expect(:body_str, body_str)
mock.expect(:header_str, response_headers_str)
mock.expect(:headers, request_headers)
mock
end
end