add performed_by to HLE

This commit is contained in:
2023-02-28 23:16:16 -08:00
parent d9a1ce3e74
commit a9ef897c42
26 changed files with 423 additions and 126 deletions

View File

@@ -103,6 +103,7 @@ gem "delayed_job_web"
gem "colorize"
gem "daemons"
gem "delayed_job_worker_pool"
gem "ripcord"
# gem 'cli-ui'
# gem "paper_trail"
# gem "paper_trail-hashdiff"

View File

@@ -238,6 +238,7 @@ GEM
reline (0.3.2)
io-console (~> 0.5)
rexml (3.2.5)
ripcord (2.0.0)
ruby-prof (1.4.5)
ruby-prof-speedscope (0.3.0)
ruby-prof (~> 1.0)
@@ -324,6 +325,7 @@ DEPENDENCIES
rack-mini-profiler
rails (~> 7.0.4, >= 7.0.4.2)
rb-bsdiff!
ripcord
ruby-prof
ruby-prof-speedscope
selenium-webdriver

View File

@@ -11,6 +11,7 @@ Rake.application.rake_require "log_entry"
Rake.application.rake_require "worker"
Rake.application.rake_require "fa"
Rake.application.rake_require "e621"
task :set_ar_stdout => :environment do
ActiveRecord::Base.logger = Logger.new(STDOUT)

View File

@@ -1,4 +1,6 @@
class Domain::Fa::ApiController < ApplicationController
include ActionController::Live
skip_before_action :verify_authenticity_token,
only: %i[ enqueue_objects ]
@@ -109,6 +111,10 @@ class Domain::Fa::ApiController < ApplicationController
depths: queue_depths,
},
}
response.stream.close
perform_enqueues!
ensure
response.stream.close
end
private
@@ -150,7 +156,16 @@ class Domain::Fa::ApiController < ApplicationController
def enqueue_manual(klass, args, priority, queue = "manual")
@@enqueue_deduper ||= Set.new
return unless @@enqueue_deduper.add?([klass, args, priority])
klass.set(priority: priority, queue: queue).perform_later(args)
@enqueue_manual ||= []
@enqueue_manual << [klass, args, priority, queue]
end
def perform_enqueues!
while job = (@enqueue_manual || []).shift
klass, args, priority, queue = job
klass.set(priority: priority, queue: queue).perform_later(args)
end
end
def time_ago_or_never(time)

View File

@@ -3,8 +3,9 @@ class Domain::Fa::Job::FaJobBase < Scraper::JobBase
def self.build_http_client
@@fa_base_http_client ||= begin
puts "Building FA HTTP client"
Domain::Fa::Scraper::HttpClient.new(Curl::Easy.new)
proxy = ENV["proxy"]
puts "Building FA HTTP client (proxy: #{proxy})"
Domain::Fa::Scraper::HttpClient.new(proxy || Scraper::CurlHttpPerformer.new)
end
end

View File

@@ -66,6 +66,10 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::FaJobBase
submission = page.submission
unless submission.id.to_i == @post.fa_id
raise("id mismatch: #{submission.id} != #{@post.fa_id}")
end
@post.last_submission_page = @submission_entry
@post.title = submission.title
@post.creator = Domain::Fa::User.find_or_build_from_submission_parser(submission)

View File

@@ -2,12 +2,17 @@ class Domain::Fa::Scraper::HttpClient < Scraper::BaseHttpClient
DEFAULT_ALLOWED_DOMAINS = [
"*.furaffinity.net",
"*.facdn.net",
"ipinfo.io",
]
def initialize(curl_client, cookies: nil, allowed_domains: nil)
@cookies = cookies || Rails.application.config.x.cookies.fa
@allowed_domains = allowed_domains || DEFAULT_ALLOWED_DOMAINS
super(curl_client)
def initialize(http_performer_or_proxy)
if http_performer_or_proxy.is_a?(String)
@cookies = Rails.application.config.x.cookies.fa[http_performer_or_proxy]
else
@cookies = Rails.application.config.x.cookies.fa["direct"]
end
super(http_performer_or_proxy)
end
def cookies
@@ -23,6 +28,6 @@ class Domain::Fa::Scraper::HttpClient < Scraper::BaseHttpClient
end
def allowed_domains
@allowed_domains
DEFAULT_ALLOWED_DOMAINS
end
end

View File

@@ -1,8 +1,6 @@
class Scraper::BaseHttpClient
Response = Struct.new(
:status_code,
:request_headers,
:response_headers,
:body,
:log_entry,
)
@@ -11,7 +9,7 @@ class Scraper::BaseHttpClient
class InvalidURLError < ArgumentError; end
def initialize(curl_client = nil)
def initialize(http_performer_or_proxy)
cookies || raise("Subclass must define non-nil #cookies")
allowed_domains || raise("Subclass must define non-nil #allowed_domains")
@@ -22,7 +20,12 @@ class Scraper::BaseHttpClient
@blob_entry_cache = Hash.new do |hash, key|
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
end
@curl_client = curl_client
if http_performer_or_proxy.is_a?(String)
@http_performer = Scraper::ProxyHttpPerformer.new(http_performer_or_proxy)
else
@http_performer = http_performer_or_proxy
end
@domain_last_requested_at = {}
@logger ||= ColorLogger.make($stdout, self)
@logger.level = :info
@@ -50,15 +53,6 @@ class Scraper::BaseHttpClient
get_impl(url, caused_by_entry)
end
def self.parse_header_str(header_str)
header_str.split("\r\n").each_with_index.map do |str, idx|
next nil if idx == 0
idx = str.index(": ")
next nil unless idx
[str[0...idx], str[idx + 2..-1]]
end.reject(&:nil?)
end
private
def get_impl(url, caused_by_entry)
@@ -71,9 +65,6 @@ class Scraper::BaseHttpClient
raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{allowed_domains}")
end
@curl_client.url = url
@curl_client.headers["cookie"] = HTTP::Cookie.cookie_value(@cookie_jar.cookies(Addressable::URI.encode url))
ratelimit_conf = ratelimit.find do |conf|
File.fnmatch? conf[0], uri.host
end || ["*", :none]
@@ -86,21 +77,24 @@ class Scraper::BaseHttpClient
if last_requested_at
sleep_for = ratelimit_delay - (Time.now - last_requested_at)
if sleep_for >= 0
logger.debug("wait #{sleep_for.round(2)} sec / domain = #{uri.host}")
logger.info("wait #{sleep_for.round(2)} sec / domain = #{uri.host}")
sleep sleep_for
end
end
end
request_headers = {
"cookie" => HTTP::Cookie.cookie_value(@cookie_jar.cookies(Addressable::URI.encode url)),
}
requested_at = Time.now
@curl_client.perform
response_time_ms = ((Time.now - requested_at) * 1000).to_i
response = @http_performer.get(url, request_headers)
@domain_last_requested_at[uri.host] = Time.now
response_code = @curl_client.response_code
body_str = @curl_client.body_str
response_headers = self.class.parse_header_str(@curl_client.header_str).to_h
request_headers = @curl_client.headers
response_code = response.response_code
response_headers = response.response_headers
response_time_ms = response.response_time_ms
response_body = response.body
content_type = response_headers["Content-Type"] ||
response_headers["content-type"] ||
raise("no content type provided: #{response_headers}")
@@ -115,32 +109,30 @@ class Scraper::BaseHttpClient
candidates << caused_by_entry.response if caused_by_entry&.response
candidates << caused_by_entry.response.base if caused_by_entry&.response&.base
response_blob_entry = BlobEntry.find_or_build(
content_type: content_type,
contents: body_str,
candidates: candidates,
)
log_entry = HttpLogEntry.new({
uri: url,
verb: :get,
content_type: response_headers["content-type"],
status_code: response_code,
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers),
response: response_blob_entry,
response_time_ms: response_time_ms,
requested_at: requested_at,
caused_by_entry: caused_by_entry,
})
retries = 0
begin
response_blob_entry = BlobEntry.find_or_build(
content_type: content_type,
contents: response_body,
candidates: candidates,
)
log_entry = HttpLogEntry.new({
uri: url,
verb: :get,
content_type: response_headers["content-type"],
status_code: response_code,
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers),
response: response_blob_entry,
response_time_ms: response_time_ms,
requested_at: requested_at,
caused_by_entry: caused_by_entry,
performed_by: @http_performer.name,
})
log_entry.save!
rescue
log_entry.errors.clear
log_entry.request_headers.errors.clear
log_entry.response_headers.errors.clear
retries += 1
retry if retries < 2
raise
@@ -174,9 +166,7 @@ class Scraper::BaseHttpClient
Response.new(
response_code,
request_headers,
response_headers,
body_str,
response_body,
log_entry
)
end
@@ -196,7 +186,7 @@ class Scraper::BaseHttpClient
domain: domain,
name: cookie_key_conf[:name],
value: cookie_key_conf[:value],
path: cookie_key_conf[:path],
path: cookie_key_conf[:path] || "",
for_domain: true,
}
@cookie_jar.add(HTTP::Cookie.new(conf))

View File

@@ -0,0 +1,41 @@
class Scraper::CurlHttpPerformer < Scraper::HttpPerformer
def initialize(curl = nil)
@curl = curl || Curl::Easy.new
end
def name
"direct"
end
def get(url, request_headers)
start_at = Time.now
@curl.url = url
@curl.headers = {}
request_headers.each do |key, value|
@curl.headers[key.to_s] = value
end
@curl.perform
response_time_ms = ((Time.now - start_at) * 1000).to_i
response_code = @curl.response_code
body_str = @curl.body_str
response_headers = self.class.parse_header_str(@curl.header_str).to_h
Scraper::HttpPerformer::Response.new(
response_code,
response_headers,
response_time_ms,
body_str
)
end
private
def self.parse_header_str(header_str)
header_str.split("\r\n").each_with_index.map do |str, idx|
next nil if idx == 0
idx = str.index(": ")
next nil unless idx
[str[0...idx], str[idx + 2..-1]]
end.reject(&:nil?)
end
end

View File

@@ -0,0 +1,12 @@
class Scraper::HttpPerformer
Response = Struct.new(
:response_code,
:response_headers,
:response_time_ms,
:body
)
def get(url, request_headers)
raise NotImplementedError, "implement in subclass"
end
end

View File

@@ -0,0 +1,24 @@
require "base64"
class Scraper::ProxyHttpPerformer < Scraper::HttpPerformer
def initialize(proxy)
@name = proxy
proxy_url = Rails.application.config.x.proxies[proxy]
@client = Ripcord::Client.new(proxy_url)
end
def name
@name
end
def get(uri, headers)
send_token = SecureRandom.uuid
rpc_result = @client.call("get", [uri, headers, send_token])
raise "rpc error: #{rpc_result.error.inspect}" unless rpc_result.successful?
recv_token, code, headers, time_ms, body = Marshal.load(Base64.decode64(rpc_result.result))
if send_token != recv_token
raise "token mismatch: #{send_token} != #{recv_token}"
end
Scraper::HttpPerformer::Response.new(code, headers, time_ms, body)
end
end

View File

@@ -127,7 +127,9 @@ class BlobEntry < ReduxApplicationRecord
record.contents = contents
end
raise RuntimeError.new("invariant!") if record.contents != contents
if record.contents != contents
raise RuntimeError.new("invariant!")
end
record
end

View File

@@ -88,8 +88,8 @@ class Domain::Fa::Post < ReduxApplicationRecord
def scanned_at=(time)
unless time.is_a?(Time)
raise ArgumentError("time must be Time, was #{time.class}")
end
self.state_detail["scanned_at"] = time.to_i
end unless time.nil?
self.state_detail["scanned_at"] = time&.to_i
end
def last_submission_page=(log_entry)
@@ -160,6 +160,7 @@ class Domain::Fa::Post < ReduxApplicationRecord
request_headers: ::HttpLogEntryHeader.find_or_create(headers: {}),
response_headers: ::HttpLogEntryHeader.find_or_create(headers: {}),
response: blob_entry,
performed_by: "legacy",
})
raise("mismatch") unless log_entry.response == blob_entry

View File

@@ -1,7 +1,8 @@
class HttpLogEntry < ReduxApplicationRecord
include ImmutableModel
enum verb: %i[get post]
enum verb: %i[get post], _prefix: true
enum performed_by: %i[direct legacy proxy-1], _prefix: true
belongs_to :response,
foreign_key: :response_sha256,
@@ -25,12 +26,12 @@ class HttpLogEntry < ReduxApplicationRecord
class_name: "::HttpLogEntry",
foreign_key: :caused_by_id
validates_inclusion_of(:verb, in: ::HttpLogEntry.verbs.keys)
validates_presence_of(
:uri_scheme,
:uri_host,
:uri_path,
:verb,
:performed_by,
:status_code,
:response_time_ms,
:content_type,
@@ -68,6 +69,7 @@ class HttpLogEntry < ReduxApplicationRecord
requested_at: legacy_model.requested_at,
created_at: legacy_model.created_at,
updated_at: legacy_model.updated_at,
performed_by: "legacy",
})
return record
end

View File

@@ -23,5 +23,8 @@ module ReduxScraper
# config.time_zone = "Central Time (US & Canada)"
# config.eager_load_paths << Rails.root.join("extras")
config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa")
config.x.proxies = {
"proxy-1" => "http://proxy-1.local:8999",
}
end
end

View File

@@ -1,42 +1,59 @@
ddwhatnow: &ddwhatnow
- domain: .furaffinity.net
cookies:
- name: a
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
path: /
- name: b
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
path: /
- domain: rv.furaffinity.net
cookies:
- name: OAID
value: d4247fd1b8b755a4064f2e030dec919f
path: /
vipvillageworker: &vipvillageworker
- domain: .furaffinity.net
cookies:
- name: a
value: 5a9437e0-ded0-4b8d-8255-ff0198664025
- name: b
value: 2d7da6a8-959f-407e-abd1-af8f51c110be
- domain: rv.furaffinity.net
cookies:
- name: OAID
value: d8ebf7f9508511d8ba7307e3cb36f03c
path: /
- domain: www.furaffinity.net
cookies:
- name: sz
value: 1905x976
path: /
testcookies: &testcookies
- domain: .furaffinity.net
cookies:
- name: a
value: avalue
path: /
- name: b
value: bvalue
path: /
- domain: rv.furaffinity.net
cookies:
- name: OAID
value: oaidvalue
path: /
development:
- domain: .furaffinity.net
cookies:
- name: a
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
path: /
- name: b
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
path: /
- domain: rv.furaffinity.net
cookies:
- name: OAID
value: d4247fd1b8b755a4064f2e030dec919f
path: /
direct: *ddwhatnow
proxy-1: *vipvillageworker
production:
- domain: .furaffinity.net
cookies:
- name: a
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
path: /
- name: b
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
path: /
- domain: rv.furaffinity.net
cookies:
- name: OAID
value: d4247fd1b8b755a4064f2e030dec919f
path: /
direct: *ddwhatnow
proxy-1: *vipvillageworker
test:
- domain: .furaffinity.net
cookies:
- name: a
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
path: /
- name: b
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
path: /
- domain: rv.furaffinity.net
cookies:
- name: OAID
value: d4247fd1b8b755a4064f2e030dec919f
path: /
direct: *testcookies
proxy-1: *testcookies

View File

@@ -10,10 +10,13 @@ end
# This runs in the worker processes after it has been forked
on_worker_boot do |worker_info|
ActiveJob::Base.logger = Logger.new("/dev/null")
Delayed::Worker.logger = Logger.new("/dev/null")
Delayed::Worker.logger = Logger.new(STDOUT)
Delayed::Worker.logger.level = :error
Rails.logger = Logger.new(STDOUT)
puts "Worker #{Process.pid} started"
sleep rand(1..5)
# Reconnect to the database
ActiveRecord::Base.establish_connection
end
@@ -34,7 +37,7 @@ end
[:default, 1],
[:fa_user_page, 2],
[:fa_user_gallery, 1],
[:fa_post, 5],
[:fa_post, 4],
].each do |queue, workers|
worker_group(queue) do |g|
g.read_ahead = 8

View File

@@ -10,7 +10,8 @@ end
# This runs in the worker processes after it has been forked
on_worker_boot do |worker_info|
ActiveJob::Base.logger = Logger.new("/dev/null")
Delayed::Worker.logger = Logger.new("/dev/null")
Delayed::Worker.logger = Logger.new(STDOUT)
Delayed::Worker.logger.level = :error
Rails.logger = Logger.new(STDOUT)
puts "Worker #{Process.pid} started"

View File

@@ -10,7 +10,8 @@ end
# This runs in the worker processes after it has been forked
on_worker_boot do |worker_info|
ActiveJob::Base.logger = Logger.new("/dev/null")
Delayed::Worker.logger = Logger.new("/dev/null")
Delayed::Worker.logger = Logger.new(STDOUT)
Delayed::Worker.logger.level = :error
Rails.logger = Logger.new(STDOUT)
puts "Worker #{Process.pid} started"
@@ -33,7 +34,7 @@ end
worker_group(:static_file) do |g|
g.read_ahead = 1
g.sleep_delay = 5
g.workers = 8
g.workers = 2
g.queues = ["static_file"]
end

24
rake/e621.rake Normal file
View File

@@ -0,0 +1,24 @@
namespace :e621 do
desc "import legacy e621 posts"
task :import_legacy => :environment do |t, args|
batch_size = args[:batch_size]&.to_i || ENV["batch_size"]&.to_i
forks = args[:forks]&.to_i || ENV["forks"]&.to_i
start_at = args[:start_at]&.to_i || ENV["start_at"]&.to_i
LegacyImport::E621LegacyPostImporter.
new(batch_size: batch_size, forks: forks, start_at: start_at).
run
end
desc "import e621 data from csv"
task :import_csv => :environment do |t, args|
batch_size = ENV["batch_size"]&.to_i
forks = ENV["forks"]&.to_i
start_at = ENV["start_at"]&.to_i
posts_csv_path = ENV["posts_csv_path"] || raise("must supply `posts_csv_path`")
LegacyImport::E621CsvPostImporter.
new(batch_size: batch_size, forks: forks, start_at: start_at).
run
end
end

View File

@@ -115,4 +115,106 @@ namespace :fa do
})
end
end
# treat this like a bomb
# desc "delete / undo changes made by buggy proxy responses"
# task :fix_buggy_posts => [:set_logger_stdout, :environment] do
# raise "Already ran, would destroy everything if it ran again"
# # log_entries = HttpLogEntry.where("id > ?", 54040076).to_a
# earliest_created_at = log_entries.map(&:created_at).min
# puts "earliest: #{earliest_created_at}"
# users_to_revert = Set.new
# posts_to_revert = Set.new
# files_to_revert = Set.new
# log_entries.each do |log_entry|
# uri = log_entry.uri
# # puts "[id #{log_entry.id} / uri #{uri}]"
# case uri.host
# when "ipinfo.io"
# puts "ipinfo, do nothing"
# when "d.furaffinity.net"
# # puts "fix static file #{uri.path.bold}"
# files_to_revert.add(uri.to_s)
# post = Domain::Fa::Post.find_by(file_id: log_entry.id)
# posts_to_revert.add(post) if post
# when "www.furaffinity.net"
# if uri.path.start_with?("/user/")
# url_name = uri.path.split("/")[2]
# user = Domain::Fa::User.find_by(url_name: url_name) || raise
# # puts "fix fa user #{url_name.bold}"
# users_to_revert.add(user)
# elsif uri.path.start_with?("/view/")
# fa_id = uri.path.split("/")[2]
# post = Domain::Fa::Post.find_by(fa_id: fa_id) || raise
# # puts "fix fa post #{fa_id.to_s.bold}"
# posts_to_revert.add(post)
# elsif uri.path.start_with?("/gallery/") || uri.path.start_with?("/scraps/")
# url_name = uri.path.split("/")[2]
# user = Domain::Fa::User.find_by(url_name: url_name) || raise
# # puts "fix fa user gallery #{url_name.bold}"
# users_to_revert.add(user)
# elsif uri.path == "/"
# next
# else
# raise("unsupported path: #{uri.path}")
# end
# else raise("unsupported host: #{uri.host}")
# end
# end
# puts "entries: #{log_entries.size}"
# puts "users: #{users_to_revert.size}"
# puts "posts: #{posts_to_revert.size}"
# puts "files: #{files_to_revert.size}"
# # puts "----- users -----"
# # users_to_revert.each do |user|
# # puts user.to_json
# # end
# # puts "----- posts -----"
# # posts_to_revert.each do |post|
# # puts post.to_json
# # end
# reset_detail = ({ :info => "reverted on #{Time.now}" })
# job_params = ({ :queue => "manual", :priority => -20 })
# ReduxApplicationRecord.transaction do
# users_to_revert.each do |user|
# user.state = :ok
# user.state_detail = reset_detail
# user.log_entry_detail = reset_detail
# user.scanned_page_at = nil
# user.scanned_gallery_at = nil
# user.save!
# Domain::Fa::Job::UserGalleryJob.set(job_params).perform_later({
# user: user,
# })
# Domain::Fa::Job::UserPageJob.set(job_params).perform_later({
# user: user,
# })
# end
# posts_to_revert.each do |post|
# post.state = :ok
# post.state_detail = reset_detail
# post.log_entry_detail = reset_detail
# post.file = nil
# post.file_url_str = nil
# post.save!
# Domain::Fa::Job::ScanPostJob.set(job_params).perform_later({
# post: post,
# })
# end
# log_entries.reverse.each(&:destroy!)
# end
# end
end

View File

@@ -1,19 +1,15 @@
class Domain::Fa::Scraper::HttpClientTest < ActiveSupport::TestCase
test "creates an http log entry" do
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_curl_easy(
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_http_performer(
"https://www.furaffinity.net/",
request_headers: Hash,
response_code: 200,
body_str: "a plain text body",
response_body: "a plain text body",
))
response = client.get("https://www.furaffinity.net/")
assert_equal 200, response.status_code
assert_equal "a plain text body", response.body
expected = {}
assert_equal expected, response.request_headers
expected = { "resp" => "respheader", "content-type" => "text/plain" }
assert_equal expected, response.response_headers
refute_nil response.log_entry
end
end

View File

@@ -9,34 +9,37 @@ class Scraper::BaseHttpClientTest < ActiveSupport::TestCase
end
end
test "can parse header string" do
headers = Scraper::BaseHttpClient.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
assert_equal ["content-encoding", "gzip"], headers[8]
end
test "throws on unallowed domain" do
client = TestHttpClient.new(TestUtil.mock_curl_easy(""))
client = TestHttpClient.new(TestUtil.mock_http_performer(""))
assert_raises(Scraper::BaseHttpClient::InvalidURLError) do
client.get("https://foobar.com")
end
end
test "creates an http log entry" do
body_str = "the response body " + TestUtil.random_string(16)
client = TestHttpClient.new(TestUtil.mock_curl_easy("https://www.example.com/", body_str: body_str))
# note the lack of trailing slash - http client should set path to '/'
client = TestHttpClient.new(TestUtil.mock_http_performer(
"https://www.example.com/",
request_headers: { "cookie" => "" },
response_code: 200,
response_time_ms: 15,
response_headers: { "content-type" => "text/plain" },
response_body: "the response " + TestUtil.random_string(16),
))
# note the lack of trailing slash - http client should set path to '/'
response = client.get("https://www.example.com")
assert_equal 200, response.status_code
assert_match /the response /, response.body
log_entry = response.log_entry
assert log_entry.persisted?
assert_equal "text/plain", log_entry.content_type
assert_in_delta Time.now, log_entry.requested_at, 50
assert_equal 15, log_entry.response_time_ms
assert_equal "get", log_entry.verb
assert_equal 200, log_entry.status_code
assert_equal "text/plain", log_entry.response.content_type
assert_equal body_str, log_entry.response.contents
assert_match /the response/, log_entry.response.contents
end
end

View File

@@ -0,0 +1,20 @@
class Scraper::CurlHttpPerformerTest < ActiveSupport::TestCase
test "can parse header string" do
headers = Scraper::CurlHttpPerformer.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
assert_equal ["content-encoding", "gzip"], headers[8]
end
test "correctly wraps curl" do
body_str = "the response body " + TestUtil.random_string(16)
client = Scraper::CurlHttpPerformer.new(
TestUtil.mock_curl_easy("https://www.example.com/", body_str: body_str)
)
response = client.get("https://www.example.com/", {})
assert_equal 200, response.response_code
assert_equal ({ "content-type" => "text/plain", "resp" => "respheader" }),
response.response_headers
assert_equal 0, response.response_time_ms
end
end

View File

@@ -14,6 +14,7 @@ class HttpLogEntryTest < ActiveSupport::TestCase
uri_host: "example.com",
uri_path: "/foo/bar",
status_code: 200,
performed_by: "direct",
}
end

View File

@@ -7,6 +7,7 @@ module TestUtil
entry = ::HttpLogEntry.new({
uri: "http://example.com/",
verb: :get,
performed_by: "direct",
status_code: 200,
response_time_ms: rand(20..100),
request_headers: create_http_headers,
@@ -61,8 +62,11 @@ module TestUtil
)
mock = MiniTest::Mock.new
def mock.singleton_method_added(_); {}; end
mock.expect(:url=, nil, [expected_url])
mock.expect(:headers, {})
def mock.headers; {}; end
def mock.headers=(_); {}; end
mock.expect(:perform, nil)
mock.expect(:response_code, response_code)
mock.expect(:body_str, body_str)
@@ -71,4 +75,25 @@ module TestUtil
mock
end
def self.mock_http_performer(
expected_url,
request_headers: {},
response_code: 200,
response_time_ms: 15,
response_headers: { "content-type" => "text/plain" },
response_body: "http body"
)
mock = MiniTest::Mock.new
mock.expect(:is_a?, false, [String])
mock.expect(:is_a?, false, [String])
mock.expect(:name, "direct")
mock.expect(:get, Scraper::HttpPerformer::Response.new(
response_code,
response_headers,
response_time_ms,
response_body
), [expected_url, request_headers])
mock
end
end