add basic http client

This commit is contained in:
2023-02-19 18:19:57 -08:00
parent ea24542ea1
commit c28ab1435f
14 changed files with 221 additions and 17 deletions

View File

@@ -74,6 +74,7 @@ group :test do
gem "capybara"
gem "selenium-webdriver"
gem "webdrivers"
# gem "minitest-rspec_mocks"
end
gem "xdiff", path: "../xdiff-rb"
@@ -87,6 +88,8 @@ gem "ruby-prof-speedscope"
gem "table_print"
gem "addressable"
gem "nokogiri"
gem "http-cookie"
gem "curb"
# gem "concurrent-ruby-ext", require: "concurrent"
# gem 'cli-ui'
# gem "paper_trail"

View File

@@ -96,15 +96,20 @@ GEM
coderay (1.1.3)
concurrent-ruby (1.2.0)
crass (1.0.6)
curb (1.0.5)
date (3.3.3)
debug (1.7.1)
irb (>= 1.5.0)
reline (>= 0.3.1)
debug_inspector (1.1.0)
diffy (3.4.2)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
erubi (1.12.0)
globalid (1.1.0)
activesupport (>= 5.0)
http-cookie (1.0.5)
domain_name (~> 0.5)
i18n (1.12.0)
concurrent-ruby (~> 1.0)
importmap-rails (1.1.5)
@@ -217,6 +222,9 @@ GEM
railties (>= 6.0.0)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
unf (0.1.4)
unf_ext
unf_ext (0.0.8.2)
web-console (4.2.0)
actionview (>= 6.0.0)
activemodel (>= 6.0.0)
@@ -241,8 +249,10 @@ DEPENDENCIES
addressable
bootsnap
capybara
curb
debug
diffy
http-cookie
importmap-rails
jbuilder
nokogiri

View File

@@ -12,14 +12,3 @@ require_relative "rake/fa_post"
task :log => :environment do
ActiveRecord::Base.logger = Logger.new(STDOUT)
end
namespace :test do
desc "Test lib source"
Rake::TestTask.new(:lib) do |t|
t.libs << "test"
t.pattern = "test/lib/**/*_test.rb"
t.verbose = true
end
end
Rake::Task[:test].enhance { Rake::Task["test:lib"].invoke }

View File

@@ -0,0 +1,13 @@
class Domain::Fa::Scraper::HttpClient < Scraper::HttpClientBase
def cookies
Rails.application.config.x.cookies.fa
end
ALLOWED_DOMAINS = [
"*.furaffinity.net",
]
def allowed_domains
ALLOWED_DOMAINS
end
end

View File

@@ -0,0 +1,9 @@
class Domain::Fa::Scraper::Job::BrowsePageJob < Scraper::JobBase
def initialize(http_client)
@http_client = http_client
end
def perform
content = @http_client.get("https://www.furaffinity.net/browse/")
end
end

View File

@@ -0,0 +1,92 @@
class Scraper::HttpClientBase
Response = Struct.new(
:status,
:request_headers,
:response_headers,
:body,
)
def initialize(curl_client)
# @cookie_jar
init_cookie_jar
# TODO - populate AdaptiveCache with domains this client cares about
@blob_cache = LegacyImport::AdaptiveCache.new
@curl_client = curl_client
end
def setup_faraday_client_hook(client)
end
def cookies
raise NotImplementedError
end
def allowed_domains
raise NotImplementedError
end
def redirect_limit
6
end
def get(url)
perform(:get, url)
end
def self.parse_header_str(header_str)
header_str.split("\r\n").each_with_index.map do |str, idx|
next nil if idx == 0
idx = str.index(": ")
next nil unless idx
[str[0...idx], str[idx + 2..-1]]
end.reject(&:nil?)
end
private
def perform(method, url)
uri = Addressable::URI.parse(url)
unless allowed_domain?(uri.host)
raise("domain not permitted: #{uri.host} - allowed: #{allowed_domains}")
end
@curl_client.url = url
@curl_client.headers["cookie"] = HTTP::Cookie.cookie_value(@cookie_jar.cookies(url))
@curl_client.perform
response_code = @curl_client.response_code
body_str = @curl_client.body_str
response_headers = self.class.parse_header_str(@curl_client.header_str).to_h
request_headers = @curl_client.headers
Response.new(
response_code,
request_headers,
response_headers,
body_str,
)
end
def allowed_domain?(domain)
allowed_domains.any? do |domain_to_check|
File.fnmatch? domain_to_check, domain
end
end
def init_cookie_jar
@cookie_jar = HTTP::CookieJar.new
self.cookies.each do |cookie_domain_conf|
domain = cookie_domain_conf[:domain]
cookie_domain_conf[:cookies].each do |cookie_key_conf|
conf = {
domain: domain,
name: cookie_key_conf[:name],
value: cookie_key_conf[:value],
path: cookie_key_conf[:path],
for_domain: true,
}
@cookie_jar.add(HTTP::Cookie.new(conf))
end
end
end
end

View File

@@ -0,0 +1,2 @@
class Scraper::JobBase
end

View File

@@ -6,9 +6,10 @@ class HttpLogEntryHeader < ReduxApplicationRecord
raise("must be a hash") unless headers.is_a?(Hash)
headers = headers.dup
headers.delete("date")
headers.delete("expires")
headers.delete("cf-ray")
scrub_header(headers, "date")
scrub_header(headers, "expires")
scrub_header(headers, "cf-ray")
scrub_header(headers, "cf-cache-status")
headers = headers.sort.to_h
sha256 = Digest::SHA256.digest(headers.to_s)
HttpLogEntryHeader.new(
@@ -28,6 +29,10 @@ class HttpLogEntryHeader < ReduxApplicationRecord
record
end
def self.scrub_header(headers, key)
headers[key] = "(scrubbed)" if headers[key]
end
def to_bulk_insert_hash
{
sha256: sha256,

View File

@@ -20,5 +20,6 @@ module ReduxScraper
#
# config.time_zone = "Central Time (US & Canada)"
# config.eager_load_paths << Rails.root.join("extras")
config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa")
end
end

28
config/cookies/fa.yml Normal file
View File

@@ -0,0 +1,28 @@
development:
- domain: .furaffinity.net
cookies:
- name: a
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
path: /
- name: b
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
path: /
- domain: rv.furaffinity.net
cookies:
- name: OAID
value: d4247fd1b8b755a4064f2e030dec919f
path: /
test:
- domain: .furaffinity.net
cookies:
- name: a
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
path: /
- name: b
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
path: /
- domain: rv.furaffinity.net
cookies:
- name: OAID
value: d4247fd1b8b755a4064f2e030dec919f
path: /

View File

@@ -0,0 +1,23 @@
class Domain::Fa::Scraper::HttpClientTest < ActiveSupport::TestCase
test "creates an http log entry" do
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_curl_easy(
"https://www.furaffinity.net/",
{
response_code: 200,
body_str: "the body",
request_headers: { "req" => "reqheader" },
response_headers_str: "HTTP 200\r\nresp: respheader\r\n\r\n",
}
))
response = client.get("https://www.furaffinity.net/")
assert_equal 200, response.status
assert_equal "the body", response.body
expected = { "req" => "reqheader" }
assert_equal expected, response.request_headers
expected = { "resp" => "respheader" }
assert_equal expected, response.response_headers
end
end

View File

@@ -0,0 +1,7 @@
class Scraper::HttpClientBaseTest < ActiveSupport::TestCase
def test_parse_header_str
headers = Scraper::HttpClientBase.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
assert_equal ["content-encoding", "gzip"], headers[8]
end
end

View File

@@ -1,7 +1,14 @@
require "test_helper"
class HttpLogEntryHeaderTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
test "blacklisted headers are scrubbed" do
header = HttpLogEntryHeader.build(
headers: {
"foo" => "bar",
"date" => "Tue, 15 Nov 1994 08:12:31 GMT",
},
)
assert_equal "bar", header.headers["foo"]
assert_equal "(scrubbed)", header.headers["date"]
end
end

View File

@@ -1,6 +1,7 @@
ENV["RAILS_ENV"] ||= "test"
require_relative "../config/environment"
require "rails/test_help"
require "minitest/mock"
class ActiveSupport::TestCase
# Run tests in parallel with specified workers
@@ -67,4 +68,18 @@ module TestUtil
user.valid? || raise(user.errors.full_messages.join(", "))
user
end
def self.mock_curl_easy(expected_url, response)
mock = MiniTest::Mock.new
mock.expect(:url=, nil, [expected_url])
mock.expect(:headers, {})
mock.expect(:perform, nil)
mock.expect(:response_code, response[:response_code])
mock.expect(:body_str, response[:body_str])
mock.expect(:header_str, response[:response_headers_str])
mock.expect(:headers, response[:request_headers])
mock
end
end