add basic http client
This commit is contained in:
3
Gemfile
3
Gemfile
@@ -74,6 +74,7 @@ group :test do
|
||||
gem "capybara"
|
||||
gem "selenium-webdriver"
|
||||
gem "webdrivers"
|
||||
# gem "minitest-rspec_mocks"
|
||||
end
|
||||
|
||||
gem "xdiff", path: "../xdiff-rb"
|
||||
@@ -87,6 +88,8 @@ gem "ruby-prof-speedscope"
|
||||
gem "table_print"
|
||||
gem "addressable"
|
||||
gem "nokogiri"
|
||||
gem "http-cookie"
|
||||
gem "curb"
|
||||
# gem "concurrent-ruby-ext", require: "concurrent"
|
||||
# gem 'cli-ui'
|
||||
# gem "paper_trail"
|
||||
|
||||
10
Gemfile.lock
10
Gemfile.lock
@@ -96,15 +96,20 @@ GEM
|
||||
coderay (1.1.3)
|
||||
concurrent-ruby (1.2.0)
|
||||
crass (1.0.6)
|
||||
curb (1.0.5)
|
||||
date (3.3.3)
|
||||
debug (1.7.1)
|
||||
irb (>= 1.5.0)
|
||||
reline (>= 0.3.1)
|
||||
debug_inspector (1.1.0)
|
||||
diffy (3.4.2)
|
||||
domain_name (0.5.20190701)
|
||||
unf (>= 0.0.5, < 1.0.0)
|
||||
erubi (1.12.0)
|
||||
globalid (1.1.0)
|
||||
activesupport (>= 5.0)
|
||||
http-cookie (1.0.5)
|
||||
domain_name (~> 0.5)
|
||||
i18n (1.12.0)
|
||||
concurrent-ruby (~> 1.0)
|
||||
importmap-rails (1.1.5)
|
||||
@@ -217,6 +222,9 @@ GEM
|
||||
railties (>= 6.0.0)
|
||||
tzinfo (2.0.6)
|
||||
concurrent-ruby (~> 1.0)
|
||||
unf (0.1.4)
|
||||
unf_ext
|
||||
unf_ext (0.0.8.2)
|
||||
web-console (4.2.0)
|
||||
actionview (>= 6.0.0)
|
||||
activemodel (>= 6.0.0)
|
||||
@@ -241,8 +249,10 @@ DEPENDENCIES
|
||||
addressable
|
||||
bootsnap
|
||||
capybara
|
||||
curb
|
||||
debug
|
||||
diffy
|
||||
http-cookie
|
||||
importmap-rails
|
||||
jbuilder
|
||||
nokogiri
|
||||
|
||||
11
Rakefile
11
Rakefile
@@ -12,14 +12,3 @@ require_relative "rake/fa_post"
|
||||
task :log => :environment do
|
||||
ActiveRecord::Base.logger = Logger.new(STDOUT)
|
||||
end
|
||||
|
||||
namespace :test do
|
||||
desc "Test lib source"
|
||||
Rake::TestTask.new(:lib) do |t|
|
||||
t.libs << "test"
|
||||
t.pattern = "test/lib/**/*_test.rb"
|
||||
t.verbose = true
|
||||
end
|
||||
end
|
||||
|
||||
Rake::Task[:test].enhance { Rake::Task["test:lib"].invoke }
|
||||
|
||||
13
app/lib/domain/fa/scraper/http_client.rb
Normal file
13
app/lib/domain/fa/scraper/http_client.rb
Normal file
@@ -0,0 +1,13 @@
|
||||
class Domain::Fa::Scraper::HttpClient < Scraper::HttpClientBase
|
||||
def cookies
|
||||
Rails.application.config.x.cookies.fa
|
||||
end
|
||||
|
||||
ALLOWED_DOMAINS = [
|
||||
"*.furaffinity.net",
|
||||
]
|
||||
|
||||
def allowed_domains
|
||||
ALLOWED_DOMAINS
|
||||
end
|
||||
end
|
||||
9
app/lib/domain/fa/scraper/job/browse_page_job.rb
Normal file
9
app/lib/domain/fa/scraper/job/browse_page_job.rb
Normal file
@@ -0,0 +1,9 @@
|
||||
class Domain::Fa::Scraper::Job::BrowsePageJob < Scraper::JobBase
|
||||
def initialize(http_client)
|
||||
@http_client = http_client
|
||||
end
|
||||
|
||||
def perform
|
||||
content = @http_client.get("https://www.furaffinity.net/browse/")
|
||||
end
|
||||
end
|
||||
92
app/lib/scraper/http_client_base.rb
Normal file
92
app/lib/scraper/http_client_base.rb
Normal file
@@ -0,0 +1,92 @@
|
||||
class Scraper::HttpClientBase
|
||||
Response = Struct.new(
|
||||
:status,
|
||||
:request_headers,
|
||||
:response_headers,
|
||||
:body,
|
||||
)
|
||||
|
||||
def initialize(curl_client)
|
||||
# @cookie_jar
|
||||
init_cookie_jar
|
||||
# TODO - populate AdaptiveCache with domains this client cares about
|
||||
@blob_cache = LegacyImport::AdaptiveCache.new
|
||||
@curl_client = curl_client
|
||||
end
|
||||
|
||||
def setup_faraday_client_hook(client)
|
||||
end
|
||||
|
||||
def cookies
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
def allowed_domains
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
def redirect_limit
|
||||
6
|
||||
end
|
||||
|
||||
def get(url)
|
||||
perform(:get, url)
|
||||
end
|
||||
|
||||
def self.parse_header_str(header_str)
|
||||
header_str.split("\r\n").each_with_index.map do |str, idx|
|
||||
next nil if idx == 0
|
||||
idx = str.index(": ")
|
||||
next nil unless idx
|
||||
[str[0...idx], str[idx + 2..-1]]
|
||||
end.reject(&:nil?)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def perform(method, url)
|
||||
uri = Addressable::URI.parse(url)
|
||||
unless allowed_domain?(uri.host)
|
||||
raise("domain not permitted: #{uri.host} - allowed: #{allowed_domains}")
|
||||
end
|
||||
|
||||
@curl_client.url = url
|
||||
@curl_client.headers["cookie"] = HTTP::Cookie.cookie_value(@cookie_jar.cookies(url))
|
||||
@curl_client.perform
|
||||
|
||||
response_code = @curl_client.response_code
|
||||
body_str = @curl_client.body_str
|
||||
response_headers = self.class.parse_header_str(@curl_client.header_str).to_h
|
||||
request_headers = @curl_client.headers
|
||||
|
||||
Response.new(
|
||||
response_code,
|
||||
request_headers,
|
||||
response_headers,
|
||||
body_str,
|
||||
)
|
||||
end
|
||||
|
||||
def allowed_domain?(domain)
|
||||
allowed_domains.any? do |domain_to_check|
|
||||
File.fnmatch? domain_to_check, domain
|
||||
end
|
||||
end
|
||||
|
||||
def init_cookie_jar
|
||||
@cookie_jar = HTTP::CookieJar.new
|
||||
self.cookies.each do |cookie_domain_conf|
|
||||
domain = cookie_domain_conf[:domain]
|
||||
cookie_domain_conf[:cookies].each do |cookie_key_conf|
|
||||
conf = {
|
||||
domain: domain,
|
||||
name: cookie_key_conf[:name],
|
||||
value: cookie_key_conf[:value],
|
||||
path: cookie_key_conf[:path],
|
||||
for_domain: true,
|
||||
}
|
||||
@cookie_jar.add(HTTP::Cookie.new(conf))
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
2
app/lib/scraper/job_base.rb
Normal file
2
app/lib/scraper/job_base.rb
Normal file
@@ -0,0 +1,2 @@
|
||||
class Scraper::JobBase
|
||||
end
|
||||
@@ -6,9 +6,10 @@ class HttpLogEntryHeader < ReduxApplicationRecord
|
||||
raise("must be a hash") unless headers.is_a?(Hash)
|
||||
|
||||
headers = headers.dup
|
||||
headers.delete("date")
|
||||
headers.delete("expires")
|
||||
headers.delete("cf-ray")
|
||||
scrub_header(headers, "date")
|
||||
scrub_header(headers, "expires")
|
||||
scrub_header(headers, "cf-ray")
|
||||
scrub_header(headers, "cf-cache-status")
|
||||
headers = headers.sort.to_h
|
||||
sha256 = Digest::SHA256.digest(headers.to_s)
|
||||
HttpLogEntryHeader.new(
|
||||
@@ -28,6 +29,10 @@ class HttpLogEntryHeader < ReduxApplicationRecord
|
||||
record
|
||||
end
|
||||
|
||||
def self.scrub_header(headers, key)
|
||||
headers[key] = "(scrubbed)" if headers[key]
|
||||
end
|
||||
|
||||
def to_bulk_insert_hash
|
||||
{
|
||||
sha256: sha256,
|
||||
|
||||
@@ -20,5 +20,6 @@ module ReduxScraper
|
||||
#
|
||||
# config.time_zone = "Central Time (US & Canada)"
|
||||
# config.eager_load_paths << Rails.root.join("extras")
|
||||
config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa")
|
||||
end
|
||||
end
|
||||
|
||||
28
config/cookies/fa.yml
Normal file
28
config/cookies/fa.yml
Normal file
@@ -0,0 +1,28 @@
|
||||
development:
|
||||
- domain: .furaffinity.net
|
||||
cookies:
|
||||
- name: a
|
||||
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
|
||||
path: /
|
||||
- name: b
|
||||
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
|
||||
path: /
|
||||
- domain: rv.furaffinity.net
|
||||
cookies:
|
||||
- name: OAID
|
||||
value: d4247fd1b8b755a4064f2e030dec919f
|
||||
path: /
|
||||
test:
|
||||
- domain: .furaffinity.net
|
||||
cookies:
|
||||
- name: a
|
||||
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
|
||||
path: /
|
||||
- name: b
|
||||
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
|
||||
path: /
|
||||
- domain: rv.furaffinity.net
|
||||
cookies:
|
||||
- name: OAID
|
||||
value: d4247fd1b8b755a4064f2e030dec919f
|
||||
path: /
|
||||
23
test/lib/domain/fa/scraper/http_client_test.rb
Normal file
23
test/lib/domain/fa/scraper/http_client_test.rb
Normal file
@@ -0,0 +1,23 @@
|
||||
class Domain::Fa::Scraper::HttpClientTest < ActiveSupport::TestCase
|
||||
test "creates an http log entry" do
|
||||
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_curl_easy(
|
||||
"https://www.furaffinity.net/",
|
||||
{
|
||||
response_code: 200,
|
||||
body_str: "the body",
|
||||
request_headers: { "req" => "reqheader" },
|
||||
response_headers_str: "HTTP 200\r\nresp: respheader\r\n\r\n",
|
||||
}
|
||||
))
|
||||
response = client.get("https://www.furaffinity.net/")
|
||||
|
||||
assert_equal 200, response.status
|
||||
assert_equal "the body", response.body
|
||||
|
||||
expected = { "req" => "reqheader" }
|
||||
assert_equal expected, response.request_headers
|
||||
|
||||
expected = { "resp" => "respheader" }
|
||||
assert_equal expected, response.response_headers
|
||||
end
|
||||
end
|
||||
7
test/lib/scraper/http_client_base_test.rb
Normal file
7
test/lib/scraper/http_client_base_test.rb
Normal file
@@ -0,0 +1,7 @@
|
||||
class Scraper::HttpClientBaseTest < ActiveSupport::TestCase
|
||||
def test_parse_header_str
|
||||
headers = Scraper::HttpClientBase.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
|
||||
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
|
||||
assert_equal ["content-encoding", "gzip"], headers[8]
|
||||
end
|
||||
end
|
||||
@@ -1,7 +1,14 @@
|
||||
require "test_helper"
|
||||
|
||||
class HttpLogEntryHeaderTest < ActiveSupport::TestCase
|
||||
# test "the truth" do
|
||||
# assert true
|
||||
# end
|
||||
test "blacklisted headers are scrubbed" do
|
||||
header = HttpLogEntryHeader.build(
|
||||
headers: {
|
||||
"foo" => "bar",
|
||||
"date" => "Tue, 15 Nov 1994 08:12:31 GMT",
|
||||
},
|
||||
)
|
||||
assert_equal "bar", header.headers["foo"]
|
||||
assert_equal "(scrubbed)", header.headers["date"]
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
ENV["RAILS_ENV"] ||= "test"
|
||||
require_relative "../config/environment"
|
||||
require "rails/test_help"
|
||||
require "minitest/mock"
|
||||
|
||||
class ActiveSupport::TestCase
|
||||
# Run tests in parallel with specified workers
|
||||
@@ -67,4 +68,18 @@ module TestUtil
|
||||
user.valid? || raise(user.errors.full_messages.join(", "))
|
||||
user
|
||||
end
|
||||
|
||||
def self.mock_curl_easy(expected_url, response)
|
||||
mock = MiniTest::Mock.new
|
||||
|
||||
mock.expect(:url=, nil, [expected_url])
|
||||
mock.expect(:headers, {})
|
||||
mock.expect(:perform, nil)
|
||||
mock.expect(:response_code, response[:response_code])
|
||||
mock.expect(:body_str, response[:body_str])
|
||||
mock.expect(:header_str, response[:response_headers_str])
|
||||
mock.expect(:headers, response[:request_headers])
|
||||
|
||||
mock
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user