add performed_by to HLE
This commit is contained in:
1
Gemfile
1
Gemfile
@@ -103,6 +103,7 @@ gem "delayed_job_web"
|
||||
gem "colorize"
|
||||
gem "daemons"
|
||||
gem "delayed_job_worker_pool"
|
||||
gem "ripcord"
|
||||
# gem 'cli-ui'
|
||||
# gem "paper_trail"
|
||||
# gem "paper_trail-hashdiff"
|
||||
|
||||
@@ -238,6 +238,7 @@ GEM
|
||||
reline (0.3.2)
|
||||
io-console (~> 0.5)
|
||||
rexml (3.2.5)
|
||||
ripcord (2.0.0)
|
||||
ruby-prof (1.4.5)
|
||||
ruby-prof-speedscope (0.3.0)
|
||||
ruby-prof (~> 1.0)
|
||||
@@ -324,6 +325,7 @@ DEPENDENCIES
|
||||
rack-mini-profiler
|
||||
rails (~> 7.0.4, >= 7.0.4.2)
|
||||
rb-bsdiff!
|
||||
ripcord
|
||||
ruby-prof
|
||||
ruby-prof-speedscope
|
||||
selenium-webdriver
|
||||
|
||||
1
Rakefile
1
Rakefile
@@ -11,6 +11,7 @@ Rake.application.rake_require "log_entry"
|
||||
Rake.application.rake_require "worker"
|
||||
|
||||
Rake.application.rake_require "fa"
|
||||
Rake.application.rake_require "e621"
|
||||
|
||||
task :set_ar_stdout => :environment do
|
||||
ActiveRecord::Base.logger = Logger.new(STDOUT)
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
class Domain::Fa::ApiController < ApplicationController
|
||||
include ActionController::Live
|
||||
|
||||
skip_before_action :verify_authenticity_token,
|
||||
only: %i[ enqueue_objects ]
|
||||
|
||||
@@ -109,6 +111,10 @@ class Domain::Fa::ApiController < ApplicationController
|
||||
depths: queue_depths,
|
||||
},
|
||||
}
|
||||
response.stream.close
|
||||
perform_enqueues!
|
||||
ensure
|
||||
response.stream.close
|
||||
end
|
||||
|
||||
private
|
||||
@@ -150,7 +156,16 @@ class Domain::Fa::ApiController < ApplicationController
|
||||
def enqueue_manual(klass, args, priority, queue = "manual")
|
||||
@@enqueue_deduper ||= Set.new
|
||||
return unless @@enqueue_deduper.add?([klass, args, priority])
|
||||
klass.set(priority: priority, queue: queue).perform_later(args)
|
||||
|
||||
@enqueue_manual ||= []
|
||||
@enqueue_manual << [klass, args, priority, queue]
|
||||
end
|
||||
|
||||
def perform_enqueues!
|
||||
while job = (@enqueue_manual || []).shift
|
||||
klass, args, priority, queue = job
|
||||
klass.set(priority: priority, queue: queue).perform_later(args)
|
||||
end
|
||||
end
|
||||
|
||||
def time_ago_or_never(time)
|
||||
|
||||
@@ -3,8 +3,9 @@ class Domain::Fa::Job::FaJobBase < Scraper::JobBase
|
||||
|
||||
def self.build_http_client
|
||||
@@fa_base_http_client ||= begin
|
||||
puts "Building FA HTTP client"
|
||||
Domain::Fa::Scraper::HttpClient.new(Curl::Easy.new)
|
||||
proxy = ENV["proxy"]
|
||||
puts "Building FA HTTP client (proxy: #{proxy})"
|
||||
Domain::Fa::Scraper::HttpClient.new(proxy || Scraper::CurlHttpPerformer.new)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -66,6 +66,10 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::FaJobBase
|
||||
|
||||
submission = page.submission
|
||||
|
||||
unless submission.id.to_i == @post.fa_id
|
||||
raise("id mismatch: #{submission.id} != #{@post.fa_id}")
|
||||
end
|
||||
|
||||
@post.last_submission_page = @submission_entry
|
||||
@post.title = submission.title
|
||||
@post.creator = Domain::Fa::User.find_or_build_from_submission_parser(submission)
|
||||
|
||||
@@ -2,12 +2,17 @@ class Domain::Fa::Scraper::HttpClient < Scraper::BaseHttpClient
|
||||
DEFAULT_ALLOWED_DOMAINS = [
|
||||
"*.furaffinity.net",
|
||||
"*.facdn.net",
|
||||
"ipinfo.io",
|
||||
]
|
||||
|
||||
def initialize(curl_client, cookies: nil, allowed_domains: nil)
|
||||
@cookies = cookies || Rails.application.config.x.cookies.fa
|
||||
@allowed_domains = allowed_domains || DEFAULT_ALLOWED_DOMAINS
|
||||
super(curl_client)
|
||||
def initialize(http_performer_or_proxy)
|
||||
if http_performer_or_proxy.is_a?(String)
|
||||
@cookies = Rails.application.config.x.cookies.fa[http_performer_or_proxy]
|
||||
else
|
||||
@cookies = Rails.application.config.x.cookies.fa["direct"]
|
||||
end
|
||||
|
||||
super(http_performer_or_proxy)
|
||||
end
|
||||
|
||||
def cookies
|
||||
@@ -23,6 +28,6 @@ class Domain::Fa::Scraper::HttpClient < Scraper::BaseHttpClient
|
||||
end
|
||||
|
||||
def allowed_domains
|
||||
@allowed_domains
|
||||
DEFAULT_ALLOWED_DOMAINS
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
class Scraper::BaseHttpClient
|
||||
Response = Struct.new(
|
||||
:status_code,
|
||||
:request_headers,
|
||||
:response_headers,
|
||||
:body,
|
||||
:log_entry,
|
||||
)
|
||||
@@ -11,7 +9,7 @@ class Scraper::BaseHttpClient
|
||||
|
||||
class InvalidURLError < ArgumentError; end
|
||||
|
||||
def initialize(curl_client = nil)
|
||||
def initialize(http_performer_or_proxy)
|
||||
cookies || raise("Subclass must define non-nil #cookies")
|
||||
allowed_domains || raise("Subclass must define non-nil #allowed_domains")
|
||||
|
||||
@@ -22,7 +20,12 @@ class Scraper::BaseHttpClient
|
||||
@blob_entry_cache = Hash.new do |hash, key|
|
||||
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
|
||||
end
|
||||
@curl_client = curl_client
|
||||
|
||||
if http_performer_or_proxy.is_a?(String)
|
||||
@http_performer = Scraper::ProxyHttpPerformer.new(http_performer_or_proxy)
|
||||
else
|
||||
@http_performer = http_performer_or_proxy
|
||||
end
|
||||
@domain_last_requested_at = {}
|
||||
@logger ||= ColorLogger.make($stdout, self)
|
||||
@logger.level = :info
|
||||
@@ -50,15 +53,6 @@ class Scraper::BaseHttpClient
|
||||
get_impl(url, caused_by_entry)
|
||||
end
|
||||
|
||||
def self.parse_header_str(header_str)
|
||||
header_str.split("\r\n").each_with_index.map do |str, idx|
|
||||
next nil if idx == 0
|
||||
idx = str.index(": ")
|
||||
next nil unless idx
|
||||
[str[0...idx], str[idx + 2..-1]]
|
||||
end.reject(&:nil?)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def get_impl(url, caused_by_entry)
|
||||
@@ -71,9 +65,6 @@ class Scraper::BaseHttpClient
|
||||
raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{allowed_domains}")
|
||||
end
|
||||
|
||||
@curl_client.url = url
|
||||
@curl_client.headers["cookie"] = HTTP::Cookie.cookie_value(@cookie_jar.cookies(Addressable::URI.encode url))
|
||||
|
||||
ratelimit_conf = ratelimit.find do |conf|
|
||||
File.fnmatch? conf[0], uri.host
|
||||
end || ["*", :none]
|
||||
@@ -86,21 +77,24 @@ class Scraper::BaseHttpClient
|
||||
if last_requested_at
|
||||
sleep_for = ratelimit_delay - (Time.now - last_requested_at)
|
||||
if sleep_for >= 0
|
||||
logger.debug("wait #{sleep_for.round(2)} sec / domain = #{uri.host}")
|
||||
logger.info("wait #{sleep_for.round(2)} sec / domain = #{uri.host}")
|
||||
sleep sleep_for
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
request_headers = {
|
||||
"cookie" => HTTP::Cookie.cookie_value(@cookie_jar.cookies(Addressable::URI.encode url)),
|
||||
}
|
||||
requested_at = Time.now
|
||||
@curl_client.perform
|
||||
response_time_ms = ((Time.now - requested_at) * 1000).to_i
|
||||
response = @http_performer.get(url, request_headers)
|
||||
@domain_last_requested_at[uri.host] = Time.now
|
||||
|
||||
response_code = @curl_client.response_code
|
||||
body_str = @curl_client.body_str
|
||||
response_headers = self.class.parse_header_str(@curl_client.header_str).to_h
|
||||
request_headers = @curl_client.headers
|
||||
response_code = response.response_code
|
||||
response_headers = response.response_headers
|
||||
response_time_ms = response.response_time_ms
|
||||
response_body = response.body
|
||||
|
||||
content_type = response_headers["Content-Type"] ||
|
||||
response_headers["content-type"] ||
|
||||
raise("no content type provided: #{response_headers}")
|
||||
@@ -115,32 +109,30 @@ class Scraper::BaseHttpClient
|
||||
candidates << caused_by_entry.response if caused_by_entry&.response
|
||||
candidates << caused_by_entry.response.base if caused_by_entry&.response&.base
|
||||
|
||||
response_blob_entry = BlobEntry.find_or_build(
|
||||
content_type: content_type,
|
||||
contents: body_str,
|
||||
candidates: candidates,
|
||||
)
|
||||
|
||||
log_entry = HttpLogEntry.new({
|
||||
uri: url,
|
||||
verb: :get,
|
||||
content_type: response_headers["content-type"],
|
||||
status_code: response_code,
|
||||
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers),
|
||||
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers),
|
||||
response: response_blob_entry,
|
||||
response_time_ms: response_time_ms,
|
||||
requested_at: requested_at,
|
||||
caused_by_entry: caused_by_entry,
|
||||
})
|
||||
|
||||
retries = 0
|
||||
begin
|
||||
response_blob_entry = BlobEntry.find_or_build(
|
||||
content_type: content_type,
|
||||
contents: response_body,
|
||||
candidates: candidates,
|
||||
)
|
||||
|
||||
log_entry = HttpLogEntry.new({
|
||||
uri: url,
|
||||
verb: :get,
|
||||
content_type: response_headers["content-type"],
|
||||
status_code: response_code,
|
||||
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers),
|
||||
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers),
|
||||
response: response_blob_entry,
|
||||
response_time_ms: response_time_ms,
|
||||
requested_at: requested_at,
|
||||
caused_by_entry: caused_by_entry,
|
||||
performed_by: @http_performer.name,
|
||||
})
|
||||
|
||||
log_entry.save!
|
||||
rescue
|
||||
log_entry.errors.clear
|
||||
log_entry.request_headers.errors.clear
|
||||
log_entry.response_headers.errors.clear
|
||||
retries += 1
|
||||
retry if retries < 2
|
||||
raise
|
||||
@@ -174,9 +166,7 @@ class Scraper::BaseHttpClient
|
||||
|
||||
Response.new(
|
||||
response_code,
|
||||
request_headers,
|
||||
response_headers,
|
||||
body_str,
|
||||
response_body,
|
||||
log_entry
|
||||
)
|
||||
end
|
||||
@@ -196,7 +186,7 @@ class Scraper::BaseHttpClient
|
||||
domain: domain,
|
||||
name: cookie_key_conf[:name],
|
||||
value: cookie_key_conf[:value],
|
||||
path: cookie_key_conf[:path],
|
||||
path: cookie_key_conf[:path] || "",
|
||||
for_domain: true,
|
||||
}
|
||||
@cookie_jar.add(HTTP::Cookie.new(conf))
|
||||
|
||||
41
app/lib/scraper/curl_http_performer.rb
Normal file
41
app/lib/scraper/curl_http_performer.rb
Normal file
@@ -0,0 +1,41 @@
|
||||
class Scraper::CurlHttpPerformer < Scraper::HttpPerformer
|
||||
def initialize(curl = nil)
|
||||
@curl = curl || Curl::Easy.new
|
||||
end
|
||||
|
||||
def name
|
||||
"direct"
|
||||
end
|
||||
|
||||
def get(url, request_headers)
|
||||
start_at = Time.now
|
||||
@curl.url = url
|
||||
@curl.headers = {}
|
||||
request_headers.each do |key, value|
|
||||
@curl.headers[key.to_s] = value
|
||||
end
|
||||
@curl.perform
|
||||
response_time_ms = ((Time.now - start_at) * 1000).to_i
|
||||
response_code = @curl.response_code
|
||||
body_str = @curl.body_str
|
||||
response_headers = self.class.parse_header_str(@curl.header_str).to_h
|
||||
|
||||
Scraper::HttpPerformer::Response.new(
|
||||
response_code,
|
||||
response_headers,
|
||||
response_time_ms,
|
||||
body_str
|
||||
)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def self.parse_header_str(header_str)
|
||||
header_str.split("\r\n").each_with_index.map do |str, idx|
|
||||
next nil if idx == 0
|
||||
idx = str.index(": ")
|
||||
next nil unless idx
|
||||
[str[0...idx], str[idx + 2..-1]]
|
||||
end.reject(&:nil?)
|
||||
end
|
||||
end
|
||||
12
app/lib/scraper/http_performer.rb
Normal file
12
app/lib/scraper/http_performer.rb
Normal file
@@ -0,0 +1,12 @@
|
||||
class Scraper::HttpPerformer
|
||||
Response = Struct.new(
|
||||
:response_code,
|
||||
:response_headers,
|
||||
:response_time_ms,
|
||||
:body
|
||||
)
|
||||
|
||||
def get(url, request_headers)
|
||||
raise NotImplementedError, "implement in subclass"
|
||||
end
|
||||
end
|
||||
24
app/lib/scraper/proxy_http_performer.rb
Normal file
24
app/lib/scraper/proxy_http_performer.rb
Normal file
@@ -0,0 +1,24 @@
|
||||
require "base64"
|
||||
|
||||
class Scraper::ProxyHttpPerformer < Scraper::HttpPerformer
|
||||
def initialize(proxy)
|
||||
@name = proxy
|
||||
proxy_url = Rails.application.config.x.proxies[proxy]
|
||||
@client = Ripcord::Client.new(proxy_url)
|
||||
end
|
||||
|
||||
def name
|
||||
@name
|
||||
end
|
||||
|
||||
def get(uri, headers)
|
||||
send_token = SecureRandom.uuid
|
||||
rpc_result = @client.call("get", [uri, headers, send_token])
|
||||
raise "rpc error: #{rpc_result.error.inspect}" unless rpc_result.successful?
|
||||
recv_token, code, headers, time_ms, body = Marshal.load(Base64.decode64(rpc_result.result))
|
||||
if send_token != recv_token
|
||||
raise "token mismatch: #{send_token} != #{recv_token}"
|
||||
end
|
||||
Scraper::HttpPerformer::Response.new(code, headers, time_ms, body)
|
||||
end
|
||||
end
|
||||
@@ -127,7 +127,9 @@ class BlobEntry < ReduxApplicationRecord
|
||||
record.contents = contents
|
||||
end
|
||||
|
||||
raise RuntimeError.new("invariant!") if record.contents != contents
|
||||
if record.contents != contents
|
||||
raise RuntimeError.new("invariant!")
|
||||
end
|
||||
|
||||
record
|
||||
end
|
||||
|
||||
@@ -88,8 +88,8 @@ class Domain::Fa::Post < ReduxApplicationRecord
|
||||
def scanned_at=(time)
|
||||
unless time.is_a?(Time)
|
||||
raise ArgumentError("time must be Time, was #{time.class}")
|
||||
end
|
||||
self.state_detail["scanned_at"] = time.to_i
|
||||
end unless time.nil?
|
||||
self.state_detail["scanned_at"] = time&.to_i
|
||||
end
|
||||
|
||||
def last_submission_page=(log_entry)
|
||||
@@ -160,6 +160,7 @@ class Domain::Fa::Post < ReduxApplicationRecord
|
||||
request_headers: ::HttpLogEntryHeader.find_or_create(headers: {}),
|
||||
response_headers: ::HttpLogEntryHeader.find_or_create(headers: {}),
|
||||
response: blob_entry,
|
||||
performed_by: "legacy",
|
||||
})
|
||||
|
||||
raise("mismatch") unless log_entry.response == blob_entry
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
class HttpLogEntry < ReduxApplicationRecord
|
||||
include ImmutableModel
|
||||
|
||||
enum verb: %i[get post]
|
||||
enum verb: %i[get post], _prefix: true
|
||||
enum performed_by: %i[direct legacy proxy-1], _prefix: true
|
||||
|
||||
belongs_to :response,
|
||||
foreign_key: :response_sha256,
|
||||
@@ -25,12 +26,12 @@ class HttpLogEntry < ReduxApplicationRecord
|
||||
class_name: "::HttpLogEntry",
|
||||
foreign_key: :caused_by_id
|
||||
|
||||
validates_inclusion_of(:verb, in: ::HttpLogEntry.verbs.keys)
|
||||
validates_presence_of(
|
||||
:uri_scheme,
|
||||
:uri_host,
|
||||
:uri_path,
|
||||
:verb,
|
||||
:performed_by,
|
||||
:status_code,
|
||||
:response_time_ms,
|
||||
:content_type,
|
||||
@@ -68,6 +69,7 @@ class HttpLogEntry < ReduxApplicationRecord
|
||||
requested_at: legacy_model.requested_at,
|
||||
created_at: legacy_model.created_at,
|
||||
updated_at: legacy_model.updated_at,
|
||||
performed_by: "legacy",
|
||||
})
|
||||
return record
|
||||
end
|
||||
|
||||
@@ -23,5 +23,8 @@ module ReduxScraper
|
||||
# config.time_zone = "Central Time (US & Canada)"
|
||||
# config.eager_load_paths << Rails.root.join("extras")
|
||||
config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa")
|
||||
config.x.proxies = {
|
||||
"proxy-1" => "http://proxy-1.local:8999",
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,42 +1,59 @@
|
||||
ddwhatnow: &ddwhatnow
|
||||
- domain: .furaffinity.net
|
||||
cookies:
|
||||
- name: a
|
||||
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
|
||||
path: /
|
||||
- name: b
|
||||
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
|
||||
path: /
|
||||
- domain: rv.furaffinity.net
|
||||
cookies:
|
||||
- name: OAID
|
||||
value: d4247fd1b8b755a4064f2e030dec919f
|
||||
path: /
|
||||
|
||||
vipvillageworker: &vipvillageworker
|
||||
- domain: .furaffinity.net
|
||||
cookies:
|
||||
- name: a
|
||||
value: 5a9437e0-ded0-4b8d-8255-ff0198664025
|
||||
- name: b
|
||||
value: 2d7da6a8-959f-407e-abd1-af8f51c110be
|
||||
- domain: rv.furaffinity.net
|
||||
cookies:
|
||||
- name: OAID
|
||||
value: d8ebf7f9508511d8ba7307e3cb36f03c
|
||||
path: /
|
||||
- domain: www.furaffinity.net
|
||||
cookies:
|
||||
- name: sz
|
||||
value: 1905x976
|
||||
path: /
|
||||
|
||||
testcookies: &testcookies
|
||||
- domain: .furaffinity.net
|
||||
cookies:
|
||||
- name: a
|
||||
value: avalue
|
||||
path: /
|
||||
- name: b
|
||||
value: bvalue
|
||||
path: /
|
||||
- domain: rv.furaffinity.net
|
||||
cookies:
|
||||
- name: OAID
|
||||
value: oaidvalue
|
||||
path: /
|
||||
|
||||
development:
|
||||
- domain: .furaffinity.net
|
||||
cookies:
|
||||
- name: a
|
||||
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
|
||||
path: /
|
||||
- name: b
|
||||
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
|
||||
path: /
|
||||
- domain: rv.furaffinity.net
|
||||
cookies:
|
||||
- name: OAID
|
||||
value: d4247fd1b8b755a4064f2e030dec919f
|
||||
path: /
|
||||
direct: *ddwhatnow
|
||||
proxy-1: *vipvillageworker
|
||||
|
||||
production:
|
||||
- domain: .furaffinity.net
|
||||
cookies:
|
||||
- name: a
|
||||
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
|
||||
path: /
|
||||
- name: b
|
||||
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
|
||||
path: /
|
||||
- domain: rv.furaffinity.net
|
||||
cookies:
|
||||
- name: OAID
|
||||
value: d4247fd1b8b755a4064f2e030dec919f
|
||||
path: /
|
||||
direct: *ddwhatnow
|
||||
proxy-1: *vipvillageworker
|
||||
|
||||
test:
|
||||
- domain: .furaffinity.net
|
||||
cookies:
|
||||
- name: a
|
||||
value: 5a1b4c3d-7b51-41e1-a00a-88420d7d0d3e
|
||||
path: /
|
||||
- name: b
|
||||
value: a29dec05-d47b-4f51-a9d8-5a607d12e482
|
||||
path: /
|
||||
- domain: rv.furaffinity.net
|
||||
cookies:
|
||||
- name: OAID
|
||||
value: d4247fd1b8b755a4064f2e030dec919f
|
||||
path: /
|
||||
direct: *testcookies
|
||||
proxy-1: *testcookies
|
||||
|
||||
@@ -10,10 +10,13 @@ end
|
||||
# This runs in the worker processes after it has been forked
|
||||
on_worker_boot do |worker_info|
|
||||
ActiveJob::Base.logger = Logger.new("/dev/null")
|
||||
Delayed::Worker.logger = Logger.new("/dev/null")
|
||||
Delayed::Worker.logger = Logger.new(STDOUT)
|
||||
Delayed::Worker.logger.level = :error
|
||||
Rails.logger = Logger.new(STDOUT)
|
||||
|
||||
puts "Worker #{Process.pid} started"
|
||||
sleep rand(1..5)
|
||||
|
||||
# Reconnect to the database
|
||||
ActiveRecord::Base.establish_connection
|
||||
end
|
||||
@@ -34,7 +37,7 @@ end
|
||||
[:default, 1],
|
||||
[:fa_user_page, 2],
|
||||
[:fa_user_gallery, 1],
|
||||
[:fa_post, 5],
|
||||
[:fa_post, 4],
|
||||
].each do |queue, workers|
|
||||
worker_group(queue) do |g|
|
||||
g.read_ahead = 8
|
||||
|
||||
@@ -10,7 +10,8 @@ end
|
||||
# This runs in the worker processes after it has been forked
|
||||
on_worker_boot do |worker_info|
|
||||
ActiveJob::Base.logger = Logger.new("/dev/null")
|
||||
Delayed::Worker.logger = Logger.new("/dev/null")
|
||||
Delayed::Worker.logger = Logger.new(STDOUT)
|
||||
Delayed::Worker.logger.level = :error
|
||||
Rails.logger = Logger.new(STDOUT)
|
||||
|
||||
puts "Worker #{Process.pid} started"
|
||||
|
||||
@@ -10,7 +10,8 @@ end
|
||||
# This runs in the worker processes after it has been forked
|
||||
on_worker_boot do |worker_info|
|
||||
ActiveJob::Base.logger = Logger.new("/dev/null")
|
||||
Delayed::Worker.logger = Logger.new("/dev/null")
|
||||
Delayed::Worker.logger = Logger.new(STDOUT)
|
||||
Delayed::Worker.logger.level = :error
|
||||
Rails.logger = Logger.new(STDOUT)
|
||||
|
||||
puts "Worker #{Process.pid} started"
|
||||
@@ -33,7 +34,7 @@ end
|
||||
worker_group(:static_file) do |g|
|
||||
g.read_ahead = 1
|
||||
g.sleep_delay = 5
|
||||
g.workers = 8
|
||||
g.workers = 2
|
||||
g.queues = ["static_file"]
|
||||
end
|
||||
|
||||
|
||||
24
rake/e621.rake
Normal file
24
rake/e621.rake
Normal file
@@ -0,0 +1,24 @@
|
||||
namespace :e621 do
|
||||
desc "import legacy e621 posts"
|
||||
task :import_legacy => :environment do |t, args|
|
||||
batch_size = args[:batch_size]&.to_i || ENV["batch_size"]&.to_i
|
||||
forks = args[:forks]&.to_i || ENV["forks"]&.to_i
|
||||
start_at = args[:start_at]&.to_i || ENV["start_at"]&.to_i
|
||||
|
||||
LegacyImport::E621LegacyPostImporter.
|
||||
new(batch_size: batch_size, forks: forks, start_at: start_at).
|
||||
run
|
||||
end
|
||||
|
||||
desc "import e621 data from csv"
|
||||
task :import_csv => :environment do |t, args|
|
||||
batch_size = ENV["batch_size"]&.to_i
|
||||
forks = ENV["forks"]&.to_i
|
||||
start_at = ENV["start_at"]&.to_i
|
||||
posts_csv_path = ENV["posts_csv_path"] || raise("must supply `posts_csv_path`")
|
||||
|
||||
LegacyImport::E621CsvPostImporter.
|
||||
new(batch_size: batch_size, forks: forks, start_at: start_at).
|
||||
run
|
||||
end
|
||||
end
|
||||
102
rake/fa.rake
102
rake/fa.rake
@@ -115,4 +115,106 @@ namespace :fa do
|
||||
})
|
||||
end
|
||||
end
|
||||
|
||||
# treat this like a bomb
|
||||
# desc "delete / undo changes made by buggy proxy responses"
|
||||
# task :fix_buggy_posts => [:set_logger_stdout, :environment] do
|
||||
# raise "Already ran, would destroy everything if it ran again"
|
||||
# # log_entries = HttpLogEntry.where("id > ?", 54040076).to_a
|
||||
|
||||
# earliest_created_at = log_entries.map(&:created_at).min
|
||||
# puts "earliest: #{earliest_created_at}"
|
||||
|
||||
# users_to_revert = Set.new
|
||||
# posts_to_revert = Set.new
|
||||
# files_to_revert = Set.new
|
||||
|
||||
# log_entries.each do |log_entry|
|
||||
# uri = log_entry.uri
|
||||
# # puts "[id #{log_entry.id} / uri #{uri}]"
|
||||
|
||||
# case uri.host
|
||||
# when "ipinfo.io"
|
||||
# puts "ipinfo, do nothing"
|
||||
# when "d.furaffinity.net"
|
||||
# # puts "fix static file #{uri.path.bold}"
|
||||
# files_to_revert.add(uri.to_s)
|
||||
# post = Domain::Fa::Post.find_by(file_id: log_entry.id)
|
||||
# posts_to_revert.add(post) if post
|
||||
# when "www.furaffinity.net"
|
||||
# if uri.path.start_with?("/user/")
|
||||
# url_name = uri.path.split("/")[2]
|
||||
# user = Domain::Fa::User.find_by(url_name: url_name) || raise
|
||||
# # puts "fix fa user #{url_name.bold}"
|
||||
# users_to_revert.add(user)
|
||||
# elsif uri.path.start_with?("/view/")
|
||||
# fa_id = uri.path.split("/")[2]
|
||||
# post = Domain::Fa::Post.find_by(fa_id: fa_id) || raise
|
||||
# # puts "fix fa post #{fa_id.to_s.bold}"
|
||||
# posts_to_revert.add(post)
|
||||
# elsif uri.path.start_with?("/gallery/") || uri.path.start_with?("/scraps/")
|
||||
# url_name = uri.path.split("/")[2]
|
||||
# user = Domain::Fa::User.find_by(url_name: url_name) || raise
|
||||
# # puts "fix fa user gallery #{url_name.bold}"
|
||||
# users_to_revert.add(user)
|
||||
# elsif uri.path == "/"
|
||||
# next
|
||||
# else
|
||||
# raise("unsupported path: #{uri.path}")
|
||||
# end
|
||||
# else raise("unsupported host: #{uri.host}")
|
||||
# end
|
||||
# end
|
||||
|
||||
# puts "entries: #{log_entries.size}"
|
||||
# puts "users: #{users_to_revert.size}"
|
||||
# puts "posts: #{posts_to_revert.size}"
|
||||
# puts "files: #{files_to_revert.size}"
|
||||
|
||||
# # puts "----- users -----"
|
||||
# # users_to_revert.each do |user|
|
||||
# # puts user.to_json
|
||||
# # end
|
||||
|
||||
# # puts "----- posts -----"
|
||||
# # posts_to_revert.each do |post|
|
||||
# # puts post.to_json
|
||||
# # end
|
||||
|
||||
# reset_detail = ({ :info => "reverted on #{Time.now}" })
|
||||
# job_params = ({ :queue => "manual", :priority => -20 })
|
||||
|
||||
# ReduxApplicationRecord.transaction do
|
||||
# users_to_revert.each do |user|
|
||||
# user.state = :ok
|
||||
# user.state_detail = reset_detail
|
||||
# user.log_entry_detail = reset_detail
|
||||
# user.scanned_page_at = nil
|
||||
# user.scanned_gallery_at = nil
|
||||
# user.save!
|
||||
|
||||
# Domain::Fa::Job::UserGalleryJob.set(job_params).perform_later({
|
||||
# user: user,
|
||||
# })
|
||||
# Domain::Fa::Job::UserPageJob.set(job_params).perform_later({
|
||||
# user: user,
|
||||
# })
|
||||
# end
|
||||
|
||||
# posts_to_revert.each do |post|
|
||||
# post.state = :ok
|
||||
# post.state_detail = reset_detail
|
||||
# post.log_entry_detail = reset_detail
|
||||
# post.file = nil
|
||||
# post.file_url_str = nil
|
||||
# post.save!
|
||||
|
||||
# Domain::Fa::Job::ScanPostJob.set(job_params).perform_later({
|
||||
# post: post,
|
||||
# })
|
||||
# end
|
||||
|
||||
# log_entries.reverse.each(&:destroy!)
|
||||
# end
|
||||
# end
|
||||
end
|
||||
|
||||
@@ -1,19 +1,15 @@
|
||||
class Domain::Fa::Scraper::HttpClientTest < ActiveSupport::TestCase
|
||||
test "creates an http log entry" do
|
||||
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_curl_easy(
|
||||
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_http_performer(
|
||||
"https://www.furaffinity.net/",
|
||||
request_headers: Hash,
|
||||
response_code: 200,
|
||||
body_str: "a plain text body",
|
||||
response_body: "a plain text body",
|
||||
))
|
||||
response = client.get("https://www.furaffinity.net/")
|
||||
|
||||
assert_equal 200, response.status_code
|
||||
assert_equal "a plain text body", response.body
|
||||
|
||||
expected = {}
|
||||
assert_equal expected, response.request_headers
|
||||
|
||||
expected = { "resp" => "respheader", "content-type" => "text/plain" }
|
||||
assert_equal expected, response.response_headers
|
||||
refute_nil response.log_entry
|
||||
end
|
||||
end
|
||||
|
||||
@@ -9,34 +9,37 @@ class Scraper::BaseHttpClientTest < ActiveSupport::TestCase
|
||||
end
|
||||
end
|
||||
|
||||
test "can parse header string" do
|
||||
headers = Scraper::BaseHttpClient.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
|
||||
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
|
||||
assert_equal ["content-encoding", "gzip"], headers[8]
|
||||
end
|
||||
|
||||
test "throws on unallowed domain" do
|
||||
client = TestHttpClient.new(TestUtil.mock_curl_easy(""))
|
||||
client = TestHttpClient.new(TestUtil.mock_http_performer(""))
|
||||
assert_raises(Scraper::BaseHttpClient::InvalidURLError) do
|
||||
client.get("https://foobar.com")
|
||||
end
|
||||
end
|
||||
|
||||
test "creates an http log entry" do
|
||||
body_str = "the response body " + TestUtil.random_string(16)
|
||||
client = TestHttpClient.new(TestUtil.mock_curl_easy("https://www.example.com/", body_str: body_str))
|
||||
# note the lack of trailing slash - http client should set path to '/'
|
||||
client = TestHttpClient.new(TestUtil.mock_http_performer(
|
||||
"https://www.example.com/",
|
||||
request_headers: { "cookie" => "" },
|
||||
response_code: 200,
|
||||
response_time_ms: 15,
|
||||
response_headers: { "content-type" => "text/plain" },
|
||||
response_body: "the response " + TestUtil.random_string(16),
|
||||
))
|
||||
|
||||
# note the lack of trailing slash - http client should set path to '/'
|
||||
response = client.get("https://www.example.com")
|
||||
|
||||
assert_equal 200, response.status_code
|
||||
assert_match /the response /, response.body
|
||||
|
||||
log_entry = response.log_entry
|
||||
assert log_entry.persisted?
|
||||
assert_equal "text/plain", log_entry.content_type
|
||||
assert_in_delta Time.now, log_entry.requested_at, 50
|
||||
assert_equal 15, log_entry.response_time_ms
|
||||
assert_equal "get", log_entry.verb
|
||||
assert_equal 200, log_entry.status_code
|
||||
assert_equal "text/plain", log_entry.response.content_type
|
||||
assert_equal body_str, log_entry.response.contents
|
||||
assert_match /the response/, log_entry.response.contents
|
||||
end
|
||||
end
|
||||
|
||||
20
test/lib/scraper/curl_http_performer_test.rb
Normal file
20
test/lib/scraper/curl_http_performer_test.rb
Normal file
@@ -0,0 +1,20 @@
|
||||
class Scraper::CurlHttpPerformerTest < ActiveSupport::TestCase
|
||||
test "can parse header string" do
|
||||
headers = Scraper::CurlHttpPerformer.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
|
||||
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
|
||||
assert_equal ["content-encoding", "gzip"], headers[8]
|
||||
end
|
||||
|
||||
test "correctly wraps curl" do
|
||||
body_str = "the response body " + TestUtil.random_string(16)
|
||||
client = Scraper::CurlHttpPerformer.new(
|
||||
TestUtil.mock_curl_easy("https://www.example.com/", body_str: body_str)
|
||||
)
|
||||
|
||||
response = client.get("https://www.example.com/", {})
|
||||
assert_equal 200, response.response_code
|
||||
assert_equal ({ "content-type" => "text/plain", "resp" => "respheader" }),
|
||||
response.response_headers
|
||||
assert_equal 0, response.response_time_ms
|
||||
end
|
||||
end
|
||||
@@ -14,6 +14,7 @@ class HttpLogEntryTest < ActiveSupport::TestCase
|
||||
uri_host: "example.com",
|
||||
uri_path: "/foo/bar",
|
||||
status_code: 200,
|
||||
performed_by: "direct",
|
||||
}
|
||||
end
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ module TestUtil
|
||||
entry = ::HttpLogEntry.new({
|
||||
uri: "http://example.com/",
|
||||
verb: :get,
|
||||
performed_by: "direct",
|
||||
status_code: 200,
|
||||
response_time_ms: rand(20..100),
|
||||
request_headers: create_http_headers,
|
||||
@@ -61,8 +62,11 @@ module TestUtil
|
||||
)
|
||||
mock = MiniTest::Mock.new
|
||||
|
||||
def mock.singleton_method_added(_); {}; end
|
||||
mock.expect(:url=, nil, [expected_url])
|
||||
mock.expect(:headers, {})
|
||||
def mock.headers; {}; end
|
||||
def mock.headers=(_); {}; end
|
||||
|
||||
mock.expect(:perform, nil)
|
||||
mock.expect(:response_code, response_code)
|
||||
mock.expect(:body_str, body_str)
|
||||
@@ -71,4 +75,25 @@ module TestUtil
|
||||
|
||||
mock
|
||||
end
|
||||
|
||||
def self.mock_http_performer(
|
||||
expected_url,
|
||||
request_headers: {},
|
||||
response_code: 200,
|
||||
response_time_ms: 15,
|
||||
response_headers: { "content-type" => "text/plain" },
|
||||
response_body: "http body"
|
||||
)
|
||||
mock = MiniTest::Mock.new
|
||||
mock.expect(:is_a?, false, [String])
|
||||
mock.expect(:is_a?, false, [String])
|
||||
mock.expect(:name, "direct")
|
||||
mock.expect(:get, Scraper::HttpPerformer::Response.new(
|
||||
response_code,
|
||||
response_headers,
|
||||
response_time_ms,
|
||||
response_body
|
||||
), [expected_url, request_headers])
|
||||
mock
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user