add good_job, add spec for browse page job
This commit is contained in:
7
Gemfile
7
Gemfile
@@ -99,15 +99,14 @@ gem "nokogiri"
|
||||
gem "http-cookie"
|
||||
gem "curb"
|
||||
gem "kaminari"
|
||||
gem "delayed_job_active_record"
|
||||
# gem "delayed-web"
|
||||
gem "delayed_job_web"
|
||||
gem "colorize"
|
||||
gem "daemons"
|
||||
gem "delayed_job_worker_pool"
|
||||
gem "ripcord"
|
||||
gem "influxdb-client"
|
||||
gem "discard"
|
||||
|
||||
gem "good_job"
|
||||
|
||||
# gem 'cli-ui'
|
||||
# gem "paper_trail"
|
||||
# gem "paper_trail-hashdiff"
|
||||
|
||||
42
Gemfile.lock
42
Gemfile.lock
@@ -109,18 +109,6 @@ GEM
|
||||
irb (>= 1.5.0)
|
||||
reline (>= 0.3.1)
|
||||
debug_inspector (1.1.0)
|
||||
delayed_job (4.1.11)
|
||||
activesupport (>= 3.0, < 8.0)
|
||||
delayed_job_active_record (4.1.7)
|
||||
activerecord (>= 3.0, < 8.0)
|
||||
delayed_job (>= 3.0, < 5)
|
||||
delayed_job_web (1.4.4)
|
||||
activerecord (> 3.0.0)
|
||||
delayed_job (> 2.0.3)
|
||||
rack-protection (>= 1.5.5)
|
||||
sinatra (>= 1.4.4)
|
||||
delayed_job_worker_pool (1.0.0)
|
||||
delayed_job (>= 3.0, < 4.2)
|
||||
diff-lcs (1.5.0)
|
||||
diffy (3.4.2)
|
||||
discard (1.2.1)
|
||||
@@ -128,9 +116,22 @@ GEM
|
||||
domain_name (0.5.20190701)
|
||||
unf (>= 0.0.5, < 1.0.0)
|
||||
erubi (1.12.0)
|
||||
et-orbi (1.2.7)
|
||||
tzinfo
|
||||
ffi (1.15.5)
|
||||
fugit (1.8.1)
|
||||
et-orbi (~> 1, >= 1.2.7)
|
||||
raabro (~> 1.4)
|
||||
globalid (1.1.0)
|
||||
activesupport (>= 5.0)
|
||||
good_job (3.14.2)
|
||||
activejob (>= 6.0.0)
|
||||
activerecord (>= 6.0.0)
|
||||
concurrent-ruby (>= 1.0.2)
|
||||
fugit (>= 1.1)
|
||||
railties (>= 6.0.0)
|
||||
thor (>= 0.14.1)
|
||||
webrick (>= 1.3)
|
||||
http-cookie (1.0.5)
|
||||
domain_name (~> 0.5)
|
||||
i18n (1.12.0)
|
||||
@@ -175,8 +176,6 @@ GEM
|
||||
mini_portile2 (2.8.1)
|
||||
minitest (5.17.0)
|
||||
msgpack (1.6.0)
|
||||
mustermann (3.0.0)
|
||||
ruby2_keywords (~> 0.0.1)
|
||||
net-imap (0.3.4)
|
||||
date
|
||||
net-protocol
|
||||
@@ -200,12 +199,11 @@ GEM
|
||||
public_suffix (5.0.1)
|
||||
puma (5.6.5)
|
||||
nio4r (~> 2.0)
|
||||
raabro (1.4.0)
|
||||
racc (1.6.2)
|
||||
rack (2.2.6.2)
|
||||
rack-mini-profiler (3.0.0)
|
||||
rack (>= 1.2.0)
|
||||
rack-protection (3.0.5)
|
||||
rack
|
||||
rack-test (2.0.2)
|
||||
rack (>= 1.3)
|
||||
rails (7.0.4.2)
|
||||
@@ -263,18 +261,12 @@ GEM
|
||||
ruby-prof (1.4.5)
|
||||
ruby-prof-speedscope (0.3.0)
|
||||
ruby-prof (~> 1.0)
|
||||
ruby2_keywords (0.0.5)
|
||||
rubyzip (2.3.2)
|
||||
rufo (0.15.1)
|
||||
selenium-webdriver (4.8.0)
|
||||
rexml (~> 3.2, >= 3.2.5)
|
||||
rubyzip (>= 1.2.2, < 3.0)
|
||||
websocket (~> 1.0)
|
||||
sinatra (3.0.5)
|
||||
mustermann (~> 3.0)
|
||||
rack (~> 2.2, >= 2.2.4)
|
||||
rack-protection (= 3.0.5)
|
||||
tilt (~> 2.0)
|
||||
sprockets (4.2.0)
|
||||
concurrent-ruby (~> 1.0)
|
||||
rack (>= 2.2.4, < 4)
|
||||
@@ -288,7 +280,6 @@ GEM
|
||||
railties (>= 6.0.0)
|
||||
table_print (1.5.7)
|
||||
thor (1.2.1)
|
||||
tilt (2.1.0)
|
||||
timeout (0.3.1)
|
||||
turbo-rails (1.3.3)
|
||||
actionpack (>= 6.0.0)
|
||||
@@ -308,6 +299,7 @@ GEM
|
||||
nokogiri (~> 1.6)
|
||||
rubyzip (>= 1.3.0)
|
||||
selenium-webdriver (~> 4.0)
|
||||
webrick (1.8.1)
|
||||
websocket (1.2.9)
|
||||
websocket-driver (0.7.5)
|
||||
websocket-extensions (>= 0.1.0)
|
||||
@@ -330,11 +322,9 @@ DEPENDENCIES
|
||||
curses
|
||||
daemons
|
||||
debug
|
||||
delayed_job_active_record
|
||||
delayed_job_web
|
||||
delayed_job_worker_pool
|
||||
diffy
|
||||
discard
|
||||
good_job
|
||||
http-cookie
|
||||
importmap-rails
|
||||
influxdb-client
|
||||
|
||||
@@ -4,4 +4,10 @@ class ApplicationJob < ActiveJob::Base
|
||||
|
||||
# Most jobs are safe to ignore if the underlying records are no longer available
|
||||
# discard_on ActiveJob::DeserializationError
|
||||
|
||||
def self.ignore_signature_args(*args)
|
||||
@ignore_signature_args ||= []
|
||||
@ignore_signature_args += (args || [])
|
||||
@ignore_signature_args
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,14 +1,8 @@
|
||||
class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::FaJobBase
|
||||
queue_as :fa_browse_page
|
||||
ignore_signature_args [:caused_by_entry]
|
||||
ignore_signature_args :caused_by_entry
|
||||
|
||||
def perform(args)
|
||||
if self.class.ran_recently?
|
||||
logger.warn("skipping browse page job, ran to recently")
|
||||
return
|
||||
end
|
||||
self.class.mark_ran!
|
||||
|
||||
@caused_by_entry = args[:caused_by_entry]
|
||||
@first_browse_page_entry = nil
|
||||
|
||||
@@ -25,23 +19,13 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::FaJobBase
|
||||
logger.info("finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts across #{@page_number.to_s.bold} pages")
|
||||
end
|
||||
|
||||
def self.ran_recently?
|
||||
@@last_ran_at ||= nil
|
||||
return false unless @@last_ran_at
|
||||
@@last_ran_at > 1.minute.ago
|
||||
end
|
||||
|
||||
def self.mark_ran!
|
||||
@@last_ran_at = Time.now
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def scan_browse_page
|
||||
if @page_number == 1
|
||||
url = "https://www.furaffinity.net/browse/"
|
||||
else
|
||||
url = "https://www.furaffinity.net/browse/#{@page_number}"
|
||||
url = "https://www.furaffinity.net/browse/#{@page_number}/"
|
||||
end
|
||||
|
||||
response = http_client.get(url, caused_by_entry: @first_browse_page_entry || @caused_by_entry)
|
||||
@@ -58,10 +42,6 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::FaJobBase
|
||||
page_desc: "Browse@#{@page_number}",
|
||||
)
|
||||
|
||||
if listing_page_stats.total_seen == 0
|
||||
fatal_error("0 posts on browse page - log entry #{log_entry.id.to_s.bold}")
|
||||
end
|
||||
|
||||
@total_num_new_posts_seen += listing_page_stats.new_seen
|
||||
@total_num_posts_seen += listing_page_stats.total_seen
|
||||
listing_page_stats.new_seen > 0
|
||||
|
||||
@@ -1,18 +1,12 @@
|
||||
class Domain::Fa::Job::FaJobBase < Scraper::JobBase
|
||||
discard_on ActiveJob::DeserializationError
|
||||
|
||||
def self.build_http_client
|
||||
@@fa_base_http_client ||= begin
|
||||
Scraper::FaHttpClient.new
|
||||
end
|
||||
def self.http_factory_method
|
||||
:new_fa_http_client
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
def http_client
|
||||
@http_client ||= self.class.build_http_client
|
||||
end
|
||||
|
||||
def find_or_intitialize_user_from_args(args, caused_by_entry: nil)
|
||||
args[:user] || begin
|
||||
url_name = args[:url_name]&.downcase
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
class Domain::Fa::Job::ScanFileJob < Domain::Fa::Job::FaJobBase
|
||||
queue_as :static_file
|
||||
ignore_signature_args [:caused_by_entry]
|
||||
ignore_signature_args :caused_by_entry
|
||||
|
||||
def perform(args)
|
||||
logger.level = :warn
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::FaJobBase
|
||||
queue_as :fa_post
|
||||
ignore_signature_args [:caused_by_entry]
|
||||
ignore_signature_args :caused_by_entry
|
||||
|
||||
def perform(args)
|
||||
@post = args[:post] || begin
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
class Domain::Fa::Job::UserGalleryJob < Domain::Fa::Job::FaJobBase
|
||||
queue_as :fa_user_gallery
|
||||
ignore_signature_args [:caused_by_entry]
|
||||
ignore_signature_args :caused_by_entry
|
||||
|
||||
MAX_PAGE_NUMBER = 350
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
class Domain::Fa::Job::UserPageJob < Domain::Fa::Job::FaJobBase
|
||||
queue_as :fa_user_page
|
||||
ignore_signature_args [:caused_by_entry]
|
||||
ignore_signature_args :caused_by_entry
|
||||
|
||||
def perform(args)
|
||||
@caused_by_entry = args[:caused_by_entry]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
class Domain::Twitter::Job::MediaJob < Domain::Twitter::Job::TwitterJobBase
|
||||
queue_as :static_file
|
||||
ignore_signature_args [:caused_by_entry]
|
||||
ignore_signature_args :caused_by_entry
|
||||
|
||||
def perform(args)
|
||||
logger.level = :warn
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::TwitterJobBase
|
||||
GDL = Scraper::GalleryDlClient
|
||||
|
||||
queue_as :twitter_timeline_tweets
|
||||
ignore_signature_args [:caused_by_entry]
|
||||
|
||||
def perform(args)
|
||||
@name = args[:name]
|
||||
|
||||
@@ -1,5 +1,39 @@
|
||||
class Scraper::JobBase < ApplicationJob
|
||||
include GoodJob::ActiveJobExtensions::Concurrency
|
||||
include HasColorLogger
|
||||
attr_reader :http_client
|
||||
|
||||
def initialize(...)
|
||||
@http_client = Scraper::HttpFactory.send(self.class.http_factory_method)
|
||||
super(...)
|
||||
end
|
||||
|
||||
def self.http_factory_method
|
||||
raise NotImplementedError.new("implement in #{self.name}")
|
||||
end
|
||||
|
||||
good_job_control_concurrency_with(
|
||||
total_limit: 1,
|
||||
key: proc do
|
||||
raise("too many arguments") if arguments.size > 1
|
||||
first_argument = arguments || []
|
||||
first_argument = first_argument[0] || {}
|
||||
|
||||
ignore_signature_args = self.class.ignore_signature_args
|
||||
ignore_signature_args << :_aj_symbol_keys
|
||||
sig_arguments = first_argument.reject do |key, value|
|
||||
ignore_signature_args.include?(key.to_sym)
|
||||
end.to_h
|
||||
|
||||
sig = []
|
||||
sig << self.class.name || raise
|
||||
sig << self.queue_name || "*"
|
||||
sig << self.priority || "*"
|
||||
sig << Digest::SHA256.hexdigest(sig_arguments.inspect)[0...16]
|
||||
sig = sig.join("|")
|
||||
sig
|
||||
end,
|
||||
)
|
||||
|
||||
around_perform do |job, block|
|
||||
level = job.logger.level
|
||||
@@ -8,12 +42,6 @@ class Scraper::JobBase < ApplicationJob
|
||||
job.logger.level = level
|
||||
end
|
||||
|
||||
def self.ignore_signature_args(args = nil)
|
||||
@ignore_signature_args ||= []
|
||||
@ignore_signature_args = args if args
|
||||
@ignore_signature_args
|
||||
end
|
||||
|
||||
def write_point(name, tags: {}, fields: {})
|
||||
Metrics::Reporter.singleton.write_point(
|
||||
name,
|
||||
|
||||
@@ -1,75 +0,0 @@
|
||||
require "delayed_job"
|
||||
require "active_record/transactions"
|
||||
|
||||
# https://github.com/rails/rails/pull/46746/files
|
||||
module ActiveRecord::Transactions
|
||||
def trigger_transactional_callbacks? # :nodoc:
|
||||
(@_new_record_before_last_commit && !persisted? && has_transactional_callbacks?) ||
|
||||
(@_new_record_before_last_commit || _trigger_update_callback) && persisted? ||
|
||||
_trigger_destroy_callback && destroyed?
|
||||
end
|
||||
end
|
||||
|
||||
class DelayedDuplicatePreventionPlugin < Delayed::Plugin
|
||||
module SignatureConcern
|
||||
extend ActiveSupport::Concern
|
||||
|
||||
included do
|
||||
before_validation :set_signature
|
||||
|
||||
before_save do
|
||||
prevent_duplicate
|
||||
end
|
||||
|
||||
after_rollback do
|
||||
update_existing
|
||||
end
|
||||
|
||||
def prevent_duplicate
|
||||
@existing = self.class.find_by(signature: signature)
|
||||
return true if @existing.nil?
|
||||
return true if @existing == self
|
||||
self.errors.add(:signature, :duplicate, message: "Signature is already in use by #{@existing.id}")
|
||||
throw :abort
|
||||
end
|
||||
|
||||
def update_existing
|
||||
return unless @existing
|
||||
return unless self.priority
|
||||
if @existing.priority
|
||||
@existing.priority = [@existing.priority, self.priority].min
|
||||
else
|
||||
@existing.priority = self.priority
|
||||
end
|
||||
@existing.save if @existing.changed?
|
||||
end
|
||||
end
|
||||
|
||||
def set_signature
|
||||
self.signature = generate_signature
|
||||
self.args = self.payload_object.job_data["arguments"]
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def generate_signature
|
||||
job_data = payload_object.job_data
|
||||
ignore_signature_args = job_data["job_class"].
|
||||
constantize.
|
||||
ignore_signature_args
|
||||
ignore_signature_args << :_aj_symbol_keys
|
||||
|
||||
arguments = job_data["arguments"]&.first
|
||||
arguments = arguments.reject do |key, value|
|
||||
ignore_signature_args.include?(key.to_sym)
|
||||
end.to_h if arguments
|
||||
|
||||
sig = []
|
||||
sig << job_data["job_class"]
|
||||
sig << job_data["queue_name"] || "*"
|
||||
sig << Digest::SHA256.hexdigest(arguments.inspect)[0...16]
|
||||
sig = sig.join("|")
|
||||
sig
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -1,19 +1,15 @@
|
||||
class Scraper::FaHttpClient < Scraper::BaseHttpClient
|
||||
class Scraper::FaHttpClientConfig < Scraper::HttpClientConfig
|
||||
DEFAULT_ALLOWED_DOMAINS = [
|
||||
"*.furaffinity.net",
|
||||
"*.facdn.net",
|
||||
"ipinfo.io",
|
||||
]
|
||||
|
||||
def initialize(http_performer_or_proxy = nil)
|
||||
@cookies = Rails.application.config.x.cookies.fa[
|
||||
Rails.application.config.x.proxy_name.to_s
|
||||
]
|
||||
super(http_performer_or_proxy)
|
||||
end
|
||||
|
||||
def cookies
|
||||
@cookies
|
||||
proxy_name = Rails.application.config.x.proxy_name
|
||||
Rails.application.config.x.cookies.fa[
|
||||
proxy_name.to_s
|
||||
] || raise("no fa cookies defined for #{proxy_name}")
|
||||
end
|
||||
|
||||
def ratelimit
|
||||
@@ -27,4 +23,8 @@ class Scraper::FaHttpClient < Scraper::BaseHttpClient
|
||||
def allowed_domains
|
||||
DEFAULT_ALLOWED_DOMAINS
|
||||
end
|
||||
|
||||
def redirect_limit
|
||||
4
|
||||
end
|
||||
end
|
||||
@@ -1,5 +1,6 @@
|
||||
class Scraper::BaseHttpClient
|
||||
class Scraper::HttpClient
|
||||
include HasColorLogger
|
||||
attr_reader :config
|
||||
|
||||
Response = Struct.new(
|
||||
:status_code,
|
||||
@@ -9,24 +10,16 @@ class Scraper::BaseHttpClient
|
||||
|
||||
class InvalidURLError < ArgumentError; end
|
||||
|
||||
def initialize(http_performer = nil)
|
||||
cookies || raise("Subclass must define non-nil #cookies")
|
||||
allowed_domains || raise("Subclass must define non-nil #allowed_domains")
|
||||
|
||||
# @cookie_jar
|
||||
def initialize(config, http_performer)
|
||||
@config = config
|
||||
@http_performer = http_performer
|
||||
@domain_last_requested_at = {}
|
||||
init_cookie_jar
|
||||
# TODO - populate AdaptiveCache with domains this client cares about
|
||||
@max_cache_size = 8
|
||||
@blob_entry_cache = Hash.new do |hash, key|
|
||||
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
|
||||
end
|
||||
|
||||
@http_performer = http_performer || begin
|
||||
proxy_config = Rails.application.config.x.proxy
|
||||
Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http])
|
||||
end
|
||||
|
||||
@domain_last_requested_at = {}
|
||||
logger.level = :info
|
||||
end
|
||||
|
||||
@@ -34,24 +27,6 @@ class Scraper::BaseHttpClient
|
||||
@http_performer.close!
|
||||
end
|
||||
|
||||
def cookies
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
def allowed_domains
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
def redirect_limit
|
||||
6
|
||||
end
|
||||
|
||||
def ratelimit
|
||||
[
|
||||
["*", :none],
|
||||
]
|
||||
end
|
||||
|
||||
def get(url, caused_by_entry: nil)
|
||||
get_impl(url, caused_by_entry)
|
||||
end
|
||||
@@ -69,10 +44,10 @@ class Scraper::BaseHttpClient
|
||||
url = uri.to_s
|
||||
|
||||
unless allowed_domain?(uri.host)
|
||||
raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{allowed_domains}")
|
||||
raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}")
|
||||
end
|
||||
|
||||
ratelimit_conf = ratelimit.find do |conf|
|
||||
ratelimit_conf = config.ratelimit.find do |conf|
|
||||
File.fnmatch? conf[0], uri.host
|
||||
end || ["*", :none]
|
||||
ratelimit_delay = ratelimit_conf[1]
|
||||
@@ -197,14 +172,14 @@ class Scraper::BaseHttpClient
|
||||
end
|
||||
|
||||
def allowed_domain?(domain)
|
||||
allowed_domains.any? do |domain_to_check|
|
||||
config.allowed_domains.any? do |domain_to_check|
|
||||
File.fnmatch? domain_to_check, domain
|
||||
end
|
||||
end
|
||||
|
||||
def init_cookie_jar
|
||||
@cookie_jar = HTTP::CookieJar.new
|
||||
self.cookies.each do |cookie_domain_conf|
|
||||
self.config.cookies.each do |cookie_domain_conf|
|
||||
domain = cookie_domain_conf[:domain]
|
||||
cookie_domain_conf[:cookies].each do |cookie_key_conf|
|
||||
conf = {
|
||||
23
app/lib/scraper/http_client_config.rb
Normal file
23
app/lib/scraper/http_client_config.rb
Normal file
@@ -0,0 +1,23 @@
|
||||
class Scraper::HttpClientConfig
|
||||
def cookies
|
||||
not_implemented!
|
||||
end
|
||||
|
||||
def ratelimit
|
||||
not_implemented!
|
||||
end
|
||||
|
||||
def allowed_domains
|
||||
not_implemented!
|
||||
end
|
||||
|
||||
def redirect_limit
|
||||
not_implemented!
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def not_implemented!
|
||||
raise NotImplementedError.new("implement in #{self.class.name}")
|
||||
end
|
||||
end
|
||||
30
app/lib/scraper/http_factory.rb
Normal file
30
app/lib/scraper/http_factory.rb
Normal file
@@ -0,0 +1,30 @@
|
||||
class Scraper::HttpFactory
|
||||
def self.http_client_mock=(mock)
|
||||
@http_client_mock = mock
|
||||
end
|
||||
|
||||
def self.new_fa_http_client
|
||||
if Rails.env.test?
|
||||
@http_client_mock || raise("no mock set yet")
|
||||
else
|
||||
get_or_create_tvar_http_client(:fa, Scraper::FaHttpClientConfig.new)
|
||||
end
|
||||
end
|
||||
|
||||
def self.new_gallery_dl_client
|
||||
end
|
||||
|
||||
def self.get_or_create_tvar_http_client(key, config)
|
||||
attr_name = :"#{self.class.name.underscore}_#{key}_klass_http_client_tvar"
|
||||
thread = Thread.current
|
||||
if thread.thread_variable?(attr_name)
|
||||
thread.thread_variable_get(attr_name)
|
||||
else
|
||||
proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
|
||||
performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http])
|
||||
http_client = Scraper::HttpClient.new(config, performer)
|
||||
thread.thread_variable_set(attr_name, http_client)
|
||||
http_client
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -11,8 +11,8 @@ class Scraper::HttpPerformer
|
||||
)
|
||||
|
||||
def initialize(name, host)
|
||||
name || raise("must provdie name")
|
||||
host || raise("must provdie host")
|
||||
name || raise("must provide name")
|
||||
host || raise("must provide host")
|
||||
logger.info("#{Thread.name.to_s.light_black} - build performer #{name.to_s.green} - #{host.to_s.green.bold}")
|
||||
@name = name
|
||||
@client = Ripcord::Client.new(host)
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
#!/usr/bin/env ruby
|
||||
|
||||
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'config', 'environment'))
|
||||
require 'delayed/command'
|
||||
Delayed::Command.new(ARGV).daemonize
|
||||
@@ -13,7 +13,15 @@ module ReduxScraper
|
||||
config.load_defaults 7.0
|
||||
config.active_record.legacy_connection_handling = false
|
||||
config.autoload_paths << config.root.join("app/lib")
|
||||
config.active_job.queue_adapter = :delayed_job
|
||||
|
||||
# GoodJob configuration - applies to all environments (including test)
|
||||
config.active_job.queue_adapter = :good_job
|
||||
config.good_job.inline_execution_respects_schedule = true
|
||||
config.good_job.execution_mode = :async
|
||||
config.good_job.active_record_parent_class = "ReduxApplicationRecord"
|
||||
config.good_job.preserve_job_records = false
|
||||
config.good_job.retry_on_unhandled_error = true
|
||||
config.good_job.smaller_number_is_higher_priority = true
|
||||
|
||||
# Configuration for the application, engines, and railties goes here.
|
||||
#
|
||||
|
||||
@@ -88,7 +88,7 @@ development:
|
||||
dedipath-1: *blazeandwish
|
||||
serverhost-1: *cottoniq
|
||||
|
||||
production:
|
||||
async_worker:
|
||||
direct: *ddwhatnow
|
||||
proxy-1: *vipvillageworker
|
||||
dedipath-1: *blazeandwish
|
||||
|
||||
@@ -70,3 +70,11 @@ production:
|
||||
<<: *redux_prod
|
||||
legacy:
|
||||
<<: *legacy_prod
|
||||
|
||||
async_worker:
|
||||
redux:
|
||||
<<: *redux_prod
|
||||
pool: 16
|
||||
legacy:
|
||||
<<: *legacy_prod
|
||||
pool: 16
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
require "delayed_duplicate_prevention_plugin"
|
||||
|
||||
Delayed::Backend::ActiveRecord::Job.send(:include, DelayedDuplicatePreventionPlugin::SignatureConcern)
|
||||
Delayed::Worker.plugins << DelayedDuplicatePreventionPlugin
|
||||
@@ -1,4 +1,7 @@
|
||||
Rails.application.routes.draw do
|
||||
# Define your application routes per the DSL in https://guides.rubyonrails.org/routing.html
|
||||
mount GoodJob::Engine => "jobs"
|
||||
|
||||
namespace :api do
|
||||
namespace :fa do
|
||||
post :enqueue_objects, to: "/domain/fa/api#enqueue_objects"
|
||||
@@ -18,9 +21,6 @@ Rails.application.routes.draw do
|
||||
end
|
||||
end
|
||||
end
|
||||
# Define your application routes per the DSL in https://guides.rubyonrails.org/routing.html
|
||||
# mount Delayed::Web::Engine, at: "/jobs"
|
||||
match "/jobs" => DelayedJobWeb, :anchor => false, :via => [:get, :post]
|
||||
|
||||
# Defines the root path route ("/")
|
||||
# root "articles#index"
|
||||
|
||||
66
db/redux_migrate/20230325124842_create_good_jobs.rb
Normal file
66
db/redux_migrate/20230325124842_create_good_jobs.rb
Normal file
@@ -0,0 +1,66 @@
|
||||
# frozen_string_literal: true
|
||||
class CreateGoodJobs < ActiveRecord::Migration[7.0]
|
||||
def change
|
||||
enable_extension "pgcrypto"
|
||||
|
||||
create_table :good_jobs, id: :uuid do |t|
|
||||
t.text :queue_name
|
||||
t.integer :priority
|
||||
t.jsonb :serialized_params
|
||||
t.datetime :scheduled_at
|
||||
t.datetime :performed_at
|
||||
t.datetime :finished_at
|
||||
t.text :error
|
||||
|
||||
t.timestamps
|
||||
|
||||
t.uuid :active_job_id
|
||||
t.text :concurrency_key
|
||||
t.text :cron_key
|
||||
t.uuid :retried_good_job_id
|
||||
t.datetime :cron_at
|
||||
|
||||
t.uuid :batch_id
|
||||
t.uuid :batch_callback_id
|
||||
end
|
||||
|
||||
create_table :good_job_batches, id: :uuid do |t|
|
||||
t.timestamps
|
||||
t.text :description
|
||||
t.jsonb :serialized_properties
|
||||
t.text :on_finish
|
||||
t.text :on_success
|
||||
t.text :on_discard
|
||||
t.text :callback_queue_name
|
||||
t.integer :callback_priority
|
||||
t.datetime :enqueued_at
|
||||
t.datetime :discarded_at
|
||||
t.datetime :finished_at
|
||||
end
|
||||
|
||||
create_table :good_job_processes, id: :uuid do |t|
|
||||
t.timestamps
|
||||
t.jsonb :state
|
||||
end
|
||||
|
||||
create_table :good_job_settings, id: :uuid do |t|
|
||||
t.timestamps
|
||||
t.text :key
|
||||
t.jsonb :value
|
||||
t.index :key, unique: true
|
||||
end
|
||||
|
||||
add_index :good_jobs, :scheduled_at, where: "(finished_at IS NULL)", name: "index_good_jobs_on_scheduled_at"
|
||||
add_index :good_jobs, [:queue_name, :scheduled_at], where: "(finished_at IS NULL)", name: :index_good_jobs_on_queue_name_and_scheduled_at
|
||||
add_index :good_jobs, [:active_job_id, :created_at], name: :index_good_jobs_on_active_job_id_and_created_at
|
||||
add_index :good_jobs, :concurrency_key, where: "(finished_at IS NULL)", name: :index_good_jobs_on_concurrency_key_when_unfinished
|
||||
add_index :good_jobs, [:cron_key, :created_at], name: :index_good_jobs_on_cron_key_and_created_at
|
||||
add_index :good_jobs, [:cron_key, :cron_at], name: :index_good_jobs_on_cron_key_and_cron_at, unique: true
|
||||
add_index :good_jobs, [:active_job_id], name: :index_good_jobs_on_active_job_id
|
||||
add_index :good_jobs, [:finished_at], where: "retried_good_job_id IS NULL AND finished_at IS NOT NULL", name: :index_good_jobs_jobs_on_finished_at
|
||||
add_index :good_jobs, [:priority, :created_at], order: { priority: "DESC NULLS LAST", created_at: :asc },
|
||||
where: "finished_at IS NULL", name: :index_good_jobs_jobs_on_priority_created_at_when_unfinished
|
||||
add_index :good_jobs, [:batch_id], where: "batch_id IS NOT NULL"
|
||||
add_index :good_jobs, [:batch_callback_id], where: "batch_callback_id IS NOT NULL"
|
||||
end
|
||||
end
|
||||
62
db/schema.rb
generated
62
db/schema.rb
generated
@@ -10,10 +10,11 @@
|
||||
#
|
||||
# It's strongly recommended that you check this file into your version control system.
|
||||
|
||||
ActiveRecord::Schema[7.0].define(version: 2023_03_08_044852) do
|
||||
ActiveRecord::Schema[7.0].define(version: 2023_03_25_124842) do
|
||||
# These are extensions that must be enabled in order to support this database
|
||||
enable_extension "pg_stat_statements"
|
||||
enable_extension "pg_trgm"
|
||||
enable_extension "pgcrypto"
|
||||
enable_extension "plpgsql"
|
||||
|
||||
create_table "blob_entries", id: false, force: :cascade do |t|
|
||||
@@ -216,6 +217,65 @@ ActiveRecord::Schema[7.0].define(version: 2023_03_08_044852) do
|
||||
t.index ["key"], name: "index_flat_sst_entries_on_key", unique: true
|
||||
end
|
||||
|
||||
create_table "good_job_batches", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.text "description"
|
||||
t.jsonb "serialized_properties"
|
||||
t.text "on_finish"
|
||||
t.text "on_success"
|
||||
t.text "on_discard"
|
||||
t.text "callback_queue_name"
|
||||
t.integer "callback_priority"
|
||||
t.datetime "enqueued_at"
|
||||
t.datetime "discarded_at"
|
||||
t.datetime "finished_at"
|
||||
end
|
||||
|
||||
create_table "good_job_processes", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.jsonb "state"
|
||||
end
|
||||
|
||||
create_table "good_job_settings", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.text "key"
|
||||
t.jsonb "value"
|
||||
t.index ["key"], name: "index_good_job_settings_on_key", unique: true
|
||||
end
|
||||
|
||||
create_table "good_jobs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.text "queue_name"
|
||||
t.integer "priority"
|
||||
t.jsonb "serialized_params"
|
||||
t.datetime "scheduled_at"
|
||||
t.datetime "performed_at"
|
||||
t.datetime "finished_at"
|
||||
t.text "error"
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.uuid "active_job_id"
|
||||
t.text "concurrency_key"
|
||||
t.text "cron_key"
|
||||
t.uuid "retried_good_job_id"
|
||||
t.datetime "cron_at"
|
||||
t.uuid "batch_id"
|
||||
t.uuid "batch_callback_id"
|
||||
t.index ["active_job_id", "created_at"], name: "index_good_jobs_on_active_job_id_and_created_at"
|
||||
t.index ["active_job_id"], name: "index_good_jobs_on_active_job_id"
|
||||
t.index ["batch_callback_id"], name: "index_good_jobs_on_batch_callback_id", where: "(batch_callback_id IS NOT NULL)"
|
||||
t.index ["batch_id"], name: "index_good_jobs_on_batch_id", where: "(batch_id IS NOT NULL)"
|
||||
t.index ["concurrency_key"], name: "index_good_jobs_on_concurrency_key_when_unfinished", where: "(finished_at IS NULL)"
|
||||
t.index ["cron_key", "created_at"], name: "index_good_jobs_on_cron_key_and_created_at"
|
||||
t.index ["cron_key", "cron_at"], name: "index_good_jobs_on_cron_key_and_cron_at", unique: true
|
||||
t.index ["finished_at"], name: "index_good_jobs_jobs_on_finished_at", where: "((retried_good_job_id IS NULL) AND (finished_at IS NOT NULL))"
|
||||
t.index ["priority", "created_at"], name: "index_good_jobs_jobs_on_priority_created_at_when_unfinished", order: { priority: "DESC NULLS LAST" }, where: "(finished_at IS NULL)"
|
||||
t.index ["queue_name", "scheduled_at"], name: "index_good_jobs_on_queue_name_and_scheduled_at", where: "(finished_at IS NULL)"
|
||||
t.index ["scheduled_at"], name: "index_good_jobs_on_scheduled_at", where: "(finished_at IS NULL)"
|
||||
end
|
||||
|
||||
create_table "http_log_entries", force: :cascade do |t|
|
||||
t.string "uri_scheme", null: false
|
||||
t.string "uri_host", null: false
|
||||
|
||||
78
spec/jobs/domain/fa/job/browse_page_job_spec.rb
Normal file
78
spec/jobs/domain/fa/job/browse_page_job_spec.rb
Normal file
@@ -0,0 +1,78 @@
|
||||
require "rails_helper"
|
||||
|
||||
describe Domain::Fa::Job::BrowsePageJob do
|
||||
Subject = Domain::Fa::Job::BrowsePageJob
|
||||
|
||||
before do
|
||||
@http_client_mock = instance_double("::Scraper::HttpClient")
|
||||
Scraper::HttpFactory.http_client_mock = @http_client_mock
|
||||
end
|
||||
|
||||
it "enqueues one" do
|
||||
expect do
|
||||
Subject.perform_later({})
|
||||
end.to change { GoodJob::Job.count }.by(1)
|
||||
end
|
||||
|
||||
it "does not enqueue more than one" do
|
||||
expect do
|
||||
Subject.perform_later({})
|
||||
Subject.perform_later({})
|
||||
end.to change { GoodJob::Job.count }.by(1)
|
||||
end
|
||||
|
||||
it "works for a browse page with no submissions" do
|
||||
SpecUtil.init_http_client_mock(
|
||||
@http_client_mock, [
|
||||
{
|
||||
uri: "https://www.furaffinity.net/browse/",
|
||||
status_code: 200,
|
||||
content_type: "text/html",
|
||||
contents: SpecUtil.read_fixture_file("domain/fa/job/browse_page_no_submissions.html"),
|
||||
caused_by_entry: nil,
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
Subject.new.perform({})
|
||||
end
|
||||
|
||||
it "requests the next page if a submission is found" do
|
||||
SpecUtil.init_http_client_mock(
|
||||
@http_client_mock, [
|
||||
{
|
||||
uri: "https://www.furaffinity.net/browse/",
|
||||
status_code: 200,
|
||||
content_type: "text/html",
|
||||
contents: SpecUtil.read_fixture_file("domain/fa/job/browse_page_one_submission.html"),
|
||||
caused_by_entry: nil,
|
||||
},
|
||||
{
|
||||
uri: "https://www.furaffinity.net/browse/2/",
|
||||
status_code: 200,
|
||||
content_type: "text/html",
|
||||
contents: SpecUtil.read_fixture_file("domain/fa/job/browse_page_no_submissions.html"),
|
||||
caused_by_entry: 0,
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
expect do
|
||||
Subject.new.perform({})
|
||||
end.to(
|
||||
change { Domain::Fa::Post.count }.by(1).and(
|
||||
change { Domain::Fa::User.count }.by(1)
|
||||
)
|
||||
)
|
||||
|
||||
creator = Domain::Fa::User.find_by(url_name: "ruby69r")
|
||||
refute_nil creator
|
||||
expect(creator.name).to eq("Ruby_69r")
|
||||
|
||||
post = Domain::Fa::Post.find_by(fa_id: 51509268)
|
||||
refute_nil post
|
||||
expect(post.state).to eq("ok")
|
||||
expect(post.title).to eq("reminder YCH AUCTION")
|
||||
expect(post.creator).to eq(creator)
|
||||
end
|
||||
end
|
||||
@@ -1,7 +1,7 @@
|
||||
require "rails_helper"
|
||||
|
||||
describe Scraper::BaseHttpClient do
|
||||
class TestHttpClient < Scraper::BaseHttpClient
|
||||
describe Scraper::HttpClient do
|
||||
class TestHttpClientConfig < Scraper::HttpClientConfig
|
||||
def cookies
|
||||
[]
|
||||
end
|
||||
@@ -9,17 +9,27 @@ describe Scraper::BaseHttpClient do
|
||||
def allowed_domains
|
||||
["*.example.com"]
|
||||
end
|
||||
|
||||
def ratelimit
|
||||
[
|
||||
["*", :none],
|
||||
]
|
||||
end
|
||||
|
||||
def redirect_limit
|
||||
1
|
||||
end
|
||||
end
|
||||
|
||||
it "throws on unallowed domain" do
|
||||
client = TestHttpClient.new(SpecUtil.mock_http_performer(""))
|
||||
assert_raises(Scraper::BaseHttpClient::InvalidURLError) do
|
||||
client = Scraper::HttpClient.new(TestHttpClientConfig.new, SpecUtil.mock_http_performer(""))
|
||||
assert_raises(Scraper::HttpClient::InvalidURLError) do
|
||||
client.get("https://foobar.com")
|
||||
end
|
||||
end
|
||||
|
||||
it "creates an http log entry" do
|
||||
client = TestHttpClient.new(SpecUtil.mock_http_performer(
|
||||
client = Scraper::HttpClient.new(TestHttpClientConfig.new, SpecUtil.mock_http_performer(
|
||||
"https://www.example.com/",
|
||||
request_headers: { "cookie" => "" },
|
||||
response_code: 200,
|
||||
@@ -1,18 +0,0 @@
|
||||
require "rails_helper"
|
||||
|
||||
describe Scraper::FaHttpClient do
|
||||
it "creates an http log entry" do
|
||||
client = Scraper::FaHttpClient.new(SpecUtil.mock_http_performer(
|
||||
"https://www.furaffinity.net/",
|
||||
request_headers: Hash,
|
||||
response_code: 200,
|
||||
response_body: "a plain text body",
|
||||
))
|
||||
client.logger.level = :error
|
||||
response = client.get("https://www.furaffinity.net/")
|
||||
|
||||
assert_equal 200, response.status_code
|
||||
assert_equal "a plain text body", response.body
|
||||
refute_nil response.log_entry
|
||||
end
|
||||
end
|
||||
@@ -49,4 +49,81 @@ class SpecUtil
|
||||
|
||||
mock
|
||||
end
|
||||
|
||||
def self.build_http_log_entry(
|
||||
uri: "http://example.com",
|
||||
status_code: 200,
|
||||
content_type: "text/plain",
|
||||
contents: nil
|
||||
)
|
||||
entry = ::HttpLogEntry.new({
|
||||
uri: uri,
|
||||
verb: :get,
|
||||
performed_by: "direct",
|
||||
status_code: status_code,
|
||||
response_time_ms: rand(20..100),
|
||||
request_headers: create_http_headers,
|
||||
response_headers: create_http_headers,
|
||||
response: build_blob_entry(
|
||||
content_type: content_type,
|
||||
contents: contents,
|
||||
),
|
||||
content_type: content_type,
|
||||
requested_at: Time.now,
|
||||
})
|
||||
raise entry.errors.full_messages.join(", ") unless entry.valid?
|
||||
|
||||
entry
|
||||
end
|
||||
|
||||
def self.build_blob_entry(
|
||||
content_type: "text/plain",
|
||||
contents: nil
|
||||
)
|
||||
BlobEntry.find_or_build(
|
||||
content_type: content_type,
|
||||
contents: contents || random_string(1024),
|
||||
)
|
||||
end
|
||||
|
||||
def self.create_http_headers
|
||||
::HttpLogEntryHeader.find_or_create(headers: { test_header_key: "test header value #{random_string(16)}" })
|
||||
end
|
||||
|
||||
def self.init_http_client_mock(
|
||||
http_client_mock, requests
|
||||
)
|
||||
log_entries = []
|
||||
|
||||
requests.each do |request|
|
||||
log_entry = SpecUtil.build_http_log_entry(
|
||||
uri: request[:uri],
|
||||
contents: request[:contents],
|
||||
content_type: request[:content_type],
|
||||
status_code: request[:status_code],
|
||||
)
|
||||
log_entry.save!
|
||||
log_entries << log_entry
|
||||
|
||||
caused_by_entry = nil
|
||||
if request[:caused_by_entry]
|
||||
caused_by_entry = log_entries[request[:caused_by_entry]]
|
||||
end
|
||||
|
||||
expect(http_client_mock).to(receive(:get).
|
||||
with(
|
||||
log_entry.uri.to_s,
|
||||
{ caused_by_entry: caused_by_entry }
|
||||
).
|
||||
and_return(Scraper::HttpClient::Response.new(
|
||||
log_entry.status_code,
|
||||
log_entry.response.contents,
|
||||
log_entry
|
||||
)))
|
||||
end
|
||||
end
|
||||
|
||||
def self.read_fixture_file(path)
|
||||
File.read Rails.root.join("test/fixtures/files", path)
|
||||
end
|
||||
end
|
||||
|
||||
1291
test/fixtures/files/domain/fa/job/browse_page_no_submissions.html
vendored
Normal file
1291
test/fixtures/files/domain/fa/job/browse_page_no_submissions.html
vendored
Normal file
File diff suppressed because one or more lines are too long
1305
test/fixtures/files/domain/fa/job/browse_page_one_submission.html
vendored
Normal file
1305
test/fixtures/files/domain/fa/job/browse_page_one_submission.html
vendored
Normal file
File diff suppressed because one or more lines are too long
@@ -3,17 +3,25 @@ module TestUtil
|
||||
(0...length).map { (65 + rand(26)).chr }.join
|
||||
end
|
||||
|
||||
def self.build_http_log_entry
|
||||
def self.build_http_log_entry(
|
||||
uri: "http://example.com",
|
||||
status_code: 200,
|
||||
content_type: "text/plain",
|
||||
contents: nil
|
||||
)
|
||||
entry = ::HttpLogEntry.new({
|
||||
uri: "http://example.com/",
|
||||
uri: uri,
|
||||
verb: :get,
|
||||
performed_by: "direct",
|
||||
status_code: 200,
|
||||
status_code: status_code,
|
||||
response_time_ms: rand(20..100),
|
||||
request_headers: create_http_headers,
|
||||
response_headers: create_http_headers,
|
||||
response: build_blob_entry,
|
||||
content_type: "text/plain",
|
||||
response: build_blob_entry(
|
||||
content_type: content_type,
|
||||
contents: contents,
|
||||
),
|
||||
content_type: content_type,
|
||||
requested_at: Time.now,
|
||||
})
|
||||
raise entry.errors.full_messages.join(", ") unless entry.valid?
|
||||
@@ -21,8 +29,14 @@ module TestUtil
|
||||
entry
|
||||
end
|
||||
|
||||
def self.build_blob_entry
|
||||
BlobEntry.find_or_build(content_type: "text/plain", contents: random_string(1024))
|
||||
def self.build_blob_entry(
|
||||
content_type: "text/plain",
|
||||
contents: nil
|
||||
)
|
||||
BlobEntry.find_or_build(
|
||||
content_type: content_type,
|
||||
contents: contents || random_string(1024),
|
||||
)
|
||||
end
|
||||
|
||||
def self.create_blob_entry
|
||||
|
||||
Reference in New Issue
Block a user