Files
redux-scraper/app/jobs/scraper/job_base.rb
2025-01-01 03:29:53 +00:00

234 lines
6.5 KiB
Ruby

# typed: false
class Scraper::JobBase < ApplicationJob
ignore_signature_args :caused_by_entry
class JobError < RuntimeError
end
def initialize(...)
@deferred_jobs = []
super(...)
end
def self.http_factory_method
raise NotImplementedError.new("implement in #{self.name}")
end
def http_client
@http_client ||= Scraper::ClientFactory.send(self.class.http_factory_method)
end
def gallery_dl_client
@gallery_dl_client ||= Scraper::ClientFactory.get_gallery_dl_client
end
good_job_control_concurrency_with(
total_limit: 1,
key:
proc do
raise("too many arguments") if arguments.size > 1
first_argument = arguments || []
first_argument = first_argument[0] || {}
# collect all ignore_signature_args from all superclasses
ignore_signature_args = []
target = self.class
while target.respond_to?(:ignore_signature_args)
ignore_signature_args += target.ignore_signature_args
target = target.superclass
end
ignore_signature_args << :_aj_symbol_keys
ignore_signature_args << :_aj_ruby2_keywords
sig_arguments =
first_argument
.reject { |key, value| ignore_signature_args.include?(key.to_sym) }
.to_h
sig = []
sig << self.class.name || raise
sig << self.queue_name || "*"
sig << self.priority || "*"
sig << Digest::SHA256.hexdigest(sig_arguments.inspect)[0...16]
sig = sig.join("|")
sig
end,
)
# make the concurrency config threadlocal so it can be modified
# per-thread temporarily for HasGoodJobSuppressConcurrency
gjcc = self.good_job_concurrency_config.dup
@@gjcc_tl ||= Concurrent::ThreadLocalVar.new { gjcc.dup }
def self.good_job_concurrency_config
@@gjcc_tl.value
end
PERMITTED_CONTENT_TYPES = [%r{text/html}, %r{application/json}]
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
return unless ENV["enqueue_jobs_from_found_links"] == "1"
start_time = Time.now
unless PERMITTED_CONTENT_TYPES.any? { |ct|
ct.match(log_entry.content_type)
}
raise("unsupported content type: #{log_entry.content_type}")
end
document = log_entry.response.contents
link_finder = Scraper::LinkFinder.new(log_entry.uri_host, document)
link_finder.logger.level = :error
links = link_finder.find_links
job_defs = []
filter_mapper =
proc do |list, type, key|
list.filter { |item| item[:type] == type }.map { |item| item[key] }
end
attr_to_mapper =
proc do |model_class, attr_sym, in_values|
model_class
.where({ attr_sym => in_values })
.map { |model| [model.send(attr_sym), model] }
.to_h
end
url_name_to_fa_user =
attr_to_mapper.call(
Domain::Fa::User,
:url_name,
filter_mapper.call(links, :fa_user, :url_name),
)
fa_id_to_fa_post =
attr_to_mapper.call(
Domain::Fa::Post,
:fa_id,
filter_mapper.call(links, :fa_post, :fa_id),
)
name_to_twitter_user =
attr_to_mapper.call(
Domain::Twitter::User,
:name,
filter_mapper.call(links, :twitter_user, :name),
)
links.each do |link|
link_type = link[:type]
case link_type
when :fa_user
url_name = link[:url_name]
user = url_name_to_fa_user[url_name]
params = (user ? { user: user } : { url_name: url_name })
job_def =
({ params: params, desc: "fa user " + (user&.name || url_name).bold })
if !user || user.due_for_page_scan?
job_defs << job_def.merge(job: Domain::Fa::Job::UserPageJob)
elsif !user || user.due_for_gallery_scan?
job_defs << job_def.merge(job: Domain::Fa::Job::UserGalleryJob)
end
when :fa_post
fa_id = link[:fa_id]
post = fa_id_to_fa_post[fa_id]
params = (post ? { post: post } : { fa_id: fa_id })
job_def = ({ params: params, desc: "fa post #{fa_id.to_s.bold}" })
if !post || (post.state == "ok" && !post.scanned?)
job_defs << job_def.merge(job: Domain::Fa::Job::ScanPostJob)
end
if post && post.state == "ok" && post.scanned? && !post.have_file?
job_defs << job_def.merge(job: Domain::Fa::Job::ScanFileJob)
end
when :twitter_user
name = link[:name]
user = name_to_twitter_user[name]
params = (user ? { user: user } : { name: name })
job_def =
(
{
params: params,
desc: "twitter user " + (user&.name || name).bold,
}
)
if !user || user.due_for_timeline_tweets_scan?
job_defs << job_def.merge(
job: Domain::Twitter::Job::UserTimelineTweetsJob,
)
end
else
logger.warn(
"unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}",
)
end
end
job_defs.uniq!
job_defs.reject! do |job_def|
suppress_jobs.any? do |suppress|
suppress == job_def.slice(*suppress.keys)
end
end
job_defs.each do |job_def|
job_class = job_def[:job]
params = job_def[:params]
desc = job_def[:desc]
logger.debug(
[
"link finder -",
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
desc,
].join(" "),
)
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
end
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
logger.info(
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)",
)
end
around_perform do |job, block|
block.call
rescue Net::ReadTimeout, Errno::ECONNREFUSED => e
logger.error "#{e.class.name} - sleep for a bit"
sleep rand(2.0..7.0)
raise e
rescue => e
raise e
end
after_perform { |job| ColorLogger.quiet { job.enqueue_deferred_jobs! } }
def defer_job(job_class, params, set_args = {})
@deferred_jobs << [job_class, params, set_args]
end
def enqueue_deferred_jobs!
GoodJob::Bulk.enqueue do
@deferred_jobs.each do |job_class, params, set_args|
job_class.set(set_args).perform_later(params)
end
end
end
def fatal_error(msg)
logger.error(msg)
raise JobError, msg.uncolorize
end
DATE_HELPER = Class.new.extend(ActionView::Helpers::DateHelper)
def time_ago_in_words(maybe_time)
return "never".bold if maybe_time.nil?
"#{DATE_HELPER.time_ago_in_words(maybe_time)} ago".bold
end
end