Files
redux-scraper/app/jobs/domain/fa/job/base.rb
2025-01-01 03:29:53 +00:00

309 lines
8.9 KiB
Ruby

# typed: false
class Domain::Fa::Job::Base < Scraper::JobBase
discard_on ActiveJob::DeserializationError
def self.http_factory_method
:get_fa_http_client
end
protected
def init_from_args!(args, build_user: true, require_user_exists: false)
@force_scan = !!args[:force_scan]
@caused_by_entry = args[:caused_by_entry]
if build_user
@user =
find_or_build_user_from_args(args, caused_by_entry: @caused_by_entry)
else
@user = find_user_from_args(args)
end
logger.prefix =
"[user #{(@user&.url_name || @user&.name || args[:url_name])&.bold} / #{@user&.state&.bold}]"
return nil unless @user
if @user.new_record?
if require_user_exists
fatal_error("user must already exist")
else
@user.save!
@created_user = true
end
end
end
def find_or_build_user_from_args(args, caused_by_entry: nil)
find_user_from_args(args) ||
begin
url_name = Domain::Fa::User.name_to_url_name(args[:url_name])
user = Domain::Fa::User.new
user.url_name = url_name
user.name = url_name
user.state_detail ||= {}
user.state_detail[
"first_seen_entry"
] = caused_by_entry.id if caused_by_entry
user
end
end
def find_user_from_args(args)
args[:user] ||
begin
if args[:url_name].blank?
fatal_error("arg 'url_name' is required if arg 'user' is nil")
end
url_name = Domain::Fa::User.name_to_url_name(args[:url_name])
Domain::Fa::User.find_by(url_name: url_name)
end
end
def user_due_for_scan?(scan_type)
unless @user.scan_due?(scan_type)
if @force_scan
logger.warn(
"scanned #{@user.scanned_ago_in_words(scan_type).bold} - force scanning",
)
return true
else
logger.warn(
"scanned #{@user.scanned_ago_in_words(scan_type).bold} - skipping",
)
return false
end
end
return true
end
ListingsPageScanStats = Struct.new(:new_seen, :total_seen, :last_was_new)
def update_and_enqueue_posts_from_listings_page(
job_type,
page,
caused_by_entry,
enqueue_posts_pri:,
enqueue_page_scan: true,
enqueue_gallery_scan: true,
page_desc: nil,
fill_id_gaps: false,
continue_for: nil
)
fatal_error("not a listings page") unless page.probably_listings_page?
submissions = page.submissions_parsed
fa_ids_to_manually_enqueue = Set.new
fa_ids = Set.new(submissions.map(&:id))
create_unseen_posts = false
if fill_id_gaps && submissions.any?
create_unseen_posts = true
max_fa_id, min_fa_id = fa_ids.max, fa_ids.min
# sanity check so we don't enqueue too many post jobs
if max_fa_id - min_fa_id <= 250
(min_fa_id..max_fa_id).each do |fa_id|
fa_ids_to_manually_enqueue << fa_id unless fa_ids.include?(fa_id)
end
end
end
if continue_for && submissions.any?
max_fa_id = fa_ids.max
min_fa_id = [max_fa_id - continue_for, 0].max
fa_ids_to_manually_enqueue = Set.new(min_fa_id..max_fa_id)
fa_ids_to_manually_enqueue.subtract(fa_ids)
existing =
Domain::Fa::Post.where(
"fa_id >= ? AND fa_id <= ?",
min_fa_id,
max_fa_id,
).pluck(:fa_id)
fa_ids_to_manually_enqueue.subtract(existing)
end
page_desc = (page_desc ? "page #{page_desc.to_s.bold}" : "page")
listing_page_stats = ListingsPageScanStats.new(0, 0, false)
submissions.each do |submission|
post = Domain::Fa::Post.find_or_initialize_by(fa_id: submission.id)
listing_page_stats.last_was_new = post.new_record?
listing_page_stats.new_seen += 1 if post.new_record?
listing_page_stats.total_seen += 1
update_and_save_post_from_listings_page(
job_type,
post,
submission,
caused_by_entry,
)
if post.creator
enqueue_user_scan(
post.creator,
caused_by_entry,
enqueue_page_scan: enqueue_page_scan,
enqueue_gallery_scan: enqueue_gallery_scan,
)
end
case post.state.to_sym
when :ok
enqueue_post_scan(post, caused_by_entry, enqueue_posts_pri)
when :removed
logger.info "(todo) removed post seen in listing page, enqueue scan for fa_id #{post.fa_id}"
when :scan_error
logger.info "(todo) scan_error'd post seen in listing page for fa_id #{post.fa_id}"
when :file_error
logger.info "(todo) file_error'd post seen in listing page for fa_id #{post.fa_id}"
else
logger.info "unknown post state `#{post.state}` for fa_id #{post.fa_id}"
end
end
fa_ids_to_manually_enqueue.to_a.sort.reverse.each do |fa_id|
if create_unseen_posts
# when filling gaps, only enqueue if the post wasn't found
post = Domain::Fa::Post.find_or_initialize_by(fa_id: fa_id)
if post.new_record?
post.save!
enqueue_post_scan(post, caused_by_entry, enqueue_posts_pri)
end
else
enqueue_fa_id_scan(fa_id, caused_by_entry, enqueue_posts_pri)
end
end
logger.info "#{page_desc} has #{submissions.count.to_s.bold} posts, " +
"#{listing_page_stats.new_seen.to_s.bold} new"
listing_page_stats
end
def update_and_save_post_from_listings_page(
job_type,
post,
submission,
caused_by_entry
)
if job_type == :browse_page
post.log_entry_detail["first_browse_page_id"] ||= caused_by_entry.id
elsif job_type == :gallery_page
post.log_entry_detail["first_gallery_page_id"] ||= caused_by_entry.id
else
fatal_error("unhandled job_type: #{job_type}")
end
post.creator ||=
Domain::Fa::User.find_or_build_from_submission_parser(submission)
post.title = submission.title || fatal_error("blank title")
post.thumbnail_uri =
submission.thumb_path || fatal_error("blank thumb_path")
post.save!
end
def enqueue_user_scan(
user,
caused_by_entry,
enqueue_page_scan: true,
enqueue_gallery_scan: true,
enqueue_favs_scan: true
)
@users_enqueued_for_page_scan ||= Set.new
@users_enqueued_for_gallery_scan ||= Set.new
@users_enqueued_for_favs_scan ||= Set.new
args =
if user.persisted?
{ user: user }
else
unless user.url_name
logger.warn "user does not have a url name and is not persisted, skipping (#{user.name})"
return
end
{ url_name: user.url_name }
end.merge({ caused_by_entry: caused_by_entry })
if enqueue_page_scan && @users_enqueued_for_page_scan.add?(user.url_name)
if user.due_for_page_scan?
logger.info(
"enqueue user page job for #{user.url_name.bold}, " +
"last scanned #{time_ago_in_words(user.scanned_page_at)}",
)
defer_job(Domain::Fa::Job::UserPageJob, args)
end
end
if enqueue_gallery_scan &&
@users_enqueued_for_gallery_scan.add?(user.url_name)
if user.due_for_gallery_scan?
logger.info(
"enqueue user gallery job for #{user.url_name.bold}, " +
"last scanned #{time_ago_in_words(user.scanned_gallery_at)}",
)
defer_job(Domain::Fa::Job::UserGalleryJob, args)
end
end
if enqueue_favs_scan && @users_enqueued_for_favs_scan.add?(user.url_name)
if user.due_for_favs_scan?
logger.info(
"enqueue user favs job for #{user.url_name.bold}, " +
"last scanned #{time_ago_in_words(user.scanned_favs_at)}",
)
defer_job(Domain::Fa::Job::FavsJob, args)
end
end
end
def normalize_enqueue_pri(enqueue_pri)
case enqueue_pri
when :low
-5
when :high
-15
else
-10
end
end
def enqueue_fa_id_scan(fa_id, caused_by_entry, enqueue_pri)
enqueue_pri = normalize_enqueue_pri(enqueue_pri)
@posts_enqueued_for_scan ||= Set.new
if @posts_enqueued_for_scan.add?(fa_id)
logger.info "enqueue post scan for fa_id #{fa_id}"
defer_job(
Domain::Fa::Job::ScanPostJob,
{ fa_id: fa_id, caused_by_entry: caused_by_entry },
{ priority: enqueue_pri },
)
end
end
def enqueue_post_scan(post, caused_by_entry, enqueue_pri)
enqueue_pri = normalize_enqueue_pri(enqueue_pri)
@posts_enqueued_for_scan ||= Set.new
if @posts_enqueued_for_scan.add?(post.fa_id)
fa_id_str = (post.fa_id || "(nil)").to_s.bold
if !post.scanned?
logger.info "enqueue post scan for fa_id #{fa_id_str}"
defer_job(
Domain::Fa::Job::ScanPostJob,
{ post: post, caused_by_entry: caused_by_entry },
{ priority: enqueue_pri },
)
elsif !post.have_file?
logger.info "enqueue file scan for fa_id #{fa_id_str}"
defer_job(
Domain::Fa::Job::ScanFileJob,
{ post: post, caused_by_entry: caused_by_entry },
{ priority: enqueue_pri },
)
end
end
end
end