Files
redux-scraper/app/jobs/domain/fa/job/favs_job.rb
2025-01-01 03:29:53 +00:00

183 lines
4.9 KiB
Ruby

# typed: true
class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
include HasBulkEnqueueJobs
USERS_PER_FULL_PAGE = Rails.env.test? ? 9 : 190
queue_as :fa_user_favs
def perform(args)
@first_job_entry = nil
init_from_args!(args, build_user: false)
@full_scan = !!args[:full_scan] || @force_scan
@user ||
begin
Domain::Fa::Job::UserPageJob.perform_later(
{ url_name: args[:url_name], caused_by_entry: best_caused_by_entry },
)
fatal_error("user does not exist: #{args}")
end
logger.prefix =
"[#{(@user.url_name || @user.name).bold} / #{@user.state.bold}]"
return unless user_due_for_scan?(:favs)
@page_number = 0
@page_id = nil
@total_items_seen = 0
@seen_post_ids = Set.new
existing_faved_ids = Set.new(@user.fav_post_joins.active.pluck(:post_id))
while true
ret = scan_page
break if ret == :break
return if ret == :stop
if !@full_scan
new_favs = @last_page_post_ids - existing_faved_ids
if new_favs.empty?
@user.scanned_favs_at = Time.now
@user.save!
logger.info "[no new favs] [stopping scan]"
return
end
end
# bail out at 100,000 items
break if @page_number > 500
@page_number += 1
end
to_remove = existing_faved_ids - @seen_post_ids
to_add = @seen_post_ids - existing_faved_ids
logger.info "[calc change favs] [add #{to_add.size.to_s.bold}] [remove #{to_remove.size.to_s.bold}]"
ReduxApplicationRecord.transaction do
if to_remove.any?
@user
.fav_post_joins
.active
.where(post_id: to_remove)
.update_all(removed: true)
end
slice_size =
if to_add.size <= 2500
100
else
1000
end
if to_add.any?
to_add.each_slice(slice_size) do |slice|
@user.fav_post_joins.upsert_all(
slice.map { |id| { post_id: id, removed: false } },
unique_by: :index_domain_fa_favs_on_user_id_and_post_id,
update_only: [:removed],
)
end
end
@user.scanned_favs_at = Time.now
@user.save!
end
logger.info "[updated favs list] [posts: #{@user.fav_post_joins.count.to_s.bold}]"
end
private
def scan_page
ret = nil
url =
if @page_id
"https://www.furaffinity.net/favorites/#{@user.url_name}/#{@page_id}/next"
else
"https://www.furaffinity.net/favorites/#{@user.url_name}/"
end
response = http_client.get(url, caused_by_entry: best_caused_by_entry)
@first_job_entry ||= response.log_entry
if response.status_code != 200
fatal_error(
"http #{response.status_code.to_s.red.bold}, " +
"log entry #{response.log_entry.id.to_s.bold}",
)
end
if Domain::Fa::Job::ScanUserUtils.user_disabled_or_not_found?(
@user,
response,
)
logger.error("account disabled / not found, abort")
return :stop
end
page = Domain::Fa::Parser::Page.new(response.body)
fatal_error("not a favs listing page") unless page.probably_listings_page?
submissions = page.submissions_parsed
@page_id = page.favorites_next_button_id
ret = :break if @page_id.nil?
@total_items_seen += submissions.length
posts_to_create_hashes = []
existing_fa_id_to_post_id =
Domain::Fa::Post
.where(fa_id: submissions.map(&:id))
.pluck(:fa_id, :id)
.to_h
posts_to_create_hashes =
submissions
.reject { |submission| existing_fa_id_to_post_id[submission.id] }
.map do |submission|
Domain::Fa::Post.hash_from_submission_parser_helper(
submission,
first_seen_log_entry: response.log_entry,
)
end
created_post_ids = []
created_post_ids =
Domain::Fa::Post
.insert_all!(posts_to_create_hashes, returning: %i[id fa_id])
.map { |row| row["id"] } unless posts_to_create_hashes.empty?
enqueue_new_post_scan_jobs(
posts_to_create_hashes.map { |hash| hash[:fa_id] },
)
@last_page_post_ids = Set.new
created_post_ids.each do |id|
@seen_post_ids.add(id)
@last_page_post_ids.add(id)
end
existing_fa_id_to_post_id.values.each do |id|
@seen_post_ids.add(id)
@last_page_post_ids.add(id)
end
logger.info [
"[page #{@page_number.to_s.bold}]",
"[posts: #{submissions.length.to_s.bold}]",
"[created: #{posts_to_create_hashes.size.to_s.bold}]",
].join(" ")
ret
end
def enqueue_new_post_scan_jobs(fa_ids)
bulk_enqueue_jobs do
fa_ids.each do |fa_id|
Domain::Fa::Job::ScanPostJob.perform_later(
{ fa_id: fa_id, caused_by_entry: best_caused_by_entry },
)
# sleep 100000
end
end
end
def best_caused_by_entry
@first_job_entry || @caused_by_entry
end
end