Files
redux-scraper/rake/fa.rake
2025-07-10 19:24:41 +00:00

229 lines
7.1 KiB
Ruby

# typed: false
# frozen_string_literal: true
T.bind(self, T.all(Rake::DSL, Object))
namespace :fa do
desc "enqueue waiting posts"
task enqueue_waiting_posts: %i[set_logger_stdout environment] do |t, args|
start_at = (ENV["start_at"] || 0).to_i
low_water_mark = 50
high_water_mark = 300
poll_duration = 10
enqueuer =
Domain::Fa::PostEnqueuer.new(
reverse_scan_holes: false,
start_at: start_at,
low_water_mark: low_water_mark,
high_water_mark: high_water_mark,
)
loop { sleep poll_duration if enqueuer.run_once == :sleep }
end
task enqueue_missing_posts: %i[set_logger_stdout environment] do |t, args|
start_at =
ENV["start_at"]&.to_i ||
raise("need start_at (highest fa_id already present)")
stop_at = ENV["stop_at"]&.to_i
low_water_mark = 50
high_water_mark = 300
poll_duration = 10
enqueuer =
Domain::Fa::PostEnqueuer.new(
start_at: start_at,
stop_at: stop_at,
low_water_mark: low_water_mark,
high_water_mark: high_water_mark,
)
loop { sleep poll_duration if enqueuer.run_once == :sleep }
end
task enqueue_unscanned_ok_posts: %i[set_logger_stdout environment] do
start_at = ENV["start_at"]
start_at = start_at.to_i if start_at && start_at != "last"
Tasks::Fa::EnqueueUnscannedOkPostsTask.new(start_at: start_at).run
end
desc "enqueue waiting users"
task enqueue_waiting_users: %i[set_logger_stdout environment] do |t, args|
start_at = (ENV["start_at"] || 0).to_i
low_water_mark = 50
high_water_mark = 300
poll_duration = 10
enqueuer =
Domain::Fa::UserEnqueuer.new(
start_at: start_at,
low_water_mark: low_water_mark,
high_water_mark: high_water_mark,
)
loop { sleep poll_duration if enqueuer.run_once == :sleep }
end
desc "run a single browse page job"
task browse_page_job: %i[set_logger_stdout environment] do
Domain::Fa::Job::BrowsePageJob.set(
priority: -20,
queue: "manual",
).perform_later({})
puts "#{Time.now} - browse_page_job - Domain::Fa::Job::BrowsePageJob"
end
desc "run a single post scan job"
task scan_post_job: %i[set_logger_stdout environment] do |t, args|
fa_id = ENV["fa_id"] || raise("must provide fa_id")
Domain::Fa::Job::ScanPostJob.set(
priority: -10,
queue: "manual",
).perform_later({ fa_id: fa_id, force_scan: true })
end
desc "run a post scan job (skip force)"
task scan_post_job_noforce: %i[set_logger_stdout environment] do |t, args|
fa_id_start = ENV["fa_id_start"]
fa_id_end = ENV["fa_id_end"]
if fa_id_start || fa_id_end
if !fa_id_start || !fa_id_end
raise("need both fa_id_start and fa_id_end")
else
fa_id_start = fa_id_start.to_i
fa_id_end = fa_id_end.to_i
if fa_id_start > fa_id_end
raise("fa_id_start must be less than fa_id_end")
end
end
else
fa_id = ENV["fa_id"] || raise("must provide fa_id")
fa_id_start = fa_id.to_i
fa_id_end = fa_id.to_i
end
puts "enqueue #{fa_id_start}..#{fa_id_end} (#{fa_id_end - fa_id_start + 1})"
for fa_id in (fa_id_start..fa_id_end)
Domain::Fa::Job::ScanPostJob.set(
priority: -10,
queue: "manual",
).perform_later({ fa_id: fa_id })
end
end
desc "run a user page scan job"
task user_page_job: %i[set_logger_stdout environment] do
url_name = ENV["url_name"] || raise("must provide url_name")
Domain::Fa::Job::UserPageJob.set(
priority: -10,
queue: "manual",
).perform_later({ url_name: url_name, force_scan: true })
end
desc "run a user gallery scan job"
task user_gallery_job: %i[set_logger_stdout environment] do
url_name = ENV["url_name"] || raise("must provide url_name")
Domain::Fa::Job::UserGalleryJob.set(
priority: -10,
queue: "manual",
).perform_later({ url_name: url_name, force_scan: true })
end
desc "backfill FaFavIdAndDate from exisitng user page and favs scans"
task backfill_favs_and_dates: %i[set_logger_stdout environment] do
start_at = ENV["start_at"]
mode = ENV["mode"] || "both"
mode = Tasks::Fa::BackfillFavsAndDatesTask::Mode.deserialize(mode)
Tasks::Fa::BackfillFavsAndDatesTask.new(mode:, start_at:).run
end
# task export_to_sqlite: %i[environment set_logger_stdout] do
# profile = !!ENV["profile"]
# sample = !!ENV["sample"]
# outfile = ENV["outfile"] || raise("'outfile' required")
# tables =
# ENV["tables"] ||
# raise(
# "'tables' required (all, #{Domain::Fa::SqliteExporter::TABLES.keys.join(", ")})",
# )
# tables = tables.split(",").map(&:to_sym)
# db = SQLite3::Database.new(outfile)
# exporter = Domain::Fa::SqliteExporter.new(db, sample, tables)
# exporter.start_profiling! if profile
# exporter.run
# exporter.end_profiling! if profile
# end
# desc "Backfill favs by scanning historical HTTP logs for favorites pages"
# task backfill_favs: :environment do
# FaBackfillFavs.new(
# start_at: ENV["start_at"]&.to_i,
# limit: ENV["limit"]&.to_i,
# batch_size: ENV["batch_size"]&.to_i,
# ).run
# end
desc "Enqueue pending favs jobs"
task enqueue_pending_favs: :environment do
Tasks::Fa::EnqueueDueUserFavsScansTask.new.run
end
desc "Enqueue pending page jobs"
task enqueue_pending_user_pages: :environment do
Tasks::Fa::EnqueueDueUserPageScansTask.new.run
end
desc "Get 404 files from FurArchiver"
task get_404_files_from_fur_archiver: :set_logger_stdout do
url_name = ENV["url_name"]
if url_name
query =
Domain::User
.find_by_param("fa@#{url_name}")
.posts
.flat_map do |post|
post.files.where(state: "terminal_error", last_status_code: 404)
end
method = :each
else
query =
Domain::PostFile
.joins(:post)
.for_post_type(Domain::Post::FaPost)
.where(state: "terminal_error", last_status_code: 404)
.where(
"((\"post\".\"json_attributes\"->>'tried_from_fur_archiver')::bool) IS NULL OR ((\"post\".\"json_attributes\"->>'tried_from_fur_archiver')::bool) != TRUE",
)
method = :find_each
end
puts "counting..."
total = query.count
puts "total: #{total}"
pb = ProgressBar.create(total: total, format: "%t: %c/%C %B %p%% %a %e")
counter = 0
query.send(method) do |post_file|
next if post_file.url_str.include?("/stories/")
Job::FaPostFurArchiverPostFileJob.perform_now({ post_file: })
post = post_file.post
puts "processed #{post.to_param} / #{post.title_for_view}".bold
counter += 1
pb.progress = [pb.progress + 1, total].min
end
end
desc "Backfill Domain::UserJobEvent::AddTrackedObject favs scans for a user"
task backfill_add_tracked_object_favs_scans: :set_logger_stdout do
url_name = ENV["url_name"] || raise("must provide url_name")
user =
Domain::User.find_by_param("fa@#{url_name}") || raise("user not found")
Domain::Fa::BackfillTrackedObjectUserFavs.new(user:).run
end
end