Files
redux-scraper/Rakefile
2023-08-24 21:02:26 -07:00

168 lines
4.6 KiB
Ruby

# Add your own tasks in files placed in lib/tasks ending in .rake,
# for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.
require "rake/testtask"
require_relative "config/application"
Rails.application.load_tasks
$LOAD_PATH << Rails.root.join("rake")
Rake.application.rake_require "sst"
Rake.application.rake_require "log_entry"
Rake.application.rake_require "metrics"
Rake.application.rake_require "fa"
Rake.application.rake_require "e621"
Rake.application.rake_require "twitter"
task :set_ar_stdout => :environment do
ActiveRecord::Base.logger = Logger.new($stdout)
end
task :set_logger_stdout => :environment do
Rails.logger = Logger.new($stdout)
Rails.logger.formatter = proc do |severity, datetime, progname, msg|
"#{severity}: #{msg}\n"
end
ActiveRecord::Base.logger = nil
ActiveJob::Base.logger = nil
GoodJob.logger = Rails.logger
end
task :periodic_tasks => [:environment, :set_logger_stdout] do
Thread.new do
loop do
Rake::Task["pghero:capture_space_stats"].execute
puts "logged space stats"
sleep 6.hours
end
end
Thread.new do
loop do
Rake::Task["pghero:capture_query_stats"].execute
puts "logged query stats"
sleep 5.minutes
end
end
Thread.new do
loop do
Rake::Task["fa:browse_page_job"].execute
Rake::Task["fa:home_page_job"].execute
Rake::Task["e621:posts_index_job"].execute
puts "enqueue periodic jobs"
sleep 1.minute
end
end
loop do
sleep 10
end
end
namespace :blob_entries do
task :export_samples => :environment do
limit = ENV["limit"]&.to_i || raise("need 'limit' (num)")
outfile = ENV["outfile"] || raise("need 'outfile' (file path, .json encoded)")
BlobEntrySampleExporter.new.export_samples(limit, outfile)
end
task :import_samples => :environment do
infile = ENV["infile"] || raise("need 'infile' (file path, .json encoded)")
BlobEntrySampleExporter.new.import_samples(infile)
end
task :migrate_entries => :environment do
start_at = ENV["start_at"]
batch_size = ENV["batch_size"]&.to_i || 64
BlobEntrySampleExporter.new.migrate_blob_entries(start_at, batch_size)
end
end
namespace :db_sampler do
task :export => :environment do
url_names = ENV["url_names"] || raise("need 'url_names' (comma-separated)")
# outfile_path = ENV["outfile"] || raise("need 'outfile' (file path)")
# outfile = File.open(outfile_path, "wb")
outfile = $stdout
DbSampler.new(outfile).export(url_names.split(","))
ensure
outfile.close if outfile
end
task :import => [:environment] do
# infile_path = ENV["infile"] || raise("need 'infile' (file path)")
# infile = File.open(infile_path, "rb")
infile = $stdin
DbSampler.new(infile).import
ensure
infile.close if infile
end
end
task :good_job do
proxies = ["direct", "proxy-1", "dedipath-1", "serverhost-1"]
proxy = ENV["proxy"]
raise("'proxy' must be set") unless proxy
raise("'proxy' must be one of #{proxies}") unless proxies.include?(proxy)
env_hash = {
"RAILS_ENV" => "worker",
"GOOD_JOB_POLL_INTERVAL" => "5",
"GOOD_JOB_MAX_CACHE" => "10000",
"GOOD_JOB_QUEUE_SELECT_LIMIT" => "4096",
"GOOD_JOB_MAX_THREADS" => "4",
"GOOD_JOB_QUEUES" => [
"manual:4",
"fa_post,e621:2",
"*:6",
].reject(&:nil?).join(";"),
}
env_hash.each do |key, value|
ENV[key] = value
puts "$> #{key.light_black.bold} = #{value.bold}"
end
cmd = "bundle exec good_job"
puts "$> #{cmd.bold}"
exec(cmd)
end
task :recompute_job_signatures => :environment do
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Base.logger.level = :error
start_at = ENV["start_at"]&.to_i || 0
count = 0
destroyed = 0
puts "# jobs: #{Delayed::Job.count}"
Delayed::Job.find_each(start: start_at) do |job|
job.set_signature
unless job.save
job.destroy
destroyed += 1
end
count += 1
if count % 50 == 0
puts "processed #{count}, destroyed #{destroyed} - last id: #{job.id}"
end
end
end
task :workoff_failed_jobs => [:environment, :set_ar_stdout, :set_logger_stdout] do
worker = Delayed::Worker.new
Delayed::Job.where(
"last_error is not null and attempts <= 2"
).find_each(batch_size: 1) do |job|
worker.run(job)
end
end
task :reverse_csv do
file = ENV["file"] || raise("need 'file' (file path)")
in_csv = CSV.parse(File.open(file, "r+"), headers: true)
out_csv = CSV.new(File.open("rev_" + file, "w"), write_headers: true, headers: in_csv.headers)
in_csv.reverse_each do |row|
out_csv << row.map(&:second)
end
out_csv.close
end