redux-scraper/Rakefile

# Add your own tasks in files placed in lib/tasks ending in .rake,
# for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.

require "rake/testtask"
require_relative "config/application"

Rails.application.load_tasks
Dir.glob(Rails.root.join("rake", "*.rake")).each { |rake_file| load rake_file }

task set_ar_stdout: :environment do
  ActiveRecord::Base.logger = Logger.new($stdout)
end

task set_logger_stdout: :environment do
  Rails.logger = Logger.new($stdout)
  Rails.logger.formatter =
    proc { |severity, datetime, progname, msg| "#{severity}: #{msg}\n" }
  ActiveRecord::Base.logger = nil
  ActiveJob::Base.logger = nil
  GoodJob.logger = Rails.logger
end

task periodic_tasks: %i[environment set_logger_stdout] do
  Thread.new do
    loop do
      Rake::Task["pghero:capture_space_stats"].execute
      puts "logged space stats"
      sleep 6.hours
    end
  end

  Thread.new do
    loop do
      Rake::Task["pghero:capture_query_stats"].execute
      puts "logged query stats"
      sleep 5.minutes
    end
  end

  loop { sleep 10 }
end

namespace :db_sampler do
  task export: :environment do
    url_names = ENV["url_names"] || raise("need 'url_names' (comma-separated)")
    outfile = $stdout
    DbSampler.new(outfile).export(url_names.split(","))
  ensure
    outfile.close if outfile
  end

  task import: [:environment] do
    infile = $stdin
    DbSampler.new(infile).import
  ensure
    infile.close if infile
  end
end

task good_job: %i[environment set_ar_stdout set_logger_stdout] do
  env_hash = {
    "RAILS_ENV" => "worker",
    "GOOD_JOB_POLL_INTERVAL" => "5",
    "GOOD_JOB_MAX_CACHE" => "10000",
    "GOOD_JOB_QUEUE_SELECT_LIMIT" => "4096",
    "GOOD_JOB_MAX_THREADS" => "4",
    "GOOD_JOB_ENABLE_CRON" => "1",
    "GOOD_JOB_QUEUES" =>
      ENV["GOOD_JOB_QUEUES"] ||
        %w[manual:4 fa_post,e621:2 *:6].reject(&:nil?).join(";"),
  }

  env_hash.each do |key, value|
    ENV[key] = value
    puts "$> #{key.light_black.bold} = #{value.bold}"
  end

  cmd = "bundle exec good_job"
  puts "$> #{cmd.bold}"
  exec(cmd)
end

task :reverse_csv do
  file = ENV["file"] || raise("need 'file' (file path)")
  in_csv = CSV.parse(File.open(file, "r+"), headers: true)
  out_csv =
    CSV.new(
      File.open("rev_" + file, "w"),
      write_headers: true,
      headers: in_csv.headers,
    )
  in_csv.reverse_each { |row| out_csv << row.map(&:second) }
  out_csv.close
end

task migrate_to_domain: :environment do
  only_user = ENV["only_user"]
  allowed_domains = %w[e621 fa ib]
  only_domains = (ENV["only_domains"] || "").split(",")
  only_domains = allowed_domains if only_domains.empty?
  if (only_domains - allowed_domains).any?
    raise "only_domains must be a subset of #{allowed_domains.join(", ")}"
  end

  migrator = Domain::MigrateToDomain.new

  if only_domains.include?("e621")
    migrator.migrate_e621_users(only_user: only_user)
    migrator.migrate_e621_posts(only_user: only_user)
    migrator.migrate_e621_users_favs(only_user: only_user)
  end

  if only_domains.include?("fa")
    migrator.migrate_fa_users(only_user: only_user)
    migrator.migrate_fa_posts(only_user: only_user)
    migrator.migrate_fa_users_favs(only_user: only_user)
    migrator.migrate_fa_users_followed_users(only_user: only_user)
  end

  if only_domains.include?("ib")
    migrator.migrate_inkbunny_users(only_user: only_user)
    migrator.migrate_inkbunny_posts(only_user: only_user)
    migrator.migrate_inkbunny_pools(only_user: nil) if only_user.nil?
  end
end

task infer_last_submission_log_entries: :environment do
  only_fa_id = ENV["only_fa_id"]
  start = ENV["start_at"]&.to_i || nil

  if only_fa_id
    relation = Domain::Fa::Post.where(fa_id: only_fa_id)
  else
    relation =
      Domain::Fa::Post
        .where(state: :ok)
        .where(last_submission_page_id: nil)
        .or(Domain::Fa::Post.where(state: :ok).where(posted_at: nil))
  end

  relation.find_each(batch_size: 10, start:) do |post|
    parts = ["[id: #{post.id}]", "[fa_id: #{post.fa_id}]"]

    log_entry = post.guess_last_submission_page
    unless log_entry
      parts << "[no log entry]"
      next
    end

    contents = log_entry.response&.contents
    unless contents
      parts << "[no contents]"
      next
    end

    parser = Domain::Fa::Parser::Page.new(contents)
    if parser.submission_not_found?
      parts << "[removed]"
      post.state = :removed
    else
      posted_at = parser.submission.posted_date
      post.posted_at ||= posted_at
      parts << "[posted at: #{posted_at}]"
    end

    if post.last_submission_page_id.present? &&
         log_entry.id != post.last_submission_page_id
      parts << "[overwrite]"
    end
    post.last_submission_page_id = log_entry.id

    parts << "[log entry: #{log_entry.id}]"
    parts << "[uri: #{log_entry.uri.to_s}]"
    post.save!
  rescue => e
    parts << "[error: #{e.message}]"
  ensure
    puts parts.join(" ")
  end
end

task fix_fa_post_files: :environment do
  file_ids = ENV["file_ids"]&.split(",") || raise("need 'file_ids'")
  Domain::Fa::Post
    .where(file_id: file_ids)
    .find_each { |post| post.fix_file_by_uri! }
end

task fix_fa_post_files_by_csv: :environment do
  require "csv"

  csv_file = ENV["csv_file"] || raise("need 'csv_file'")
  CSV
    .open(csv_file, headers: true)
    .each do |row|
      id = row["id"].to_i
      post = Domain::Fa::Post.find(id)
      post.fix_file_by_uri!
    end
end

task fix_buggy_fa_posts: :environment do
  post_fa_ids = %w[7704069 7704068 6432347 6432346].map(&:to_i)

  require "uri"

  post_fa_ids.each do |fa_id|
    post = Domain::Fa::Post.find_by(fa_id: fa_id)
    next unless post&.file
    post_file_url_str = Addressable::URI.parse(post.file_url_str).to_s
    file_url_str = Addressable::URI.parse(CGI.unescape(post.file.uri.to_s)).to_s
    hle = post.guess_last_submission_page

    parser = Domain::Fa::Parser::Page.new(hle.response.contents)
    if parser.submission_not_found?
      post.file = nil
      post.save!
      puts "submission not found"
    else
      submission = parser.submission
      full_res_img = Addressable::URI.parse(submission.full_res_img)
      full_res_img.scheme = "https" if full_res_img.scheme.blank?
      matches = full_res_img.to_s == post.file_url_str
    end

    # if post_file_url_str != file_url_str
    #   post.file = nil
    #   post.save!
    #   puts "url mismatch: #{post_file_url_str} != #{file_url_str}"
    # end
    # binding.pry
  end
end

task fix_e621_post_files: :environment do
  query = Domain::Post::E621Post.where(state: "ok").where.missing(:files)
  limit = ENV["limit"]&.to_i
  puts "query: #{query.to_sql}"

  query.find_each(batch_size: 10) do |post|
    Domain::E621::Task::FixE621PostMissingFiles.new.run(post)
    if limit
      limit -= 1
      if limit.zero?
        puts "limit reached"
        break
      end
    end
  end
end

task perform_good_jobs: :environment do
  job_class = ENV["job_class"]
  job_id = ENV["job_id"]
  limit = ENV["limit"]&.to_i

  if !job_id.present? && !job_class.present?
    raise "need 'job_id' or 'job_class'"
  end

  relation =
    if job_id
      GoodJob::Job.where(id: job_id)
    else
      GoodJob::Job.queued.where(job_class: job_class).order(created_at: :asc)
    end

  relation.find_each(batch_size: 1) do |job|
    job = T.cast(job, GoodJob::Job)

    # Get the actual job instance and deserialize arguments
    serialized_args = job.serialized_params["arguments"]
    if serialized_args.nil?
      puts "No arguments found for job #{job.id}"
      next
    end

    deserialized_args = ActiveJob::Arguments.deserialize(serialized_args)
    job_instance = job.job_class.constantize.new
    job_instance.deserialize(job.serialized_params)

    puts "Running job #{job.id} (#{job.job_class})"

    # Create execution record
    execution =
      GoodJob::Execution.create!(
        active_job_id: job.active_job_id,
        job_class: job.job_class,
        queue_name: job.queue_name,
        serialized_params: job.serialized_params,
        scheduled_at: job.scheduled_at,
        created_at: Time.current,
        updated_at: Time.current,
        process_id: SecureRandom.uuid,
      )

    start_time = Time.current

    # Temporarily disable concurrency limits
    job_class = job.job_class.constantize
    old_config = job_class.good_job_concurrency_config
    job_class.good_job_concurrency_config = { total_limit: nil }

    begin
      # Perform the job with deserialized arguments
      GoodJob::CurrentThread.job = job
      job.update!(performed_at: Time.current)
      job_instance.arguments = deserialized_args
      job_instance.perform_now

      # Update execution and job records
      execution.update!(
        finished_at: Time.current,
        error: nil,
        error_event: nil,
        duration: Time.current - start_time,
      )
      job.update!(finished_at: Time.current)
      puts "Job completed successfully"
    rescue => e
      puts "Job failed: #{e.message}"
      # Update execution and job records with error
      execution.update!(
        finished_at: Time.current,
        error: e.message,
        error_event: "execution_failed",
        error_backtrace: e.backtrace,
        duration: Time.current - start_time,
      )
      job.update!(
        error: "#{e.class}: #{e.message}",
        error_event: "execution_failed",
      )
      raise e
    ensure
      job.update!(
        executions_count: GoodJob::Execution.where(active_job_id: job.id).count,
      )
      # Restore original concurrency config
      job_class.good_job_concurrency_config = old_config
      GoodJob::CurrentThread.job = nil
    end

    if limit
      limit -= 1
      if limit.zero?
        puts "limit reached"
        break
      end
    end
  end
end