script for fixing e621 posts

This commit is contained in:
Dylan Knutson
2025-02-13 21:18:54 +00:00
parent fb9e36f527
commit 049f83660c
17 changed files with 215 additions and 31 deletions

View File

@@ -234,12 +234,18 @@ end
task fix_e621_post_files: :environment do
query = Domain::Post::E621Post.where(state: "ok").where.missing(:files)
limit = ENV["limit"]&.to_i
puts "query: #{query.to_sql}"
query.find_each(batch_size: 1) do |post|
puts "post: #{post.id} / #{post.e621_id}"
Domain::E621::Job::ScanPostJob.perform_now(post: post)
break
query.find_each(batch_size: 10) do |post|
Domain::E621::Task::FixE621PostMissingFiles.new.run(post)
if limit
limit -= 1
if limit.zero?
puts "limit reached"
break
end
end
end
end
@@ -256,12 +262,7 @@ task perform_good_jobs: :environment do
if job_id
GoodJob::Job.where(id: job_id)
else
GoodJob::Job.where(
job_class: job_class,
finished_at: nil,
error: nil,
performed_at: nil,
)
GoodJob::Job.queued.where(job_class: job_class).order(created_at: :asc)
end
relation.find_each(batch_size: 1) do |job|
@@ -279,7 +280,6 @@ task perform_good_jobs: :environment do
job_instance.deserialize(job.serialized_params)
puts "Running job #{job.id} (#{job.job_class})"
puts "Arguments: #{deserialized_args.inspect}"
# Create execution record
execution =
@@ -333,6 +333,9 @@ task perform_good_jobs: :environment do
)
raise e
ensure
job.update!(
executions_count: GoodJob::Execution.where(active_job_id: job.id).count,
)
# Restore original concurrency config
job_class.good_job_concurrency_config = old_config
GoodJob::CurrentThread.job = nil

View File

@@ -103,6 +103,7 @@ class Domain::E621::Job::ScanUserFavsJob < Domain::E621::Job::Base
caused_by_entry: causing_log_entry,
)
was_new = post.new_record?
post.last_index_page ||= response.log_entry
post.save!
e621_id_to_post_id[e621_post_id] = T.must(post.id)
if was_new

View File

@@ -95,6 +95,7 @@ class Domain::Inkbunny::Job::UpdatePostsJob < Domain::Inkbunny::Job::Base
pools_to_update
)
logger.prefix = "ib_post_id #{submission_json["submission_id"].to_s.bold}"
logger.info "update post #{submission_json["submission_id"].to_s.bold}"
post =
Domain::Inkbunny::Post.includes(:pools).find_by!(

View File

@@ -45,8 +45,7 @@ class Domain::E621::TagUtil
file.url_str =
begin
file = post_json["file"]
file["url"] ||
"https://static1.e621.net/data/#{e621_md5[0...2]}/#{e621_md5[2...4]}/#{e621_md5}.#{file["ext"]}"
file["url"] || md5_to_file_url(e621_md5, file["ext"])
end
e621_post.md5 = e621_md5
e621_post.posted_at = post_json["created_at"]
@@ -68,4 +67,9 @@ class Domain::E621::TagUtil
e621_post
end
sig { params(md5: String, ext: String).returns(String) }
def self.md5_to_file_url(md5, ext)
"https://static1.e621.net/data/#{md5[0...2]}/#{md5[2...4]}/#{md5}.#{ext}"
end
end

View File

@@ -0,0 +1,52 @@
# typed: strict
class Domain::E621::Task::FixE621PostMissingFiles
extend T::Sig
include HasColorLogger
sig { params(post: Domain::Post::E621Post).void }
def run(post)
index_page = post.last_index_page
unless index_page
logger.error("no index page for post #{post.id}")
return
end
index_page_contents = index_page.response&.contents
unless index_page_contents
logger.error("no index page contents for post #{post.id}")
return
end
index_page_json = JSON.parse(index_page_contents) if index_page_contents
post_json = index_page_json["posts"].find { |p| p["id"] == post.e621_id }
unless post_json
logger.error("no post json for post #{post.id}")
return
end
if post_json["file"]["md5"] != post.md5
logger.error(
"md5 mismatch for post #{post.id}: #{post.md5} != #{post_json["file"]["md5"]}",
)
return
end
post_updated =
Domain::E621::TagUtil.initialize_or_update_post(
post_json: post_json,
caused_by_entry: post.last_index_page,
)
unless post_updated.id == post.id
logger.error(
"new post id mismatch for post #{post.id}: #{post_updated.id} != #{post.id}",
)
return
end
logger.info(
"[fixed missing file][post id: #{post.id}][post e621_id: #{post.e621_id}]",
)
post_updated.save!
end
end

View File

@@ -165,13 +165,8 @@ class Domain::Post < ReduxApplicationRecord
nil
end
sig { overridable.returns(T.nilable(String)) }
def primary_creator_name_fallback_for_view
nil
end
sig { overridable.returns(T.nilable(ActiveSupport::TimeWithZone)) }
def scanned_post_at
sig { overridable.returns(T.nilable(HttpLogEntry)) }
def scanned_post_log_entry_for_view
nil
end
@@ -193,4 +188,9 @@ class Domain::Post < ReduxApplicationRecord
def num_favorites_for_view
nil
end
sig { overridable.returns(T.nilable(String)) }
def primary_creator_name_fallback_for_view
nil
end
end

View File

@@ -50,6 +50,8 @@ class Domain::Post::E621Post < Domain::Post
belongs_to :last_index_page, class_name: "HttpLogEntry", optional: true
belongs_to :scan_log_entry, class_name: "HttpLogEntry", optional: true
belongs_to :caused_by_entry, class_name: "HttpLogEntry", optional: true
validates :state, inclusion: { in: %w[ok removed scan_error file_error] }
validates :rating,
inclusion: {
@@ -67,6 +69,13 @@ class Domain::Post::E621Post < Domain::Post
self.artists_array ||= []
end
sig { params(log_entry: HttpLogEntry).void }
def set_index_page_entry(log_entry)
self.last_index_page = log_entry
self.index_page_ids ||= []
self.index_page_ids << log_entry.id
end
sig { override.returns([String, Symbol]) }
def self.param_prefix_and_attribute
["e621", :e621_id]
@@ -97,6 +106,11 @@ class Domain::Post::E621Post < Domain::Post
"E621"
end
sig { override.returns(T.nilable(HttpLogEntry)) }
def scanned_post_log_entry_for_view
self.scan_log_entry || self.last_index_page || self.caused_by_entry
end
sig { override.returns(String) }
def self.domain_name_for_view
"E621"

View File

@@ -99,6 +99,11 @@ class Domain::Post::FaPost < Domain::Post
num_favorites
end
sig { override.returns(T.nilable(HttpLogEntry)) }
def scanned_post_log_entry_for_view
self.last_submission_log_entry || self.guess_last_submission_log_entry
end
sig { returns(T.nilable(String)) }
def status_for_view
case self.state

View File

@@ -64,12 +64,13 @@ class ReduxApplicationRecord < ActiveRecord::Base
@after_save_deferred_jobs ||= T.let([], T.nilable(T::Array[DeferredJob]))
GoodJob::Bulk.enqueue do
@after_save_deferred_jobs.each do |deferred_job|
deferred_job
.job_class
.set(deferred_job.set_args)
.perform_later(deferred_job.params)
job =
deferred_job
.job_class
.set(deferred_job.set_args)
.perform_later(deferred_job.params)
logger.info(
"[class: #{self.class.name}][id: #{id}][enqueued job: #{deferred_job.job_class.name}]",
"[class: #{self.class.name}][id: #{id}][enqueued job: #{deferred_job.job_class.name}][job_id: #{job.job_id}]",
)
Scraper::Metrics::JobBaseMetrics.observe_job_enqueued(
source_class: self.class,

View File

@@ -16,6 +16,14 @@
</div>
<i class="fa-solid fa-arrow-up-right-from-square text-slate-400"></i>
</div>
<% new_post = Domain::Post::E621Post.find_by(e621_id: @post.e621_id) %>
<% if new_post %>
<div class="py-2">
<%= link_to "Go to new post",
domain_post_path(new_post),
class: "text-white bg-blue-500 hover:bg-blue-800 transition-colors duration-200 px-4 py-2 rounded-md shadow hover:shadow-lg" %>
</div>
<% end %>
<div class="mt-2 flex flex-wrap gap-x-4 text-sm text-slate-600">
<span
><i class="fa-solid fa-star mr-1"></i>Score: <%= @post.score %></span

View File

@@ -3,12 +3,12 @@
<div class="flex flex-row gap-4 w-full mt-2">
<div class="grow text-sm text-slate-500">
<i class="fa-solid fa-magnifying-glass mr-1"></i>
<% if (scanned_hle = post.last_submission_log_entry || post.guess_last_submission_log_entry) %>
<% if (scanned_hle = post.scanned_post_log_entry_for_view) %>
<%= link_to "Scanned #{time_ago_in_words(scanned_hle.requested_at)} ago",
log_entry_path(scanned_hle),
class: "text-blue-600 hover:underline",
target: "_blank",
title: scanned_hle.requested_at.strftime("%Y-%m-%d %H:%M:%S") %>
log_entry_path(scanned_hle),
class: "text-blue-600 hover:underline",
target: "_blank",
title: scanned_hle.requested_at.strftime("%Y-%m-%d %H:%M:%S") %>
<% else %>
Unknown when post scanned
<% end %>

View File

@@ -1,10 +1,17 @@
<% if policy(post).view_file? %>
<section>
<% if log_entry = post.primary_file_for_view&.log_entry %>
<% if (log_entry = post.primary_file_for_view&.log_entry) && log_entry.status_code == 200 %>
<%= render partial: "log_entries/content_container",
locals: {
log_entry: log_entry,
} %>
<% else %>
<section class="flex grow justify-center overflow-clip">
<div class="text-slate-500">
<i class="fa-solid fa-exclamation-triangle"></i>
File error
</div>
</section>
<% end %>
</section>
<%= render partial: "log_entries/file_details_sky_section",

View File

@@ -15,6 +15,10 @@
<i class="fa-solid fa-clock mr-1"></i>
Response Time: <%= log_entry.response_time_ms %>ms
</span>
<span>
<i class="fa-solid fa-signal mr-1"></i>
Status: <span class="<%= log_entry.status_code == 200 ? 'text-green-600' : 'text-red-600' %>"><%= log_entry.status_code %></span>
</span>
</div>
</div>
</section>

View File

@@ -0,0 +1,16 @@
# typed: true
# DO NOT EDIT MANUALLY
# This is an autogenerated file for dynamic methods in `Domain::E621::Task::FixE621PostMissingFiles`.
# Please instead update this file by running `bin/tapioca dsl Domain::E621::Task::FixE621PostMissingFiles`.
class Domain::E621::Task::FixE621PostMissingFiles
sig { returns(ColorLogger) }
def logger; end
class << self
sig { returns(ColorLogger) }
def logger; end
end
end

View File

@@ -443,6 +443,9 @@ class Domain::Post::E621Post
end
module GeneratedAssociationMethods
sig { params(args: T.untyped, blk: T.untyped).returns(::HttpLogEntry) }
def build_caused_by_entry(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(::Domain::PostFile) }
def build_file(*args, &blk); end
@@ -461,6 +464,24 @@ class Domain::Post::E621Post
sig { params(args: T.untyped, blk: T.untyped).returns(::Domain::User::E621User) }
def build_uploader_user(*args, &blk); end
sig { returns(T.nilable(::HttpLogEntry)) }
def caused_by_entry; end
sig { params(value: T.nilable(::HttpLogEntry)).void }
def caused_by_entry=(value); end
sig { returns(T::Boolean) }
def caused_by_entry_changed?; end
sig { returns(T::Boolean) }
def caused_by_entry_previously_changed?; end
sig { params(args: T.untyped, blk: T.untyped).returns(::HttpLogEntry) }
def create_caused_by_entry(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(::HttpLogEntry) }
def create_caused_by_entry!(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(::Domain::PostFile) }
def create_file(*args, &blk); end
@@ -589,6 +610,9 @@ class Domain::Post::E621Post
sig { params(value: T::Enumerable[::Domain::PostGroupJoin::E621PoolJoin]).void }
def post_group_joins=(value); end
sig { returns(T.nilable(::HttpLogEntry)) }
def reload_caused_by_entry; end
sig { returns(T.nilable(::Domain::PostFile)) }
def reload_file; end
@@ -607,6 +631,9 @@ class Domain::Post::E621Post
sig { returns(T.nilable(::Domain::User::E621User)) }
def reload_uploader_user; end
sig { void }
def reset_caused_by_entry; end
sig { void }
def reset_file; end

View File

@@ -0,0 +1,36 @@
# typed: true
# DO NOT EDIT MANUALLY
# This is an autogenerated file for dynamic methods in `GoodJob::JobsController`.
# Please instead update this file by running `bin/tapioca dsl GoodJob::JobsController`.
class GoodJob::JobsController
sig { returns(HelperProxy) }
def helpers; end
module HelperMethods
include ::Turbo::DriveHelper
include ::Turbo::FramesHelper
include ::Turbo::IncludesHelper
include ::Turbo::StreamsHelper
include ::ActionView::Helpers::CaptureHelper
include ::ActionView::Helpers::OutputSafetyHelper
include ::ActionView::Helpers::TagHelper
include ::Turbo::Streams::ActionHelper
include ::ActionText::ContentHelper
include ::ActionText::TagHelper
include ::Webpacker::Helper
include ::ActionController::Base::HelperMethods
include ::GoodJob::IconsHelper
include ::GoodJob::ApplicationHelper
include ::GoodJob::ApplicationController::HelperMethods
include ::HelpersInterface
include ::Domain::PostsHelper
include ::GoodJobHelper
end
class HelperProxy < ::ActionView::Base
include HelperMethods
end
end

View File

@@ -41,6 +41,11 @@ RSpec.describe Domain::E621::Job::ScanUserFavsJob do
)
expect(Domain::UserPostFav.count).to eq(5)
post5212363 = Domain::Post::E621Post.find_by(e621_id: 5_212_363)
expect(post5212363).to be_present
expect(post5212363.caused_by_entry).to eq(log_entries[0])
expect(post5212363.last_index_page).to eq(log_entries[0])
expect(post5212363.index_page_ids).to eq([log_entries[0].id])
# Verify StaticFileJob was enqueued for each new post
static_file_jobs =
SpecUtil.enqueued_job_args(Domain::E621::Job::StaticFileJob)