visual fingerprinting

This commit is contained in:
Dylan Knutson
2025-03-11 01:06:58 +00:00
parent ed30a8c5bd
commit 305ddd2cb6
26 changed files with 631 additions and 215 deletions

142
Rakefile
View File

@@ -482,28 +482,6 @@ task sample_migrated_favs: :environment do
puts "new fav count: #{new_user.faved_posts.count}"
end
task clear_e621_user_favs_migrated_at: :environment do
puts "clearing migrated_user_favs_at"
ReduxApplicationRecord.connection.execute(<<~SQL)
UPDATE domain_users
SET json_attributes = json_attributes - 'migrated_user_favs_at'
WHERE type = 'Domain::User::E621User'
AND json_attributes->>'migrated_user_favs_at' IS NOT NULL
SQL
puts "done"
# query = Domain::User::E621User.where.not(migrated_user_favs_at: nil)
# pb = ProgressBar.create(total: query.count, format: "%t: %c/%C %B %p%% %a %e")
# query.find_in_batches(batch_size: 1000) do |b|
# ReduxApplicationRecord.transaction do
# b.each do |u|
# u.migrated_user_favs_at = nil
# u.save!
# end
# end
# pb.progress += b.size
# end
end
task create_post_file_fingerprints: :environment do
def migrate_posts_for_user(user)
puts "migrating posts for #{user.to_param}"
@@ -512,30 +490,130 @@ task create_post_file_fingerprints: :environment do
total: user.posts.count,
format: "%t: %c/%C %B %p%% %a %e",
)
user
.posts
.includes(files: :blob)
.find_in_batches(batch_size: 16) do |batch|
.includes(:files)
.find_in_batches(batch_size: 64) do |batch|
ReduxApplicationRecord.transaction do
batch.each do |post|
post.files.each { |file| file.ensure_fingerprint! }
puts "migrated #{post.id} / #{post.to_param} / '#{post.title_for_view}'"
pb.progress = [pb.progress + 1, pb.total].min
end
batch.each { |post| migrate_post(post) }
pb.progress = [pb.progress + 1, pb.total].min
end
end
end
if ENV["user"].present?
def migrate_post(post)
puts "migrating #{post.id} / #{post.to_param} / '#{post.title_for_view}'"
ColorLogger.quiet do
post.files.each do |file|
migrate_post_file(file)
rescue StandardError => e
puts "error: #{e.message}"
end
end
end
def migrate_post_file(post_file)
job = Domain::PostFileThumbnailJob.new
ColorLogger.quiet do
job.perform({ post_file: })
rescue => e
puts "error: #{e.message}"
end
end
if ENV["post_file_descending"].present?
total = 49_783_962 # cache this value
pb = ProgressBar.create(total:, format: "%t: %c/%C %B %p%% %a %e")
i = 0
Domain::PostFile
.where(state: "ok")
.includes(:blob)
.find_each(
order: :desc,
batch_size: 32,
start: ENV["start_at"],
) do |post_file|
i += 1
if i % 100 == 0
puts "migrating #{post_file.id} / #{post_file.post.title_for_view}"
end
migrate_post_file(post_file)
pb.progress = [pb.progress + 1, pb.total].min
end
elsif ENV["posts_descending"].present?
# total = Domain::Post.count
total = 66_431_808 # cache this value
pb = ProgressBar.create(total:, format: "%t: %c/%C %B %p%% %a %e")
Domain::Post.find_each(order: :desc) do |post|
migrate_post(post) unless post.is_a?(Domain::Post::InkbunnyPost)
pb.progress = [pb.progress + 1, pb.total].min
end
elsif ENV["user"].present?
for_user = ENV["user"] || raise("need 'user'")
user = DomainController.find_model_from_param(Domain::User, for_user)
raise "user '#{for_user}' not found" unless user
migrate_posts_for_user(user)
elsif ENV["users_descending"].present?
# all users with posts, ordered by post count descending
users = Domain::User::FaUser.order(num_watched_by: :desc).limit(20)
users.find_each(batch_size: 1) { |user| migrate_posts_for_user(user) }
migrated_file = File.open("migrated_files.txt", "a+")
migrated_file.seek(0)
migrated_users = migrated_file.readlines.map(&:strip)
users =
Domain::User::FaUser.order(
Arel.sql("json_attributes->>'num_watched_by' DESC NULLS LAST"),
).pluck(:id)
users.each do |user_id|
user = Domain::User::FaUser.find(user_id)
next if migrated_users.include?(user.to_param)
puts "migrating posts for #{user.to_param} (#{user.num_watched_by} watched by)"
migrate_posts_for_user(user)
migrated_file.write("#{user.to_param}\n")
migrated_file.flush
end
migrated_file.close
else
raise "need 'user' or 'users_descending'"
end
end
task enqueue_pending_post_files: :environment do
query = Domain::PostFile.where(state: "pending")
puts "enqueueing #{query.count} pending post files"
query.find_in_batches(batch_size: 100, start: ENV["start_at"]) do |batch|
while (
queue_size =
GoodJob::Job.where(
job_class: "Job::PostFileJob",
performed_at: nil,
scheduled_at: nil,
error: nil,
).count
) > 100
puts "queue size: #{queue_size}"
sleep 10
end
batch.each do |post_file|
Job::PostFileJob.set(priority: 10).perform_later(post_file:)
end
end
end
task find_post_files_with_empty_response: :environment do
query =
Domain::PostFile
.where(state: "ok", retry_count: 0)
.joins(:log_entry)
.where(http_log_entries: { response_sha256: BlobFile::EMPTY_FILE_SHA256 })
pb = ProgressBar.create(total: query.count, format: "%t: %c/%C %B %p%% %a %e")
query.find_each(batch_size: 10) do |post_file|
# puts "post_file: #{post_file.id} / '#{post_file.post.to_param}'"
post_file.state_pending!
post_file.save!
Job::PostFileJob.perform_now(post_file:)
pb.progress = [pb.progress + 1, pb.total].min
end
end