blob file migration task refactor

This commit is contained in:
Dylan Knutson
2025-07-08 02:28:14 +00:00
parent fe5276d3ef
commit a2460a5327
11 changed files with 639 additions and 268 deletions

View File

@@ -1,118 +1,12 @@
require "find"
namespace :blob_file do
desc "migrate blob entries in parallel"
task migrate_blob_entries_parallel: %i[environment] do
batch_size = ENV["batch_size"]&.to_i || 16
start_at = ENV["start_at"] || "0" * 64
migrator = Domain::BlobFile::MigrateBlobEntryToBlobFile.new
migrator.run(batch_size: batch_size, start_sha256: start_at)
end
desc "migrate blob files to the new format"
desc "migrate blob entries to blob files"
task migrate_blob_entries: %i[environment] do
batch_size = ENV["batch_size"]&.to_i || 1000
profile = ENV["profile"] == "true" || false
start_at = ENV["start_at"] || "0" * 64
num_migrated = 0
puts "batch_size: #{batch_size}"
start_at = ENV["start_at"] || Tasks::BlobFileMigrationTask::ZERO_SHA256
RubyProf.start if profile
def migrate_impl(batch_size, start_at)
def n2d(n)
ActiveSupport::NumberHelper.number_to_delimited(n).rjust(8)
end
num_migrated = 0
num_processed = 0
start_time = Time.now
BlobEntry.in_batches(
of: batch_size,
start: HexUtil.hex2bin(start_at),
order: :asc,
use_ranges: true,
) do |batch|
batch_migrated = insert_blob_entries_batch(batch)
num_migrated += batch_migrated
num_processed += batch.size
rate = batch_migrated.to_f / (Time.now - start_time)
puts [
"[migrated: #{n2d(num_migrated)}]",
"[processed: #{n2d(num_processed)}]",
"[rate: #{rate.round(1).to_s.rjust(5)}/second]",
"[last: '#{HexUtil.bin2hex(batch.last.sha256)}']",
].join(" ")
start_time = Time.now
end
num_migrated
end
def insert_blob_entries_batch(batch)
num_migrated = 0
blob_entry_sha256s = batch.pluck(:sha256)
blob_file_sha256s =
BlobFile.where(sha256: blob_entry_sha256s).pluck(:sha256)
missing_sha256s = blob_entry_sha256s - blob_file_sha256s
BlobFile.transaction do
BlobEntry
.where(sha256: missing_sha256s)
.each do |blob_entry|
blob_file = BlobFile.initialize_from_blob_entry(blob_entry)
sha256_hex = HexUtil.bin2hex(blob_file.sha256)
begin
blob_file.save!
num_migrated += 1
rescue => e
puts "error saving blob file #{sha256_hex}: #{e}"
end
end
rescue => e
puts "error migrating blob entry: #{missing_sha256s.map { |sha256| HexUtil.bin2hex(sha256) }}"
raise e
end
num_migrated
end
def start_thread(batch_size, start_at)
Thread.new { migrate_impl(batch_size, start_at) }
end
num_threads = 1
# skip = ((0xFFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF) / num_threads) + 1
num_migrated =
(0...num_threads)
.map do |i|
# partition the entire sha256 space into num_threads chunks
# each chunk is 256 / num_threads in size
# start_at = (skip * i).to_s(16).rjust(32, "0")
# stop_at = ((skip * (i + 1)) - 1).to_s(16).rjust(32, "0")
puts "migrate #{start_at}"
start_thread(batch_size, start_at)
end
.map(&:value)
.sum
begin
base = "profiler/blob_file_migrate"
FileUtils.mkdir_p(base) unless File.exist?(base)
result = RubyProf.stop
File.open("#{base}/profile.txt", "w") do |f|
RubyProf::GraphPrinter.new(result).print(f, { min_percent: 1 })
end
File.open("#{base}/profile.html", "w") do |f|
RubyProf::CallStackPrinter.new(result).print(f, { min_percent: 1 })
end
File.open("#{base}/profile.rubyprof", "w") do |f|
RubyProf::SpeedscopePrinter.new(result).print(f, { min_percent: 1 })
end
puts "wrote profile to #{base}"
end if profile
puts "migrated #{num_migrated} total blob entries"
Tasks::BlobFileMigrationTask.new.run(batch_size:, start_sha256: start_at)
end
task verify_fs_files: :environment do