Files
redux-scraper/spec/lib/tasks/blob_file_migration_task_spec.rb
Dylan Knutson e67eb94e44 add HasAuxTable
2025-07-18 06:23:40 +00:00

418 lines
13 KiB
Ruby

# typed: false
require "rails_helper"
RSpec.describe Tasks::BlobFileMigrationTask do
let(:log_sink) { StringIO.new }
let(:migrator) { described_class.new(log_sink: log_sink) }
describe "#run" do
context "with no blob entries" do
it "runs migration with default settings and logs correctly" do
result = migrator.run
expect(result).to eq(0)
expect(log_sink.string).to include("batch_size: 1000")
expect(log_sink.string).to include("migrated 0 total blob entries")
end
end
context "with custom batch size" do
it "uses the specified batch size in logs" do
result = migrator.run(batch_size: 500)
expect(result).to eq(0)
expect(log_sink.string).to include("batch_size: 500")
end
end
context "with custom start SHA256" do
it "accepts custom start SHA256 parameter" do
start_sha256 = "ff" * 32
result = migrator.run(start_sha256: start_sha256)
expect(result).to eq(0)
expect(log_sink.string).to include("batch_size: 1000")
expect(log_sink.string).to include("migrated 0 total blob entries")
end
end
context "with start_sha256='last' and no saved progress" do
it "starts from beginning when no progress is saved" do
result = migrator.run(start_sha256: "last")
expect(result).to eq(0)
expect(log_sink.string).to include("no saved progress")
expect(log_sink.string).to include("migrated 0 total blob entries")
end
end
context "with start_sha256='last' and existing progress" do
let!(:blob_entries) do
5.times.map { |i| create(:blob_entry, content: "progress #{i}") }
end
before do
# Simulate saved progress - use the SHA256 of the 3rd blob entry
progress_sha256_hex = HexUtil.bin2hex(blob_entries[2].sha256)
GlobalState.set("blob-file-migration-task", progress_sha256_hex)
end
after do
# Clean up the GlobalState
GlobalState.find_by(key: "blob-file-migration-task")&.destroy
end
it "resumes from saved progress" do
result = migrator.run(start_sha256: "last")
# Should skip the first 3 entries and migrate the remaining 2
expect(result).to be >= 0
expected_progress = HexUtil.bin2hex(blob_entries[2].sha256)
expect(log_sink.string).to include(expected_progress)
end
end
context "with existing blob entries that need migration" do
let!(:blob_entries) do
3.times.map { |i| create(:blob_entry, content: "content #{i}") }
end
after do
# Clean up any saved progress
GlobalState.find_by(key: "blob-file-migration-task")&.destroy
end
it "migrates all blob entries to blob files" do
expect(BlobFile.count).to eq(0)
result = migrator.run
expect(result).to eq(3)
expect(BlobFile.count).to eq(3)
expect(log_sink.string).to include("migrated 3 total blob entries")
end
it "creates blob files with correct attributes" do
migrator.run
blob_entries.each do |blob_entry|
blob_file = BlobFile.find_by(sha256: blob_entry.sha256)
expect(blob_file).to be_present
expect(blob_file.content_type).to eq(blob_entry.content_type)
expect(blob_file.size_bytes).to eq(blob_entry.size)
expect(blob_file.content_bytes).to eq(blob_entry.contents)
end
end
it "saves progress to GlobalState after migration" do
migrator.run
# Check that progress was saved
saved_progress = GlobalState.get("blob-file-migration-task")
expect(saved_progress).to be_present
expect(saved_progress.length).to eq(64) # Hex SHA256 length
# Should be the SHA256 of the last migrated entry
last_migrated_sha256_hex = HexUtil.bin2hex(BlobFile.last.sha256)
expect(saved_progress).to eq(last_migrated_sha256_hex)
end
it "logs progress during migration" do
migrator.run(batch_size: 2)
log_output = log_sink.string
expect(log_output).to include("migrated:")
expect(log_output).to include("processed:")
expect(log_output).to include("rate:")
end
end
context "with existing blob entries that already have blob files" do
let!(:blob_entry) { create(:blob_entry, content: "existing content") }
let!(:existing_blob_file) do
create(
:blob_file,
contents: blob_entry.contents,
content_type: blob_entry.content_type,
)
end
after { GlobalState.find_by(key: "blob-file-migration-task")&.destroy }
it "does not create duplicate blob files" do
expect(BlobFile.count).to eq(1)
result = migrator.run
expect(result).to eq(0) # No new migrations
expect(BlobFile.count).to eq(1) # Same count
expect(log_sink.string).to include("migrated 0 total blob entries")
end
it "saves the last migrated sha256, even if it has already been migrated" do
migrator.run
final_progress = GlobalState.get("blob-file-migration-task")
expect(final_progress).to eq(HexUtil.bin2hex(blob_entry.sha256))
end
end
context "with mixed scenarios" do
let!(:blob_entry_needing_migration) do
create(:blob_entry, content: "needs migration")
end
let!(:blob_entry_with_file) { create(:blob_entry, content: "has file") }
let!(:existing_blob_file) do
create(
:blob_file,
contents: blob_entry_with_file.contents,
content_type: blob_entry_with_file.content_type,
)
end
after { GlobalState.find_by(key: "blob-file-migration-task")&.destroy }
it "only migrates entries that need migration" do
expect(BlobFile.count).to eq(1)
result = migrator.run
expect(result).to eq(1) # Only one new migration
expect(BlobFile.count).to eq(2) # Now has both
# Verify the new blob file was created correctly
new_blob_file =
BlobFile.find_by(sha256: blob_entry_needing_migration.sha256)
expect(new_blob_file).to be_present
expect(new_blob_file.content_bytes).to eq(
blob_entry_needing_migration.contents,
)
end
it "saves progress for the actually migrated entry" do
migrator.run
saved_progress = GlobalState.get("blob-file-migration-task")
expected_progress = HexUtil.bin2hex(blob_entry_needing_migration.sha256)
expect(saved_progress).to eq(expected_progress)
end
end
context "with different batch sizes" do
let!(:blob_entries) do
5.times.map { |i| create(:blob_entry, content: "batch content #{i}") }
end
after { GlobalState.find_by(key: "blob-file-migration-task")&.destroy }
it "handles batch size of 1" do
result = migrator.run(batch_size: 1)
expect(result).to eq(5)
expect(BlobFile.count).to eq(5)
end
it "handles batch size larger than total entries" do
result = migrator.run(batch_size: 100)
expect(result).to eq(5)
expect(BlobFile.count).to eq(5)
end
it "handles custom batch size smaller than total entries" do
result = migrator.run(batch_size: 2)
expect(result).to eq(5)
expect(BlobFile.count).to eq(5)
# Verify all entries were migrated correctly
blob_entries.each do |blob_entry|
blob_file = BlobFile.find_by(sha256: blob_entry.sha256)
expect(blob_file).to be_present
end
end
it "saves progress multiple times with small batch size" do
migrator.run(batch_size: 2)
# Should see multiple progress saves in the log
expect(log_sink.string.scan(/migrated:/).count).to be > 1
end
end
context "with idempotent runs" do
let!(:blob_entries) do
3.times.map { |i| create(:blob_entry, content: "idempotent #{i}") }
end
after { GlobalState.find_by(key: "blob-file-migration-task")&.destroy }
it "handles duplicate runs gracefully" do
first_result = migrator.run
expect(first_result).to eq(3)
expect(BlobFile.count).to eq(3)
# Reset log sink for second run
log_sink.truncate(0)
log_sink.rewind
second_result = migrator.run
expect(second_result).to eq(0) # No new migrations
expect(BlobFile.count).to eq(3) # Same count
expect(log_sink.string).to include("migrated 0 total blob entries")
end
end
context "with large datasets" do
let!(:blob_entries) do
10.times.map { |i| create(:blob_entry, content: "large dataset #{i}") }
end
after { GlobalState.find_by(key: "blob-file-migration-task")&.destroy }
it "successfully migrates large datasets in batches" do
result = migrator.run(batch_size: 3)
expect(result).to eq(10)
expect(BlobFile.count).to eq(10)
# Verify progress logging occurred multiple times
log_output = log_sink.string
expect(log_output.scan(/migrated:/).count).to be > 1
end
end
end
describe "#log_progress" do
it "logs migration progress with numbers" do
migrator.send(:log_progress, 100, 150, 25.5, "abc123")
log_output = log_sink.string
expect(log_output).to include("migrated:")
expect(log_output).to include("processed:")
expect(log_output).to include("rate:")
expect(log_output).to include("last:")
end
it "handles nil last_sha256" do
migrator.send(:log_progress, 100, 150, 25.5, nil)
log_output = log_sink.string
expect(log_output).to include("last: nil")
end
it "includes formatted numbers" do
migrator.send(:log_progress, 1000, 2500, 10.5, nil)
log_output = log_sink.string
expect(log_output).to include("1,000")
expect(log_output).to include("2,500")
expect(log_output).to include("10.5")
end
end
describe "#format_number" do
it "formats numbers with delimiters" do
result = migrator.send(:format_number, 1_234_567)
expect(result).to include("1,234,567")
end
it "right-justifies numbers" do
result = migrator.send(:format_number, 123)
expect(result.length).to be >= 8
end
it "handles zero" do
result = migrator.send(:format_number, 0)
expect(result).to include("0")
expect(result.length).to be >= 8
end
it "handles large numbers" do
result = migrator.send(:format_number, 999_999_999)
expect(result).to include("999,999,999")
end
end
describe "constructor and basic functionality" do
it "initializes with log_sink" do
expect(migrator).to be_a(Tasks::BlobFileMigrationTask)
end
it "uses provided log_sink" do
custom_sink = StringIO.new
custom_migrator = described_class.new(log_sink: custom_sink)
custom_migrator.run
expect(custom_sink.string).to include("batch_size:")
end
it "defaults to $stderr when no log_sink provided" do
expect { described_class.new }.not_to raise_error
end
end
describe "ZERO_SHA256 constant" do
it "defines the zero SHA256 constant" do
expect(Tasks::BlobFileMigrationTask::ZERO_SHA256).to eq("00" * 32)
end
end
describe "PROGRESS_KEY constant" do
it "defines the progress key constant" do
expect(Tasks::BlobFileMigrationTask::PROGRESS_KEY).to eq(
"blob-file-migration-task",
)
end
end
describe "integration scenarios" do
it "handles multiple sequential runs without errors" do
first_result = migrator.run(batch_size: 100)
expect(first_result).to eq(0)
# Reset log sink for second run
log_sink.truncate(0)
log_sink.rewind
second_result = migrator.run(batch_size: 200)
expect(second_result).to eq(0)
log_output = log_sink.string
expect(log_output).to include("batch_size: 200")
end
it "properly formats logs with different parameters" do
migrator.run(batch_size: 42, start_sha256: "aa" * 32)
log_output = log_sink.string
expect(log_output).to include("batch_size: 42")
expect(log_output).to include("migrated 0 total blob entries")
end
end
describe "parameter validation" do
it "accepts valid batch_size parameter" do
expect { migrator.run(batch_size: 1) }.not_to raise_error
expect { migrator.run(batch_size: 1000) }.not_to raise_error
expect { migrator.run(batch_size: 10_000) }.not_to raise_error
end
it "accepts valid start_sha256 parameter" do
expect { migrator.run(start_sha256: "00" * 32) }.not_to raise_error
expect { migrator.run(start_sha256: "ff" * 32) }.not_to raise_error
expect { migrator.run(start_sha256: "ab" * 32) }.not_to raise_error
end
it "accepts 'last' as start_sha256 parameter" do
expect { migrator.run(start_sha256: "last") }.not_to raise_error
end
end
end