fa post importer with basic legacy log matchup

This commit is contained in:
2023-02-09 06:20:40 +00:00
parent cf9122a941
commit f24e361e34
18 changed files with 304 additions and 84 deletions

View File

@@ -84,6 +84,7 @@ gem "rb-bsdiff", path: "../rb-bsdiff"
gem "ruby-prof"
gem "table_print"
gem "addressable"
# gem "concurrent-ruby-ext", require: "concurrent"
# gem 'cli-ui'
# gem "paper_trail"

View File

@@ -105,7 +105,6 @@ GEM
erubi (1.12.0)
globalid (1.1.0)
activesupport (>= 5.0)
hashdiff (1.0.1)
i18n (1.12.0)
concurrent-ruby (~> 1.0)
importmap-rails (1.1.5)
@@ -145,9 +144,6 @@ GEM
nokogiri (1.14.1)
mini_portile2 (~> 2.8.0)
racc (~> 1.4)
paper_trail (14.0.0)
activerecord (>= 6.0)
request_store (~> 1.4)
pg (1.4.5)
pry (0.14.2)
coderay (~> 1.1)
@@ -192,8 +188,6 @@ GEM
regexp_parser (2.6.2)
reline (0.3.2)
io-console (~> 0.5)
request_store (1.5.1)
rack (>= 1.4)
rexml (3.2.5)
ruby-prof (1.4.5)
rubyzip (2.3.2)
@@ -242,14 +236,13 @@ PLATFORMS
ruby
DEPENDENCIES
addressable
bootsnap
capybara
debug
diffy
hashdiff
importmap-rails
jbuilder
paper_trail
pg
pry
pry-stack_explorer

View File

@@ -7,6 +7,7 @@ require_relative "config/application"
Rails.application.load_tasks
require_relative "rake/sst"
require_relative "rake/log_entry"
require_relative "rake/fa_post"
task :log => :environment do
ActiveRecord::Base.logger = Logger.new(STDOUT)

View File

@@ -5,4 +5,12 @@ class HexUtil
def self.bin2hex(bin)
bin.unpack("H*").first
end
def self.humansize(size)
units = %w[B KiB MiB GiB TiB Pib EiB ZiB]
return "0.0 B" if size == 0
exp = (Math.log(size) / Math.log(1024)).to_i
exp += 1 if (size.to_f / 1024 ** exp >= 1024 - 0.05)
exp = units.size - 1 if exp > units.size - 1
"%.1f %s" % [size.to_f / 1024 ** exp, units[exp]]
end
end

View File

@@ -431,12 +431,7 @@ class LegacyImport::HttpLogEntryBulkImporter
end
def self.humansize(size)
units = %w[B KiB MiB GiB TiB Pib EiB ZiB]
return "0.0 B" if size == 0
exp = (Math.log(size) / Math.log(1024)).to_i
exp += 1 if (size.to_f / 1024 ** exp >= 1024 - 0.05)
exp = units.size - 1 if exp > units.size - 1
"%.1f %s" % [size.to_f / 1024 ** exp, units[exp]]
HexUtil.humansize(size)
end
end
end

View File

@@ -63,6 +63,7 @@ class BlobEntry < ReduxApplicationRecord
end
end
puts ("[blob entry] built #{file_mime} (#{HexUtil.humansize(record.size)})")
record.created_at = legacy_be.created_at
record.updated_at = legacy_be.updated_at
record

View File

@@ -8,37 +8,88 @@ class Domain::Fa::Post < ReduxApplicationRecord
},
)
validates_presence_of(:fa_id)
# TODO - this class doesn't exist yet
# belongs_to :creator,
# class_name: "::Domain::Fa::User"
# If the file was scraped, this is the blob entry that represents it
belongs_to :file,
class_name: "::BlobEntry",
foreign_key: :file_sha256,
optional: :true
# HttpLogEntry of the request made to scrape this post
belongs_to :http_log_page,
class_name: "::HttpLogEntry",
optional: true
optional: :true,
autosave: true
# Same, but for the scrape of the file itself
belongs_to :http_log_file,
class_name: "::HttpLogEntry",
optional: true
has_many :log_entry_joins,
class_name: "::Domain::Fa::PostHttpLogEntryJoin",
inverse_of: :post,
autosave: true
has_many :log_entries,
through: :log_entry_joins,
source: :entry,
autosave: true
def file_uri
URI(self.file_url_str) if self.file_url_str
Addressable::URI.parse(self.file_url_str) if self.file_url_str
end
def self.build_from_legacy(legacy)
legacy_be = legacy.blob_entry
new_be = ::BlobEntry.find_or_build_from_legacy(legacy_be) if legacy_be
def self.build_from_legacy(legacy_post)
existing = find_by(fa_id: legacy_post.fa_id)
return existing if existing
record = Domain::Fa::Post.new
record.file = new_be if legacy_be
post = Domain::Fa::Post.new
if legacy_post.blob_entry
legacy_be = legacy_post.blob_entry
# may be able to be found by original blob entry id
original_le = ::Legacy::HttpLogEntry.find_by(blob_entry_id: legacy_be.id)
# yay, check if we need to make a new blob entry
blob_entry = ::BlobEntry.find_or_build_from_legacy(legacy_be)
uri = Addressable::URI.parse(legacy_post.file_url)
uri.scheme ||= "https"
uri.path ||= "/"
log_entry = ::HttpLogEntry.find_by(id: original_le.id) || begin
::HttpLogEntry.build_from_legacy(original_le)
end if original_le
log_entry ||= begin
legacy_hle = ::Legacy::HttpLogEntry.find_by(
host: uri.host,
path: uri.path,
)
legacy_hle && ::HttpLogEntry.find_by(
id: legacy_hle.id,
response_sha256: blob_entry.sha256,
)
end
# couldn't reconstruct from a legacy http log entry, try to guess and make a new one
log_entry ||=
::HttpLogEntry.new({
uri: uri,
status_code: 200,
verb: :get,
response_time_ms: -1,
content_type: blob_entry.content_type,
requested_at: Time.now,
request_headers: ::HttpLogEntryHeader.find_or_create({}),
response_headers: ::HttpLogEntryHeader.find_or_create({}),
response: blob_entry,
})
raise("mismatch") unless log_entry.response == blob_entry
raise("mismatch") unless HexUtil.bin2hex(log_entry.response.sha256) == legacy_be.sha256
post.file = log_entry
end
# TODO: populate creator_id as well - need to build the model
fields_to_copy = [
:creator_name,
:fa_id,
:title,
:category,
@@ -53,10 +104,10 @@ class Domain::Fa::Post < ReduxApplicationRecord
:updated_at,
:created_at,
].each do |field|
record.send(:"#{field}=", legacy.send(field))
post.send(:"#{field}=", legacy_post.send(field))
end
record.file_url_str = legacy.file_url
post.file_url_str = legacy_post.file_url
record
post
end
end

View File

@@ -0,0 +1,8 @@
class Domain::Fa::PostHttpLogEntryJoin < ReduxApplicationRecord
self.table_name = "domain_fa_post_http_log_entry_joins"
belongs_to :post,
class_name: "::Domain::Fa::Post"
belongs_to :entry,
class_name: "::HttpLogEntry"
end

View File

@@ -3,9 +3,12 @@ class HttpLogEntry < ReduxApplicationRecord
belongs_to :response,
foreign_key: :response_sha256,
class_name: "::BlobEntry"
class_name: "::BlobEntry",
autosave: true
belongs_to :request_headers,
class_name: "::HttpLogEntryHeader"
belongs_to :response_headers,
class_name: "::HttpLogEntryHeader"
@@ -16,14 +19,53 @@ class HttpLogEntry < ReduxApplicationRecord
:uri_scheme,
:uri_host,
:uri_path,
:verb,
:status_code,
:response_time_ms,
:content_type,
:requested_at
)
def self.build_from_legacy(legacy_model)
response_body = legacy_model.response_body
can_reconstruct_from_be =
response_body.nil? &&
legacy_model.parent_log_entry_id.nil? &&
legacy_model.resp_body.present? &&
legacy_model.blob_entry.present?
if can_reconstruct_from_be
blob_entry = ::BlobEntry.find_or_build_from_legacy(legacy_model.blob_entry)
blob_sha256 = HexUtil.hex2bin(legacy_model.resp_body)
unless blob_entry.sha256 == blob_sha256
raise("mismatch for legacy http entry #{legacy_model.id} / legacy blob entry #{legacy_model.blob_entry.id}")
end
uri = Addressable::URI.parse(legacy_model.full_path)
uri.scheme ||= "https"
uri.path ||= "/"
record = ::HttpLogEntry.new({
verb: legacy_model.verb,
uri: uri,
content_type: legacy_model.content_type,
status_code: legacy_model.status,
response_time_ms: legacy_model.response_time,
request_headers: ::HttpLogEntryHeader.find_or_build(legacy_model.req_headers),
response_headers: ::HttpLogEntryHeader.find_or_build(legacy_model.res_headers),
response: blob_entry,
requested_at: legacy_model.requested_at,
created_at: legacy_model.created_at,
updated_at: legacy_model.updated_at,
})
return record
end
binding.pry
end
def uri=(uri)
uri = URI(uri)
uri = Addressable::URI.parse(uri)
self.uri_scheme = uri.scheme
self.uri_host = uri.host
self.uri_path = uri.path
@@ -32,7 +74,7 @@ class HttpLogEntry < ReduxApplicationRecord
end
def uri
URI(uri_str)
Addressable::URI.parse(uri_str)
end
def uri_str

View File

@@ -2,7 +2,7 @@ class HttpLogEntryHeader < ReduxApplicationRecord
validates_presence_of(:sha256)
validates :sha256, length: { is: 32 }
def self.build_record(headers:)
def self.build(headers:)
raise("must be a hash") unless headers.is_a?(Hash)
headers = headers.dup
@@ -17,12 +17,15 @@ class HttpLogEntryHeader < ReduxApplicationRecord
)
end
def self.find_or_build(headers)
record = build(headers: headers)
find_by(sha256: record.sha256) || record
end
def self.find_or_create(headers)
record = build_record(headers: headers)
find_by(sha256: record.sha256) || begin
record.save!
record
end
record = find_or_build(headers)
record.save! unless record.persisted?
record
end
def to_bulk_insert_hash

View File

@@ -57,13 +57,14 @@ class Legacy::Fa::Post < LegacyApplicationRecord
serialize :state_error
validates_presence_of :state_error, if: -> { scan_error? || static_error? }
belongs_to :creator, class_name: "Legacy::Fa::User"
belongs_to :blob_entry, class_name: "Legacy::BlobEntry"
belongs_to :creator, class_name: "::Legacy::Fa::User"
belongs_to :blob_entry, class_name: "::Legacy::BlobEntry"
belongs_to :submission_page_log_entry, class_name: "Legacy::Cache::HttpLogEntry"
belongs_to :submission_page_log_entry,
class_name: "::Legacy::HttpLogEntry"
belongs_to :description_ref, nil,
class_name: "Legacy::Fa::PostDescription",
class_name: "::Legacy::Fa::PostDescription",
foreign_key: :description_id,
inverse_of: :fa_post

View File

@@ -361,7 +361,7 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
@response_body_native ||=
self.native_blob_entry&.patched_value ||
::FlatSstEntry.find_by_hex_key(resp_body)&.contents ||
::LogStoreSstEntry.find_by_hex_key(resp_body).patched_value
::LogStoreSstEntry.find_by_hex_key(resp_body)&.patched_value
end
def self.encode_str(str)

View File

@@ -2,10 +2,10 @@ class CreateDomainFaPosts < ActiveRecord::Migration[7.0]
def change
create_table :domain_fa_posts do |t|
t.integer :fa_id
t.index :fa_id, unique: true
# Domain::Fa::User.{id,name}
t.integer :creator_id
t.string :creator_name
# ::Domain::Fa::User.id
t.references :creator
# just start with this for now - can add more fields later
t.string :title
@@ -18,18 +18,27 @@ class CreateDomainFaPosts < ActiveRecord::Migration[7.0]
t.integer :num_favorites
t.integer :num_comments
t.integer :num_views
t.datetime :posted_at
# the primary file associated with the FA post
t.string :file_url_str
t.binary :file_sha256
t.references :file
# relevant http log entries
t.references :http_log_page
t.references :http_log_file
t.timestamps
end
add_foreign_key :domain_fa_posts, :blob_entries, column: :file_sha256, primary_key: :sha256, validate: true
add_foreign_key :domain_fa_posts, :http_log_entries, column: :http_log_page_id, primary_key: :id, validate: true
add_foreign_key :domain_fa_posts, :http_log_entries, column: :http_log_file_id, primary_key: :id, validate: true
# Fa::Post.file -> HttpLogEntry.id
add_foreign_key :domain_fa_posts, :http_log_entries, column: :file_id, primary_key: :id, validate: true
create_table :domain_fa_post_http_log_entry_joins do |t|
t.references :post, null: false
t.references :entry, null: false
t.jsonb :metadata
end
# Fa::PostHttpLogEntryJoin.post_id -> Fa::Post.id
add_foreign_key :domain_fa_post_http_log_entry_joins, :domain_fa_posts, column: :post_id, primary_key: :id, validate: true
# Fa::PostHttpLogEntryJoin.post_id -> HttpLogEntry.id
add_foreign_key :domain_fa_post_http_log_entry_joins, :http_log_entries, column: :entry_id, primary_key: :id, validate: true
end
end

27
db/schema.rb generated
View File

@@ -26,10 +26,17 @@ ActiveRecord::Schema[7.0].define(version: 2023_02_07_211826) do
t.index ["sha256"], name: "index_blob_entries_on_sha256", unique: true
end
create_table "domain_fa_post_http_log_entry_joins", force: :cascade do |t|
t.bigint "post_id", null: false
t.bigint "entry_id", null: false
t.jsonb "metadata"
t.index ["entry_id"], name: "index_domain_fa_post_http_log_entry_joins_on_entry_id"
t.index ["post_id"], name: "index_domain_fa_post_http_log_entry_joins_on_post_id"
end
create_table "domain_fa_posts", force: :cascade do |t|
t.integer "fa_id"
t.integer "creator_id"
t.string "creator_name"
t.bigint "creator_id"
t.string "title"
t.string "category"
t.string "theme"
@@ -40,14 +47,14 @@ ActiveRecord::Schema[7.0].define(version: 2023_02_07_211826) do
t.integer "num_favorites"
t.integer "num_comments"
t.integer "num_views"
t.datetime "posted_at"
t.string "file_url_str"
t.binary "file_sha256"
t.bigint "http_log_page_id"
t.bigint "http_log_file_id"
t.bigint "file_id"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["http_log_file_id"], name: "index_domain_fa_posts_on_http_log_file_id"
t.index ["http_log_page_id"], name: "index_domain_fa_posts_on_http_log_page_id"
t.index ["creator_id"], name: "index_domain_fa_posts_on_creator_id"
t.index ["fa_id"], name: "index_domain_fa_posts_on_fa_id", unique: true
t.index ["file_id"], name: "index_domain_fa_posts_on_file_id"
end
create_table "flat_sst_entries", id: false, force: :cascade do |t|
@@ -103,9 +110,9 @@ ActiveRecord::Schema[7.0].define(version: 2023_02_07_211826) do
end
add_foreign_key "blob_entries", "blob_entries", column: "base_sha256", primary_key: "sha256"
add_foreign_key "domain_fa_posts", "blob_entries", column: "file_sha256", primary_key: "sha256"
add_foreign_key "domain_fa_posts", "http_log_entries", column: "http_log_file_id"
add_foreign_key "domain_fa_posts", "http_log_entries", column: "http_log_page_id"
add_foreign_key "domain_fa_post_http_log_entry_joins", "domain_fa_posts", column: "post_id"
add_foreign_key "domain_fa_post_http_log_entry_joins", "http_log_entries", column: "entry_id"
add_foreign_key "domain_fa_posts", "http_log_entries", column: "file_id"
add_foreign_key "http_log_entries", "blob_entries", column: "response_sha256", primary_key: "sha256"
add_foreign_key "http_log_entries", "http_log_entry_headers", column: "request_headers_id"
add_foreign_key "http_log_entries", "http_log_entry_headers", column: "response_headers_id"

View File

@@ -1,14 +1,42 @@
namespace :fa_post do
desc "Import existing FA posts"
task :import_existing => [:environment] do
query = ::Legacy::Fa::Post
query.find_each(finish: 10) do |legacy_post|
next if ::Domain::Fa::Post.find_by(fa_id: legacy_post.fa_id)
new_post = ::Domain::Fa::Post.build_from_legacy(legacy_post)
unless new_post.valid?
raise("errors on #{legacy_post.id}: #{new_post.errors.full_messages}")
task :import_existing, [:start_at] => [:environment] do |t, args|
batch_size = args[:batch_size]&.to_i || ENV["batch_size"]&.to_i || 100
start_at = args[:start_at]&.to_i || ENV["start_at"]&.to_i || 0
start_time = Time.now
progress = 0
query = ::Legacy::Fa::Post.includes(:blob_entry, :description_ref)
query.find_in_batches(start: start_at, batch_size: batch_size) do |batch|
existing_ids = Set.new(
::Domain::Fa::Post.select(:fa_id).
where(fa_id: batch.map(&:fa_id)).
pluck(:fa_id)
)
batch.reject! do |legacy_post|
existing_ids.include?(legacy_post.fa_id)
end
new_post.save!
ForkFuture.parallel_each(4, batch) do |legacy_post|
retries = 0
begin
ReduxApplicationRecord.transaction do
new_post = ::Domain::Fa::Post.build_from_legacy(legacy_post)
unless new_post.valid?
raise("errors on #{legacy_post.id}: #{new_post.errors.full_messages}")
end
new_post.save!
end
rescue
sleep 0.1 and retry if (retries += 1) < 3
raise
end
end
progress += batch.size
rate = progress.to_f / (Time.now - start_time)
puts "finish batch, last id #{batch.last&.id} - #{progress} - #{rate.round(1)} / second"
end
end
end

View File

@@ -1,9 +1,10 @@
class Domain::Fa::PostTest < ActiveSupport::TestCase
test "can build from legacy" do
legacy_post = ::Legacy::Fa::Post.find(1)
new_post = ::Domain::Fa::Post.build_from_legacy(legacy_post)
test "can build from legacy post without a legacy http entry" do
legacy_post = ::Legacy::Fa::Post.find_by!(fa_id: 19177819)
# made a valid, unpersisted record
assert new_post.valid?
new_post = ::Domain::Fa::Post.build_from_legacy(legacy_post)
assert new_post.valid?, new_post.errors.full_messages
refute new_post.persisted?
# the associated file was populated
@@ -12,7 +13,6 @@ class Domain::Fa::PostTest < ActiveSupport::TestCase
assert_equal 19177819, new_post.fa_id
# TODO - post creator model verification
assert_equal "Meesh", new_post.creator_name
assert_equal "MASSIVE ART PACK 6 - Available now!", new_post.title
assert_equal "Artwork (Digital)", new_post.category
assert_equal "All", new_post.theme
@@ -28,6 +28,7 @@ class Domain::Fa::PostTest < ActiveSupport::TestCase
new_post.save!
assert_equal 1, new_post.versions.count
assert new_post.file.id
new_post.num_views = 1000
new_post.save!
@@ -35,5 +36,51 @@ class Domain::Fa::PostTest < ActiveSupport::TestCase
old_post = new_post.versions.last.reify
assert_equal 904, old_post.num_views
# check that it works after reload
new_post.reload
assert new_post.file.present?
end
test "can build from a legacy post with a legacy http entry" do
# a legacy post / log entry, but they're only associated by their
# blob entry (no direct association otherwise)
legacy_post = ::Legacy::Fa::Post.find(10117853)
legacy_le = ::Legacy::HttpLogEntry.find(1766)
expected_sha256 = HexUtil.hex2bin("41F8DAF7772D11F80AFE56B742087A2D1AB372E08B69E1284BE4FEFEC2AD0C7F")
assert_equal legacy_le.blob_entry_id, legacy_post.blob_entry_id
new_post = ::Domain::Fa::Post.build_from_legacy(legacy_post)
assert new_post.valid?, new_post.errors.full_messages
refute new_post.persisted?
assert new_post.file.present?
refute new_post.file.persisted?
assert_equal expected_sha256, new_post.file.response.sha256
[
:verb,
:content_type,
:requested_at,
:updated_at,
:created_at,
].each do |attr|
assert_equal legacy_le.send(attr), new_post.file.send(attr)
end
assert_equal legacy_le.full_path, new_post.file.uri_str
assert_equal legacy_le.response_time, new_post.file.response_time_ms
assert_equal legacy_le.status, new_post.file.status_code
end
test "can manipulate associated log entries" do
post = ::Domain::Fa::Post.new(fa_id: 12345)
assert post.valid?, post.errors.full_messages
le1 = TestUtil.build_http_log_entry
le2 = TestUtil.build_http_log_entry
post.log_entries = [le1, le2]
assert post.valid?, post.errors.full_messages
post.save!
end
end

View File

@@ -30,7 +30,7 @@ class HttpLogEntryTest < ActiveSupport::TestCase
test "uri is correct" do
uri_str = "https://www.example.com/big/path/here?and=query&other=query2#smaz"
uri = URI(uri_str)
uri = Addressable::URI.parse(uri_str)
model = ::HttpLogEntry.new({ uri: uri_str })
assert_equal "https", model.uri_scheme
assert_equal "www.example.com", model.uri_host

View File

@@ -14,6 +14,15 @@ class ActiveSupport::TestCase
Rails.log.error "Deleting all records: #{ReduxApplicationRecord.subclasses.map(&:name)}"
ReduxApplicationRecord.subclasses.each(&:delete_all)
end
def start_logging!
@old_logger = ActiveRecord::Base.logger
ActiveRecord::Base.logger = Logger.new(STDOUT)
end
def end_logging!
ActiveRecord::Base.logger = @old_logger
end
end
module TestUtil
@@ -21,6 +30,22 @@ module TestUtil
(0...length).map { (65 + rand(26)).chr }.join
end
def self.build_http_log_entry
entry = ::HttpLogEntry.new({
uri: "http://example.com/",
verb: :get,
status_code: 200,
response_time_ms: rand(20..100),
request_headers: create_http_headers,
response_headers: create_http_headers,
response: build_blob_entry,
content_type: "text/plain",
requested_at: Time.now,
})
raise entry.errors.full_messages.join(", ") unless entry.valid?
entry
end
def self.build_blob_entry
BlobEntry.find_or_build("text/plain", random_string(1024))
end