add fa posts and basic paper trail
This commit is contained in:
5
Gemfile
5
Gemfile
@@ -13,7 +13,9 @@ gem "sprockets-rails"
|
||||
# Use sqlite3 as the database for Active Record
|
||||
gem "sqlite3", "~> 1.4"
|
||||
gem "pg"
|
||||
|
||||
gem "pry"
|
||||
gem "pry-stack_explorer"
|
||||
|
||||
# Use the Puma web server [https://github.com/puma/puma]
|
||||
gem "puma", "~> 5.0"
|
||||
@@ -84,3 +86,6 @@ gem "ruby-prof"
|
||||
gem "table_print"
|
||||
# gem "concurrent-ruby-ext", require: "concurrent"
|
||||
# gem 'cli-ui'
|
||||
# gem "paper_trail"
|
||||
# gem "paper_trail-hashdiff"
|
||||
# gem "hashdiff"
|
||||
|
||||
15
Gemfile.lock
15
Gemfile.lock
@@ -79,6 +79,8 @@ GEM
|
||||
addressable (2.8.1)
|
||||
public_suffix (>= 2.0.2, < 6.0)
|
||||
bindex (0.8.1)
|
||||
binding_of_caller (1.0.0)
|
||||
debug_inspector (>= 0.0.1)
|
||||
bootsnap (1.16.0)
|
||||
msgpack (~> 1.2)
|
||||
builder (3.2.4)
|
||||
@@ -98,10 +100,12 @@ GEM
|
||||
debug (1.7.1)
|
||||
irb (>= 1.5.0)
|
||||
reline (>= 0.3.1)
|
||||
debug_inspector (1.1.0)
|
||||
diffy (3.4.2)
|
||||
erubi (1.12.0)
|
||||
globalid (1.1.0)
|
||||
activesupport (>= 5.0)
|
||||
hashdiff (1.0.1)
|
||||
i18n (1.12.0)
|
||||
concurrent-ruby (~> 1.0)
|
||||
importmap-rails (1.1.5)
|
||||
@@ -141,10 +145,16 @@ GEM
|
||||
nokogiri (1.14.1)
|
||||
mini_portile2 (~> 2.8.0)
|
||||
racc (~> 1.4)
|
||||
paper_trail (14.0.0)
|
||||
activerecord (>= 6.0)
|
||||
request_store (~> 1.4)
|
||||
pg (1.4.5)
|
||||
pry (0.14.2)
|
||||
coderay (~> 1.1)
|
||||
method_source (~> 1.0)
|
||||
pry-stack_explorer (0.6.1)
|
||||
binding_of_caller (~> 1.0)
|
||||
pry (~> 0.13)
|
||||
public_suffix (5.0.1)
|
||||
puma (5.6.5)
|
||||
nio4r (~> 2.0)
|
||||
@@ -182,6 +192,8 @@ GEM
|
||||
regexp_parser (2.6.2)
|
||||
reline (0.3.2)
|
||||
io-console (~> 0.5)
|
||||
request_store (1.5.1)
|
||||
rack (>= 1.4)
|
||||
rexml (3.2.5)
|
||||
ruby-prof (1.4.5)
|
||||
rubyzip (2.3.2)
|
||||
@@ -234,10 +246,13 @@ DEPENDENCIES
|
||||
capybara
|
||||
debug
|
||||
diffy
|
||||
hashdiff
|
||||
importmap-rails
|
||||
jbuilder
|
||||
paper_trail
|
||||
pg
|
||||
pry
|
||||
pry-stack_explorer
|
||||
puma (~> 5.0)
|
||||
rails (~> 7.0.4, >= 7.0.4.2)
|
||||
rb-bsdiff!
|
||||
|
||||
37
Rakefile
37
Rakefile
@@ -6,47 +6,12 @@ require_relative "config/application"
|
||||
|
||||
Rails.application.load_tasks
|
||||
require_relative "rake/sst"
|
||||
require_relative "rake/log_entry"
|
||||
|
||||
task :log => :environment do
|
||||
ActiveRecord::Base.logger = Logger.new(STDOUT)
|
||||
end
|
||||
|
||||
task :http_log_entry_bulk_importer, [:batch_size, :cache_size, :start_at, :finish_at] => [:environment] do |t, args|
|
||||
batch_size = args[:batch_size]&.to_i || ENV["batch_size"]&.to_i || 8192
|
||||
cache_size = args[:cache_size]&.to_i || ENV["cache_size"]&.to_i || 6
|
||||
start_at = args[:start_at]&.to_i || ENV["start_at"]&.to_i || 0
|
||||
finish_at = args[:finish_at]&.to_i || ENV["finish_at"]&.to_i || nil
|
||||
|
||||
LegacyImport::HttpLogEntryBulkImporter.
|
||||
new(batch_size, cache_size, start_at, finish_at).
|
||||
run
|
||||
end
|
||||
|
||||
task :legacy_http_log_entry_native_blob_entry => [:environment] do
|
||||
batch_size = ENV["batch_size"]&.to_i || 4000
|
||||
parallelism = ENV["parallelism"]&.to_i || 8
|
||||
|
||||
query = Legacy::HttpLogEntry.
|
||||
where(diff_type: :native).
|
||||
where(native_blob_entry_sha256: nil).
|
||||
where("resp_body is not null")
|
||||
|
||||
start_time = Time.now
|
||||
progress = 0
|
||||
|
||||
query.find_in_batches(batch_size: batch_size, start: 0, finish: 10) do |batch|
|
||||
ForkFuture.parallel_each(parallelism, batch) do |http_log_entry|
|
||||
http_log_entry.update_columns(
|
||||
native_blob_entry_sha256: HexUtil.hex2bin(http_log_entry.resp_body),
|
||||
)
|
||||
end
|
||||
|
||||
progress += batch.size
|
||||
rate = progress.to_f / (Time.now - start_time)
|
||||
puts "finish batch, last id #{batch.last&.id} - #{progress} - #{rate.round(1)} / second"
|
||||
end
|
||||
end
|
||||
|
||||
namespace :test do
|
||||
desc "Test lib source"
|
||||
Rake::TestTask.new(:lib) do |t|
|
||||
|
||||
@@ -3,6 +3,6 @@ class HexUtil
|
||||
[str].pack("H*")
|
||||
end
|
||||
def self.bin2hex(bin)
|
||||
bin.unpack("H*").first.upcase
|
||||
bin.unpack("H*").first
|
||||
end
|
||||
end
|
||||
|
||||
@@ -38,6 +38,43 @@ class BlobEntry < ReduxApplicationRecord
|
||||
self.read_attribute(:contents).size
|
||||
end
|
||||
|
||||
def self.find_or_build_from_legacy(legacy_be)
|
||||
file_path = legacy_be.file_path
|
||||
file_name = File.basename file_path
|
||||
|
||||
if file_name.length == 64
|
||||
file_sha256_assumed = HexUtil.hex2bin(file_name)
|
||||
# try to find existing file before going through all this hassle
|
||||
entry = BlobEntry.find_by(sha256: file_sha256_assumed)
|
||||
return entry if entry
|
||||
end
|
||||
|
||||
file_mime = `file -ib #{file_path}`
|
||||
raise("error running `file` on #{file_path}: #{file_mime}") if $?.exitstatus != 0
|
||||
|
||||
file_mime.chomp!
|
||||
file_contents = IO.binread(file_path)
|
||||
record = find_or_build(file_mime, file_contents)
|
||||
|
||||
# guess the name is a sha256 hash
|
||||
if file_name.length == 64
|
||||
if record.sha256 != file_sha256_assumed
|
||||
raise("checksum mismatch for #{file_path}: #{HexUtil.bin2hex(record.sha256)} != #{file_name}")
|
||||
end
|
||||
end
|
||||
|
||||
record.created_at = legacy_be.created_at
|
||||
record.updated_at = legacy_be.updated_at
|
||||
record
|
||||
end
|
||||
|
||||
def self.find_or_build(content_type, contents)
|
||||
sha256 = Digest::SHA256.digest(contents)
|
||||
BlobEntry.find_by(sha256: sha256) || begin
|
||||
build_record(content_type: content_type, sha256: sha256, contents: contents)
|
||||
end
|
||||
end
|
||||
|
||||
def self.build_record(content_type:, sha256:, contents:, candidates: [])
|
||||
record = BlobEntry.new(sha256: sha256, content_type: content_type, size: contents.size)
|
||||
|
||||
|
||||
62
app/models/domain/fa/post.rb
Normal file
62
app/models/domain/fa/post.rb
Normal file
@@ -0,0 +1,62 @@
|
||||
class Domain::Fa::Post < ReduxApplicationRecord
|
||||
self.table_name = "domain_fa_posts"
|
||||
|
||||
has_lite_trail(
|
||||
schema_version: 1,
|
||||
map_attribute: {
|
||||
file_sha256: ::SHA256AttributeMapper,
|
||||
},
|
||||
)
|
||||
|
||||
# If the file was scraped, this is the blob entry that represents it
|
||||
belongs_to :file,
|
||||
class_name: "::BlobEntry",
|
||||
foreign_key: :file_sha256,
|
||||
optional: :true
|
||||
|
||||
# HttpLogEntry of the request made to scrape this post
|
||||
belongs_to :http_log_page,
|
||||
class_name: "::HttpLogEntry",
|
||||
optional: true
|
||||
|
||||
# Same, but for the scrape of the file itself
|
||||
belongs_to :http_log_file,
|
||||
class_name: "::HttpLogEntry",
|
||||
optional: true
|
||||
|
||||
def file_uri
|
||||
URI(self.file_url_str) if self.file_url_str
|
||||
end
|
||||
|
||||
def self.build_from_legacy(legacy)
|
||||
legacy_be = legacy.blob_entry
|
||||
new_be = ::BlobEntry.find_or_build_from_legacy(legacy_be) if legacy_be
|
||||
|
||||
record = Domain::Fa::Post.new
|
||||
record.file = new_be if legacy_be
|
||||
|
||||
# TODO: populate creator_id as well - need to build the model
|
||||
|
||||
fields_to_copy = [
|
||||
:creator_name,
|
||||
:fa_id,
|
||||
:title,
|
||||
:category,
|
||||
:theme,
|
||||
:species,
|
||||
:gender,
|
||||
:keywords,
|
||||
:description,
|
||||
:num_favorites,
|
||||
:num_comments,
|
||||
:num_views,
|
||||
:updated_at,
|
||||
:created_at,
|
||||
].each do |field|
|
||||
record.send(:"#{field}=", legacy.send(field))
|
||||
end
|
||||
record.file_url_str = legacy.file_url
|
||||
|
||||
record
|
||||
end
|
||||
end
|
||||
@@ -22,8 +22,24 @@ class HttpLogEntry < ReduxApplicationRecord
|
||||
:requested_at
|
||||
)
|
||||
|
||||
def full_path
|
||||
"#{scheme}://#{host}#{path}#{query ? "?#{query}" : ""}"
|
||||
def uri=(uri)
|
||||
uri = URI(uri)
|
||||
self.uri_scheme = uri.scheme
|
||||
self.uri_host = uri.host
|
||||
self.uri_path = uri.path
|
||||
self.uri_query = uri.query
|
||||
self.uri_hash = uri.fragment
|
||||
end
|
||||
|
||||
def uri
|
||||
URI(uri_str)
|
||||
end
|
||||
|
||||
def uri_str
|
||||
the_str = "#{uri_scheme}://#{uri_host}#{uri_path}"
|
||||
the_str += "?#{uri_query}" if uri_query
|
||||
the_str += "\##{uri_hash}" if uri_hash
|
||||
the_str
|
||||
end
|
||||
|
||||
def to_bulk_insert_hash
|
||||
@@ -33,6 +49,7 @@ class HttpLogEntry < ReduxApplicationRecord
|
||||
uri_host: self.uri_host,
|
||||
uri_path: self.uri_path,
|
||||
uri_query: self.uri_query,
|
||||
uri_hash: self.uri_hash,
|
||||
verb: self.verb,
|
||||
content_type: self.content_type,
|
||||
status_code: self.status_code,
|
||||
|
||||
@@ -17,6 +17,14 @@ class HttpLogEntryHeader < ReduxApplicationRecord
|
||||
)
|
||||
end
|
||||
|
||||
def self.find_or_create(headers)
|
||||
record = build_record(headers: headers)
|
||||
find_by(sha256: record.sha256) || begin
|
||||
record.save!
|
||||
record
|
||||
end
|
||||
end
|
||||
|
||||
def to_bulk_insert_hash
|
||||
{
|
||||
sha256: sha256,
|
||||
|
||||
@@ -105,7 +105,7 @@ class Legacy::Fa::Post < LegacyApplicationRecord
|
||||
sha256: Digest::SHA256.hexdigest(fa_id.to_s),
|
||||
depth: 4, stride: 2,
|
||||
)
|
||||
File.join SConfig.fa_data_dir, "static", "resized", style.to_s, (hashed_path + file_ext)
|
||||
File.join Legacy::SConfig.fa_data_dir, "static", "resized", style.to_s, (hashed_path + file_ext)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
57
app/models/lite_trail/version.rb
Normal file
57
app/models/lite_trail/version.rb
Normal file
@@ -0,0 +1,57 @@
|
||||
class LiteTrail::Version < ReduxApplicationRecord
|
||||
self.table_name = "versions"
|
||||
belongs_to :item, polymorphic: true
|
||||
|
||||
def reify
|
||||
versions_arr = item.versions.to_a
|
||||
this_idx = versions_arr.find_index(self)
|
||||
if this_idx == nil
|
||||
raise("item.versions (#{item.item_type}/#{item.item_id}) does not contain self: #{self.id}")
|
||||
end
|
||||
unapply_versions = versions_arr[this_idx..].reverse
|
||||
|
||||
model = if self.event == "destroy"
|
||||
self.item.class.new
|
||||
else
|
||||
self.item.dup
|
||||
end
|
||||
|
||||
unapply_versions.each do |version|
|
||||
version.unapply(model)
|
||||
end
|
||||
|
||||
model
|
||||
end
|
||||
|
||||
def unapply(model)
|
||||
mapper_config = item.class.lite_trail_options[:map_attribute] || {}
|
||||
|
||||
if self.event == "create"
|
||||
raise("'create' cannot be undone")
|
||||
elsif self.event == "update"
|
||||
self.diff.each do |attr_name, change|
|
||||
attr_before, attr_after = change
|
||||
|
||||
attr_name_sym = attr_name.to_sym
|
||||
if mapper_config[attr_name_sym]
|
||||
attr_before = mapper_config[attr_name_sym].map_from(attr_before) if attr_before
|
||||
attr_after = mapper_config[attr_name_sym].map_from(attr_after) if attr_after
|
||||
end
|
||||
|
||||
# sanity check - but ignore updated_at due to rounding issues
|
||||
if model.send(attr_name.to_sym) != attr_after
|
||||
raise("expected #{attr_name} to be #{attr_after}, was #{item_attributes[attr_name]}")
|
||||
end if attr_name_sym != :updated_at
|
||||
|
||||
model.send(:"#{attr_name}=", attr_before)
|
||||
end
|
||||
elsif self.event == "destroy"
|
||||
self.diff.each do |attr_name, attr_value|
|
||||
if mapper_config[attr_name]
|
||||
attr_value = mapper_config[attr_name].map_from(attr_value)
|
||||
end
|
||||
item.send(:"#{attr_name}=", attr_value)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -2,4 +2,68 @@ class ReduxApplicationRecord < ActiveRecord::Base
|
||||
# self.primary_abstract_class = true
|
||||
self.abstract_class = true
|
||||
connects_to database: { writing: :redux, reading: :redux }
|
||||
|
||||
def self.has_lite_trail(schema_version:, map_attribute: nil)
|
||||
class_attribute :lite_trail_options
|
||||
self.lite_trail_options = {
|
||||
schema_version: schema_version,
|
||||
map_attribute: map_attribute,
|
||||
}
|
||||
|
||||
has_many :versions,
|
||||
-> { order(created_at: :asc) },
|
||||
class_name: "::LiteTrail::Version",
|
||||
as: :item
|
||||
|
||||
after_create do
|
||||
::LiteTrail::Version.create({
|
||||
event: "create",
|
||||
item: self,
|
||||
schema_version: schema_version,
|
||||
})
|
||||
end
|
||||
|
||||
after_update do
|
||||
changes = self.saved_changes
|
||||
if changes.any?
|
||||
map_attribute.each do |attr_name, mapper|
|
||||
if changes[attr_name]
|
||||
changes[attr_name][0] = mapper.map_to(changes[attr_name][0]) if changes[attr_name][0]
|
||||
changes[attr_name][1] = mapper.map_to(changes[attr_name][1]) if changes[attr_name][1]
|
||||
end
|
||||
end if map_attribute
|
||||
::LiteTrail::Version.create({
|
||||
event: "update",
|
||||
item: self,
|
||||
schema_version: schema_version,
|
||||
diff: changes,
|
||||
})
|
||||
end
|
||||
end
|
||||
|
||||
after_destroy do
|
||||
attributes = self.attributes
|
||||
map_attribute.each do |attr_name, mapper|
|
||||
if attributes[attr_name]
|
||||
attributes[attr_name] = mapper.map_to(attributes[attr_name])
|
||||
end
|
||||
end
|
||||
|
||||
::LiteTrail::Version.create({
|
||||
event: "destroy",
|
||||
item: self,
|
||||
schema_version: schema_version,
|
||||
diff: attributes,
|
||||
})
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class SHA256AttributeMapper
|
||||
def self.map_to(attr_value)
|
||||
HexUtil.bin2hex(attr_value)
|
||||
end
|
||||
def self.map_from(stored_value)
|
||||
HexUtil.hex2bin(attr_value)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -42,6 +42,17 @@ legacy_prod: &legacy_prod
|
||||
database_tasks: false
|
||||
pool: 64
|
||||
|
||||
legacy_prod_readonly: &legacy_prod_readonly
|
||||
adapter: postgresql
|
||||
host: scraper-postgres.local
|
||||
port: 5432
|
||||
database: legacy_prod
|
||||
username: scraper_legacy
|
||||
password: zL7zDRXycLhLFJLQj5Zh
|
||||
migrations_paths: db/legacy_migrate
|
||||
database_tasks: false
|
||||
pool: 64
|
||||
|
||||
development:
|
||||
redux:
|
||||
<<: *redux_prod
|
||||
@@ -55,7 +66,7 @@ test:
|
||||
redux:
|
||||
<<: *redux_test
|
||||
legacy:
|
||||
<<: *legacy_prod
|
||||
<<: *legacy_prod_readonly
|
||||
|
||||
production:
|
||||
redux:
|
||||
|
||||
15
db/redux_migrate/20230207210950_create_versions.rb
Normal file
15
db/redux_migrate/20230207210950_create_versions.rb
Normal file
@@ -0,0 +1,15 @@
|
||||
# This migration creates the `versions` table, the only schema PT requires.
|
||||
# All other migrations PT provides are optional.
|
||||
class CreateVersions < ActiveRecord::Migration[7.0]
|
||||
def change
|
||||
create_table :versions do |t|
|
||||
t.string :item_type, null: false
|
||||
t.bigint :item_id, null: false
|
||||
t.integer :schema_version, null: false
|
||||
t.string :event, null: false
|
||||
t.jsonb :diff
|
||||
t.datetime :created_at
|
||||
end
|
||||
add_index :versions, %i[item_type item_id]
|
||||
end
|
||||
end
|
||||
35
db/redux_migrate/20230207211826_create_domain_fa_posts.rb
Normal file
35
db/redux_migrate/20230207211826_create_domain_fa_posts.rb
Normal file
@@ -0,0 +1,35 @@
|
||||
class CreateDomainFaPosts < ActiveRecord::Migration[7.0]
|
||||
def change
|
||||
create_table :domain_fa_posts do |t|
|
||||
t.integer :fa_id
|
||||
|
||||
# Domain::Fa::User.{id,name}
|
||||
t.integer :creator_id
|
||||
t.string :creator_name
|
||||
|
||||
# just start with this for now - can add more fields later
|
||||
t.string :title
|
||||
t.string :category
|
||||
t.string :theme
|
||||
t.string :species
|
||||
t.string :gender
|
||||
t.string :description
|
||||
t.jsonb :keywords
|
||||
t.integer :num_favorites
|
||||
t.integer :num_comments
|
||||
t.integer :num_views
|
||||
|
||||
t.string :file_url_str
|
||||
t.binary :file_sha256
|
||||
|
||||
# relevant http log entries
|
||||
t.references :http_log_page
|
||||
t.references :http_log_file
|
||||
t.timestamps
|
||||
end
|
||||
|
||||
add_foreign_key :domain_fa_posts, :blob_entries, column: :file_sha256, primary_key: :sha256, validate: true
|
||||
add_foreign_key :domain_fa_posts, :http_log_entries, column: :http_log_page_id, primary_key: :id, validate: true
|
||||
add_foreign_key :domain_fa_posts, :http_log_entries, column: :http_log_file_id, primary_key: :id, validate: true
|
||||
end
|
||||
end
|
||||
39
db/schema.rb
generated
39
db/schema.rb
generated
@@ -10,7 +10,7 @@
|
||||
#
|
||||
# It's strongly recommended that you check this file into your version control system.
|
||||
|
||||
ActiveRecord::Schema[7.0].define(version: 2023_02_07_204901) do
|
||||
ActiveRecord::Schema[7.0].define(version: 2023_02_07_211826) do
|
||||
# These are extensions that must be enabled in order to support this database
|
||||
enable_extension "pg_stat_statements"
|
||||
enable_extension "plpgsql"
|
||||
@@ -26,6 +26,30 @@ ActiveRecord::Schema[7.0].define(version: 2023_02_07_204901) do
|
||||
t.index ["sha256"], name: "index_blob_entries_on_sha256", unique: true
|
||||
end
|
||||
|
||||
create_table "domain_fa_posts", force: :cascade do |t|
|
||||
t.integer "fa_id"
|
||||
t.integer "creator_id"
|
||||
t.string "creator_name"
|
||||
t.string "title"
|
||||
t.string "category"
|
||||
t.string "theme"
|
||||
t.string "species"
|
||||
t.string "gender"
|
||||
t.string "description"
|
||||
t.jsonb "keywords"
|
||||
t.integer "num_favorites"
|
||||
t.integer "num_comments"
|
||||
t.integer "num_views"
|
||||
t.string "file_url_str"
|
||||
t.binary "file_sha256"
|
||||
t.bigint "http_log_page_id"
|
||||
t.bigint "http_log_file_id"
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.index ["http_log_file_id"], name: "index_domain_fa_posts_on_http_log_file_id"
|
||||
t.index ["http_log_page_id"], name: "index_domain_fa_posts_on_http_log_page_id"
|
||||
end
|
||||
|
||||
create_table "flat_sst_entries", id: false, force: :cascade do |t|
|
||||
t.binary "key", null: false
|
||||
t.binary "contents", null: false
|
||||
@@ -68,7 +92,20 @@ ActiveRecord::Schema[7.0].define(version: 2023_02_07_204901) do
|
||||
t.index ["key"], name: "index_log_store_sst_entries_on_key", unique: true
|
||||
end
|
||||
|
||||
create_table "versions", force: :cascade do |t|
|
||||
t.string "item_type", null: false
|
||||
t.bigint "item_id", null: false
|
||||
t.integer "schema_version", null: false
|
||||
t.string "event", null: false
|
||||
t.jsonb "diff"
|
||||
t.datetime "created_at"
|
||||
t.index ["item_type", "item_id"], name: "index_versions_on_item_type_and_item_id"
|
||||
end
|
||||
|
||||
add_foreign_key "blob_entries", "blob_entries", column: "base_sha256", primary_key: "sha256"
|
||||
add_foreign_key "domain_fa_posts", "blob_entries", column: "file_sha256", primary_key: "sha256"
|
||||
add_foreign_key "domain_fa_posts", "http_log_entries", column: "http_log_file_id"
|
||||
add_foreign_key "domain_fa_posts", "http_log_entries", column: "http_log_page_id"
|
||||
add_foreign_key "http_log_entries", "blob_entries", column: "response_sha256", primary_key: "sha256"
|
||||
add_foreign_key "http_log_entries", "http_log_entry_headers", column: "request_headers_id"
|
||||
add_foreign_key "http_log_entries", "http_log_entry_headers", column: "response_headers_id"
|
||||
|
||||
14
rake/fa_post.rb
Normal file
14
rake/fa_post.rb
Normal file
@@ -0,0 +1,14 @@
|
||||
namespace :fa_post do
|
||||
desc "Import existing FA posts"
|
||||
task :import_existing => [:environment] do
|
||||
query = ::Legacy::Fa::Post
|
||||
query.find_each(finish: 10) do |legacy_post|
|
||||
next if ::Domain::Fa::Post.find_by(fa_id: legacy_post.fa_id)
|
||||
new_post = ::Domain::Fa::Post.build_from_legacy(legacy_post)
|
||||
unless new_post.valid?
|
||||
raise("errors on #{legacy_post.id}: #{new_post.errors.full_messages}")
|
||||
end
|
||||
new_post.save!
|
||||
end
|
||||
end
|
||||
end
|
||||
75
rake/log_entry.rb
Normal file
75
rake/log_entry.rb
Normal file
@@ -0,0 +1,75 @@
|
||||
namespace :log_entry do
|
||||
desc "Fix up entries that have '' contents but shouldn't"
|
||||
task :fix_up_empty_response_contents => [:environment] do
|
||||
query = ::HttpLogEntry.where(response_sha256: Digest::SHA256.digest("")).limit(100)
|
||||
query.find_each do |model|
|
||||
puts "uri: #{model.uri_str}"
|
||||
end
|
||||
end
|
||||
|
||||
desc "Find an HttpLogEntry that is missing its response body, ignoring those with a missing blob entry file"
|
||||
task :find_missing_response_body_ignore_file => [:environment] do
|
||||
Legacy::HttpLogEntry.where(
|
||||
"resp_body is not null or blob_entry_id is not null"
|
||||
).find_in_batches do |batch|
|
||||
id_to_legacy = batch.map { |l| [l.id, l] }.to_h
|
||||
legacy_ids = batch.map(&:id)
|
||||
log_ids = HttpLogEntry.select(:id).where(id: legacy_ids).map(&:id).to_a
|
||||
|
||||
missing_ids = legacy_ids - log_ids
|
||||
|
||||
# ignore the ones which have a missing blob entry
|
||||
missing_ids.filter! do |id|
|
||||
model = id_to_legacy[id]
|
||||
model.response_body
|
||||
true
|
||||
rescue Legacy::HttpLogEntry::NoBEPathException
|
||||
false
|
||||
rescue
|
||||
true
|
||||
end
|
||||
|
||||
if missing_ids.any?
|
||||
puts "Found missing: #{missing_ids}"
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
desc "Bulk import Legacy::HLE -> ::HLE"
|
||||
task :http_log_entry_bulk_importer, [:batch_size, :cache_size, :start_at, :finish_at] => [:environment] do |t, args|
|
||||
batch_size = args[:batch_size]&.to_i || ENV["batch_size"]&.to_i || 8192
|
||||
cache_size = args[:cache_size]&.to_i || ENV["cache_size"]&.to_i || 6
|
||||
start_at = args[:start_at]&.to_i || ENV["start_at"]&.to_i || 0
|
||||
finish_at = args[:finish_at]&.to_i || ENV["finish_at"]&.to_i || nil
|
||||
|
||||
LegacyImport::HttpLogEntryBulkImporter.
|
||||
new(batch_size, cache_size, start_at, finish_at).
|
||||
run
|
||||
end
|
||||
|
||||
task :legacy_http_log_entry_native_blob_entry => [:environment] do
|
||||
batch_size = ENV["batch_size"]&.to_i || 4000
|
||||
parallelism = ENV["parallelism"]&.to_i || 8
|
||||
|
||||
query = Legacy::HttpLogEntry.
|
||||
where(diff_type: :native).
|
||||
where(native_blob_entry_sha256: nil).
|
||||
where("resp_body is not null")
|
||||
|
||||
start_time = Time.now
|
||||
progress = 0
|
||||
|
||||
query.find_in_batches(batch_size: batch_size, start: 0, finish: 10) do |batch|
|
||||
ForkFuture.parallel_each(parallelism, batch) do |http_log_entry|
|
||||
http_log_entry.update_columns(
|
||||
native_blob_entry_sha256: HexUtil.hex2bin(http_log_entry.resp_body),
|
||||
)
|
||||
end
|
||||
|
||||
progress += batch.size
|
||||
rate = progress.to_f / (Time.now - start_time)
|
||||
puts "finish batch, last id #{batch.last&.id} - #{progress} - #{rate.round(1)} / second"
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -1,7 +1,23 @@
|
||||
require "test_helper"
|
||||
|
||||
class BlobEntryTest < ActiveSupport::TestCase
|
||||
# test "the truth" do
|
||||
# assert true
|
||||
# end
|
||||
test "building a blob works" do
|
||||
blob = TestUtil.build_blob_entry
|
||||
assert blob.valid?
|
||||
assert blob.save
|
||||
end
|
||||
|
||||
test "can create from a legacy blob entry" do
|
||||
legacy_be = ::Legacy::BlobEntry.find(1370198)
|
||||
new_be = ::BlobEntry.find_or_build_from_legacy(legacy_be)
|
||||
assert new_be.valid?, new_be.errors.full_messages
|
||||
assert new_be.new_record?
|
||||
assert_equal "5ed3a0400ac50f721123c7a8c638da8b19bf563f8e880f9abb36dcb38395bc82", HexUtil.bin2hex(new_be.sha256)
|
||||
assert_equal legacy_be.file_size, new_be.bytes_stored
|
||||
assert_equal 313065, new_be.bytes_stored
|
||||
assert_equal "image/png; charset=binary", new_be.content_type
|
||||
new_be.save!
|
||||
assert_equal legacy_be.updated_at, new_be.updated_at
|
||||
assert_equal legacy_be.created_at, new_be.created_at
|
||||
end
|
||||
end
|
||||
|
||||
39
test/models/domain/fa/post_test.rb
Normal file
39
test/models/domain/fa/post_test.rb
Normal file
@@ -0,0 +1,39 @@
|
||||
class Domain::Fa::PostTest < ActiveSupport::TestCase
|
||||
test "can build from legacy" do
|
||||
legacy_post = ::Legacy::Fa::Post.find(1)
|
||||
new_post = ::Domain::Fa::Post.build_from_legacy(legacy_post)
|
||||
# made a valid, unpersisted record
|
||||
assert new_post.valid?
|
||||
refute new_post.persisted?
|
||||
|
||||
# the associated file was populated
|
||||
assert new_post.file.present?
|
||||
refute new_post.file.persisted?
|
||||
|
||||
assert_equal 19177819, new_post.fa_id
|
||||
# TODO - post creator model verification
|
||||
assert_equal "Meesh", new_post.creator_name
|
||||
assert_equal "MASSIVE ART PACK 6 - Available now!", new_post.title
|
||||
assert_equal "Artwork (Digital)", new_post.category
|
||||
assert_equal "All", new_post.theme
|
||||
assert_equal "Unspecified / Any", new_post.species
|
||||
assert_equal "Any", new_post.gender
|
||||
assert_equal "d.facdn.net", new_post.file_uri.host
|
||||
assert_equal "/art/meesh/1456189705/1456189705.meesh_production5.png", new_post.file_uri.path
|
||||
assert_equal 8, new_post.num_favorites
|
||||
assert_equal 2, new_post.num_comments
|
||||
assert_equal 904, new_post.num_views
|
||||
assert_equal ["meesh", "nsfw", "art", "pack", "adult", "boner", "touching"], new_post.keywords
|
||||
assert_match /with a handful that have special edits/, new_post.description
|
||||
|
||||
new_post.save!
|
||||
assert_equal 1, new_post.versions.count
|
||||
|
||||
new_post.num_views = 1000
|
||||
new_post.save!
|
||||
assert_equal 2, new_post.versions.count
|
||||
|
||||
old_post = new_post.versions.last.reify
|
||||
assert_equal 904, old_post.num_views
|
||||
end
|
||||
end
|
||||
@@ -1,7 +1,43 @@
|
||||
require "test_helper"
|
||||
|
||||
class HttpLogEntryTest < ActiveSupport::TestCase
|
||||
# test "the truth" do
|
||||
# assert true
|
||||
# end
|
||||
def make_required_params
|
||||
{
|
||||
request_headers_id: TestUtil.create_http_headers.id,
|
||||
response_headers_id: TestUtil.create_http_headers.id,
|
||||
response_sha256: TestUtil.create_blob_entry.sha256,
|
||||
response_time_ms: 125,
|
||||
content_type: "test/text",
|
||||
verb: :get,
|
||||
requested_at: Time.now,
|
||||
uri_scheme: "https",
|
||||
uri_host: "example.com",
|
||||
uri_path: "/foo/bar",
|
||||
status_code: 200,
|
||||
}
|
||||
end
|
||||
|
||||
test "requires required params" do
|
||||
model = ::HttpLogEntry.new(make_required_params)
|
||||
assert model.valid?, model.errors.full_messages
|
||||
|
||||
make_required_params.keys.each do |target_key|
|
||||
params_without_key = make_required_params
|
||||
params_without_key.delete(target_key)
|
||||
refute ::HttpLogEntry.new(params_without_key).valid?
|
||||
end
|
||||
end
|
||||
|
||||
test "uri is correct" do
|
||||
uri_str = "https://www.example.com/big/path/here?and=query&other=query2#smaz"
|
||||
uri = URI(uri_str)
|
||||
model = ::HttpLogEntry.new({ uri: uri_str })
|
||||
assert_equal "https", model.uri_scheme
|
||||
assert_equal "www.example.com", model.uri_host
|
||||
assert_equal "/big/path/here", model.uri_path
|
||||
assert_equal "and=query&other=query2", model.uri_query
|
||||
assert_equal "smaz", model.uri_hash
|
||||
assert_equal uri, model.uri
|
||||
assert_equal uri_str, model.uri_str
|
||||
end
|
||||
end
|
||||
|
||||
@@ -10,4 +10,26 @@ class ActiveSupport::TestCase
|
||||
fixtures :all
|
||||
|
||||
# Add more helper methods to be used by all tests here...
|
||||
parallelize_teardown do
|
||||
Rails.log.error "Deleting all records: #{ReduxApplicationRecord.subclasses.map(&:name)}"
|
||||
ReduxApplicationRecord.subclasses.each(&:delete_all)
|
||||
end
|
||||
end
|
||||
|
||||
module TestUtil
|
||||
def self.random_string(length)
|
||||
(0...length).map { (65 + rand(26)).chr }.join
|
||||
end
|
||||
|
||||
def self.build_blob_entry
|
||||
BlobEntry.find_or_build("text/plain", random_string(1024))
|
||||
end
|
||||
def self.create_blob_entry
|
||||
be = build_blob_entry
|
||||
be.save!
|
||||
be
|
||||
end
|
||||
def self.create_http_headers
|
||||
::HttpLogEntryHeader.find_or_create({ test_header_key: "test header value #{random_string(16)}" })
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user