Files
redux-scraper/app/models/domain/e621/post.rb
2024-12-25 21:53:47 +00:00

144 lines
4.1 KiB
Ruby

class Domain::E621::Post < ReduxApplicationRecord
self.table_name = "domain_e621_posts"
has_lite_trail(schema_version: 1, separate_versions_table: true)
include HasIndexedPost
include Discard::Model
self.discard_column = :deleted_at
default_scope -> { kept }
# see state_detail for scan_error/file_error
enum :state, %i[ok scan_error file_error]
enum :rating, %i[s q e]
validates_presence_of(:e621_id, :state)
after_initialize do
self.state ||= :ok
self.state_detail ||= {}
self.flags_array ||= []
self.pools_array ||= []
self.sources_array ||= []
self.artists_array ||= []
self.tags_array ||= []
end
has_many :taggings, class_name: "Domain::E621::Tagging", inverse_of: :post
has_many :tags, class_name: "Domain::E621::Tag", through: :taggings
# If the file was scraped, this is the blob entry that represents it
belongs_to :file,
class_name: "::HttpLogEntry",
optional: :true,
autosave: true
belongs_to :parent_e621,
class_name: "Domain::E621::Post",
foreign_key: :e621_id,
optional: true
SKIP_MISMATCH_LEGACY_IDS = Set.new([836_414, 1_070_178])
def self.find_or_build_from_legacy(legacy_model)
model = self.find_by(e621_id: legacy_model.e621_id)
return model if model
model =
self.new(
{
state: :ok,
file_url_str: legacy_model.file_url,
rating: legacy_model.rating,
sources_array: legacy_model.sources,
tags_array: legacy_model.tags.map(&:value),
artists_array: legacy_model.artists || [],
},
)
if legacy_model.e621_status != "active"
model.flags_array << legacy_model.e621_status
model.flags_array.uniq!
end
%i[e621_id md5 description score created_at].each do |attr|
model.send(:"#{attr}=", legacy_model.send(attr))
end
http_log_entries =
::HttpLogEntry.where(
uri_host: model.file_uri.host,
uri_path: model.file_uri.path,
)
http_log_entry = http_log_entries.first
if !http_log_entry && legacy_model.blob_entry
legacy_hles =
::Legacy::HttpLogEntry.where(
host: model.file_uri.host,
path: model.file_uri.path,
)
legacy_hle = legacy_hles.first
if legacy_hle
http_log_entry = ::HttpLogEntry.build_from_legacy(legacy_hle)
else
http_log_entry =
::HttpLogEntry.new(
{
uri: model.file_uri || raise,
status_code: 200,
verb: "get",
response_time_ms: -1,
requested_at: Time.now,
request_headers: ::HttpLogEntryHeader.empty,
response_headers: ::HttpLogEntryHeader.empty,
performed_by: "legacy",
},
)
end
http_log_entry.response ||=
::BlobEntryP.find_or_build_from_legacy(legacy_model.blob_entry)
blob_entry = http_log_entry.response
if blob_entry && http_log_entry
http_log_entry.content_type ||= blob_entry.content_type
else
# unable to construct http & blob entries, skip
File.write(
Rails.root.join("tmp/e621_legacy_post_importer_failures"),
"#{model.e621_id} - (no hle) - unable to reconstruct http / blob entry\n",
)
http_log_entry = nil
end
end
if http_log_entry
blob_entry = http_log_entry.response
if model.md5 != Digest::MD5.hexdigest(blob_entry.contents)
File.write(
Rails.root.join("tmp/e621_legacy_post_importer_failures"),
"#{model.e621_id} - #{http_log_entry.status_code} - expected #{model.md5} != actual #{Digest::MD5.hexdigest(blob_entry.contents)}\n",
)
http_log_entry = nil
end
end
model.file = http_log_entry
model
end
def file_uri
Addressable::URI.parse(self.file_url_str) if self.file_url_str.present?
end
def e621_updated_at
str = state_detail["e621_updated_at"]
Time.parse(str) if str
end
def e621_updated_at=(time)
time = Time.parse(time) if time.is_a?(String)
state_detail["e621_updated_at"] = time.iso8601
end
end