Files
redux-scraper/app/models/domain/fa/post.rb
2025-02-25 19:59:41 +00:00

234 lines
6.8 KiB
Ruby

# typed: strict
class Domain::Fa::Post < ReduxApplicationRecord
self.table_name = "domain_fa_posts"
include HasIndexedPost
include Pundit::Authorization
include AttrJsonRecordAliases
enum :state,
[
:ok, # so far so good, post may not yet be scanned or have file downloaded
:removed, # post has been removed
:scan_error, # error scanning post page (see state_detail)
:file_error, # error downloading post file (see state_detail)
]
validates_inclusion_of(:state, in: self.states.keys)
validates_presence_of(:fa_id, :state)
after_initialize { self.state ||= :ok }
belongs_to :creator,
class_name: "::Domain::Fa::User",
inverse_of: :posts,
optional: true,
autosave: true
# If the file was scraped, this is the blob entry that represents it
belongs_to :file, class_name: "::HttpLogEntry", optional: true, autosave: true
has_many :fav_post_joins, class_name: "::Domain::Fa::Fav", inverse_of: :post
has_many :faved_by,
class_name: "::Domain::Fa::User",
through: :fav_post_joins,
source: :user
has_one :disco,
class_name: "::Domain::Fa::PostFactor",
inverse_of: :post,
foreign_key: :post_id,
dependent: :destroy
attr_json :title, :string
attr_json :category, :string
attr_json :theme, :string
attr_json :species, :string
attr_json :gender, :string
attr_json :description, :string
attr_json :keywords, :string, array: true, default: []
attr_json :num_favorites, :integer
attr_json :num_comments, :integer
attr_json :num_views, :integer
attr_json :posted_at, :datetime
attr_json :scanned_at, :datetime
attr_json :last_user_page_id, :integer
attr_json :last_submission_page_id, :integer
attr_json :first_browse_page_id, :integer
attr_json :first_gallery_page_id, :integer
attr_json :first_seen_entry_id, :integer
attr_json :scan_file_error, :string
belongs_to :last_user_page, class_name: "HttpLogEntry", optional: true
belongs_to :last_submission_page, class_name: "HttpLogEntry", optional: true
belongs_to :first_browse_page, class_name: "HttpLogEntry", optional: true
belongs_to :first_gallery_page, class_name: "HttpLogEntry", optional: true
belongs_to :first_seen_entry, class_name: "HttpLogEntry", optional: true
sig { params(id: Integer).returns(T.nilable(Domain::Fa::Post)) }
def self.find_by_fa_id(id)
where(fa_id: id).first
end
sig { params(id: Integer).returns(Domain::Fa::Post) }
def self.find_by_fa_id!(id)
where(fa_id: id).first!
end
sig { returns(String) }
def to_param
self.fa_id.to_s
end
sig { returns(T.nilable(Addressable::URI)) }
def file_uri
Addressable::URI.parse(self.file_url_str) if self.file_url_str
end
sig { params(uri: T.nilable(T.any(String, Addressable::URI))).void }
def file_uri=(uri)
if uri
uri = Addressable::URI.parse(uri)
uri.scheme = "https" if uri.scheme.blank?
self.file_url_str = uri.to_s
else
self.file_url_str = nil
end
end
sig { void }
def fix_file_by_uri!
parts = []
parts << "[fa_id: #{self.fa_id}]"
file_uri = self.file&.uri
file_url_str = self.file_url_str
if file_url_str.present? && file_uri.present? &&
(file_uri.to_s != file_url_str)
old_file = self.file
new_file = HttpLogEntry.find_by_uri_host_path(file_url_str)
if old_file == new_file
parts << "[no change][invariant]"
elsif new_file
self.file = new_file
parts << "[old: #{old_file&.id} / #{old_file&.uri.to_s}]"
parts << "[new: #{self.file&.id} / #{self.file&.uri.to_s}]"
self.save!
else
parts << "[new_file_not_found: #{file_url_str}]"
end
else
parts << "[no change]"
parts << "[file: #{self.file&.id} / #{self.file&.uri.to_s}]"
parts << "[file_url_str: #{self.file_url_str}]"
end
logger.info parts.join(" ")
end
sig { returns(T::Boolean) }
def scanned?
self.file_url_str.present?
end
sig { returns(T.nilable(ActiveSupport::TimeWithZone)) }
def scanned_at
# # at some point, `scanned_at` was populated to avoid having to look up the
# # post's `last_submission_page` log entry, but we fall back to that
# # if scanned_at isn't populated yet
# if state_detail["scanned_at"]
# Time.at(state_detail["scanned_at"])
# else
# last_submission_page&.created_at
# end
super || last_submission_page&.created_at
end
# sig { params(time: T.nilable(Time)).void }
# def scanned_at=(time)
# self.state_detail["scanned_at"] = time&.to_i
# end
sig { returns(T.nilable(ActiveSupport::TimeWithZone)) }
def guess_posted_at
pa = posted_at
return pa if pa
begin
contents = guess_last_submission_page&.response_bytes
if contents
parser = Domain::Fa::Parser::Page.new(contents)
parser.submission.posted_date if parser.probably_submission?
end
end
end
# sig { params(log_entry: T.nilable(HttpLogEntry)).void }
# def last_submission_page=(log_entry)
# self.last_submission_page_id = log_entry&.id
# end
# sig { returns(T.nilable(HttpLogEntry)) }
# def last_submission_page
# HttpLogEntry.find_by(id: self.last_submission_page_id)
# end
sig { returns(T.nilable(HttpLogEntry)) }
def guess_last_submission_page
last_submission_page ||
begin
HttpLogEntry
.where(
uri_host: "www.furaffinity.net",
uri_path: ["/view/#{self.fa_id}/", "/view/#{self.fa_id}"],
uri_query: nil,
status_code: 200,
)
.order(created_at: :desc)
.first
end
end
sig { returns(T.nilable(String)) }
def description
content = super
return nil if content.nil? || content.blank?
# this is a hack to remove the first two lines of the description, which are
# always empty and a <br><br>
lines = content.lines.map(&:strip).map(&:chomp)
if lines.length > 3
if lines[0] == "" && lines[1]&.start_with?("<a href=") &&
lines[2] == "<br><br>"
return (lines[3..] || []).join("\n")
end
end
content
end
sig { returns(T::Boolean) }
def have_file?
self.file_id.present?
end
sig do
params(
submission: T.untyped,
first_seen_log_entry: T.nilable(HttpLogEntry),
).returns(Domain::Fa::Post)
end
def self.find_or_initialize_by_submission_parser(
submission,
first_seen_log_entry: nil
)
creator =
Domain::Fa::User.find_or_create_by!(
{ url_name: submission.artist_url_name },
) { |user| user.name = submission.artist }
Domain::Fa::Post.find_or_initialize_by(fa_id: submission.id) do |post|
post.creator = creator
post.title = submission.title
post.first_seen_entry = first_seen_log_entry
end
end
end