redux-scraper/app/models/domain/fa/post.rb

# typed: strict
class Domain::Fa::Post < ReduxApplicationRecord
  self.table_name = "domain_fa_posts"

  include HasIndexedPost
  include Pundit::Authorization
  include AttrJsonRecordAliases

  enum :state,
       [
         :ok, # so far so good, post may not yet be scanned or have file downloaded
         :removed, # post has been removed
         :scan_error, # error scanning post page (see state_detail)
         :file_error, # error downloading post file (see state_detail)
       ]
  validates_inclusion_of(:state, in: self.states.keys)
  validates_presence_of(:fa_id, :state)
  after_initialize { self.state ||= :ok }

  belongs_to :creator,
             class_name: "::Domain::Fa::User",
             inverse_of: :posts,
             optional: true,
             autosave: true

  # If the file was scraped, this is the blob entry that represents it
  belongs_to :file, class_name: "::HttpLogEntry", optional: true, autosave: true

  has_many :fav_post_joins, class_name: "::Domain::Fa::Fav", inverse_of: :post

  has_many :faved_by,
           class_name: "::Domain::Fa::User",
           through: :fav_post_joins,
           source: :user

  has_one :disco,
          class_name: "::Domain::Fa::PostFactor",
          inverse_of: :post,
          foreign_key: :post_id,
          dependent: :destroy

  attr_json :title, :string
  attr_json :category, :string
  attr_json :theme, :string
  attr_json :species, :string
  attr_json :gender, :string
  attr_json :description, :string
  attr_json :keywords, :string, array: true, default: []
  attr_json :num_favorites, :integer
  attr_json :num_comments, :integer
  attr_json :num_views, :integer
  attr_json :posted_at, :datetime
  attr_json :scanned_at, :datetime

  attr_json :last_user_page_id, :integer
  attr_json :last_submission_page_id, :integer
  attr_json :first_browse_page_id, :integer
  attr_json :first_gallery_page_id, :integer
  attr_json :first_seen_entry_id, :integer

  attr_json :scan_file_error, :string

  belongs_to :last_user_page, class_name: "HttpLogEntry", optional: true
  belongs_to :last_submission_page, class_name: "HttpLogEntry", optional: true
  belongs_to :first_browse_page, class_name: "HttpLogEntry", optional: true
  belongs_to :first_gallery_page, class_name: "HttpLogEntry", optional: true
  belongs_to :first_seen_entry, class_name: "HttpLogEntry", optional: true

  sig { params(id: Integer).returns(T.nilable(Domain::Fa::Post)) }
  def self.find_by_fa_id(id)
    where(fa_id: id).first
  end

  sig { params(id: Integer).returns(Domain::Fa::Post) }
  def self.find_by_fa_id!(id)
    where(fa_id: id).first!
  end

  sig { returns(String) }
  def to_param
    self.fa_id.to_s
  end

  sig { returns(T.nilable(Addressable::URI)) }
  def file_uri
    Addressable::URI.parse(self.file_url_str) if self.file_url_str
  end

  sig { params(uri: T.nilable(T.any(String, Addressable::URI))).void }
  def file_uri=(uri)
    if uri
      uri = Addressable::URI.parse(uri)
      uri.scheme = "https" if uri.scheme.blank?
      self.file_url_str = uri.to_s
    else
      self.file_url_str = nil
    end
  end

  sig { void }
  def fix_file_by_uri!
    parts = []
    parts << "[fa_id: #{self.fa_id}]"
    file_uri = self.file&.uri
    file_url_str = self.file_url_str
    if file_url_str.present? && file_uri.present? &&
         (file_uri.to_s != file_url_str)
      old_file = self.file
      new_file = HttpLogEntry.find_by_uri_host_path(file_url_str)
      if old_file == new_file
        parts << "[no change][invariant]"
      elsif new_file
        self.file = new_file
        parts << "[old: #{old_file&.id} / #{old_file&.uri.to_s}]"
        parts << "[new: #{self.file&.id} / #{self.file&.uri.to_s}]"
        self.save!
      else
        parts << "[new_file_not_found: #{file_url_str}]"
      end
    else
      parts << "[no change]"
      parts << "[file: #{self.file&.id} / #{self.file&.uri.to_s}]"
      parts << "[file_url_str: #{self.file_url_str}]"
    end
    logger.info parts.join(" ")
  end

  sig { returns(T::Boolean) }
  def scanned?
    self.file_url_str.present?
  end

  sig { returns(T.nilable(ActiveSupport::TimeWithZone)) }
  def scanned_at
    # # at some point, `scanned_at` was populated to avoid having to look up the
    # # post's `last_submission_page` log entry, but we fall back to that
    # # if scanned_at isn't populated yet
    # if state_detail["scanned_at"]
    #   Time.at(state_detail["scanned_at"])
    # else
    #   last_submission_page&.created_at
    # end
    super || last_submission_page&.created_at
  end

  # sig { params(time: T.nilable(Time)).void }
  # def scanned_at=(time)
  #   self.state_detail["scanned_at"] = time&.to_i
  # end

  sig { returns(T.nilable(ActiveSupport::TimeWithZone)) }
  def guess_posted_at
    pa = posted_at
    return pa if pa
    begin
      contents = guess_last_submission_page&.response_bytes
      if contents
        parser = Domain::Fa::Parser::Page.new(contents)
        parser.submission.posted_date if parser.probably_submission?
      end
    end
  end

  # sig { params(log_entry: T.nilable(HttpLogEntry)).void }
  # def last_submission_page=(log_entry)
  #   self.last_submission_page_id = log_entry&.id
  # end

  # sig { returns(T.nilable(HttpLogEntry)) }
  # def last_submission_page
  #   HttpLogEntry.find_by(id: self.last_submission_page_id)
  # end

  sig { returns(T.nilable(HttpLogEntry)) }
  def guess_last_submission_page
    last_submission_page ||
      begin
        HttpLogEntry
          .where(
            uri_host: "www.furaffinity.net",
            uri_path: ["/view/#{self.fa_id}/", "/view/#{self.fa_id}"],
            uri_query: nil,
            status_code: 200,
          )
          .order(created_at: :desc)
          .first
      end
  end

  sig { returns(T.nilable(String)) }
  def description
    content = super
    return nil if content.nil? || content.blank?

    # this is a hack to remove the first two lines of the description, which are
    # always empty and a <br><br>
    lines = content.lines.map(&:strip).map(&:chomp)
    if lines.length > 3
      if lines[0] == "" && lines[1]&.start_with?("<a href=") &&
           lines[2] == "<br><br>"
        return (lines[3..] || []).join("\n")
      end
    end
    content
  end

  sig { returns(T::Boolean) }
  def have_file?
    self.file_id.present?
  end

  sig do
    params(
      submission: T.untyped,
      first_seen_log_entry: T.nilable(HttpLogEntry),
    ).returns(Domain::Fa::Post)
  end
  def self.find_or_initialize_by_submission_parser(
    submission,
    first_seen_log_entry: nil
  )
    creator =
      Domain::Fa::User.find_or_create_by!(
        { url_name: submission.artist_url_name },
      ) { |user| user.name = submission.artist }

    Domain::Fa::Post.find_or_initialize_by(fa_id: submission.id) do |post|
      post.creator = creator
      post.title = submission.title
      post.first_seen_entry = first_seen_log_entry
    end
  end
end