redux-scraper/app/jobs/domain/inkbunny/job/update_posts_job.rb

# typed: strict
class Domain::Inkbunny::Job::UpdatePostsJob < Domain::Inkbunny::Job::Base
  sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
  def perform(args)
    ib_post_ids = T.cast(args[:ib_post_ids], T.nilable(T::Array[Integer]))

    missing_pool_post_ib_ids = T::Set[Integer].new
    pools_to_update = T::Set[Domain::PostGroup::InkbunnyPool].new

    if ib_post_ids.blank? || ib_post_ids.empty?
      logger.error("empty ib_post_ids")
      return
    end

    ib_post_ids.each_slice(100) do |ib_post_ids_chunk|
      process_ib_post_ids(
        ib_post_ids_chunk,
        missing_pool_post_ib_ids,
        pools_to_update,
      )
    end

    # Enqueue update jobs for missing posts
    unless missing_pool_post_ib_ids.empty?
      missing_pool_post_ib_ids.each_slice(1000) do |ib_post_ids_chunk|
        logger.info "enqueuing update jobs for missing posts: #{ib_post_ids_chunk.join(", ")}"
        defer_job(
          Domain::Inkbunny::Job::UpdatePostsJob,
          { ib_post_ids: ib_post_ids_chunk },
        )
      end
    end

    # Enqueue update jobs for pools
    pools_to_update.each do |pool|
      defer_job(Domain::Inkbunny::Job::UpdatePoolJob, { pool: pool })
    end
  end

  sig { params(ib_post_ids_chunk: T::Array[Integer]).returns(String) }
  def build_api_submissions_url(ib_post_ids_chunk)
    ib_post_ids_list = ib_post_ids_chunk.join(",")
    "https://inkbunny.net/api_submissions.php?" +
      "submission_ids=#{ib_post_ids_list}" +
      "&show_description=yes&show_writing=yes&show_pools=yes"
  end

  sig do
    params(
      ib_post_ids_chunk: T::Array[Integer],
      missing_pool_post_ib_ids: T::Set[Integer],
      pools_to_update: T::Set[Domain::PostGroup::InkbunnyPool],
    ).void
  end
  def process_ib_post_ids(
    ib_post_ids_chunk,
    missing_pool_post_ib_ids,
    pools_to_update
  )
    url = build_api_submissions_url(ib_post_ids_chunk)
    response = http_client.get(url)
    logger.tagged(make_arg_tag(response.log_entry, name: "submissions_hle")) do
      if response.status_code != 200
        fatal_error("api_submissions failed: #{response.status_code}")
      end
      api_submissions_json = JSON.parse(response.body)
      submission_jsons = api_submissions_json["submissions"]
      logger.info("api_submissions page has #{submission_jsons.size} posts")

      submission_jsons.each do |submission_json|
        Domain::Post::InkbunnyPost.transaction do
          deep_update_post_from_submission_json(
            submission_json,
            response.log_entry,
            missing_pool_post_ib_ids,
            pools_to_update,
          )
        end
      end
    end
  end

  sig do
    params(
      submission_json: T::Hash[String, T.untyped],
      log_entry: HttpLogEntry,
      missing_pool_post_ib_ids: T::Set[Integer],
      pools_to_update: T::Set[Domain::PostGroup::InkbunnyPool],
    ).void
  end
  def deep_update_post_from_submission_json(
    submission_json,
    log_entry,
    missing_pool_post_ib_ids,
    pools_to_update
  )
    ib_post_id = submission_json["submission_id"]
    post =
      Domain::Post::InkbunnyPost.includes(:pools).find_by(ib_id: ib_post_id)

    if post.blank?
      processor = Domain::Inkbunny::Job::ApiSearchPageProcessor.new
      processor.upsert_post_from_submission_json!(
        submission_json,
        caused_by_entry: log_entry,
      )
      post = T.must(Domain::Post::InkbunnyPost.find_by(ib_id: ib_post_id))
    end

    logger.tagged(make_arg_tag(post)) do
      logger.info "begin deep update post"
      post.deep_updated_at = Time.zone.now
      post.description = submission_json["description"]
      post.writing = submission_json["writing"]
      post.rating = submission_json["rating"]
      post.submission_type = submission_json["submission_type"]
      post.num_views = submission_json["views"]
      post.num_files = submission_json["pagecount"]
      post.num_favs = submission_json["favorites_count"]&.to_i
      post.num_comments = submission_json["comments_count"]&.to_i
      if last_updated = submission_json["last_file_update_datetime"]
        post.last_file_updated_at = Time.zone.parse(last_updated)
      end
      post.keywords = submission_json["keywords"]
      post.deep_update_log_entry = log_entry

      if pools_json = submission_json["pools"]
        update_submission_pools(
          post,
          pools_json,
          missing_pool_post_ib_ids,
          pools_to_update,
        )
      end

      if submission_json["user_icon_url_large"]
        user = T.must(post.creator)
        user.save!
        logger.tagged(make_arg_tag(user)) do
          avatar = user.avatar
          avatar_url_str = submission_json["user_icon_url_large"]
          if !avatar || avatar.url_str != avatar_url_str
            avatar = user.avatars.build
            avatar.url_str = avatar_url_str
            user.deep_update_log_entry = log_entry
            logger.info "avatar url changed, enqueuing avatar download"
            avatar.enqueue_job_after_save(
              Domain::Inkbunny::Job::UserAvatarJob,
              { avatar: avatar, caused_by_entry: log_entry },
            )
          end
          user.save!
        end
      end

      post_files_by_ib_id =
        T.cast(
          post.files.index_by(&:ib_id),
          T::Hash[Integer, Domain::PostFile::InkbunnyPostFile],
        )
      file_jsons = submission_json["files"] || fatal_error("no files[] array")
      post.save!

      file_jsons.each_with_index do |file_json, index|
        ib_file_id = file_json["file_id"]&.to_i
        logger.tagged(
          make_tag("ib_file_id", ib_file_id),
          make_tag("file_idx", index),
        ) do
          if ib_file_id.blank?
            logger.error("file has no ib_id, skipping")
            next
          end

          md5_initial = file_json["initial_file_md5"]
          logger.error("file has no md5") if md5_initial.blank?

          file = post_files_by_ib_id[ib_file_id]
          if file.present? && file.md5_initial.present? &&
               (file.md5_initial != md5_initial)
            fatal_error(
              format_tags(
                make_tag("old md5", file.md5_initial),
                make_tag("new md5", md5_initial),
                "md5_initial changed",
              ),
            )
          elsif file.present? && file.md5_initial.present?
            logger.info(format_tags("file already exists, skipping"))
            next
          elsif file.present? && file.md5_initial.blank? && md5_initial.present?
            logger.warn(
              format_tags(
                make_tag("md5", md5_initial),
                "file got its first (delayed) md5_initial",
              ),
            )
            file.md5_initial = md5_initial
            file.state_ok!
          elsif file.present?
            fatal_error("invariant")
          end

          # We create all files, even those with null MD5 sums (which also do not have
          # a valid download URL), so that post.files.count will be accurate and match
          # pagecount.
          file ||=
            post.files.create do |file|
              md5_initial.present? ? file.state_ok! : file.state_terminal_error!
              file.ib_id = ib_file_id
              file.ib_created_at = Time.zone.parse(file_json["create_datetime"])
              file.file_order = file_json["submission_file_order"]&.to_i
              file.ib_detail_raw = file_json
              file.file_name = file_json["file_name"]
              file.url_str = file_json["file_url_full"]
              file.md5_initial = md5_initial
              file.md5_full = file_json["full_file_md5"]
              file.md5s = {
                initial_file_md5: md5_initial,
                full_file_md5: file_json["full_file_md5"],
                large_file_md5: file_json["large_file_md5"],
                small_file_md5: file_json["small_file_md5"],
                thumbnail_md5: file_json["thumbnail_md5"],
              }
            end

          if file.state_terminal_error?
            logger.error(
              format_tags("file is in terminal error state, skipping enqueue"),
            )
            next
          end

          if file.invalid?
            fatal_error "file is invalid: #{format_tags_arr(file.errors.full_messages)}"
          end

          logger.info format_tags("created new file")

          defer_job(
            Domain::Inkbunny::Job::StaticFileJob,
            { file: file },
            { priority: 1 },
          )
        end
      end
    end
  end

  sig do
    params(
      post: Domain::Post::InkbunnyPost,
      pools_json: T::Array[T::Hash[String, T.untyped]],
      missing_pool_post_ib_ids: T::Set[Integer],
      pools_to_update: T::Set[Domain::PostGroup::InkbunnyPool],
    ).void
  end
  def update_submission_pools(
    post,
    pools_json,
    missing_pool_post_ib_ids,
    pools_to_update
  )
    pools_json.each do |pool_json|
      left_post, right_post =
        %w[
          submission_left_submission_id
          submission_right_submission_id
        ].map do |key|
          ib_id = pool_json[key]&.to_i
          next nil if ib_id.blank?
          p =
            Domain::Post::InkbunnyPost.find_or_initialize_by(
              ib_id: ib_id,
            ) { |p| p.creator = post.creator }
          if p.new_record?
            missing_pool_post_ib_ids.add(ib_id)
            p.save!
          end
          p
        end

      pool =
        Domain::PostGroup::InkbunnyPool.find_or_initialize_by(
          ib_id: pool_json["pool_id"],
        ) { |pool| pool.owner = post.creator }
      pools_to_update.add(pool) if pool.deep_update_log_entry_id.blank?
      pool.count = pool_json["count"]&.to_i
      pool.name = pool_json["name"]
      pool.description = pool_json["description"]
      pool.save!

      pool_join = post.post_group_joins.find_or_initialize_by(group: pool)
      pool_join.left_post = left_post
      pool_join.right_post = right_post
      pool_join.save!
    end
  end
end