Files
redux-scraper/app/jobs/domain/inkbunny/job/update_posts_job.rb
2025-02-25 05:47:44 +00:00

300 lines
9.8 KiB
Ruby

# typed: strict
class Domain::Inkbunny::Job::UpdatePostsJob < Domain::Inkbunny::Job::Base
sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
def perform(args)
ib_post_ids = T.cast(args[:ib_post_ids], T.nilable(T::Array[Integer]))
missing_pool_post_ib_ids = T::Set[Integer].new
pools_to_update = T::Set[Domain::PostGroup::InkbunnyPool].new
if ib_post_ids.blank? || ib_post_ids.empty?
logger.error("empty ib_post_ids")
return
end
ib_post_ids.each_slice(100) do |ib_post_ids_chunk|
process_ib_post_ids(
ib_post_ids_chunk,
missing_pool_post_ib_ids,
pools_to_update,
)
end
# Enqueue update jobs for missing posts
unless missing_pool_post_ib_ids.empty?
missing_pool_post_ib_ids.each_slice(1000) do |ib_post_ids_chunk|
logger.info "enqueuing update jobs for missing posts: #{ib_post_ids_chunk.join(", ")}"
defer_job(
Domain::Inkbunny::Job::UpdatePostsJob,
{ ib_post_ids: ib_post_ids_chunk },
)
end
end
# Enqueue update jobs for pools
pools_to_update.each do |pool|
defer_job(Domain::Inkbunny::Job::UpdatePoolJob, { pool: pool })
end
end
sig { params(ib_post_ids_chunk: T::Array[Integer]).returns(String) }
def build_api_submissions_url(ib_post_ids_chunk)
ib_post_ids_list = ib_post_ids_chunk.join(",")
"https://inkbunny.net/api_submissions.php?" +
"submission_ids=#{ib_post_ids_list}" +
"&show_description=yes&show_writing=yes&show_pools=yes"
end
sig do
params(
ib_post_ids_chunk: T::Array[Integer],
missing_pool_post_ib_ids: T::Set[Integer],
pools_to_update: T::Set[Domain::PostGroup::InkbunnyPool],
).void
end
def process_ib_post_ids(
ib_post_ids_chunk,
missing_pool_post_ib_ids,
pools_to_update
)
url = build_api_submissions_url(ib_post_ids_chunk)
response = http_client.get(url)
logger.tagged(make_arg_tag(response.log_entry, name: "submissions_hle")) do
if response.status_code != 200
fatal_error("api_submissions failed: #{response.status_code}")
end
api_submissions_json = JSON.parse(response.body)
submission_jsons = api_submissions_json["submissions"]
logger.info("api_submissions page has #{submission_jsons.size} posts")
submission_jsons.each do |submission_json|
Domain::Post::InkbunnyPost.transaction do
deep_update_post_from_submission_json(
submission_json,
response.log_entry,
missing_pool_post_ib_ids,
pools_to_update,
)
end
end
end
end
sig do
params(
submission_json: T::Hash[String, T.untyped],
log_entry: HttpLogEntry,
missing_pool_post_ib_ids: T::Set[Integer],
pools_to_update: T::Set[Domain::PostGroup::InkbunnyPool],
).void
end
def deep_update_post_from_submission_json(
submission_json,
log_entry,
missing_pool_post_ib_ids,
pools_to_update
)
ib_post_id = submission_json["submission_id"]
post =
Domain::Post::InkbunnyPost.includes(:pools).find_by(ib_id: ib_post_id)
if post.blank?
processor = Domain::Inkbunny::Job::ApiSearchPageProcessor.new
processor.upsert_post_from_submission_json!(
submission_json,
caused_by_entry: log_entry,
)
post = T.must(Domain::Post::InkbunnyPost.find_by(ib_id: ib_post_id))
end
logger.tagged(make_arg_tag(post)) do
logger.info "begin deep update post"
post.deep_updated_at = Time.zone.now
post.description = submission_json["description"]
post.writing = submission_json["writing"]
post.rating = submission_json["rating"]
post.submission_type = submission_json["submission_type"]
post.num_views = submission_json["views"]
post.num_files = submission_json["pagecount"]
post.num_favs = submission_json["favorites_count"]&.to_i
post.num_comments = submission_json["comments_count"]&.to_i
if last_updated = submission_json["last_file_update_datetime"]
post.last_file_updated_at = Time.zone.parse(last_updated)
end
post.keywords = submission_json["keywords"]
post.deep_update_log_entry = log_entry
if pools_json = submission_json["pools"]
update_submission_pools(
post,
pools_json,
missing_pool_post_ib_ids,
pools_to_update,
)
end
if submission_json["user_icon_url_large"]
user = T.must(post.creator)
user.save!
logger.tagged(make_arg_tag(user)) do
avatar = user.avatar
avatar_url_str = submission_json["user_icon_url_large"]
if !avatar || avatar.url_str != avatar_url_str
avatar = user.avatars.build
avatar.url_str = avatar_url_str
user.deep_update_log_entry = log_entry
logger.info "avatar url changed, enqueuing avatar download"
avatar.enqueue_job_after_save(
Domain::Inkbunny::Job::UserAvatarJob,
{ avatar: avatar, caused_by_entry: log_entry },
)
end
user.save!
end
end
post_files_by_ib_id =
T.cast(
post.files.index_by(&:ib_id),
T::Hash[Integer, Domain::PostFile::InkbunnyPostFile],
)
file_jsons = submission_json["files"] || fatal_error("no files[] array")
post.save!
file_jsons.each_with_index do |file_json, index|
ib_file_id = file_json["file_id"]&.to_i
logger.tagged(
make_tag("ib_file_id", ib_file_id),
make_tag("file_idx", index),
) do
if ib_file_id.blank?
logger.error("file has no ib_id, skipping")
next
end
md5_initial = file_json["initial_file_md5"]
logger.error("file has no md5") if md5_initial.blank?
file = post_files_by_ib_id[ib_file_id]
if file.present? && file.md5_initial.present? &&
(file.md5_initial != md5_initial)
fatal_error(
format_tags(
make_tag("old md5", file.md5_initial),
make_tag("new md5", md5_initial),
"md5_initial changed",
),
)
elsif file.present? && file.md5_initial.present?
logger.info(format_tags("file already exists, skipping"))
next
elsif file.present? && file.md5_initial.blank? && md5_initial.present?
logger.warn(
format_tags(
make_tag("md5", md5_initial),
"file got its first (delayed) md5_initial",
),
)
file.md5_initial = md5_initial
file.state_ok!
elsif file.present?
fatal_error("invariant")
end
# We create all files, even those with null MD5 sums (which also do not have
# a valid download URL), so that post.files.count will be accurate and match
# pagecount.
file ||=
post.files.create do |file|
md5_initial.present? ? file.state_ok! : file.state_terminal_error!
file.ib_id = ib_file_id
file.ib_created_at = Time.zone.parse(file_json["create_datetime"])
file.file_order = file_json["submission_file_order"]&.to_i
file.ib_detail_raw = file_json
file.file_name = file_json["file_name"]
file.url_str = file_json["file_url_full"]
file.md5_initial = md5_initial
file.md5_full = file_json["full_file_md5"]
file.md5s = {
initial_file_md5: md5_initial,
full_file_md5: file_json["full_file_md5"],
large_file_md5: file_json["large_file_md5"],
small_file_md5: file_json["small_file_md5"],
thumbnail_md5: file_json["thumbnail_md5"],
}
end
if file.state_terminal_error?
logger.error(
format_tags("file is in terminal error state, skipping enqueue"),
)
next
end
if file.invalid?
fatal_error "file is invalid: #{format_tags_arr(file.errors.full_messages)}"
end
logger.info format_tags("created new file")
defer_job(
Domain::Inkbunny::Job::StaticFileJob,
{ file: file },
{ priority: 1 },
)
end
end
end
end
sig do
params(
post: Domain::Post::InkbunnyPost,
pools_json: T::Array[T::Hash[String, T.untyped]],
missing_pool_post_ib_ids: T::Set[Integer],
pools_to_update: T::Set[Domain::PostGroup::InkbunnyPool],
).void
end
def update_submission_pools(
post,
pools_json,
missing_pool_post_ib_ids,
pools_to_update
)
pools_json.each do |pool_json|
left_post, right_post =
%w[
submission_left_submission_id
submission_right_submission_id
].map do |key|
ib_id = pool_json[key]&.to_i
next nil if ib_id.blank?
p =
Domain::Post::InkbunnyPost.find_or_initialize_by(
ib_id: ib_id,
) { |p| p.creator = post.creator }
if p.new_record?
missing_pool_post_ib_ids.add(ib_id)
p.save!
end
p
end
pool =
Domain::PostGroup::InkbunnyPool.find_or_initialize_by(
ib_id: pool_json["pool_id"],
) { |pool| pool.owner = post.creator }
pools_to_update.add(pool) if pool.deep_update_log_entry_id.blank?
pool.count = pool_json["count"]&.to_i
pool.name = pool_json["name"]
pool.description = pool_json["description"]
pool.save!
pool_join = post.post_group_joins.find_or_initialize_by(group: pool)
pool_join.left_post = left_post
pool_join.right_post = right_post
pool_join.save!
end
end
end