Files
redux-scraper/app/jobs/domain/sofurry/job/scan_gallery_job.rb
2025-07-01 21:02:23 +00:00

265 lines
7.5 KiB
Ruby

# typed: strict
class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
self.default_priority = -20
Folder = Domain::Sofurry::GalleryPageParser::Folder
ShortGalleryEntry = Domain::Sofurry::GalleryPageParser::ShortGalleryEntry
class GallerySummary < T::Struct
const :folders, T::Set[Folder]
const :posts, T::Set[ShortGalleryEntry]
end
class GalleryEntry < T::Struct
include T::Struct::ActsAsComparable
const :sofurry_id, Integer
const :title, String
const :content_level, Integer
const :media_type, Domain::Sofurry::GalleryPageParser::MediaType
const :tags, T::Array[String]
const :description, T.nilable(String)
const :posted_at, Time
const :gallery_log_entry, T.nilable(HttpLogEntry)
end
sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
def perform(args)
user = user_from_args!
logger.push_tags(make_arg_tag(user))
logger.info(
format_tags(make_arg_tag(user.gallery_scan, name: "gallery_scan")),
)
scan_gallery(user) if force_scan? || user.gallery_scan.due?
ensure
user.save! if user
end
private
sig { params(user: Domain::User::SofurryUser).returns(T.untyped) }
def scan_gallery(user)
user_id = T.must(user.sofurry_id)
posts = T.let(Set.new, T::Set[GalleryEntry])
media_types = Domain::Sofurry::GalleryPageParser::MediaType.values
media_types_with_posts =
T.let(Set.new, T::Set[Domain::Sofurry::GalleryPageParser::MediaType])
media_types.each do |media_type|
gallery_summary = gather_gallery_summary(user, media_type)
folders = gallery_summary.folders
gallery_posts = gallery_summary.posts
existing_folders =
user
.folders
.filter { |folder| folder.media_type == media_type.serialize }
.filter_map { |folder| (sfid = folder.sofurry_id) && [sfid, folder] }
.to_h
folders.each do |folder|
if (existing = existing_folders[folder.id])
existing.name = folder.name
else
user.folders.build(
{
sofurry_id: folder.id,
name: folder.name,
media_type: media_type.serialize,
},
)
end
if gallery_posts.any?
media_types_with_posts << media_type
else
logger.info(
format_tags(make_tag("skip media type", media_type.serialize)),
)
end
end
end
existing_posts =
user
.posts
.filter_map { |post| (sfid = post.sofurry_id) && [sfid, post] }
.to_h
posts_by_folder =
T.let(
Hash.new,
T::Hash[Domain::PostGroup::SofurryFolder, T::Set[GalleryEntry]],
)
user.folders.each do |folder|
if folder.folder_scan.due?
folder_posts =
scan_gallery_posts(
user_id,
T.must(folder.media_type_enum),
existing_posts.keys.to_set,
folder_id: folder.sofurry_id,
)
posts_by_folder[folder] = folder_posts
posts.merge(folder_posts)
folder.scanned_folder_at = Time.now
end
end
logger.info(
format_tags(
make_tag(
"media types with posts",
media_types_with_posts.map(&:serialize),
),
),
)
media_types_with_posts.each do |media_type|
posts.merge(
scan_gallery_posts(user_id, media_type, existing_posts.keys.to_set),
)
end
posts.each do |post|
post_model =
existing_posts[post.sofurry_id] ||
user.posts.build({ sofurry_id: post.sofurry_id })
post_model.title = post.title
post_model.tags_array = post.tags
post_model.content_level = post.content_level
post_model.description = post.description
post_model.posted_at = post.posted_at.in_time_zone("UTC")
post_model.media_type = post.media_type.serialize
post_model.last_gallery_log_entry = post.gallery_log_entry
end
ReduxApplicationRecord.transaction do
user.scanned_gallery_at = Time.now
user.save!
posts_by_sofurry_id =
user.posts.map { |post| [post.sofurry_id, post] }.to_h
sofurry_post_ids_by_pool_id =
posts_by_folder.flat_map do |folder, posts|
posts.map do |post|
{
type: "Domain::PostGroupJoin::SofurryFolderJoin",
group_id: T.must(folder.id),
post_id: T.must(posts_by_sofurry_id[post.sofurry_id]&.id),
}
end
end
Domain::PostGroupJoin::SofurryFolderJoin.upsert_all(
sofurry_post_ids_by_pool_id,
unique_by: %i[group_id post_id],
)
end
bulk_enqueue_jobs do
user.posts.each do |post|
if post.state_pending?
defer_job(Domain::Sofurry::Job::ScanPostJob, { post: })
end
end
if user.page_scan.due?
defer_job(Domain::Sofurry::Job::ScanUserJob, { user: })
end
end
end
sig do
params(
user: Domain::User::SofurryUser,
media_type: Domain::Sofurry::GalleryPageParser::MediaType,
).returns(GallerySummary)
end
def gather_gallery_summary(user, media_type)
user_sofurry_id = T.must(user.sofurry_id)
response =
http_client.get(
"https://www.sofurry.com/browse/user/#{media_type.serialize}" +
"?uid=#{user_sofurry_id}&#{media_type.serialize}-display=45",
)
if response.status_code != 200
fatal_error("failed to get gallery: #{response.status_code}")
end
parser =
Domain::Sofurry::GalleryPageParser.new(
response.body,
user_id: user_sofurry_id,
)
user.username ||= parser.username
GallerySummary.new(
folders: parser.folders.to_set,
posts: parser.posts.to_set,
)
end
sig do
params(
user_id: Integer,
media_type: Domain::Sofurry::GalleryPageParser::MediaType,
existing_post_ids: T::Set[Integer],
folder_id: T.nilable(Integer),
).returns(T::Set[GalleryEntry])
end
def scan_gallery_posts(user_id, media_type, existing_post_ids, folder_id: nil)
posts = T.let(Set.new, T::Set[GalleryEntry])
page = 1
loop do
url =
if folder_id
"https://api2.sofurry.com/browse/folder/#{media_type.serialize}" +
"?by=#{user_id}&folder=#{folder_id}&format=json"
else
"https://api2.sofurry.com/browse/user/#{media_type.serialize}" +
"?uid=#{user_id}&#{media_type.serialize}-page=#{page}&format=json"
end
response = http_client.get(url)
if response.status_code != 200
fatal_error("failed to get gallery: #{response.status_code}")
end
posts_json =
T.cast(
JSON.parse(response.body)["items"],
T::Array[T::Hash[String, T.untyped]],
)
post_ids =
posts_json
.map { |post_json| T.cast(post_json["id"].to_i, Integer) }
.compact
.to_set
posts.merge(
posts_json.map do |post_json|
GalleryEntry.new(
sofurry_id: post_json["id"]&.to_i,
title: post_json["title"],
tags: post_json["tags"].split(", ").map(&:strip),
description: post_json["description"],
content_level: post_json["contentLevel"].to_i,
posted_at: Time.at(post_json["postTime"].to_i),
media_type: media_type,
gallery_log_entry: response.log_entry,
)
end,
)
break if post_ids.subset?(existing_post_ids)
break if posts_json.length < 30
page += 1
end
posts
end
end