Files
redux-scraper/app/jobs/domain/fa/job/base.rb
2025-02-25 05:47:44 +00:00

505 lines
14 KiB
Ruby

# typed: strict
class Domain::Fa::Job::Base < Scraper::JobBase
abstract!
discard_on ActiveJob::DeserializationError
include HasBulkEnqueueJobs
sig { override.returns(Symbol) }
def self.http_factory_method
:get_fa_http_client
end
protected
sig { returns(T::Boolean) }
def skip_enqueue_found_links?
!!arguments[0][:skip_enqueue_found_links]
end
sig { params(build_post: T::Boolean).returns(Domain::Post::FaPost) }
def post_from_args!(build_post: false)
args = arguments[0]
post = args[:post]
if post.is_a?(Domain::Post::FaPost)
return post
elsif post.is_a?(Domain::Fa::Post)
return Domain::Post::FaPost.find_by!(fa_id: post.fa_id)
elsif fa_id = args[:fa_id]
if build_post
Domain::Post::FaPost.find_or_initialize_by(fa_id: fa_id)
else
Domain::Post::FaPost.find_by!(fa_id: fa_id)
end
else
fatal_error("arg 'post' must be a Domain::Post::FaPost or an Integer")
end
end
sig { returns(Domain::UserAvatar) }
def avatar_from_args!
args = arguments[0]
avatar = args[:avatar]
user = args[:user]
if avatar.is_a?(Domain::UserAvatar)
return avatar
elsif user.is_a?(Domain::User::FaUser)
return T.must(user.avatar)
elsif user.is_a?(Domain::Fa::User)
user = Domain::User::FaUser.find_by(url_name: user.url_name)
return T.must(user&.avatar)
else
fatal_error(
"arg 'avatar' must be a Domain::UserAvatar or user must be a Domain::Fa::User",
)
end
end
sig { params(create_if_missing: T::Boolean).returns(Domain::User::FaUser) }
def user_from_args!(create_if_missing: true)
args = arguments[0]
user = args[:user]
if user.is_a?(Domain::User::FaUser)
user
elsif user.is_a?(Domain::Fa::User)
Domain::User::FaUser.find_by!(url_name: user.url_name)
elsif url_name = args[:url_name]
if create_if_missing
user =
Domain::User::FaUser.find_or_initialize_by(url_name:) do |user|
user.name = url_name
end
if user.new_record?
user.save!
defer_job(
Domain::Fa::Job::UserPageJob,
{ user:, caused_by_entry: causing_log_entry },
)
end
user
else
Domain::User::FaUser.find_by!(url_name:)
end
else
fatal_error(
"arg 'user' must be a Domain::User::FaUser or Domain::Fa::User, or url_name must be provided",
)
end
end
sig { params(user: Domain::User::FaUser).returns(T::Boolean) }
def user_due_for_favs_scan?(user)
unless user.favs_scan.due?
if force_scan?
logger.warn(
"scanned favs #{user.favs_scan.ago_in_words.bold} ago - force scanning",
)
return true
else
logger.warn(
"scanned favs #{user.favs_scan.ago_in_words.bold} ago - skipping",
)
return false
end
end
return true
end
class ListingPageScanStats < T::Struct
include T::Struct::ActsAsComparable
const :new_posts, T::Array[Domain::Post::FaPost]
const :all_posts, T::Array[Domain::Post::FaPost]
end
module ListingPageType
extend T::Sig
class BrowsePage < T::Struct
extend T::Sig
const :page_number, Integer
end
class GalleryPage < T::Struct
const :page_number, Integer
const :folder, String
end
class FavsPage < T::Struct
const :page_number, T.nilable(String)
const :user, Domain::User::FaUser
end
Type = T.type_alias { T.any(BrowsePage, GalleryPage, FavsPage) }
sig { params(page_type: Type).returns(String) }
def self.describe(page_type)
case page_type
when BrowsePage
"browse"
when GalleryPage
"folder '#{page_type.folder}'"
when FavsPage
"favs"
end
end
end
sig do
params(
page_type: ListingPageType::Type,
page_parser: Domain::Fa::Parser::Page,
for_user: T.nilable(Domain::User::FaUser),
).returns(ListingPageScanStats)
end
def update_and_enqueue_posts_from_listings_page(
page_type,
page_parser:,
for_user: nil
)
fatal_error("not a listing page") unless page_parser.probably_listings_page?
all_posts = T.let([], T::Array[Domain::Post::FaPost])
new_posts = T.let([], T::Array[Domain::Post::FaPost])
posts_to_save = T.let([], T::Array[Domain::Post::FaPost])
page_parser.submissions_parsed.each do |submission|
post =
Domain::Post::FaPost.find_or_initialize_by_submission_parser(
submission,
first_seen_log_entry: last_log_entry,
)
case page_type
when ListingPageType::BrowsePage
post.first_browse_page ||= last_log_entry
when ListingPageType::GalleryPage
post.first_gallery_page ||= last_log_entry
end
all_posts << post
new_posts << post if post.new_record?
if post.new_record? || !post.state_ok? || post.file.blank? ||
post.file&.state_terminal_error?
post.state_ok!
posts_to_save << post
defer_job(Domain::Fa::Job::ScanPostJob, { post: })
end
if (post_file = post.file) && post_file.url_str.present? &&
post_file.log_entry.nil? && !post_file.state_terminal_error?
defer_job(Domain::Fa::Job::ScanFileJob, { post_file: })
end
if creator = post.creator
creator.state_ok!
creator.save!
enqueue_user_scan(creator)
end
end
if for_user && (user_page = page_parser.user_page) &&
(url = user_page.profile_thumb_url)
enqueue_user_avatar(for_user, url)
end
posts_to_save.each(&:save!)
logger.info(
format_tags(
make_tag("page_number", page_type.page_number),
make_tag("page_type", ListingPageType.describe(page_type)),
make_tag("all_posts.count", all_posts.count),
make_tag("new_posts.count", new_posts.count),
),
)
ListingPageScanStats.new(new_posts:, all_posts:)
end
sig { params(user: Domain::User::FaUser).void }
def enqueue_user_scan(user)
logger.tagged(make_arg_tag(user)) do
args =
if user.persisted?
{ user: user }
else
unless user.url_name
logger.warn(
format_tags("url_name or id, skipping enqueue_user_scan"),
)
return
end
{ url_name: user.url_name }
end
if user.page_scan.due? && defer_job(Domain::Fa::Job::UserPageJob, args)
logger.info(
format_tags(
"enqueue user page job",
make_tag("last page scan", user.page_scan.ago_in_words),
),
)
end
if user.gallery_scan.due? &&
defer_job(Domain::Fa::Job::UserGalleryJob, args)
logger.info(
format_tags(
"enqueue user gallery job",
make_tag("last gallery scan", user.gallery_scan.ago_in_words),
),
)
end
if user.follows_scan.due? &&
defer_job(Domain::Fa::Job::UserFollowsJob, args)
logger.info(
format_tags(
"enqueue user follows job",
make_tag("last follows scan", user.follows_scan.ago_in_words),
),
)
end
if user.favs_scan.due? && defer_job(Domain::Fa::Job::FavsJob, args)
logger.info(
format_tags(
"enqueue user favs job",
make_tag("last favs scan", user.favs_scan.ago_in_words),
),
)
end
end
end
sig do
params(fa_ids: T::Array[Integer]).returns(T::Array[Domain::Post::FaPost])
end
def find_or_create_posts_by_fa_ids(fa_ids)
posts = Domain::Post::FaPost.where(fa_id: fa_ids).to_a
missing_post_fa_ids = fa_ids - posts.map(&:fa_id)
ReduxApplicationRecord.transaction do
missing_post_fa_ids.each do |fa_id|
post = Domain::Post::FaPost.create!(fa_id: fa_id)
defer_job(Domain::Fa::Job::ScanPostJob, { post: post })
posts << post
end
end
posts = posts.index_by(&:fa_id)
fa_ids.map { |fa_id| posts[fa_id] }
end
sig do
params(
recent_users: T::Array[Domain::Fa::Parser::UserPageHelper::RecentUser],
).returns(T::Array[Domain::User::FaUser])
end
def find_or_create_users_by_recent_users(recent_users)
users =
Domain::User::FaUser.where(url_name: recent_users.map(&:url_name)).to_a
missing_recent_users =
recent_users.reject do |recent_user|
users.any? { |u| u.url_name == recent_user.url_name }
end
ReduxApplicationRecord.transaction do
missing_recent_users.each do |recent_user|
user =
Domain::User::FaUser.create!(
url_name: recent_user.url_name,
name: recent_user.name,
)
defer_job(Domain::Fa::Job::UserPageJob, { user: user })
users << user
end
end
users_by_url_name =
T.cast(users.index_by(&:url_name), T::Hash[String, Domain::User::FaUser])
# return user models in the same order as the input
recent_users.map { |name| T.must(users_by_url_name[name.url_name]) }
end
sig do
params(
user: Domain::User::FaUser,
page: Domain::Fa::Parser::Page,
response: Scraper::HttpClient::Response,
).void
end
def update_user_fields_from_page(user, page, response)
user_page = page.user_page
user.name = user_page.name
user.registered_at = user_page.registered_since
user.num_pageviews = user_page.num_pageviews
user.num_submissions = user_page.num_submissions
user.num_comments_recieved = user_page.num_comments_recieved
user.num_comments_given = user_page.num_comments_given
user.num_journals = user_page.num_journals
user.num_favorites = user_page.num_favorites
user.profile_html =
user_page.profile_html.encode("UTF-8", invalid: :replace, undef: :replace)
user.last_user_page_id = response.log_entry.id
user.scanned_page_at = Time.current
user.save!
if url = user_page.profile_thumb_url
enqueue_user_avatar(user, url)
end
end
sig { params(user: Domain::User::FaUser, avatar_url_str: String).void }
def enqueue_user_avatar(user, avatar_url_str)
match = avatar_url_str.match(%r{/([^/]+)\.gif})
if match.nil?
logger.warn(
format_tags("invalid avatar url", make_tag("url", avatar_url_str)),
)
return
end
expected_url_name = match[1]
if user.url_name != expected_url_name
logger.warn(
format_tags(
"invalid avatar url",
make_tag("url", avatar_url_str),
make_tag("expected", expected_url_name),
make_tag("actual", user.url_name),
),
)
return
end
uri = Addressable::URI.parse(avatar_url_str)
uri.scheme ||= "https"
avatar = user.avatar
if avatar.nil? || (avatar.url_str.present? && avatar.url_str != uri.to_s)
logger.info(format_tags("creating new avatar", make_tag("url", uri.to_s)))
avatar = user.avatars.build(url_str: uri.to_s)
elsif avatar.url_str.blank?
logger.info(format_tags("updating avatar", make_tag("url", uri.to_s)))
avatar.url_str = uri.to_s
end
if avatar.changed?
avatar.state_pending!
avatar.save!
defer_job(Domain::Fa::Job::UserAvatarJob, { avatar: })
user.association(:avatar).reload
end
end
FoundLink = Scraper::LinkFinder::FoundLink
sig do
params(log_entry: HttpLogEntry, suppress_jobs: T::Array[T.untyped]).void
end
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
return if skip_enqueue_found_links?
start_time = Time.now
unless PERMITTED_CONTENT_TYPES.any? { |ct|
ct.match(log_entry.content_type)
}
raise("unsupported content type: #{log_entry.content_type}")
end
document = log_entry.response&.contents || return
link_finder = Scraper::LinkFinder.new(T.must(log_entry.uri_host), document)
link_finder.logger.level = :error
links = link_finder.find_links
job_defs = []
url_names =
links.filter_map do |link|
link.is_a?(FoundLink::FaUser) ? link.url_name : nil
end
url_name_to_fa_user =
T.let(
Domain::User::FaUser.where(url_name: url_names).index_by(&:url_name),
T::Hash[String, Domain::User::FaUser],
)
fa_ids =
links.filter_map do |link|
link.is_a?(FoundLink::FaPost) ? link.fa_id : nil
end
fa_id_to_fa_post =
T.cast(
Domain::Post::FaPost.where(fa_id: fa_ids).index_by(&:fa_id),
T::Hash[Integer, Domain::Post::FaPost],
)
links
.filter_map do |link|
if link.is_a?(FoundLink::FaUser) || link.is_a?(FoundLink::FaPost)
link
else
nil
end
end
.each do |link|
case link
when FoundLink::FaUser
url_name = link.url_name
user =
url_name_to_fa_user[url_name] ||
Domain::User::FaUser.create!(url_name:) do |user|
user.name ||= url_name
end
enqueue_user_scan(user)
when FoundLink::FaPost
fa_id = link.fa_id
post =
fa_id_to_fa_post[fa_id] ||
Domain::Post::FaPost.build(fa_id:) do |post|
post.first_seen_entry_id = log_entry.id
end
if post.new_record?
post.save!
defer_job(Domain::Fa::Job::ScanPostJob, { post: })
end
end
end
job_defs.uniq!
job_defs.reject! do |job_def|
suppress_jobs.any? do |suppress|
suppress == job_def.slice(*suppress.keys)
end
end
job_defs.each do |job_def|
job_class = job_def[:job]
params = job_def[:params]
desc = job_def[:desc]
logger.debug(
[
"link finder -",
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
desc,
].join(" "),
)
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
end
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
logger.info(
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)",
)
rescue StandardError => e
logger.error(
"link finder - error enqueuing jobs: #{e.class.name} - #{e.message}",
)
end
end