505 lines
14 KiB
Ruby
505 lines
14 KiB
Ruby
# typed: strict
|
|
class Domain::Fa::Job::Base < Scraper::JobBase
|
|
abstract!
|
|
|
|
discard_on ActiveJob::DeserializationError
|
|
include HasBulkEnqueueJobs
|
|
|
|
sig { override.returns(Symbol) }
|
|
def self.http_factory_method
|
|
:get_fa_http_client
|
|
end
|
|
|
|
protected
|
|
|
|
sig { returns(T::Boolean) }
|
|
def skip_enqueue_found_links?
|
|
!!arguments[0][:skip_enqueue_found_links]
|
|
end
|
|
|
|
sig { params(build_post: T::Boolean).returns(Domain::Post::FaPost) }
|
|
def post_from_args!(build_post: false)
|
|
args = arguments[0]
|
|
post = args[:post]
|
|
if post.is_a?(Domain::Post::FaPost)
|
|
return post
|
|
elsif post.is_a?(Domain::Fa::Post)
|
|
return Domain::Post::FaPost.find_by!(fa_id: post.fa_id)
|
|
elsif fa_id = args[:fa_id]
|
|
if build_post
|
|
Domain::Post::FaPost.find_or_initialize_by(fa_id: fa_id)
|
|
else
|
|
Domain::Post::FaPost.find_by!(fa_id: fa_id)
|
|
end
|
|
else
|
|
fatal_error("arg 'post' must be a Domain::Post::FaPost or an Integer")
|
|
end
|
|
end
|
|
|
|
sig { returns(Domain::UserAvatar) }
|
|
def avatar_from_args!
|
|
args = arguments[0]
|
|
avatar = args[:avatar]
|
|
user = args[:user]
|
|
if avatar.is_a?(Domain::UserAvatar)
|
|
return avatar
|
|
elsif user.is_a?(Domain::User::FaUser)
|
|
return T.must(user.avatar)
|
|
elsif user.is_a?(Domain::Fa::User)
|
|
user = Domain::User::FaUser.find_by(url_name: user.url_name)
|
|
return T.must(user&.avatar)
|
|
else
|
|
fatal_error(
|
|
"arg 'avatar' must be a Domain::UserAvatar or user must be a Domain::Fa::User",
|
|
)
|
|
end
|
|
end
|
|
|
|
sig { params(create_if_missing: T::Boolean).returns(Domain::User::FaUser) }
|
|
def user_from_args!(create_if_missing: true)
|
|
args = arguments[0]
|
|
user = args[:user]
|
|
if user.is_a?(Domain::User::FaUser)
|
|
user
|
|
elsif user.is_a?(Domain::Fa::User)
|
|
Domain::User::FaUser.find_by!(url_name: user.url_name)
|
|
elsif url_name = args[:url_name]
|
|
if create_if_missing
|
|
user =
|
|
Domain::User::FaUser.find_or_initialize_by(url_name:) do |user|
|
|
user.name = url_name
|
|
end
|
|
|
|
if user.new_record?
|
|
user.save!
|
|
defer_job(
|
|
Domain::Fa::Job::UserPageJob,
|
|
{ user:, caused_by_entry: causing_log_entry },
|
|
)
|
|
end
|
|
|
|
user
|
|
else
|
|
Domain::User::FaUser.find_by!(url_name:)
|
|
end
|
|
else
|
|
fatal_error(
|
|
"arg 'user' must be a Domain::User::FaUser or Domain::Fa::User, or url_name must be provided",
|
|
)
|
|
end
|
|
end
|
|
|
|
sig { params(user: Domain::User::FaUser).returns(T::Boolean) }
|
|
def user_due_for_favs_scan?(user)
|
|
unless user.favs_scan.due?
|
|
if force_scan?
|
|
logger.warn(
|
|
"scanned favs #{user.favs_scan.ago_in_words.bold} ago - force scanning",
|
|
)
|
|
return true
|
|
else
|
|
logger.warn(
|
|
"scanned favs #{user.favs_scan.ago_in_words.bold} ago - skipping",
|
|
)
|
|
return false
|
|
end
|
|
end
|
|
|
|
return true
|
|
end
|
|
|
|
class ListingPageScanStats < T::Struct
|
|
include T::Struct::ActsAsComparable
|
|
const :new_posts, T::Array[Domain::Post::FaPost]
|
|
const :all_posts, T::Array[Domain::Post::FaPost]
|
|
end
|
|
|
|
module ListingPageType
|
|
extend T::Sig
|
|
|
|
class BrowsePage < T::Struct
|
|
extend T::Sig
|
|
const :page_number, Integer
|
|
end
|
|
|
|
class GalleryPage < T::Struct
|
|
const :page_number, Integer
|
|
const :folder, String
|
|
end
|
|
|
|
class FavsPage < T::Struct
|
|
const :page_number, T.nilable(String)
|
|
const :user, Domain::User::FaUser
|
|
end
|
|
|
|
Type = T.type_alias { T.any(BrowsePage, GalleryPage, FavsPage) }
|
|
|
|
sig { params(page_type: Type).returns(String) }
|
|
def self.describe(page_type)
|
|
case page_type
|
|
when BrowsePage
|
|
"browse"
|
|
when GalleryPage
|
|
"folder '#{page_type.folder}'"
|
|
when FavsPage
|
|
"favs"
|
|
end
|
|
end
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
page_type: ListingPageType::Type,
|
|
page_parser: Domain::Fa::Parser::Page,
|
|
for_user: T.nilable(Domain::User::FaUser),
|
|
).returns(ListingPageScanStats)
|
|
end
|
|
def update_and_enqueue_posts_from_listings_page(
|
|
page_type,
|
|
page_parser:,
|
|
for_user: nil
|
|
)
|
|
fatal_error("not a listing page") unless page_parser.probably_listings_page?
|
|
|
|
all_posts = T.let([], T::Array[Domain::Post::FaPost])
|
|
new_posts = T.let([], T::Array[Domain::Post::FaPost])
|
|
posts_to_save = T.let([], T::Array[Domain::Post::FaPost])
|
|
|
|
page_parser.submissions_parsed.each do |submission|
|
|
post =
|
|
Domain::Post::FaPost.find_or_initialize_by_submission_parser(
|
|
submission,
|
|
first_seen_log_entry: last_log_entry,
|
|
)
|
|
|
|
case page_type
|
|
when ListingPageType::BrowsePage
|
|
post.first_browse_page ||= last_log_entry
|
|
when ListingPageType::GalleryPage
|
|
post.first_gallery_page ||= last_log_entry
|
|
end
|
|
|
|
all_posts << post
|
|
new_posts << post if post.new_record?
|
|
|
|
if post.new_record? || !post.state_ok? || post.file.blank? ||
|
|
post.file&.state_terminal_error?
|
|
post.state_ok!
|
|
posts_to_save << post
|
|
defer_job(Domain::Fa::Job::ScanPostJob, { post: })
|
|
end
|
|
|
|
if (post_file = post.file) && post_file.url_str.present? &&
|
|
post_file.log_entry.nil? && !post_file.state_terminal_error?
|
|
defer_job(Domain::Fa::Job::ScanFileJob, { post_file: })
|
|
end
|
|
|
|
if creator = post.creator
|
|
creator.state_ok!
|
|
creator.save!
|
|
enqueue_user_scan(creator)
|
|
end
|
|
end
|
|
|
|
if for_user && (user_page = page_parser.user_page) &&
|
|
(url = user_page.profile_thumb_url)
|
|
enqueue_user_avatar(for_user, url)
|
|
end
|
|
|
|
posts_to_save.each(&:save!)
|
|
|
|
logger.info(
|
|
format_tags(
|
|
make_tag("page_number", page_type.page_number),
|
|
make_tag("page_type", ListingPageType.describe(page_type)),
|
|
make_tag("all_posts.count", all_posts.count),
|
|
make_tag("new_posts.count", new_posts.count),
|
|
),
|
|
)
|
|
|
|
ListingPageScanStats.new(new_posts:, all_posts:)
|
|
end
|
|
|
|
sig { params(user: Domain::User::FaUser).void }
|
|
def enqueue_user_scan(user)
|
|
logger.tagged(make_arg_tag(user)) do
|
|
args =
|
|
if user.persisted?
|
|
{ user: user }
|
|
else
|
|
unless user.url_name
|
|
logger.warn(
|
|
format_tags("url_name or id, skipping enqueue_user_scan"),
|
|
)
|
|
return
|
|
end
|
|
|
|
{ url_name: user.url_name }
|
|
end
|
|
|
|
if user.page_scan.due? && defer_job(Domain::Fa::Job::UserPageJob, args)
|
|
logger.info(
|
|
format_tags(
|
|
"enqueue user page job",
|
|
make_tag("last page scan", user.page_scan.ago_in_words),
|
|
),
|
|
)
|
|
end
|
|
|
|
if user.gallery_scan.due? &&
|
|
defer_job(Domain::Fa::Job::UserGalleryJob, args)
|
|
logger.info(
|
|
format_tags(
|
|
"enqueue user gallery job",
|
|
make_tag("last gallery scan", user.gallery_scan.ago_in_words),
|
|
),
|
|
)
|
|
end
|
|
|
|
if user.follows_scan.due? &&
|
|
defer_job(Domain::Fa::Job::UserFollowsJob, args)
|
|
logger.info(
|
|
format_tags(
|
|
"enqueue user follows job",
|
|
make_tag("last follows scan", user.follows_scan.ago_in_words),
|
|
),
|
|
)
|
|
end
|
|
|
|
if user.favs_scan.due? && defer_job(Domain::Fa::Job::FavsJob, args)
|
|
logger.info(
|
|
format_tags(
|
|
"enqueue user favs job",
|
|
make_tag("last favs scan", user.favs_scan.ago_in_words),
|
|
),
|
|
)
|
|
end
|
|
end
|
|
end
|
|
|
|
sig do
|
|
params(fa_ids: T::Array[Integer]).returns(T::Array[Domain::Post::FaPost])
|
|
end
|
|
def find_or_create_posts_by_fa_ids(fa_ids)
|
|
posts = Domain::Post::FaPost.where(fa_id: fa_ids).to_a
|
|
missing_post_fa_ids = fa_ids - posts.map(&:fa_id)
|
|
|
|
ReduxApplicationRecord.transaction do
|
|
missing_post_fa_ids.each do |fa_id|
|
|
post = Domain::Post::FaPost.create!(fa_id: fa_id)
|
|
defer_job(Domain::Fa::Job::ScanPostJob, { post: post })
|
|
posts << post
|
|
end
|
|
end
|
|
|
|
posts = posts.index_by(&:fa_id)
|
|
fa_ids.map { |fa_id| posts[fa_id] }
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
recent_users: T::Array[Domain::Fa::Parser::UserPageHelper::RecentUser],
|
|
).returns(T::Array[Domain::User::FaUser])
|
|
end
|
|
def find_or_create_users_by_recent_users(recent_users)
|
|
users =
|
|
Domain::User::FaUser.where(url_name: recent_users.map(&:url_name)).to_a
|
|
|
|
missing_recent_users =
|
|
recent_users.reject do |recent_user|
|
|
users.any? { |u| u.url_name == recent_user.url_name }
|
|
end
|
|
|
|
ReduxApplicationRecord.transaction do
|
|
missing_recent_users.each do |recent_user|
|
|
user =
|
|
Domain::User::FaUser.create!(
|
|
url_name: recent_user.url_name,
|
|
name: recent_user.name,
|
|
)
|
|
defer_job(Domain::Fa::Job::UserPageJob, { user: user })
|
|
users << user
|
|
end
|
|
end
|
|
|
|
users_by_url_name =
|
|
T.cast(users.index_by(&:url_name), T::Hash[String, Domain::User::FaUser])
|
|
|
|
# return user models in the same order as the input
|
|
recent_users.map { |name| T.must(users_by_url_name[name.url_name]) }
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
user: Domain::User::FaUser,
|
|
page: Domain::Fa::Parser::Page,
|
|
response: Scraper::HttpClient::Response,
|
|
).void
|
|
end
|
|
def update_user_fields_from_page(user, page, response)
|
|
user_page = page.user_page
|
|
user.name = user_page.name
|
|
user.registered_at = user_page.registered_since
|
|
user.num_pageviews = user_page.num_pageviews
|
|
user.num_submissions = user_page.num_submissions
|
|
user.num_comments_recieved = user_page.num_comments_recieved
|
|
user.num_comments_given = user_page.num_comments_given
|
|
user.num_journals = user_page.num_journals
|
|
user.num_favorites = user_page.num_favorites
|
|
user.profile_html =
|
|
user_page.profile_html.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
user.last_user_page_id = response.log_entry.id
|
|
user.scanned_page_at = Time.current
|
|
user.save!
|
|
|
|
if url = user_page.profile_thumb_url
|
|
enqueue_user_avatar(user, url)
|
|
end
|
|
end
|
|
|
|
sig { params(user: Domain::User::FaUser, avatar_url_str: String).void }
|
|
def enqueue_user_avatar(user, avatar_url_str)
|
|
match = avatar_url_str.match(%r{/([^/]+)\.gif})
|
|
if match.nil?
|
|
logger.warn(
|
|
format_tags("invalid avatar url", make_tag("url", avatar_url_str)),
|
|
)
|
|
return
|
|
end
|
|
|
|
expected_url_name = match[1]
|
|
if user.url_name != expected_url_name
|
|
logger.warn(
|
|
format_tags(
|
|
"invalid avatar url",
|
|
make_tag("url", avatar_url_str),
|
|
make_tag("expected", expected_url_name),
|
|
make_tag("actual", user.url_name),
|
|
),
|
|
)
|
|
return
|
|
end
|
|
|
|
uri = Addressable::URI.parse(avatar_url_str)
|
|
uri.scheme ||= "https"
|
|
|
|
avatar = user.avatar
|
|
if avatar.nil? || (avatar.url_str.present? && avatar.url_str != uri.to_s)
|
|
logger.info(format_tags("creating new avatar", make_tag("url", uri.to_s)))
|
|
avatar = user.avatars.build(url_str: uri.to_s)
|
|
elsif avatar.url_str.blank?
|
|
logger.info(format_tags("updating avatar", make_tag("url", uri.to_s)))
|
|
avatar.url_str = uri.to_s
|
|
end
|
|
|
|
if avatar.changed?
|
|
avatar.state_pending!
|
|
avatar.save!
|
|
defer_job(Domain::Fa::Job::UserAvatarJob, { avatar: })
|
|
user.association(:avatar).reload
|
|
end
|
|
end
|
|
|
|
FoundLink = Scraper::LinkFinder::FoundLink
|
|
|
|
sig do
|
|
params(log_entry: HttpLogEntry, suppress_jobs: T::Array[T.untyped]).void
|
|
end
|
|
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
|
|
return if skip_enqueue_found_links?
|
|
|
|
start_time = Time.now
|
|
unless PERMITTED_CONTENT_TYPES.any? { |ct|
|
|
ct.match(log_entry.content_type)
|
|
}
|
|
raise("unsupported content type: #{log_entry.content_type}")
|
|
end
|
|
document = log_entry.response&.contents || return
|
|
link_finder = Scraper::LinkFinder.new(T.must(log_entry.uri_host), document)
|
|
link_finder.logger.level = :error
|
|
links = link_finder.find_links
|
|
job_defs = []
|
|
|
|
url_names =
|
|
links.filter_map do |link|
|
|
link.is_a?(FoundLink::FaUser) ? link.url_name : nil
|
|
end
|
|
url_name_to_fa_user =
|
|
T.let(
|
|
Domain::User::FaUser.where(url_name: url_names).index_by(&:url_name),
|
|
T::Hash[String, Domain::User::FaUser],
|
|
)
|
|
|
|
fa_ids =
|
|
links.filter_map do |link|
|
|
link.is_a?(FoundLink::FaPost) ? link.fa_id : nil
|
|
end
|
|
fa_id_to_fa_post =
|
|
T.cast(
|
|
Domain::Post::FaPost.where(fa_id: fa_ids).index_by(&:fa_id),
|
|
T::Hash[Integer, Domain::Post::FaPost],
|
|
)
|
|
|
|
links
|
|
.filter_map do |link|
|
|
if link.is_a?(FoundLink::FaUser) || link.is_a?(FoundLink::FaPost)
|
|
link
|
|
else
|
|
nil
|
|
end
|
|
end
|
|
.each do |link|
|
|
case link
|
|
when FoundLink::FaUser
|
|
url_name = link.url_name
|
|
user =
|
|
url_name_to_fa_user[url_name] ||
|
|
Domain::User::FaUser.create!(url_name:) do |user|
|
|
user.name ||= url_name
|
|
end
|
|
enqueue_user_scan(user)
|
|
when FoundLink::FaPost
|
|
fa_id = link.fa_id
|
|
post =
|
|
fa_id_to_fa_post[fa_id] ||
|
|
Domain::Post::FaPost.build(fa_id:) do |post|
|
|
post.first_seen_entry_id = log_entry.id
|
|
end
|
|
if post.new_record?
|
|
post.save!
|
|
defer_job(Domain::Fa::Job::ScanPostJob, { post: })
|
|
end
|
|
end
|
|
end
|
|
|
|
job_defs.uniq!
|
|
job_defs.reject! do |job_def|
|
|
suppress_jobs.any? do |suppress|
|
|
suppress == job_def.slice(*suppress.keys)
|
|
end
|
|
end
|
|
job_defs.each do |job_def|
|
|
job_class = job_def[:job]
|
|
params = job_def[:params]
|
|
desc = job_def[:desc]
|
|
logger.debug(
|
|
[
|
|
"link finder -",
|
|
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
|
|
desc,
|
|
].join(" "),
|
|
)
|
|
|
|
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
|
|
end
|
|
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
|
|
logger.info(
|
|
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)",
|
|
)
|
|
rescue StandardError => e
|
|
logger.error(
|
|
"link finder - error enqueuing jobs: #{e.class.name} - #{e.message}",
|
|
)
|
|
end
|
|
end
|