591 lines
17 KiB
Ruby
591 lines
17 KiB
Ruby
# typed: strict
|
|
class Domain::Fa::Job::Base < Scraper::JobBase
|
|
abstract!
|
|
discard_on ActiveJob::DeserializationError
|
|
include HasBulkEnqueueJobs
|
|
|
|
sig { override.returns(Symbol) }
|
|
def self.http_factory_method
|
|
:get_fa_http_client
|
|
end
|
|
|
|
protected
|
|
|
|
BUGGY_USER_URL_NAMES =
|
|
T.let(["click here", "..", ".", "<i class="], T::Array[String])
|
|
|
|
sig { params(user: Domain::User::FaUser).returns(T::Boolean) }
|
|
def buggy_user?(user)
|
|
if BUGGY_USER_URL_NAMES.include?(user.url_name)
|
|
logger.error(
|
|
format_tags("buggy user", make_tag("url_name", user.url_name)),
|
|
)
|
|
return true
|
|
end
|
|
|
|
false
|
|
end
|
|
|
|
sig { returns(T::Boolean) }
|
|
def skip_enqueue_found_links?
|
|
!!arguments[0][:skip_enqueue_found_links]
|
|
end
|
|
|
|
sig { params(build_post: T::Boolean).returns(Domain::Post::FaPost) }
|
|
def post_from_args!(build_post: false)
|
|
args = arguments[0]
|
|
post = args[:post]
|
|
if post.is_a?(Domain::Post::FaPost)
|
|
return post
|
|
elsif fa_id = args[:fa_id]
|
|
if build_post
|
|
Domain::Post::FaPost.find_or_initialize_by(fa_id: fa_id)
|
|
else
|
|
Domain::Post::FaPost.find_by!(fa_id: fa_id)
|
|
end
|
|
else
|
|
fatal_error("arg 'post' must be a Domain::Post::FaPost or an Integer")
|
|
end
|
|
end
|
|
|
|
sig { returns(Domain::UserAvatar) }
|
|
def avatar_from_args!
|
|
args = arguments[0]
|
|
avatar = args[:avatar]
|
|
user = args[:user]
|
|
if avatar.is_a?(Domain::UserAvatar)
|
|
return avatar
|
|
elsif user.is_a?(Domain::User::FaUser)
|
|
return T.must(user.avatar)
|
|
else
|
|
fatal_error(
|
|
"arg 'avatar' must be a Domain::UserAvatar or user must be a Domain::User::FaUser",
|
|
)
|
|
end
|
|
end
|
|
|
|
sig { params(create_if_missing: T::Boolean).returns(Domain::User::FaUser) }
|
|
def user_from_args!(create_if_missing: true)
|
|
args = arguments[0]
|
|
user = args[:user]
|
|
if user.is_a?(Domain::User::FaUser)
|
|
user
|
|
elsif url_name = args[:url_name]
|
|
if create_if_missing
|
|
user =
|
|
Domain::User::FaUser.find_or_initialize_by(url_name:) do |user|
|
|
user.name = url_name
|
|
end
|
|
|
|
if user.new_record?
|
|
user.save!
|
|
defer_job(
|
|
Domain::Fa::Job::UserPageJob,
|
|
{ user:, caused_by_entry: causing_log_entry },
|
|
)
|
|
end
|
|
|
|
user
|
|
else
|
|
Domain::User::FaUser.find_by!(url_name:)
|
|
end
|
|
else
|
|
fatal_error(
|
|
"arg 'user' must be a Domain::User::FaUser, or url_name must be provided",
|
|
)
|
|
end
|
|
end
|
|
|
|
sig { params(user: Domain::User::FaUser).returns(T::Boolean) }
|
|
def user_due_for_favs_scan?(user)
|
|
unless user.favs_scan.due?
|
|
if force_scan?
|
|
logger.warn(
|
|
"scanned favs #{user.favs_scan.ago_in_words.bold} ago - force scanning",
|
|
)
|
|
return true
|
|
else
|
|
logger.warn(
|
|
"scanned favs #{user.favs_scan.ago_in_words.bold} ago - skipping",
|
|
)
|
|
return false
|
|
end
|
|
end
|
|
|
|
return true
|
|
end
|
|
|
|
class ListingPageScanStats < T::Struct
|
|
include T::Struct::ActsAsComparable
|
|
const :new_posts, T::Array[Domain::Post::FaPost]
|
|
const :all_posts, T::Array[Domain::Post::FaPost]
|
|
end
|
|
|
|
module ListingPageType
|
|
extend T::Sig
|
|
|
|
class BrowsePage < T::Struct
|
|
extend T::Sig
|
|
const :page_number, Integer
|
|
end
|
|
|
|
class GalleryPage < T::Struct
|
|
const :page_number, Integer
|
|
const :folder, String
|
|
end
|
|
|
|
class FavsPage < T::Struct
|
|
const :page_number, T.nilable(String)
|
|
const :user, Domain::User::FaUser
|
|
end
|
|
|
|
Type = T.type_alias { T.any(BrowsePage, GalleryPage, FavsPage) }
|
|
|
|
sig { params(page_type: Type).returns(String) }
|
|
def self.describe(page_type)
|
|
case page_type
|
|
when BrowsePage
|
|
"browse"
|
|
when GalleryPage
|
|
"folder '#{page_type.folder}'"
|
|
when FavsPage
|
|
"favs"
|
|
end
|
|
end
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
page_type: ListingPageType::Type,
|
|
page_parser: Domain::Fa::Parser::Page,
|
|
for_user: T.nilable(Domain::User::FaUser),
|
|
).returns(ListingPageScanStats)
|
|
end
|
|
def update_and_enqueue_posts_from_listings_page(
|
|
page_type,
|
|
page_parser:,
|
|
for_user: nil
|
|
)
|
|
fatal_error("not a listing page") unless page_parser.probably_listings_page?
|
|
|
|
all_posts = T.let([], T::Array[Domain::Post::FaPost])
|
|
new_posts = T.let([], T::Array[Domain::Post::FaPost])
|
|
posts_to_save = T.let([], T::Array[Domain::Post::FaPost])
|
|
|
|
page_parser.submissions_parsed.each do |submission|
|
|
post =
|
|
Domain::Post::FaPost.find_or_initialize_by_submission_parser(
|
|
submission,
|
|
first_seen_log_entry: last_log_entry,
|
|
)
|
|
|
|
case page_type
|
|
when ListingPageType::BrowsePage
|
|
post.first_browse_page ||= last_log_entry
|
|
when ListingPageType::GalleryPage
|
|
post.first_gallery_page ||= last_log_entry
|
|
end
|
|
|
|
all_posts << post
|
|
new_posts << post if post.new_record?
|
|
|
|
if post.new_record? || !post.state_ok? || post.file.blank? ||
|
|
post.file&.state_terminal_error?
|
|
post.state_ok!
|
|
posts_to_save << post
|
|
defer_job(Domain::Fa::Job::ScanPostJob, { post: })
|
|
end
|
|
|
|
if (post_file = post.file) && post_file.url_str.present? &&
|
|
post_file.log_entry.nil? && !post_file.state_terminal_error?
|
|
defer_job(Domain::Fa::Job::ScanFileJob, { post_file: })
|
|
end
|
|
|
|
if creator = post.creator
|
|
creator.state_ok!
|
|
creator.save!
|
|
enqueue_user_scan(creator, at_most_one_scan: true)
|
|
end
|
|
end
|
|
|
|
if for_user && (user_page = page_parser.user_page) &&
|
|
(url = user_page.profile_thumb_url)
|
|
enqueue_user_avatar(for_user, url)
|
|
end
|
|
|
|
posts_to_save.each(&:save!)
|
|
|
|
logger.info(
|
|
format_tags(
|
|
make_tag("page_number", page_type.page_number),
|
|
make_tag("page_type", ListingPageType.describe(page_type)),
|
|
make_tag("all_posts.count", all_posts.count),
|
|
make_tag("new_posts.count", new_posts.count),
|
|
),
|
|
)
|
|
|
|
ListingPageScanStats.new(new_posts:, all_posts:)
|
|
end
|
|
|
|
sig { params(user: Domain::User::FaUser, at_most_one_scan: T::Boolean).void }
|
|
def enqueue_user_scan(user, at_most_one_scan:)
|
|
skip_page_enqueue = !!ENV["SKIP_PAGE_ENQUEUE"]
|
|
skip_favs_enqueue = !!ENV["SKIP_FAVS_ENQUEUE"]
|
|
skip_follows_enqueue = !!ENV["SKIP_FOLLOWS_ENQUEUE"]
|
|
skip_gallery_enqueue = !!ENV["SKIP_GALLERY_ENQUEUE"]
|
|
|
|
logger.tagged(make_arg_tag(user)) do
|
|
args =
|
|
if user.persisted?
|
|
{ user: user }
|
|
else
|
|
unless user.url_name
|
|
logger.warn(
|
|
format_tags("url_name or id, skipping enqueue_user_scan"),
|
|
)
|
|
return
|
|
end
|
|
|
|
{ url_name: user.url_name }
|
|
end
|
|
|
|
if (
|
|
user.page_scan.at.nil? || (!skip_page_enqueue && user.page_scan.due?)
|
|
) && defer_job(Domain::Fa::Job::UserPageJob, args)
|
|
logger.info(
|
|
format_tags(
|
|
"enqueue user page job",
|
|
make_tag("last page scan", user.page_scan.ago_in_words),
|
|
),
|
|
)
|
|
end
|
|
|
|
# don't enqueue any other jobs if the user page hasn't been scanned yet
|
|
return if at_most_one_scan && user.page_scan.due?
|
|
|
|
if (
|
|
user.favs_scan.at.nil? || (!skip_favs_enqueue && user.favs_scan.due?)
|
|
) && defer_job(Domain::Fa::Job::FavsJob, args)
|
|
logger.info(
|
|
format_tags(
|
|
"enqueue user favs job",
|
|
make_tag("last favs scan", user.favs_scan.ago_in_words),
|
|
),
|
|
)
|
|
end
|
|
|
|
return if at_most_one_scan && user.favs_scan.due?
|
|
|
|
if (
|
|
user.follows_scan.at.nil? ||
|
|
(!skip_follows_enqueue && user.follows_scan.due?)
|
|
) && defer_job(Domain::Fa::Job::UserFollowsJob, args)
|
|
logger.info(
|
|
format_tags(
|
|
"enqueue user follows job",
|
|
make_tag("last follows scan", user.follows_scan.ago_in_words),
|
|
),
|
|
)
|
|
end
|
|
|
|
return if at_most_one_scan && user.follows_scan.due?
|
|
|
|
if (
|
|
user.gallery_scan.at.nil? ||
|
|
(!skip_gallery_enqueue && user.gallery_scan.due?)
|
|
) && defer_job(Domain::Fa::Job::UserGalleryJob, args)
|
|
logger.info(
|
|
format_tags(
|
|
"enqueue user gallery job",
|
|
make_tag("last gallery scan", user.gallery_scan.ago_in_words),
|
|
),
|
|
)
|
|
end
|
|
end
|
|
end
|
|
|
|
sig do
|
|
params(fa_ids: T::Array[Integer]).returns(T::Array[Domain::Post::FaPost])
|
|
end
|
|
def find_or_create_posts_by_fa_ids(fa_ids)
|
|
posts = Domain::Post::FaPost.where(fa_id: fa_ids).to_a
|
|
missing_post_fa_ids = fa_ids - posts.map(&:fa_id)
|
|
|
|
ReduxApplicationRecord.transaction do
|
|
missing_post_fa_ids.each do |fa_id|
|
|
post = Domain::Post::FaPost.create!(fa_id: fa_id)
|
|
defer_job(Domain::Fa::Job::ScanPostJob, { post: post })
|
|
posts << post
|
|
end
|
|
end
|
|
|
|
posts = posts.index_by(&:fa_id)
|
|
fa_ids.map { |fa_id| posts[fa_id] }
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
recent_users: T::Array[Domain::Fa::Parser::UserPageHelper::RecentUser],
|
|
).returns(T::Array[Domain::User::FaUser])
|
|
end
|
|
def find_or_create_users_by_recent_users(recent_users)
|
|
users =
|
|
Domain::User::FaUser.where(url_name: recent_users.map(&:url_name)).to_a
|
|
|
|
missing_recent_users =
|
|
recent_users.reject do |recent_user|
|
|
users.any? { |u| u.url_name == recent_user.url_name }
|
|
end
|
|
|
|
ReduxApplicationRecord.transaction do
|
|
missing_recent_users.each do |recent_user|
|
|
user =
|
|
Domain::User::FaUser.create!(
|
|
url_name: recent_user.url_name,
|
|
name: recent_user.name,
|
|
)
|
|
defer_job(Domain::Fa::Job::UserPageJob, { user: user })
|
|
users << user
|
|
end
|
|
end
|
|
|
|
users_by_url_name =
|
|
T.cast(users.index_by(&:url_name), T::Hash[String, Domain::User::FaUser])
|
|
|
|
# return user models in the same order as the input
|
|
recent_users.map { |name| T.must(users_by_url_name[name.url_name]) }
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
user: Domain::User::FaUser,
|
|
response: Scraper::HttpClient::Response,
|
|
).returns(T.nilable(Domain::Fa::Parser::Page))
|
|
end
|
|
def update_user_from_user_page(user, response)
|
|
user.scanned_page_at = Time.current
|
|
user.last_user_page_log_entry = response.log_entry
|
|
return nil if user_disabled_or_not_found?(user, response)
|
|
|
|
page = Domain::Fa::Parser::Page.from_log_entry(response.log_entry)
|
|
return nil unless page.probably_user_page?
|
|
|
|
user_page = page.user_page
|
|
user.state_ok!
|
|
user.name = user_page.name
|
|
user.registered_at = user_page.registered_since&.in_time_zone("UTC")
|
|
user.num_pageviews = user_page.num_pageviews
|
|
user.num_submissions = user_page.num_submissions
|
|
user.num_comments_recieved = user_page.num_comments_recieved
|
|
user.num_comments_given = user_page.num_comments_given
|
|
user.num_journals = user_page.num_journals
|
|
user.num_favorites = user_page.num_favorites
|
|
user.num_watched_by = user_page.num_watched_by
|
|
user.num_watching = user_page.num_watching
|
|
user.account_status = user_page.account_status&.to_s
|
|
user.profile_html =
|
|
user_page.profile_html.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
if url = user_page.profile_thumb_url
|
|
enqueue_user_avatar(user, url)
|
|
end
|
|
|
|
page
|
|
end
|
|
|
|
sig { params(user: Domain::User::FaUser, avatar_url_str: String).void }
|
|
def enqueue_user_avatar(user, avatar_url_str)
|
|
match = avatar_url_str.match(%r{/([^/]+)\.gif})
|
|
if match.nil?
|
|
logger.warn(
|
|
format_tags("invalid avatar url", make_tag("url", avatar_url_str)),
|
|
)
|
|
return
|
|
end
|
|
|
|
expected_url_name = match[1]
|
|
if user.url_name != expected_url_name
|
|
logger.warn(
|
|
format_tags(
|
|
"invalid avatar url",
|
|
make_tag("url", avatar_url_str),
|
|
make_tag("expected", expected_url_name),
|
|
make_tag("actual", user.url_name),
|
|
),
|
|
)
|
|
return
|
|
end
|
|
|
|
uri = Addressable::URI.parse(avatar_url_str)
|
|
uri.scheme ||= "https"
|
|
|
|
avatar = user.avatar
|
|
if avatar.nil? || (avatar.url_str.present? && avatar.url_str != uri.to_s)
|
|
logger.info(format_tags("creating new avatar", make_tag("url", uri.to_s)))
|
|
avatar = user.avatars.build(url_str: uri.to_s)
|
|
elsif avatar.url_str.blank?
|
|
logger.info(format_tags("updating avatar", make_tag("url", uri.to_s)))
|
|
avatar.url_str = uri.to_s
|
|
end
|
|
|
|
if avatar.changed?
|
|
avatar.state_pending!
|
|
avatar.save!
|
|
defer_job(Domain::Fa::Job::UserAvatarJob, { avatar: })
|
|
user.association(:avatar).reload
|
|
end
|
|
end
|
|
|
|
FoundLink = Scraper::LinkFinder::FoundLink
|
|
|
|
sig { params(log_entry: HttpLogEntry).void }
|
|
def enqueue_jobs_from_found_links(log_entry)
|
|
return if skip_enqueue_found_links?
|
|
|
|
logger.tagged("link-finder") do
|
|
start_time = Time.now
|
|
unless PERMITTED_CONTENT_TYPES.any? { |ct|
|
|
ct.match(log_entry.content_type)
|
|
}
|
|
raise("unsupported content type: #{log_entry.content_type}")
|
|
end
|
|
document = log_entry.response_bytes || return
|
|
link_finder =
|
|
Scraper::LinkFinder.new(T.must(log_entry.uri_host), document)
|
|
link_finder.logger.level = :error
|
|
links = link_finder.find_links
|
|
|
|
url_names =
|
|
links.filter_map do |link|
|
|
link.is_a?(FoundLink::FaUser) ? link.url_name : nil
|
|
end
|
|
url_name_to_fa_user =
|
|
T.let(
|
|
Domain::User::FaUser.where(url_name: url_names).index_by(&:url_name),
|
|
T::Hash[String, Domain::User::FaUser],
|
|
)
|
|
|
|
fa_ids =
|
|
links.filter_map do |link|
|
|
link.is_a?(FoundLink::FaPost) ? link.fa_id : nil
|
|
end
|
|
fa_id_to_fa_post =
|
|
T.cast(
|
|
Domain::Post::FaPost.where(fa_id: fa_ids).index_by(&:fa_id),
|
|
T::Hash[Integer, Domain::Post::FaPost],
|
|
)
|
|
|
|
links
|
|
.filter_map do |link|
|
|
if link.is_a?(FoundLink::FaUser) || link.is_a?(FoundLink::FaPost)
|
|
link
|
|
else
|
|
nil
|
|
end
|
|
end
|
|
.each do |link|
|
|
case link
|
|
when FoundLink::FaUser
|
|
url_name = link.url_name
|
|
user =
|
|
url_name_to_fa_user[url_name] ||
|
|
Domain::User::FaUser.create!(url_name:) do |user|
|
|
user.name ||= url_name
|
|
end
|
|
enqueue_user_scan(user, at_most_one_scan: true)
|
|
when FoundLink::FaPost
|
|
fa_id = link.fa_id
|
|
post =
|
|
fa_id_to_fa_post[fa_id] ||
|
|
Domain::Post::FaPost.build(fa_id:) do |post|
|
|
post.first_seen_entry_id = log_entry.id
|
|
end
|
|
if post.new_record?
|
|
post.save!
|
|
defer_job(Domain::Fa::Job::ScanPostJob, { post: })
|
|
end
|
|
end
|
|
end
|
|
|
|
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
|
|
logger.info(format_tags(make_tag("duration", "#{duration_ms} ms")))
|
|
end
|
|
rescue StandardError => e
|
|
logger.error(
|
|
format_tags(
|
|
make_tag("error.class", e.class.name),
|
|
make_tag("error.message", e.message),
|
|
),
|
|
)
|
|
end
|
|
|
|
DISABLED_PAGE_PATTERNS =
|
|
T.let(
|
|
[
|
|
/User ".+" has voluntarily disabled access/,
|
|
/User .+ has voluntarily disabled access/,
|
|
/The page you are trying to reach is currently pending deletion/,
|
|
],
|
|
T::Array[Regexp],
|
|
)
|
|
|
|
NOT_FOUND_PAGE_PATTERNS =
|
|
T.let(
|
|
[
|
|
/User ".+" was not found in our database\./,
|
|
/The username ".+" could not be found\./,
|
|
%r{This user cannot be found.<br/><br/>},
|
|
],
|
|
T::Array[Regexp],
|
|
)
|
|
|
|
module DisabledOrNotFoundResult
|
|
class Stop < T::Struct
|
|
include T::Struct::ActsAsComparable
|
|
|
|
const :message, String
|
|
end
|
|
|
|
class Ok < T::Struct
|
|
include T::Struct::ActsAsComparable
|
|
|
|
const :page, Domain::Fa::Parser::Page
|
|
end
|
|
end
|
|
|
|
sig do
|
|
params(
|
|
user: Domain::User::FaUser,
|
|
response: Scraper::HttpClient::Response,
|
|
).returns(T::Boolean)
|
|
end
|
|
def user_disabled_or_not_found?(user, response)
|
|
# HTTP 400 is returned when the user is not found
|
|
if response.status_code != 200 && response.status_code != 400
|
|
fatal_error(
|
|
"http #{response.status_code}, log entry #{response.log_entry.id}",
|
|
)
|
|
end
|
|
|
|
suppress_user_jobs =
|
|
Kernel.lambda do |user|
|
|
suppress_deferred_job(Domain::Fa::Job::UserPageJob, { user: })
|
|
suppress_deferred_job(Domain::Fa::Job::FavsJob, { user: })
|
|
suppress_deferred_job(Domain::Fa::Job::UserGalleryJob, { user: })
|
|
suppress_deferred_job(Domain::Fa::Job::UserFollowsJob, { user: })
|
|
end
|
|
|
|
if DISABLED_PAGE_PATTERNS.any? { |pattern| response.body =~ pattern }
|
|
user.state_account_disabled!
|
|
user.is_disabled = true
|
|
suppress_user_jobs.call(user)
|
|
true
|
|
elsif NOT_FOUND_PAGE_PATTERNS.any? { |pattern| response.body =~ pattern }
|
|
user.state_error!
|
|
suppress_user_jobs.call(user)
|
|
true
|
|
else
|
|
false
|
|
end
|
|
end
|
|
end
|