user page job improvement for skipping gallery scans

This commit is contained in:
Dylan Knutson
2025-03-02 19:28:47 +00:00
parent ac50c47865
commit e49fe33dc6
10 changed files with 2249 additions and 21 deletions

View File

@@ -35,3 +35,4 @@
- [ ] tumblr domain icon
- [ ] Do PCA on user factors table to display a 2D plot of users
- [ ] Use links found in descriptions to indicate re-scanning a post? (e.g. for comic next/prev links)
- [ ] fix for IDs that have a dot in them - e.g. https://refurrer.com/users/fa@jakke.

View File

@@ -11,6 +11,7 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
logger.info("scanning post")
if force_scan? || (post.state_ok? && !post.scanned_at.present?) ||
# check for !state_ok? on the file maybe?
post.file&.state_file_error?
ReduxApplicationRecord.transaction { scan_post(post) }
end
@@ -90,6 +91,7 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
end
# save before any changes so post has an id for any files
post.state_ok!
post.save!
post.title = submission.title

View File

@@ -28,7 +28,7 @@ class Domain::Fa::Job::UserPageJob < Domain::Fa::Job::Base
end
if user.state_ok? && user_page
check_skip_gallery_scan(user)
check_skip_gallery_scan(user, user_page)
check_skip_favs_scan(user, user_page)
check_skip_followed_users_scan(user, user_page)
check_skip_followed_by_users_scan(user, user_page)
@@ -46,13 +46,87 @@ class Domain::Fa::Job::UserPageJob < Domain::Fa::Job::Base
private
sig { params(user: Domain::User::FaUser).void }
def check_skip_gallery_scan(user)
sig do
params(
user: Domain::User::FaUser,
user_page: Domain::Fa::Parser::UserPageHelper,
).void
end
def check_skip_gallery_scan(user, user_page)
# if the user has no submissions, we don't need to scan their gallery
if user.num_submissions == 0
user_page_num_submissions = user_page.num_submissions
if user_page_num_submissions.nil?
logger.warn(format_tags("user page num_submissions is nil"))
return
end
if user_page_num_submissions == 0
logger.info(format_tags("skipping gallery scan, 0 submissions"))
user.scanned_gallery_at = Time.current
end
# create any submissions on the recent gallery page that are not yet known
recent_gallery_fa_ids = user_page.recent_gallery_fa_ids
known_posts = user.posts.where(fa_id: recent_gallery_fa_ids).to_a
unknown_recent_gallery_fa_ids =
recent_gallery_fa_ids - known_posts.map(&:fa_id)
known_recent_gallery_posts =
known_posts.filter { |post| recent_gallery_fa_ids.include?(post.fa_id) }
if unknown_recent_gallery_fa_ids.any?
unknown_recent_gallery_fa_ids.each do |fa_id|
post = Domain::Post::FaPost.find_or_initialize_by(fa_id:)
post.creator = user
post.enqueue_job_after_save(
Domain::Fa::Job::ScanPostJob,
{ post:, caused_by_entry: causing_log_entry },
)
post.save!
end
end
# recheck any known posts to see if they should be rescanned
known_recent_gallery_posts.each do |post|
post_file = post.file
if !post.state_ok? || post_file.blank? || !post_file.state_ok?
# we just saw it, so it must be ok to enqueue
logger.tagged(make_arg_tag(post)) do
logger.info(
"post is in recent gallery section, force enqueue ScanPostJob",
)
end
post.state_ok!
post.save!
defer_job(Domain::Fa::Job::ScanPostJob, { post:, force_scan: true })
end
if post_file && post_file.url_str.present? &&
(
post_file.state_pending? || post_file.log_entry.blank? ||
!post_file.state_terminal_error?
)
logger.tagged(make_arg_tag(post_file)) do
logger.info(
"post is in recent gallery section, force enqueue ScanFileJob",
)
end
defer_job(Domain::Fa::Job::ScanFileJob, { post_file: })
end
end
# if we know at least as many submissions as are indicated on the user page,
# then we can skip the gallery scan
if user_page_num_submissions <= user.posts.count
logger.info(
format_tags(
make_tag("user_page_submissions", user_page_num_submissions),
make_tag("known_submissions", user.posts.count),
"skipping gallery scan, all submissions known",
),
)
user.scanned_gallery_at = Time.current
end
end
sig do

View File

@@ -23,6 +23,7 @@ class Domain::Fa::Parser::UserPageHelper < Domain::Fa::Parser::Base
@num_journals = T.let(nil, T.nilable(Integer))
@num_favorites = T.let(nil, T.nilable(Integer))
@recent_favs = T.let(nil, T.nilable(T::Array[Integer]))
@recent_gallery_fa_ids = T.let(nil, T.nilable(T::Array[Integer]))
@recent_watchers = T.let(nil, T.nilable(T::Array[RecentUser]))
@recent_watching = T.let(nil, T.nilable(T::Array[RecentUser]))
@statistics = T.let(nil, T.nilable(Nokogiri::XML::Element))
@@ -232,6 +233,25 @@ class Domain::Fa::Parser::UserPageHelper < Domain::Fa::Parser::Base
end
end
sig { returns(T::Array[Integer]) }
def recent_gallery_fa_ids
@recent_gallery_fa_ids ||=
case @page_version
when VERSION_2
@elem
.css("#gallery-latest-submissions")
.first
&.css("figure a")
&.map do |elem|
href = elem["href"]
%r{/view/(\d+)}.match(href)&.[](1)&.to_i ||
raise("invalid url: #{href}")
end || []
else
unimplemented_version!
end
end
class JSONSubmissionData < T::ImmutableStruct
include T::Struct::ActsAsComparable

View File

@@ -7,13 +7,7 @@ class ApplicationPolicy
sig { returns(T.nilable(User)) }
attr_reader :user
sig do
returns(
T.nilable(
T.any(ReduxApplicationRecord, T.class_of(ReduxApplicationRecord)),
),
)
end
sig { returns(T.untyped) }
attr_reader :record
sig { returns(String) }
@@ -33,15 +27,7 @@ class ApplicationPolicy
"#{self.class.name}::#{Digest::SHA256.hexdigest(method_values_string)[0..16]}"
end
sig do
params(
user: T.nilable(User),
record:
T.nilable(
T.any(ReduxApplicationRecord, T.class_of(ReduxApplicationRecord)),
),
).void
end
sig { params(user: T.nilable(User), record: T.untyped).void }
def initialize(user, record)
@user = user
@record = record

View File

@@ -6,7 +6,7 @@
<span class="badge bg-primary">
<i class="fa-solid fa-file me-1"></i><%= post_file.class.name %> #<%= post_file.id %>
</span>
<%= link_to domain_post_path(post),
<%= link_to Rails.application.routes.url_helpers.domain_post_path(post),
class: "badge bg-primary",
target: "_blank" do %>
<i class="fa-solid fa-image me-1"></i><%= post.class.name %> #<%= post.id %>

View File

@@ -410,6 +410,299 @@ describe Domain::Fa::Job::UserPageJob do
end
end
context "a user with recent gallery submissions" do
let(:user) { create(:domain_user_fa_user, url_name: "kutua") }
context "has one recent gallery submission, and user page indicates three total submissions" do
let(:client_mock_config) do
[
{
uri: "https://www.furaffinity.net/user/kutua/",
status_code: 200,
content_type: "text/html",
contents:
SpecUtil.read_fixture_file(
# one recent gallery submission, and three submissions indicated on user page
"domain/fa/user_page/user_page_kutua_one_recent_three_total_gallery.html",
),
},
]
end
it "updates the num_submissions count" do
expect do
perform_now({ user: })
user.reload
end.to change { user.num_submissions }.from(nil).to(3)
end
context "no submissions are yet known" do
it "creates the recent submission" do
perform_now({ user: })
expect(user.posts.count).to eq(1)
post = user.posts.first
expect(post.fa_id).to eq(60_073_062)
end
it "does not mark the gallery as scanned" do
expect do
perform_now({ user: })
user.reload
end.not_to change { user.scanned_gallery_at }
end
it "enqueues a ScanPostJob for the recent submission" do
perform_now({ user: })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::ScanPostJob),
).to match(
[
hash_including(
post: user.posts.first,
caused_by_entry: @log_entries[0],
),
],
)
end
it "enqueues a UserGalleryJob for the user" do
perform_now({ user: })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::UserGalleryJob),
).to match([hash_including(user:, caused_by_entry: @log_entries[0])])
end
end
context "one unlisted submission is known" do
let!(:unseen_post_1) do
create(
:domain_post_fa_post,
fa_id: 12_345,
creator: user,
title: "Not In The Gallery",
)
end
shared_examples "unlisted submission in state" do |state|
context "and is in the '#{state}' state" do
before do
unseen_post_1.state = state
unseen_post_1.save!
end
it "enqueues a ScanPostJob only for the new gallery submission" do
perform_now({ user: })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::ScanPostJob),
).to match(
[
hash_including(
post: user.posts.find_by(fa_id: 60_073_062),
caused_by_entry: @log_entries[0],
),
],
)
end
it "enqueues a UserGalleryJob for the user" do
perform_now({ user: })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::UserGalleryJob),
).to match(
[hash_including(user:, caused_by_entry: @log_entries[0])],
)
end
it "does not mark the gallery as scanned" do
perform_now({ user: })
user.reload
expect(user.scanned_gallery_at).to be_nil
end
end
end
# we do not know anything about the unseen submissions, so do
# not re-enqueue them - only enqueue those in recent gallery section
Domain::Post::FaPost.states.keys.each do |state|
include_examples "unlisted submission in state", state
end
end
context "all two unlisted submissions are known" do
let!(:unseen_post_1) do
create(
:domain_post_fa_post,
fa_id: 12_345,
creator: user,
title: "Not In The Gallery 1",
)
end
let!(:unseen_post_2) do
create(
:domain_post_fa_post,
fa_id: 12_346,
creator: user,
title: "Not In The Gallery 2",
)
end
it "enqueues a ScanPostJob for the recent submission" do
perform_now({ user: })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::ScanPostJob),
).to match(
[hash_including(post: user.posts.find_by(fa_id: 60_073_062))],
)
end
it "marks the gallery as scanned" do
expect do
perform_now({ user: })
user.reload
end.to change { user.scanned_gallery_at }.to be_within(3.seconds).of(
Time.current,
)
end
end
end
context "has one recent gallery submission, and user page indicates one total submission" do
let(:client_mock_config) do
[
{
# one recent gallery submission, and one submission indicated on user page
uri: "https://www.furaffinity.net/user/kutua/",
status_code: 200,
content_type: "text/html",
contents:
SpecUtil.read_fixture_file(
"domain/fa/user_page/user_page_kutua_one_recent_one_total_gallery.html",
),
},
]
end
context "the submission is not yet known" do
it "updates the num_submissions count" do
expect do
perform_now({ user: })
user.reload
end.to change { user.num_submissions }.from(nil).to(1)
end
it "creates the submission" do
expect do
perform_now({ user: })
user.reload
end.to change { user.posts.count }.from(0).to(1)
post = user.posts.first
# only know the creator and fa_id when seen from the user page
expect(post.fa_id).to eq(60_073_062)
expect(post.title).to be_nil
expect(post.description).to be_nil
expect(post.file).to be_nil
end
it "marks the gallery as scanned" do
perform_now({ user: })
expect(user.scanned_gallery_at).to be_within(3.seconds).of(
Time.current,
)
end
it "enqueues a ScanPostJob" do
perform_now({ user: })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::ScanPostJob),
).to match(
[
hash_including(
post: user.posts.first,
caused_by_entry: @log_entries[0],
),
],
)
end
end
context "and recent gallery submissions are known" do
let!(:post) do
create(
:domain_post_fa_post,
fa_id: 60_073_062,
state: "ok",
creator: user,
)
end
shared_examples "force enqueues a ScanPostJob when post is in state" do |state|
it "force enqueues a ScanPostJob when a post is in the '#{state}' state" do
post.state = state
post.save!
perform_now({ user: })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::ScanPostJob),
).to match([hash_including(post:, force_scan: true)])
end
end
shared_examples "force enqueues a ScanFileJob when post file is in state" do |state|
it "force enqueues a ScanPostJob when a post's file in the '#{state}' state" do
file = create(:domain_post_file, post:)
file.state = state
file.save!
perform_now({ user: })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::ScanPostJob),
).to match([hash_including(post:, force_scan: true)])
end
end
%w[removed scan_error file_error].each do |state|
include_examples(
"force enqueues a ScanPostJob when post is in state",
state,
)
end
%w[file_error retryable_error terminal_error removed].each do |state|
include_examples(
"force enqueues a ScanFileJob when post file is in state",
state,
)
end
it "force enqueues a ScanFileJob a post's file is in the 'pending' state and has a url" do
post.state_ok!
post.save!
file =
create(
:domain_post_file,
post:,
url_str: "https://example.com/file.png",
)
file.state_pending!
file.save!
perform_now({ user: })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::ScanFileJob),
).to match([hash_including(post_file: file)])
end
it "marks the gallery as scanned" do
perform_now({ user: })
expect(user.scanned_gallery_at).to be_within(3.seconds).of(
Time.current,
)
end
end
end
end
context "the user has no recent favories" do
let(:client_mock_config) do
[

View File

@@ -513,6 +513,32 @@ describe Domain::Fa::Parser::Page do
assert_equal 14, up.num_watching
end
context "recent gallery submission parsing" do
it "works when the user has recent gallery submissions" do
parser =
get_parser_at(
Rails.root.join(
"test/fixtures/files/domain/fa/user_page/user_page_kutua_one_recent_one_total_gallery.html",
),
)
assert_page_type parser, :probably_user_page?
up = parser.user_page
expect(up.recent_gallery_fa_ids).to eq([60_073_062])
end
it "works when the user has no submissions" do
parser =
get_parser_at(
Rails.root.join(
"test/fixtures/files/domain/fa/user_page/user_page_angelpawqt.html",
),
)
assert_page_type parser, :probably_user_page?
up = parser.user_page
expect(up.recent_gallery_fa_ids).to eq([])
end
end
def get_parser(file, require_logged_in: true)
path = File.join("domain/fa/parser/redux", file)
get_parser_at(path, require_logged_in:)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long