fill gaps in fa browse page jobs

This commit is contained in:
Dylan Knutson
2023-05-02 13:35:47 -07:00
parent aedc9f4fcd
commit dfa6202b5e
5 changed files with 1409 additions and 6 deletions

View File

@@ -40,11 +40,24 @@ class Domain::Fa::Job::Base < Scraper::JobBase
enqueue_posts_pri:,
enqueue_page_scan: true,
enqueue_gallery_scan: true,
page_desc: nil
page_desc: nil,
fill_id_gaps: false
)
fatal_error("not a listings page") unless page.probably_listings_page?
submissions = page.submissions_parsed
fa_ids_to_manually_enqueue = []
if fill_id_gaps && submissions.any?
fa_ids = submissions.map(&:id)
max_fa_id, min_fa_id = fa_ids.max, fa_ids.min
# sanity check so we don't enqueue too many post jobs
if max_fa_id - min_fa_id <= 250
(min_fa_id..max_fa_id).each do |fa_id|
fa_ids_to_manually_enqueue << fa_id unless fa_ids.include?(fa_id)
end
end
end
page_desc = if page_desc
"page #{page_desc.to_s.bold}"
else
@@ -80,6 +93,15 @@ class Domain::Fa::Job::Base < Scraper::JobBase
end
end
fa_ids_to_manually_enqueue.each do |fa_id|
# when filling gaps, only enqueue if the post wasn't found
post = Domain::Fa::Post.find_or_initialize_by(fa_id: fa_id)
if post.new_record?
post.save!
enqueue_post_scan(post, caused_by_entry, enqueue_posts_pri)
end
end
logger.info "#{page_desc} has #{submissions.count.to_s.bold} posts, " +
"#{listing_page_stats.new_seen.to_s.bold} new"

View File

@@ -41,6 +41,7 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
:browse_page, page, log_entry,
enqueue_posts_pri: :high,
page_desc: "Browse@#{@page_number}",
fill_id_gaps: true,
)
@total_num_new_posts_seen += listing_page_stats.new_seen

View File

@@ -1,4 +1,4 @@
# Gather and record all the users that follow a user (not who this user follows)
# Gather and record all the users that this user follows
# This will be used to create an index of follower -> followed
# of a specific user, for recommender training
class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base

View File

@@ -90,7 +90,8 @@ describe Domain::Fa::Job::BrowsePageJob do
it "enqueues one" do
expect do
described_class.perform_later({})
ret = described_class.perform_later({})
expect(ret).not_to be(Exception)
end.to change { GoodJob::Job.count }.by(1)
end
@@ -155,7 +156,8 @@ describe Domain::Fa::Job::BrowsePageJob do
it "creates a new post" do
expect {
described_class.perform_now({})
ret = described_class.perform_now({})
expect(ret).not_to be(Exception)
}.to change {
Domain::Fa::Post.count
}.by(1)
@@ -165,7 +167,7 @@ describe Domain::Fa::Job::BrowsePageJob do
expect {
described_class.perform_now({})
}.to change {
Domain::Fa::Post.count
Domain::Fa::User.count
}.by(1)
end
@@ -182,6 +184,65 @@ describe Domain::Fa::Job::BrowsePageJob do
end
end
context "with gaps between posts" do
include_context "user and post getters"
let! :log_entries do
SpecUtil.init_http_client_mock(
http_client_mock, [
{
uri: "https://www.furaffinity.net/browse/",
status_code: 200,
content_type: "text/html",
contents: SpecUtil.read_fixture_file("domain/fa/job/browse_page_two_submissions.html"),
caused_by_entry_idx: nil,
},
{
uri: "https://www.furaffinity.net/browse/2/",
status_code: 200,
content_type: "text/html",
contents: SpecUtil.read_fixture_file("domain/fa/job/browse_page_no_submissions.html"),
caused_by_entry_idx: 0,
},
]
)
end
it "enqueues scan post jobs" do
expect(described_class.perform_now({})).to_not be(Exception)
post1 = Domain::Fa::Post.find_by(fa_id: 51509268)
post2 = Domain::Fa::Post.find_by(fa_id: 51509267)
post3 = Domain::Fa::Post.find_by(fa_id: 51509266)
expect(post1).to_not be_nil
expect(post1.creator).to eq(user.call)
expect(post2).to_not be_nil
expect(post2.creator).to be_nil
expect(post2.title).to be_nil
expect(post3).to_not be_nil
expect(post3.creator).to eq(user.call)
expect(SpecUtil.enqueued_jobs(Domain::Fa::Job::ScanPostJob)).to match(
[
including(args: [{
post: post1,
caused_by_entry: log_entries[0],
}]),
including(args: [{
post: post3,
caused_by_entry: log_entries[0],
}]),
including(args: [{
post: post2,
caused_by_entry: log_entries[0],
}]),
]
)
end
end
context "with one seen post" do
include_context "user and post getters"
include_context "create user and post"
@@ -200,7 +261,9 @@ describe Domain::Fa::Job::BrowsePageJob do
end
context "and nothing yet scanned" do
before { described_class.perform_now({}) }
before {
expect(described_class.perform_now({})).to_not be(Exception)
}
include_examples "enqueue post scan", true
include_examples "enqueue file scan", false
include_examples "enqueue user page scan", true

File diff suppressed because one or more lines are too long