backfill job handles url_names correctly

This commit is contained in:
Dylan Knutson
2025-01-13 20:20:23 +00:00
parent f14c73a152
commit 73ff4ee472
2 changed files with 29 additions and 15 deletions

View File

@@ -15,9 +15,10 @@ class FaBackfillFavs
def initialize(start_at: nil, limit: nil, batch_size: nil)
@start_at = start_at
@limit = limit
@batch_size = T.let(batch_size || 100, Integer)
@batch_size = T.let(batch_size || 32, Integer)
@unique_users_seen = T.let(Set.new, T::Set[String])
@total_favs_seen = T.let(0, Integer)
@total_log_entries_processed = T.let(0, Integer)
end
sig { void }
@@ -30,16 +31,15 @@ class FaBackfillFavs
sig { void }
def collect_favs_from_logs
scope =
HttpLogEntry.where(uri_host: "www.furaffinity.net").where(
"uri_path LIKE ?",
"/favorites/%",
)
total_processed = 0
HttpLogEntry
.includes(:response)
.where(uri_host: "www.furaffinity.net")
.where("uri_path LIKE ?", "/favorites/%")
scope.find_in_batches(
batch_size: @batch_size,
start: @start_at,
order: :desc,
) do |entries|
user_favs =
T.let(
@@ -54,13 +54,25 @@ class FaBackfillFavs
contents = T.let(response.contents, T.nilable(String))
next unless contents
page = Domain::Fa::Parser::Page.new(contents)
page =
begin
Domain::Fa::Parser::Page.new(contents, require_logged_in: false)
rescue Domain::Fa::Parser::NotLoggedInError
next
end
uri_path = T.let(entry.uri_path, T.nilable(String))
next unless uri_path
url_name = uri_path.split("/").last
# url is like /favorites/iiszed/1183854116/next
# extract url_name from the path with regex
url_name = uri_path.match(%r{/favorites/([^\/]+)})&.[](1)
next unless url_name
@total_log_entries_processed += 1
logger.info(
"[entry #{entry.id}] [processed #{@total_log_entries_processed}] favs for user #{url_name} - #{entry.uri_path}",
)
favs = T.must(user_favs[url_name])
page.submissions_parsed.each do |submission|
@@ -68,16 +80,18 @@ class FaBackfillFavs
favs.add(submission.id)
end
break if @limit && @total_log_entries_processed >= @limit
end
process_users_favs(user_favs)
total_processed += entries.length
logger.info("Processed #{total_processed} HttpLogEntries")
@total_log_entries_processed += entries.length
logger.info("Processed #{@total_log_entries_processed} HttpLogEntries")
logger.info(
"Unique users seen: #{@unique_users_seen.length}, Total favs seen: #{@total_favs_seen}",
)
if @limit && total_processed >= @limit
if @limit && @total_log_entries_processed >= @limit
logger.info("Reached limit of #{@limit} HttpLogEntries, stopping")
break
end
@@ -117,12 +131,12 @@ class FaBackfillFavs
user_favs.each do |url_name, post_fa_ids|
user_id =
T.must_because(url_name_to_user_id[url_name]) do
"url_name #{url_name} not found in url_name_to_user_id"
"url_name '#{url_name.inspect}' not found in url_name_to_user_id"
end
post_fa_ids.each do |post_fa_id|
post_id =
T.must_because(fa_id_to_post_id[post_fa_id]) do
"post fa_id #{post_fa_id} not found in fa_id_to_post_id"
"post fa_id '#{post_fa_id.inspect}' not found in fa_id_to_post_id"
end
user_id_post_id_pairs << [user_id, post_id]
end

View File

@@ -42,7 +42,7 @@ describe FaBackfillFavs do
# Create HttpLogEntry records for each user's favorites page
HttpLogEntry.create!(
uri_host: "www.furaffinity.net",
uri_path: "/favorites/iiszed",
uri_path: "/favorites/iiszed/1183854116/next",
uri_scheme: "https",
verb: :get,
performed_by: "direct",