Files
redux-scraper/app/lib/fa_backfill_favs.rb
2025-02-25 19:59:41 +00:00

153 lines
4.3 KiB
Ruby

# typed: strict
# frozen_string_literal: true
class FaBackfillFavs
include HasColorLogger
extend T::Sig
sig do
params(
start_at: T.nilable(Integer),
limit: T.nilable(Integer),
batch_size: T.nilable(Integer),
).void
end
def initialize(start_at: nil, limit: nil, batch_size: nil)
@start_at = start_at
@limit = limit
@batch_size = T.let(batch_size || 32, Integer)
@unique_users_seen = T.let(Set.new, T::Set[String])
@total_favs_seen = T.let(0, Integer)
@total_log_entries_processed = T.let(0, Integer)
end
sig { void }
def run
collect_favs_from_logs
end
private
sig { void }
def collect_favs_from_logs
scope =
HttpLogEntry
.includes(:response)
.where(uri_host: "www.furaffinity.net")
.where("uri_path LIKE ?", "/favorites/%")
scope.find_in_batches(
batch_size: @batch_size,
start: @start_at,
order: :desc,
) do |entries|
user_favs =
T.let(
Hash.new { |h, k| h[k] = Set.new },
T::Hash[String, T::Set[Integer]],
)
entries.each do |entry|
response = T.let(entry.response, T.nilable(BlobFile))
next unless response
contents = T.let(response.content_bytes, T.nilable(String))
next unless contents
page =
begin
Domain::Fa::Parser::Page.new(contents, require_logged_in: false)
rescue Domain::Fa::Parser::NotLoggedInError
next
end
uri_path = T.let(entry.uri_path, T.nilable(String))
next unless uri_path
# url is like /favorites/iiszed/1183854116/next
# extract url_name from the path with regex
url_name = uri_path.match(%r{/favorites/([^\/]+)})&.[](1)
next unless url_name
@total_log_entries_processed += 1
logger.info(
"[entry #{entry.id}] [processed #{@total_log_entries_processed}] favs for user #{url_name} - #{entry.uri_path}",
)
favs = T.must(user_favs[url_name])
page.submissions_parsed.each do |submission|
fa_id = submission.id
next unless fa_id
favs.add(fa_id)
end
break if @limit && @total_log_entries_processed >= @limit
end
process_users_favs(user_favs)
@total_log_entries_processed += entries.length
logger.info("Processed #{@total_log_entries_processed} HttpLogEntries")
logger.info(
"Unique users seen: #{@unique_users_seen.length}, Total favs seen: #{@total_favs_seen}",
)
if @limit && @total_log_entries_processed >= @limit
logger.info("Reached limit of #{@limit} HttpLogEntries, stopping")
break
end
end
end
sig { params(user_favs: T::Hash[String, T::Set[Integer]]).void }
def process_users_favs(user_favs)
url_name_to_user_id =
T.let(
Domain::Fa::User
.where(url_name: user_favs.keys)
.pluck(:url_name, :id)
.to_h,
T::Hash[String, Integer],
)
@unique_users_seen.merge(user_favs.keys)
@total_favs_seen +=
user_favs.values.reduce(0) { |acc, set| acc + set.length }
fa_ids = user_favs.values.reduce(Set.new) { |acc, set| acc.merge(set) }.to_a
# process a max of 10000 fa_ids at a time
fa_id_to_post_id = T.let({}, T::Hash[Integer, Integer])
fa_ids.each_slice(50_000) do |slice|
slice_fa_id_to_post_id =
T.let(
Domain::Fa::Post.where(fa_id: slice).pluck(:fa_id, :id).to_h,
T::Hash[Integer, Integer],
)
fa_id_to_post_id.merge!(slice_fa_id_to_post_id)
end
user_id_post_id_pairs = T.let([], T::Array[[Integer, Integer]])
user_favs.each do |url_name, post_fa_ids|
user_id =
T.must_because(url_name_to_user_id[url_name]) do
"url_name '#{url_name.inspect}' not found in url_name_to_user_id"
end
post_fa_ids.each do |post_fa_id|
post_id =
T.must_because(fa_id_to_post_id[post_fa_id]) do
"post fa_id '#{post_fa_id.inspect}' not found in fa_id_to_post_id"
end
user_id_post_id_pairs << [user_id, post_id]
end
end
user_id_post_id_pairs.each_slice(30_000) do |pairs|
Domain::Fa::Fav.upsert_all(
pairs.map { |user_id, post_id| { user_id:, post_id: } },
unique_by: :index_domain_fa_favs_on_user_id_and_post_id,
)
end
end
end