favs backfill job

This commit is contained in:
Dylan Knutson
2025-01-13 03:23:24 +00:00
parent f96c5d853e
commit 26196534fd
7 changed files with 2463 additions and 8 deletions

138
app/lib/fa_backfill_favs.rb Normal file
View File

@@ -0,0 +1,138 @@
# typed: strict
# frozen_string_literal: true
class FaBackfillFavs
include HasColorLogger
extend T::Sig
sig do
params(
start_at: T.nilable(Integer),
limit: T.nilable(Integer),
batch_size: T.nilable(Integer),
).void
end
def initialize(start_at: nil, limit: nil, batch_size: nil)
@start_at = start_at
@limit = limit
@batch_size = T.let(batch_size || 100, Integer)
@unique_users_seen = T.let(Set.new, T::Set[String])
@total_favs_seen = T.let(0, Integer)
end
sig { void }
def run
collect_favs_from_logs
end
private
sig { void }
def collect_favs_from_logs
scope =
HttpLogEntry.where(uri_host: "www.furaffinity.net").where(
"uri_path LIKE ?",
"/favorites/%",
)
total_processed = 0
scope.find_in_batches(
batch_size: @batch_size,
start: @start_at,
) do |entries|
user_favs =
T.let(
Hash.new { |h, k| h[k] = Set.new },
T::Hash[String, T::Set[Integer]],
)
entries.each do |entry|
response = T.let(entry.response, T.nilable(BlobEntry))
next unless response
contents = T.let(response.contents, T.nilable(String))
next unless contents
page = Domain::Fa::Parser::Page.new(contents)
uri_path = T.let(entry.uri_path, T.nilable(String))
next unless uri_path
url_name = uri_path.split("/").last
next unless url_name
favs = T.must(user_favs[url_name])
page.submissions_parsed.each do |submission|
next unless submission.id
favs.add(submission.id)
end
end
process_users_favs(user_favs)
total_processed += entries.length
logger.info("Processed #{total_processed} HttpLogEntries")
logger.info(
"Unique users seen: #{@unique_users_seen.length}, Total favs seen: #{@total_favs_seen}",
)
if @limit && total_processed >= @limit
logger.info("Reached limit of #{@limit} HttpLogEntries, stopping")
break
end
end
end
sig { params(user_favs: T::Hash[String, T::Set[Integer]]).void }
def process_users_favs(user_favs)
url_name_to_user_id =
T.let(
Domain::Fa::User
.where(url_name: user_favs.keys)
.pluck(:url_name, :id)
.to_h,
T::Hash[String, Integer],
)
@unique_users_seen.merge(user_favs.keys)
@total_favs_seen +=
user_favs.values.reduce(0) { |acc, set| acc + set.length }
fa_ids = user_favs.values.reduce(Set.new) { |acc, set| acc.merge(set) }.to_a
# process a max of 10000 fa_ids at a time
fa_id_to_post_id = T.let({}, T::Hash[Integer, Integer])
fa_ids.each_slice(50_000) do |slice|
slice_fa_id_to_post_id =
T.let(
Domain::Fa::Post.where(fa_id: slice).pluck(:fa_id, :id).to_h,
T::Hash[Integer, Integer],
)
fa_id_to_post_id.merge!(slice_fa_id_to_post_id)
end
user_id_post_id_pairs = T.let([], T::Array[[Integer, Integer]])
user_favs.each do |url_name, post_fa_ids|
user_id =
T.must_because(url_name_to_user_id[url_name]) do
"url_name #{url_name} not found in url_name_to_user_id"
end
post_fa_ids.each do |post_fa_id|
post_id =
T.must_because(fa_id_to_post_id[post_fa_id]) do
"post fa_id #{post_fa_id} not found in fa_id_to_post_id"
end
user_id_post_id_pairs << [user_id, post_id]
end
end
user_id_post_id_pairs.each_slice(30_000) do |pairs|
Domain::Fa::Fav.upsert_all(
pairs.map { |user_id, post_id| { user_id:, post_id: } },
unique_by: :index_domain_fa_favs_on_user_id_and_post_id,
)
end
end
end

View File

@@ -139,14 +139,6 @@ namespace :fa do
worker.write_factors
end
desc "calculate post favorite factors for similar posts"
task calc_similar_posts: %i[set_logger_stdout environment] do
epochs = (ENV["epochs"] || 20).to_i
worker = Domain::Fa::PostFactorCalculator.new(epochs)
worker.fit
worker.write_factors
end
desc "dump user follows to stdout, formatted for mf-train"
task dump_user_follows: %i[set_logger_stdout environment] do
relation = Domain::Fa::Follow.all
@@ -299,4 +291,13 @@ namespace :fa do
exporter.run
exporter.end_profiling! if profile
end
desc "Backfill favs by scanning historical HTTP logs for favorites pages"
task backfill_favs: :environment do
FaBackfillFavs.new(
start_at: ENV["start_at"]&.to_i,
limit: ENV["limit"]&.to_i,
batch_size: ENV["batch_size"]&.to_i,
).run
end
end

16
sorbet/rbi/dsl/fa_backfill_favs.rbi generated Normal file
View File

@@ -0,0 +1,16 @@
# typed: true
# DO NOT EDIT MANUALLY
# This is an autogenerated file for dynamic methods in `FaBackfillFavs`.
# Please instead update this file by running `bin/tapioca dsl FaBackfillFavs`.
class FaBackfillFavs
sig { returns(ColorLogger) }
def logger; end
class << self
sig { returns(ColorLogger) }
def logger; end
end
end

View File

@@ -0,0 +1,139 @@
# typed: false
require "rails_helper"
describe FaBackfillFavs do
let(:backfiller) { FaBackfillFavs.new }
let(:iiszed_html) do
File.read("test/fixtures/files/domain/fa/favs_backfill/iiszed_1.html")
end
let(:renamonzeo_html) do
File.read("test/fixtures/files/domain/fa/favs_backfill/renamonzeo_1.html")
end
let(:stickiest_html) do
File.read("test/fixtures/files/domain/fa/favs_backfill/stickiest_1.html")
end
let!(:iiszed_user) do
Domain::Fa::User.create!(url_name: "iiszed", name: "I_Is_Zed")
end
let!(:renamonzeo_user) do
Domain::Fa::User.create!(url_name: "renamonzeo", name: "Renamonzeo")
end
let!(:stickiest_user) do
Domain::Fa::User.create!(url_name: "stickiest", name: "stickiest")
end
before do
# Create all posts found in the HTML
[iiszed_html, renamonzeo_html, stickiest_html].each do |html|
page = Domain::Fa::Parser::Page.new(html)
submissions = page.submissions_parsed
submissions.each do |submission|
Domain::Fa::Post.create!(
Domain::Fa::Post.hash_from_submission_parser_helper(submission),
)
end
end
# Create empty headers record for reuse
empty_headers = HttpLogEntryHeader.find_or_create(headers: {})
# Create HttpLogEntry records for each user's favorites page
HttpLogEntry.create!(
uri_host: "www.furaffinity.net",
uri_path: "/favorites/iiszed",
uri_scheme: "https",
verb: :get,
performed_by: "direct",
status_code: 200,
response_time_ms: 100,
content_type: "text/html",
requested_at: Time.current,
request_headers: empty_headers,
response_headers: empty_headers,
response:
BlobEntry.create!(
contents: iiszed_html,
content_type: "text/html",
sha256: Digest::SHA256.digest(iiszed_html),
size: iiszed_html.bytesize,
),
)
HttpLogEntry.create!(
uri_host: "www.furaffinity.net",
uri_path: "/favorites/renamonzeo",
uri_scheme: "https",
verb: :get,
performed_by: "direct",
status_code: 200,
response_time_ms: 100,
content_type: "text/html",
requested_at: Time.current,
request_headers: empty_headers,
response_headers: empty_headers,
response:
BlobEntry.create!(
contents: renamonzeo_html,
content_type: "text/html",
sha256: Digest::SHA256.digest(renamonzeo_html),
size: renamonzeo_html.bytesize,
),
)
HttpLogEntry.create!(
uri_host: "www.furaffinity.net",
uri_path: "/favorites/stickiest",
uri_scheme: "https",
verb: :get,
performed_by: "direct",
status_code: 200,
response_time_ms: 100,
content_type: "text/html",
requested_at: Time.current,
request_headers: empty_headers,
response_headers: empty_headers,
response:
BlobEntry.create!(
contents: stickiest_html,
content_type: "text/html",
sha256: Digest::SHA256.digest(stickiest_html),
size: stickiest_html.bytesize,
),
)
end
it "processes favorites from log entries" do
backfiller.run
expect(
Domain::Fa::Fav.where(
user: iiszed_user,
post: Domain::Fa::Post.find_by!(fa_id: 48_999_001),
),
).not_to be_empty
expect(
Domain::Fa::Fav.where(
user: iiszed_user,
post: Domain::Fa::Post.find_by!(fa_id: 48_996_138),
),
).not_to be_empty
expect(
Domain::Fa::Fav.where(
user: iiszed_user,
post: Domain::Fa::Post.find_by!(fa_id: 48_993_021),
),
).not_to be_empty
# Check total number of favorites for iiszed
expect(Domain::Fa::Fav.where(user: iiszed_user).count).to eq(48) # Total number of figures in the HTML
expect(Domain::Fa::Fav.where(user: renamonzeo_user).count).to eq(48) # Total number of figures in the HTML
# Verify no posts were created for stickiest (empty favorites page)
expect(Domain::Fa::Fav.where(user: stickiest_user).count).to eq(0)
end
end

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long