favs backfill job
This commit is contained in:
138
app/lib/fa_backfill_favs.rb
Normal file
138
app/lib/fa_backfill_favs.rb
Normal file
@@ -0,0 +1,138 @@
|
||||
# typed: strict
|
||||
# frozen_string_literal: true
|
||||
|
||||
class FaBackfillFavs
|
||||
include HasColorLogger
|
||||
extend T::Sig
|
||||
|
||||
sig do
|
||||
params(
|
||||
start_at: T.nilable(Integer),
|
||||
limit: T.nilable(Integer),
|
||||
batch_size: T.nilable(Integer),
|
||||
).void
|
||||
end
|
||||
def initialize(start_at: nil, limit: nil, batch_size: nil)
|
||||
@start_at = start_at
|
||||
@limit = limit
|
||||
@batch_size = T.let(batch_size || 100, Integer)
|
||||
@unique_users_seen = T.let(Set.new, T::Set[String])
|
||||
@total_favs_seen = T.let(0, Integer)
|
||||
end
|
||||
|
||||
sig { void }
|
||||
def run
|
||||
collect_favs_from_logs
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
sig { void }
|
||||
def collect_favs_from_logs
|
||||
scope =
|
||||
HttpLogEntry.where(uri_host: "www.furaffinity.net").where(
|
||||
"uri_path LIKE ?",
|
||||
"/favorites/%",
|
||||
)
|
||||
|
||||
total_processed = 0
|
||||
|
||||
scope.find_in_batches(
|
||||
batch_size: @batch_size,
|
||||
start: @start_at,
|
||||
) do |entries|
|
||||
user_favs =
|
||||
T.let(
|
||||
Hash.new { |h, k| h[k] = Set.new },
|
||||
T::Hash[String, T::Set[Integer]],
|
||||
)
|
||||
|
||||
entries.each do |entry|
|
||||
response = T.let(entry.response, T.nilable(BlobEntry))
|
||||
next unless response
|
||||
|
||||
contents = T.let(response.contents, T.nilable(String))
|
||||
next unless contents
|
||||
|
||||
page = Domain::Fa::Parser::Page.new(contents)
|
||||
uri_path = T.let(entry.uri_path, T.nilable(String))
|
||||
next unless uri_path
|
||||
|
||||
url_name = uri_path.split("/").last
|
||||
next unless url_name
|
||||
|
||||
favs = T.must(user_favs[url_name])
|
||||
|
||||
page.submissions_parsed.each do |submission|
|
||||
next unless submission.id
|
||||
|
||||
favs.add(submission.id)
|
||||
end
|
||||
end
|
||||
|
||||
process_users_favs(user_favs)
|
||||
|
||||
total_processed += entries.length
|
||||
logger.info("Processed #{total_processed} HttpLogEntries")
|
||||
logger.info(
|
||||
"Unique users seen: #{@unique_users_seen.length}, Total favs seen: #{@total_favs_seen}",
|
||||
)
|
||||
if @limit && total_processed >= @limit
|
||||
logger.info("Reached limit of #{@limit} HttpLogEntries, stopping")
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
sig { params(user_favs: T::Hash[String, T::Set[Integer]]).void }
|
||||
def process_users_favs(user_favs)
|
||||
url_name_to_user_id =
|
||||
T.let(
|
||||
Domain::Fa::User
|
||||
.where(url_name: user_favs.keys)
|
||||
.pluck(:url_name, :id)
|
||||
.to_h,
|
||||
T::Hash[String, Integer],
|
||||
)
|
||||
|
||||
@unique_users_seen.merge(user_favs.keys)
|
||||
@total_favs_seen +=
|
||||
user_favs.values.reduce(0) { |acc, set| acc + set.length }
|
||||
|
||||
fa_ids = user_favs.values.reduce(Set.new) { |acc, set| acc.merge(set) }.to_a
|
||||
|
||||
# process a max of 10000 fa_ids at a time
|
||||
fa_id_to_post_id = T.let({}, T::Hash[Integer, Integer])
|
||||
fa_ids.each_slice(50_000) do |slice|
|
||||
slice_fa_id_to_post_id =
|
||||
T.let(
|
||||
Domain::Fa::Post.where(fa_id: slice).pluck(:fa_id, :id).to_h,
|
||||
T::Hash[Integer, Integer],
|
||||
)
|
||||
fa_id_to_post_id.merge!(slice_fa_id_to_post_id)
|
||||
end
|
||||
|
||||
user_id_post_id_pairs = T.let([], T::Array[[Integer, Integer]])
|
||||
|
||||
user_favs.each do |url_name, post_fa_ids|
|
||||
user_id =
|
||||
T.must_because(url_name_to_user_id[url_name]) do
|
||||
"url_name #{url_name} not found in url_name_to_user_id"
|
||||
end
|
||||
post_fa_ids.each do |post_fa_id|
|
||||
post_id =
|
||||
T.must_because(fa_id_to_post_id[post_fa_id]) do
|
||||
"post fa_id #{post_fa_id} not found in fa_id_to_post_id"
|
||||
end
|
||||
user_id_post_id_pairs << [user_id, post_id]
|
||||
end
|
||||
end
|
||||
|
||||
user_id_post_id_pairs.each_slice(30_000) do |pairs|
|
||||
Domain::Fa::Fav.upsert_all(
|
||||
pairs.map { |user_id, post_id| { user_id:, post_id: } },
|
||||
unique_by: :index_domain_fa_favs_on_user_id_and_post_id,
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
rake/fa.rake
17
rake/fa.rake
@@ -139,14 +139,6 @@ namespace :fa do
|
||||
worker.write_factors
|
||||
end
|
||||
|
||||
desc "calculate post favorite factors for similar posts"
|
||||
task calc_similar_posts: %i[set_logger_stdout environment] do
|
||||
epochs = (ENV["epochs"] || 20).to_i
|
||||
worker = Domain::Fa::PostFactorCalculator.new(epochs)
|
||||
worker.fit
|
||||
worker.write_factors
|
||||
end
|
||||
|
||||
desc "dump user follows to stdout, formatted for mf-train"
|
||||
task dump_user_follows: %i[set_logger_stdout environment] do
|
||||
relation = Domain::Fa::Follow.all
|
||||
@@ -299,4 +291,13 @@ namespace :fa do
|
||||
exporter.run
|
||||
exporter.end_profiling! if profile
|
||||
end
|
||||
|
||||
desc "Backfill favs by scanning historical HTTP logs for favorites pages"
|
||||
task backfill_favs: :environment do
|
||||
FaBackfillFavs.new(
|
||||
start_at: ENV["start_at"]&.to_i,
|
||||
limit: ENV["limit"]&.to_i,
|
||||
batch_size: ENV["batch_size"]&.to_i,
|
||||
).run
|
||||
end
|
||||
end
|
||||
|
||||
16
sorbet/rbi/dsl/fa_backfill_favs.rbi
generated
Normal file
16
sorbet/rbi/dsl/fa_backfill_favs.rbi
generated
Normal file
@@ -0,0 +1,16 @@
|
||||
# typed: true
|
||||
|
||||
# DO NOT EDIT MANUALLY
|
||||
# This is an autogenerated file for dynamic methods in `FaBackfillFavs`.
|
||||
# Please instead update this file by running `bin/tapioca dsl FaBackfillFavs`.
|
||||
|
||||
|
||||
class FaBackfillFavs
|
||||
sig { returns(ColorLogger) }
|
||||
def logger; end
|
||||
|
||||
class << self
|
||||
sig { returns(ColorLogger) }
|
||||
def logger; end
|
||||
end
|
||||
end
|
||||
139
spec/lib/fa_backfill_favs_spec.rb
Normal file
139
spec/lib/fa_backfill_favs_spec.rb
Normal file
@@ -0,0 +1,139 @@
|
||||
# typed: false
|
||||
require "rails_helper"
|
||||
|
||||
describe FaBackfillFavs do
|
||||
let(:backfiller) { FaBackfillFavs.new }
|
||||
|
||||
let(:iiszed_html) do
|
||||
File.read("test/fixtures/files/domain/fa/favs_backfill/iiszed_1.html")
|
||||
end
|
||||
let(:renamonzeo_html) do
|
||||
File.read("test/fixtures/files/domain/fa/favs_backfill/renamonzeo_1.html")
|
||||
end
|
||||
let(:stickiest_html) do
|
||||
File.read("test/fixtures/files/domain/fa/favs_backfill/stickiest_1.html")
|
||||
end
|
||||
|
||||
let!(:iiszed_user) do
|
||||
Domain::Fa::User.create!(url_name: "iiszed", name: "I_Is_Zed")
|
||||
end
|
||||
let!(:renamonzeo_user) do
|
||||
Domain::Fa::User.create!(url_name: "renamonzeo", name: "Renamonzeo")
|
||||
end
|
||||
let!(:stickiest_user) do
|
||||
Domain::Fa::User.create!(url_name: "stickiest", name: "stickiest")
|
||||
end
|
||||
|
||||
before do
|
||||
# Create all posts found in the HTML
|
||||
[iiszed_html, renamonzeo_html, stickiest_html].each do |html|
|
||||
page = Domain::Fa::Parser::Page.new(html)
|
||||
submissions = page.submissions_parsed
|
||||
submissions.each do |submission|
|
||||
Domain::Fa::Post.create!(
|
||||
Domain::Fa::Post.hash_from_submission_parser_helper(submission),
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
# Create empty headers record for reuse
|
||||
empty_headers = HttpLogEntryHeader.find_or_create(headers: {})
|
||||
|
||||
# Create HttpLogEntry records for each user's favorites page
|
||||
HttpLogEntry.create!(
|
||||
uri_host: "www.furaffinity.net",
|
||||
uri_path: "/favorites/iiszed",
|
||||
uri_scheme: "https",
|
||||
verb: :get,
|
||||
performed_by: "direct",
|
||||
status_code: 200,
|
||||
response_time_ms: 100,
|
||||
content_type: "text/html",
|
||||
requested_at: Time.current,
|
||||
request_headers: empty_headers,
|
||||
response_headers: empty_headers,
|
||||
response:
|
||||
BlobEntry.create!(
|
||||
contents: iiszed_html,
|
||||
content_type: "text/html",
|
||||
sha256: Digest::SHA256.digest(iiszed_html),
|
||||
size: iiszed_html.bytesize,
|
||||
),
|
||||
)
|
||||
|
||||
HttpLogEntry.create!(
|
||||
uri_host: "www.furaffinity.net",
|
||||
uri_path: "/favorites/renamonzeo",
|
||||
uri_scheme: "https",
|
||||
verb: :get,
|
||||
performed_by: "direct",
|
||||
status_code: 200,
|
||||
response_time_ms: 100,
|
||||
content_type: "text/html",
|
||||
requested_at: Time.current,
|
||||
request_headers: empty_headers,
|
||||
response_headers: empty_headers,
|
||||
response:
|
||||
BlobEntry.create!(
|
||||
contents: renamonzeo_html,
|
||||
content_type: "text/html",
|
||||
sha256: Digest::SHA256.digest(renamonzeo_html),
|
||||
size: renamonzeo_html.bytesize,
|
||||
),
|
||||
)
|
||||
|
||||
HttpLogEntry.create!(
|
||||
uri_host: "www.furaffinity.net",
|
||||
uri_path: "/favorites/stickiest",
|
||||
uri_scheme: "https",
|
||||
verb: :get,
|
||||
performed_by: "direct",
|
||||
status_code: 200,
|
||||
response_time_ms: 100,
|
||||
content_type: "text/html",
|
||||
requested_at: Time.current,
|
||||
request_headers: empty_headers,
|
||||
response_headers: empty_headers,
|
||||
response:
|
||||
BlobEntry.create!(
|
||||
contents: stickiest_html,
|
||||
content_type: "text/html",
|
||||
sha256: Digest::SHA256.digest(stickiest_html),
|
||||
size: stickiest_html.bytesize,
|
||||
),
|
||||
)
|
||||
end
|
||||
|
||||
it "processes favorites from log entries" do
|
||||
backfiller.run
|
||||
|
||||
expect(
|
||||
Domain::Fa::Fav.where(
|
||||
user: iiszed_user,
|
||||
post: Domain::Fa::Post.find_by!(fa_id: 48_999_001),
|
||||
),
|
||||
).not_to be_empty
|
||||
|
||||
expect(
|
||||
Domain::Fa::Fav.where(
|
||||
user: iiszed_user,
|
||||
post: Domain::Fa::Post.find_by!(fa_id: 48_996_138),
|
||||
),
|
||||
).not_to be_empty
|
||||
|
||||
expect(
|
||||
Domain::Fa::Fav.where(
|
||||
user: iiszed_user,
|
||||
post: Domain::Fa::Post.find_by!(fa_id: 48_993_021),
|
||||
),
|
||||
).not_to be_empty
|
||||
|
||||
# Check total number of favorites for iiszed
|
||||
expect(Domain::Fa::Fav.where(user: iiszed_user).count).to eq(48) # Total number of figures in the HTML
|
||||
|
||||
expect(Domain::Fa::Fav.where(user: renamonzeo_user).count).to eq(48) # Total number of figures in the HTML
|
||||
|
||||
# Verify no posts were created for stickiest (empty favorites page)
|
||||
expect(Domain::Fa::Fav.where(user: stickiest_user).count).to eq(0)
|
||||
end
|
||||
end
|
||||
740
test/fixtures/files/domain/fa/favs_backfill/iiszed_1.html
vendored
Normal file
740
test/fixtures/files/domain/fa/favs_backfill/iiszed_1.html
vendored
Normal file
File diff suppressed because one or more lines are too long
742
test/fixtures/files/domain/fa/favs_backfill/renamonzeo_1.html
vendored
Normal file
742
test/fixtures/files/domain/fa/favs_backfill/renamonzeo_1.html
vendored
Normal file
File diff suppressed because one or more lines are too long
679
test/fixtures/files/domain/fa/favs_backfill/stickiest_1.html
vendored
Normal file
679
test/fixtures/files/domain/fa/favs_backfill/stickiest_1.html
vendored
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user