fix sqlite exporter

This commit is contained in:
Dylan Knutson
2024-12-21 09:08:55 +00:00
parent 6922c07b8c
commit 2ce6dc7b96
25 changed files with 486 additions and 537 deletions

View File

@@ -34,6 +34,7 @@ RUN \
ca-certificates \
curl \
gnupg \
iputils-ping \
libblas-dev \
libdb-dev \
libffi-dev \
@@ -48,7 +49,7 @@ RUN \
libvips42 \
libyaml-dev \
patch \
iputils-ping \
postgresql-client \
rustc \
uuid-dev \
zlib1g-dev

View File

@@ -39,8 +39,8 @@ services:
PGADMIN_DEFAULT_EMAIL: admin@example.com
PGADMIN_DEFAULT_PASSWORD: password
PGADMIN_LISTEN_PORT: 8080
PGADMIN_CONFIG_SERVER_MODE: False
PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED: False
PGADMIN_CONFIG_SERVER_MODE: "False"
PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED: "False"
volumes:
postgres-data:

View File

@@ -11,3 +11,4 @@ install_extension dbaeumer.vscode-eslint
install_extension aliariff.vscode-erb-beautify
install_extension bradlc.vscode-tailwindcss
install_extension KoichiSasada.vscode-rdbg
install_extension qwtel.sqlite-viewer

View File

@@ -23,5 +23,6 @@
"other": "on",
"comments": "off",
"strings": "on"
}
},
"sqliteViewer.maxFileSize": 1024
}

View File

@@ -1,5 +1,4 @@
class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
include HasMeasureDuration
include HasBulkEnqueueJobs
USERS_PER_FULL_PAGE = Rails.env.test? ? 9 : 190
@@ -39,7 +38,7 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
if new_favs.empty?
@user.scanned_favs_at = Time.now
@user.save!
logger.info "no new favs, stopping favs scan early"
logger.info "[no new favs] [stopping scan]"
return
end
end
@@ -49,45 +48,32 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
@page_number += 1
end
to_add = nil
to_remove = nil
measure(
proc do |jobs|
"add #{to_add.size.to_s.bold} favs, " +
"remove #{to_remove.size.to_s.bold} favs"
end
) do
to_remove = existing_faved_ids - @seen_post_ids
to_add = @seen_post_ids - existing_faved_ids
end
to_remove = existing_faved_ids - @seen_post_ids
to_add = @seen_post_ids - existing_faved_ids
logger.info "[calc change favs] [add #{to_add.size.to_s.bold}] [remove #{to_remove.size.to_s.bold}]"
measure(
proc do
"updated favs list to #{@user.fav_post_joins.count.to_s.bold} posts"
ReduxApplicationRecord.transaction do
if to_remove.any?
@user.fav_post_joins.where(post_id: to_remove).delete_all
end
) do
ReduxApplicationRecord.transaction do
if to_remove.any?
@user.fav_post_joins.where(post_id: to_remove).delete_all
slice_size =
if to_add.size <= 2500
100
else
1000
end
slice_size =
if to_add.size <= 2500
100
else
1000
end
if to_add.any?
to_add.each_slice(slice_size) do |slice|
@user.fav_post_joins.insert_all!(slice.map { |id| { post_id: id } })
end
if to_add.any?
to_add.each_slice(slice_size) do |slice|
@user.fav_post_joins.insert_all!(slice.map { |id| { post_id: id } })
end
@user.scanned_favs_at = Time.now
@user.save!
end
@user.scanned_favs_at = Time.now
@user.save!
end
logger.info "[updated favs list] [posts: #{@user.fav_post_joins.count.to_s.bold}]"
end
private
@@ -126,49 +112,47 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
@total_items_seen += submissions.length
posts_to_create_hashes = []
measure(
proc do
"page #{@page_number.to_s.bold} - " +
"#{submissions.length.to_s.bold} posts on page, " +
"created #{posts_to_create_hashes.size.to_s.bold}"
end
) do
existing_fa_id_to_post_id =
Domain::Fa::Post
.where(fa_id: submissions.map(&:id))
.pluck(:fa_id, :id)
.to_h
existing_fa_id_to_post_id =
Domain::Fa::Post
.where(fa_id: submissions.map(&:id))
.pluck(:fa_id, :id)
.to_h
posts_to_create_hashes =
submissions
.reject { |submission| existing_fa_id_to_post_id[submission.id] }
.map do |submission|
Domain::Fa::Post.hash_from_submission_parser_helper(
submission,
first_seen_log_entry: response.log_entry
)
end
posts_to_create_hashes =
submissions
.reject { |submission| existing_fa_id_to_post_id[submission.id] }
.map do |submission|
Domain::Fa::Post.hash_from_submission_parser_helper(
submission,
first_seen_log_entry: response.log_entry
)
end
created_post_ids = []
created_post_ids =
Domain::Fa::Post
.insert_all!(posts_to_create_hashes, returning: %i[id fa_id])
.map { |row| row["id"] } unless posts_to_create_hashes.empty?
created_post_ids = []
created_post_ids =
Domain::Fa::Post
.insert_all!(posts_to_create_hashes, returning: %i[id fa_id])
.map { |row| row["id"] } unless posts_to_create_hashes.empty?
enqueue_new_post_scan_jobs(
posts_to_create_hashes.map { |hash| hash[:fa_id] }
)
enqueue_new_post_scan_jobs(
posts_to_create_hashes.map { |hash| hash[:fa_id] }
)
@last_page_post_ids = Set.new
created_post_ids.each do |id|
@seen_post_ids.add(id)
@last_page_post_ids.add(id)
end
existing_fa_id_to_post_id.values.each do |id|
@seen_post_ids.add(id)
@last_page_post_ids.add(id)
end
@last_page_post_ids = Set.new
created_post_ids.each do |id|
@seen_post_ids.add(id)
@last_page_post_ids.add(id)
end
existing_fa_id_to_post_id.values.each do |id|
@seen_post_ids.add(id)
@last_page_post_ids.add(id)
end
logger.info [
"[page #{@page_number.to_s.bold}]",
"[posts: #{submissions.length.to_s.bold}]",
"[created: #{posts_to_create_hashes.size.to_s.bold}]"
].join(" ")
ret
end

View File

@@ -35,35 +35,25 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
@page_number += 1
end
to_add = nil
to_remove = nil
measure(
proc do |jobs|
"add #{to_add.size.to_s.bold} follows, " +
"remove #{to_remove.size.to_s.bold} follows"
existing_followed_ids = Set.new(@user.follower_joins.pluck(:followed_id))
to_remove = existing_followed_ids - @scanned_followed_ids
to_add = @scanned_followed_ids - existing_followed_ids
logger.info "[calc changed follows] [add #{to_add.size.to_s.bold}] [remove #{to_remove.size.to_s.bold}]"
ReduxApplicationRecord.transaction do
if to_remove.any?
@user.follower_joins.where(followed_id: to_remove).delete_all
end
) do
existing_followed_ids = Set.new(@user.follower_joins.pluck(:followed_id))
to_remove = existing_followed_ids - @scanned_followed_ids
to_add = @scanned_followed_ids - existing_followed_ids
if to_add.any?
@user.follower_joins.insert_all!(
to_add.map { |id| { followed_id: id } }
)
end
@user.scanned_follows_at = Time.now
@user.save!
end
measure(
proc { "updated follows list to #{@user.follows.count.to_s.bold} users" }
) do
ReduxApplicationRecord.transaction do
if to_remove.any?
@user.follower_joins.where(followed_id: to_remove).delete_all
end
if to_add.any?
@user.follower_joins.insert_all!(
to_add.map { |id| { followed_id: id } }
)
end
@user.scanned_follows_at = Time.now
@user.save!
end
end
logger.info "[updated follows list] [users: #{@user.follows.count.to_s.bold}]"
if @created_user
logger.info("user was new record, enqueue page scan job")
@@ -114,49 +104,45 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
@last_in_user_list = user_list.last.url_name
@total_follows_seen += user_list.length
users_to_create_hashes = []
followed_user_ids =
measure(
proc do
"page #{@page_number.to_s.bold} - " +
"#{user_list.length.to_s.bold} users on page, " +
"created #{users_to_create_hashes.size.to_s.bold}"
existing_url_name_to_id =
Domain::Fa::User
.where(url_name: user_list.map(&:url_name))
.pluck(:id, :url_name)
.map { |id, url_name| [url_name, id] }
.to_h
users_to_create_hashes =
user_list
.reject { |user| existing_url_name_to_id[user.url_name] }
.map do |user|
{
url_name: user.url_name,
name: user.name,
state_detail: {
"first_seen_entry" => response.log_entry.id
}
}
end
) do
existing_url_name_to_id =
Domain::Fa::User
.where(url_name: user_list.map(&:url_name))
.pluck(:id, :url_name)
.map { |id, url_name| [url_name, id] }
.to_h
users_to_create_hashes =
user_list
.reject { |user| existing_url_name_to_id[user.url_name] }
.map do |user|
{
url_name: user.url_name,
name: user.name,
state_detail: {
"first_seen_entry" => response.log_entry.id
}
}
end
created_user_ids =
Domain::Fa::User
.upsert_all(
users_to_create_hashes,
unique_by: :url_name,
update_only: :url_name,
returning: %i[id url_name]
)
.map { |row| row["id"] } unless users_to_create_hashes.empty?
created_user_ids =
Domain::Fa::User
.upsert_all(
users_to_create_hashes,
unique_by: :url_name,
update_only: :url_name,
returning: %i[id url_name]
)
.map { |row| row["id"] } unless users_to_create_hashes.empty?
enqueue_new_user_pagescan_jobs(users_to_create_hashes)
(created_user_ids || []) + existing_url_name_to_id.values
end
logger.info [
"[page #{@page_number.to_s.bold}]",
"[users: #{user_list.length.to_s.bold}]",
"[created: #{users_to_create_hashes.size.to_s.bold}]"
].join(" ")
enqueue_new_user_pagescan_jobs(users_to_create_hashes)
followed_user_ids =
(created_user_ids || []) + existing_url_name_to_id.values
followed_user_ids.each { |user_id| @scanned_followed_ids.add(user_id) }
ret

View File

@@ -40,7 +40,11 @@ class Domain::Fa::PostEnqueuer
"enqueuing #{to_enqueue.to_s.bold} more posts - #{already_enqueued.to_s.bold} already enqueued"
)
rows =
measure(proc { |p| "gather #{p.length.to_s.bold} posts to enqueue" }) do
measure(
proc do |p|
p && "gathered #{p.length.to_s.bold} posts" || "gathering posts..."
end
) do
to_enqueue
.times
.map do

View File

@@ -8,14 +8,18 @@ class Domain::Fa::PostFactorCalculator
end
def fit
logger.info "loading fav rows..."
limit = 100_000_000
dataset =
measure(proc { |r| "loaded #{r.length.to_s.bold} favs" }) do
Domain::Fa::Fav
.all
.pluck(:user_id, :post_id)
.map { |user_id, post_id| { user_id: user_id, item_id: post_id } }
end
measure(
->(r) do
r && "loaded #{r.length.to_s.bold} favs" ||
"loading up to #{limit} favs"
end
) { Domain::Fa::Fav.all.limit(limit).pluck(:user_id, :post_id).to_a }
measure("convert to hash") do
dataset.map! { |user_id, post_id| { user_id: user_id, item_id: post_id } }
end
measure("fit #{dataset.length.to_s.bold} favs") do
@recommender.fit(dataset)
@@ -23,7 +27,7 @@ class Domain::Fa::PostFactorCalculator
end
def write_factors
measure("#{"for_favorite".bold} - done") do
measure("write factors") do
write_factors_col(:item_ids, :item_factors, :for_favorite)
end
end

View File

@@ -1,42 +1,103 @@
class Domain::Fa::SqliteExporter
include HasMeasureDuration
TABLES = {
fa_users: {
model: Domain::Fa::User,
columns: [
%w[id int primary key],
%w[url_name text],
%w[name text],
%w[artist_type text],
%w[mood text],
%w[num_pageviews int],
%w[num_submissions int],
%w[num_comments_recieved int],
%w[num_comments_given int],
%w[num_journals int],
%w[num_favorites int],
%w[registered_at text]
],
indexes: [{ on: "id", unique: true }, { on: "url_name", unique: true }],
batch_size: 512,
# format registered_at column
each_row: ->(row) { row[11] = row[11]&.iso8601 }
},
fa_follows: {
model: Domain::Fa::Follow,
columns: [%w[follower_id int], %w[followed_id int]],
indexes: [{ on: %w[follower_id followed_id], unique: true }],
fk: {
follower_id: %w[fa_users id],
followed_id: %w[fa_users id]
},
batch_size: 4096
},
fa_favs: {
model: Domain::Fa::Fav,
columns: [%w[user_id int], %w[post_id int]],
indexes: [{ on: %w[user_id post_id], unique: true }],
fk: {
user_id: %w[fa_users id],
post_id: %w[fa_posts id]
},
batch_size: 4096
},
fa_posts: {
model: Domain::Fa::Post.where("file_url_str is not null"),
columns: [
%w[id int],
%w[fa_id int],
%w[creator_id int],
%w[title text],
%w[category text],
%w[theme text],
%w[species text],
%w[gender text],
%w[file_url_str text],
%w[num_views int],
%w[num_comments int],
%w[num_favorites int],
%w[posted_at text]
],
batch_size: 4096,
indexes: [{ on: "id", unique: true }, { on: "fa_id", unique: true }],
fk: {
creator_id: %w[fa_users id]
},
# format posted_at column
each_row: ->(row) { row[12] = row[12]&.iso8601 }
}
}
def initialize(db, sample, tables)
@db = db
@sample = sample
@tables = tables
@tables = tables.include?(:all) ? TABLES.keys : tables
@tables.each do |table|
raise("unknown table: #{table}") unless TABLES.key?(table)
end
end
def run
measure("created tables") { migrate }
measure("create tables") { migrate }
measure("drop indexes") { drop_indexes }
if dump_table?(:users)
measure(proc { |num| "dumped #{num&.to_s&.bold} fa users" }) do
dump_fa_users
@tables.each do |table|
config = TABLES[table]
measure(
proc do |num|
num && "dumped #{table}, #{num} rows" || "dumping #{table}..."
end
) do
dump_table_common(
table: table,
model: config[:model],
columns: config[:columns],
batch_size: config[:batch_size],
each_row: config[:each_row]
)
end
end
if dump_table?(:follows)
measure(proc { |num| "dumped #{num&.to_s&.bold} fa follows" }) do
dump_fa_follows
end
end
if dump_table?(:favs)
measure(proc { |num| "dumped #{num&.to_s&.bold} fa favs" }) do
dump_fa_favs
end
end
if dump_table?(:posts)
measure(proc { |num| "dumped #{num&.to_s&.bold} fa posts" }) do
dump_fa_posts
end
end
measure("created indexes") { create_indexes }
ensure
@db.close
end
@@ -63,123 +124,45 @@ class Domain::Fa::SqliteExporter
private
def migrate
@db.execute_batch2 <<-SQL
create table if not exists fa_users (
id int primary key,
url_name text,
name text,
num_favorites int,
registered_at text
);
TABLES.each do |table, config|
columns = config[:columns]
fk = config[:fk] || {}
create table if not exists fa_follows (
id int primary key,
follower_id int,
followed_id int
);
columns_and_fks = [
columns.map { |name, *rest| "#{name} #{rest.join(" ")}" }.join(",\n"),
fk.map do |name, foreign|
foreign_table, foreign_column = foreign
"foreign key (#{name}) references #{foreign_table}(#{foreign_column})"
end
].flatten.join(",\n")
create table if not exists fa_favs (
id int primary key,
user_id int,
post_id int
);
create table if not exists fa_posts (
id int primary key,
fa_id int,
creator_id int,
title text,
num_views int,
num_comments int,
num_favorites int,
posted_at text
);
SQL
end
INDEXES = [
["fa_users", "url_name", true],
["fa_follows", "follower_id", false],
["fa_follows", "followed_id", false],
["fa_favs", "user_id", false],
["fa_favs", "post_id", false],
["fa_posts", "creator_id", false],
["fa_posts", "fa_id", true]
]
def create_indexes
@db.execute_batch2(INDEXES.map { |table, col, unique| <<-SQL }.join("\n"))
create #{unique ? "unique" : ""} index if not exists #{col}_on_#{table}
on #{table} (#{col});
sql = <<-SQL
create table if not exists #{table} (
#{columns_and_fks}
);
SQL
end
# logger.info(sql)
@db.execute_batch2(sql)
def drop_indexes
@db.execute_batch2(INDEXES.map { |table, col, unique| <<-SQL }.join("\n"))
drop index if exists #{col}_on_#{table};
SQL
end
def dump_fa_users
dump_table_common(
model: Domain::Fa::User,
table: "fa_users",
columns: %w[id url_name name num_favorites registered_at],
batch_size: 512
) do |batch|
# format registered_at
batch.each { |row| row[4] = row[4]&.iso8601 }
end
end
def dump_fa_follows
dump_table_common(
model: Domain::Fa::Follow,
table: "fa_follows",
columns: %w[id follower_id followed_id],
batch_size: 4096
)
end
def dump_fa_favs
dump_table_common(
model: Domain::Fa::Fav,
table: "fa_favs",
columns: %w[id user_id post_id],
batch_size: 4096
)
end
def dump_fa_posts
dump_table_common(
model: Domain::Fa::Post.where("file_url_str is not null"),
table: "fa_posts",
columns: %w[
id
fa_id
title
creator_id
num_views
num_comments
num_favorites
posted_at
],
batch_size: 4096
) do |batch|
# format posted_at
batch.each { |row| row[7] = row[7]&.iso8601 }
config[:indexes].each do |index|
unique = index[:unique] ? "unique" : ""
cols = [index[:on]].flatten
col_names = cols.join("_")
sql = <<-SQL
create #{unique} index if not exists #{col_names}_on_#{table}
on #{table} (#{cols.join(", ")});
SQL
logger.info(sql)
@db.execute_batch2(sql)
end
end
end
# ====== common infra ====== #
def dump_table_common(model:, table:, columns:, batch_size:)
def dump_table_common(model:, table:, columns:, batch_size:, each_row: nil)
num_models = 0
start_id = max_id(table, "id")
start_id += 1 if start_id
logger.info(
"dumping #{table.bold}, start at #{(start_id || "first").to_s.bold}..."
)
logger.info("[#{table.to_s.bold}] [batch size: #{batch_size.to_s.bold}]")
inserter = create_inserter(batch_size, table, columns)
load_duration = 0.0
@@ -189,40 +172,34 @@ class Domain::Fa::SqliteExporter
dump_start = Time.now
load_start = Time.now
@db.transaction
pluck_rows(model, columns, batch_size: batch_size) do |batch|
@db.transaction do
load_duration += Time.now - load_start
pluck_rows(
model,
columns,
start_id: start_id,
batch_size: batch_size
) do |rows|
load_duration += Time.now - load_start
map_start = Time.now
batch.each { |row| each_row.call(row) } if each_row
map_duration += Time.now - map_start
map_start = Time.now
yield rows if block_given?
map_duration += Time.now - map_start
insert_start = Time.now
inserter.insert(batch)
insert_duration += Time.now - insert_start
insert_start = Time.now
inserter.insert(rows)
insert_duration += Time.now - insert_start
num_models += rows.size
load_start = Time.now
num_models += batch.size
load_start = Time.now
end
end
dump_duration = Time.now - dump_start
logger.info(
"time spent on #{table.bold} " +
"(#{(num_models / dump_duration).round(0).to_s.bold}/sec): " +
"#{load_duration.round(2).to_s.bold} sec loading, " +
"#{map_duration.round(2).to_s.bold} sec mapping, " +
"#{insert_duration.round(2).to_s.bold} sec inserting"
"[#{table.to_s.bold}] " +
"[#{(num_models / dump_duration).round(0).to_s.bold}/sec] " +
"[load: #{load_duration.round(2).to_s.bold} sec] " +
"[map: #{map_duration.round(2).to_s.bold} sec] " +
"[insert: #{insert_duration.round(2).to_s.bold} sec]"
)
num_models
ensure
inserter.close if inserter
@db.commit
end
def create_inserter(bulk_size, table, columns)
@@ -235,20 +212,20 @@ class Domain::Fa::SqliteExporter
def initialize(db, bulk_size, table, columns)
@db = db
@bulk_size = bulk_size
@bulk_size = [bulk_size, 999 / columns.size].min
@table = table
@columns = columns
binds = "(" + (["?"] * columns.size).join(", ") + ")"
binds = "(" + (["?"] * @columns.size).join(", ") + ")"
@single = @db.prepare <<-SQL
insert into #{table} (#{columns.join(", ")})
values #{binds}
insert into #{@table} (#{@columns.map(&:first).join(", ")})
values #{binds} on conflict do nothing
SQL
@bulk = @db.prepare <<-SQL
insert into #{table} (#{columns.join(", ")})
values #{([binds] * bulk_size).join(", ")}
insert into #{@table} (#{@columns.map(&:first).join(", ")})
values #{([binds] * @bulk_size).join(", ")} on conflict do nothing
SQL
end
@@ -303,19 +280,15 @@ class Domain::Fa::SqliteExporter
end
end
def pluck_rows(relation, cols, start_id:, batch_size:)
def pluck_rows(relation, cols, batch_size:)
num_batches = 0
num_models = 0
start_time = Time.now
models_in_measure = 0
relation = relation.all unless relation.is_a?(ActiveRecord::Relation)
relation = relation.where("id >= ?", start_id) if start_id
relation.pluck_in_batches(
*cols.map(&:to_sym),
batch_size: batch_size
) do |batch|
relation.in_batches(of: batch_size) do |batch|
batch = batch.pluck(*cols.map(&:first).map(&:to_sym)).to_a
yield batch
num_models += batch.size
@@ -335,16 +308,4 @@ class Domain::Fa::SqliteExporter
puts ""
end
def max_id(table, column)
@db.get_first_value <<-SQL
select max(#{column}) from #{table}
SQL
end
def dump_table?(table)
ret = @tables.include?(:all) || @tables.include?(table)
logger.info("skipping #{table.to_s.bold}...") if !ret
ret
end
end

View File

@@ -24,9 +24,12 @@ class Domain::Fa::UserEnqueuer
"enqueuing #{to_enqueue.to_s.bold} more users - #{already_enqueued.to_s.bold} already enqueued"
)
rows =
measure(proc { |p| "gather #{p.length.to_s.bold} users to enqueue" }) do
to_enqueue.times.map { @user_iterator.next }
end
measure(
proc do |p|
p && "gathered #{p.length.to_s.bold} users to enqueue" ||
"gathering users..."
end
) { to_enqueue.times.map { @user_iterator.next } }
measure("enqueue jobs") do
rows.each do |user|
types = []

View File

@@ -8,9 +8,12 @@ class Domain::Fa::UserFactorCalculator
end
def fit
logger.info "loading follow rows..."
dataset =
measure(proc { |r| "loaded #{r.length.to_s.bold} follows" }) do
measure(
proc do |r|
r && "loaded #{r.length.to_s.bold} follows" || "loading follows"
end
) do
Domain::Fa::Follow
.all
.pluck(:follower_id, :followed_id)
@@ -50,7 +53,7 @@ class Domain::Fa::UserFactorCalculator
.each_slice(20_000) do |chunk|
total += chunk.size
measure(
" -> wrote chunk of #{chunk.size.to_s.bold} - (#{total.to_s.bold} total)"
" -> write #{chunk.size.to_s.bold} factors - (#{total.to_s.bold} total)"
) do
Domain::Fa::UserFactor.upsert_all(
chunk,

View File

@@ -6,6 +6,8 @@ module HasMeasureDuration
def measure(title)
now = Time.now
title_str = title.respond_to?(:call) && title.call(nil) || title
logger.info "[start] #{title_str}"
ret = yield
duration = Time.now - now
if duration >= 1.hour
@@ -17,8 +19,8 @@ module HasMeasureDuration
else
duration_str = "#{(duration * 1000).round(0).to_s.bold} ms"
end
title = title.call(ret, duration) if title.respond_to?(:call)
logger.info "#{title} - #{duration_str}"
title_str = title.call(ret, duration) if title.respond_to?(:call)
logger.info "[finish] #{title_str} - #{duration_str}"
ret
end
end

View File

@@ -116,7 +116,7 @@ class Scraper::HttpClient
begin
response_blob_file.save unless response_blob_file.persisted?
rescue => e
puts "error saving blob file #{response_blob_file.sha256_hex}: #{e}"
puts "error saving blob file #{HexUtil.bin2hex(response_blob_file.sha256)}: #{e}"
end
rescue StandardError
retries += 1

View File

@@ -47,13 +47,20 @@ class BlobFile < ReduxApplicationRecord
end
def self.find_or_initialize_from_blob_entry(blob_entry)
blob_file =
BlobFile.find_or_initialize_by(sha256: blob_entry.sha256) do |blob_file|
blob_file.content_type = blob_entry.content_type
blob_file.content_bytes = blob_entry.contents
blob_file.created_at = blob_entry.created_at
end
blob_file
BlobFile.find_or_initialize_by(sha256: blob_entry.sha256) do |blob_file|
blob_file.content_type = blob_entry.content_type
blob_file.content_bytes = blob_entry.contents
blob_file.created_at = blob_entry.created_at
end
end
def self.initialize_from_blob_entry(blob_entry)
BlobFile.new(
sha256: blob_entry.sha256,
content_type: blob_entry.content_type,
content_bytes: blob_entry.contents,
created_at: blob_entry.created_at
)
end
def content_bytes=(content_bytes)

View File

@@ -1,5 +1,6 @@
class Domain::Fa::Fav < ReduxApplicationRecord
self.table_name = "domain_fa_favs"
self.primary_keys = :user_id, :post_id
belongs_to :user, class_name: "::Domain::Fa::User"
belongs_to :post, class_name: "::Domain::Fa::Post"

View File

@@ -40,11 +40,13 @@
<% if similar %>
<% similar.each do |factor| %>
<% post = factor.post %>
<div class='flex flex-row py-1 px-2 border-b-2 last:border-b-0'>
<span class='text-md italic'><%= link_to post.title, domain_fa_post_path(post.fa_id), class: 'underline' %></span> -
<%= render "domain/fa/users/inline_link", user: post.creator %>
<div class='flex flex-row justify-between items-center py-1 px-2 border-b-2 last:border-b-0'>
<div class='flex flex-row items-center'>
<span class='text-md italic'><%= link_to post.title, domain_fa_post_path(post.fa_id), class: 'underline' %></span>
<span class='italic ml-2'>by <%= render "domain/fa/users/inline_link", user: post.creator %></span>
</div>
<span class='text-sm text-slate-500 ml-2'>
(distance: <%= number_with_precision(factor.neighbor_distance, precision: 3) %>)
(distance: <%= number_with_precision(factor.neighbor_distance, precision: 5) %>)
</span>
</div>
<% end %>

View File

@@ -1,5 +1,6 @@
class CreateDomainFaFollows < ActiveRecord::Migration[7.0]
def change
# pk removed in 20230523162724_remove_pk_from_follows.rb
create_table :domain_fa_follows do |t|
t.references :follower, null: false
t.references :followed, null: false

View File

@@ -1,5 +1,6 @@
class CreateDomainFaFavs < ActiveRecord::Migration[7.0]
def change
# pk removed in 20241220174922_remove_pk_from_domain_fa_favs.rb
create_table :domain_fa_favs do |t|
t.references :user, null: false
t.references :post, null: false

View File

@@ -0,0 +1,12 @@
class RemovePkFromDomainFaFavs < ActiveRecord::Migration[7.0]
def up
add_index :domain_fa_favs, %i[user_id post_id], unique: true
remove_column :domain_fa_favs, :id, :bigint, before: :user_id
end
def down
add_column :domain_fa_favs, :id, :bigint, primary_key: true, first: true
add_index :domain_fa_favs, :id
remove_index :domain_fa_favs, %i[user_id post_id]
end
end

232
db/schema.rb generated
View File

@@ -10,25 +10,15 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
ActiveRecord::Schema[7.0].define(version: 2024_12_20_174922) do
# These are extensions that must be enabled in order to support this database
enable_extension "pg_prewarm"
enable_extension "pg_stat_statements"
enable_extension "pg_trgm"
enable_extension "pgcrypto"
enable_extension "plpgsql"
enable_extension "vector"
create_table "blob_entries", id: false, force: :cascade do |t|
t.binary "sha256", null: false
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
t.binary "contents", null: false
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["sha256"], name: "index_blob_entries_on_sha256", unique: true
end
create_table "blob_entries_p", id: false, force: :cascade do |t|
t.binary "sha256", null: false
t.binary "base_sha256"
@@ -39,8 +29,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_on_sha256", unique: true
end
create_table "blob_entries_p_00", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_00", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -49,8 +38,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_00_on_sha256", unique: true
end
create_table "blob_entries_p_01", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_01", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -59,8 +47,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_01_on_sha256", unique: true
end
create_table "blob_entries_p_02", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_02", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -69,8 +56,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_02_on_sha256", unique: true
end
create_table "blob_entries_p_03", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_03", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -79,8 +65,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_03_on_sha256", unique: true
end
create_table "blob_entries_p_04", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_04", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -89,8 +74,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_04_on_sha256", unique: true
end
create_table "blob_entries_p_05", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_05", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -99,8 +83,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_05_on_sha256", unique: true
end
create_table "blob_entries_p_06", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_06", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -109,8 +92,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_06_on_sha256", unique: true
end
create_table "blob_entries_p_07", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_07", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -119,8 +101,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_07_on_sha256", unique: true
end
create_table "blob_entries_p_08", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_08", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -129,8 +110,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_08_on_sha256", unique: true
end
create_table "blob_entries_p_09", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_09", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -139,8 +119,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_09_on_sha256", unique: true
end
create_table "blob_entries_p_10", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_10", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -149,8 +128,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_10_on_sha256", unique: true
end
create_table "blob_entries_p_11", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_11", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -159,8 +137,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_11_on_sha256", unique: true
end
create_table "blob_entries_p_12", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_12", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -169,8 +146,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_12_on_sha256", unique: true
end
create_table "blob_entries_p_13", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_13", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -179,8 +155,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_13_on_sha256", unique: true
end
create_table "blob_entries_p_14", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_14", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -189,8 +164,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_14_on_sha256", unique: true
end
create_table "blob_entries_p_15", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_15", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -199,8 +173,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_15_on_sha256", unique: true
end
create_table "blob_entries_p_16", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_16", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -209,8 +182,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_16_on_sha256", unique: true
end
create_table "blob_entries_p_17", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_17", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -219,8 +191,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_17_on_sha256", unique: true
end
create_table "blob_entries_p_18", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_18", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -229,8 +200,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_18_on_sha256", unique: true
end
create_table "blob_entries_p_19", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_19", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -239,8 +209,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_19_on_sha256", unique: true
end
create_table "blob_entries_p_20", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_20", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -249,8 +218,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_20_on_sha256", unique: true
end
create_table "blob_entries_p_21", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_21", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -259,8 +227,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_21_on_sha256", unique: true
end
create_table "blob_entries_p_22", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_22", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -269,8 +236,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_22_on_sha256", unique: true
end
create_table "blob_entries_p_23", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_23", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -279,8 +245,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_23_on_sha256", unique: true
end
create_table "blob_entries_p_24", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_24", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -289,8 +254,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_24_on_sha256", unique: true
end
create_table "blob_entries_p_25", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_25", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -299,8 +263,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_25_on_sha256", unique: true
end
create_table "blob_entries_p_26", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_26", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -309,8 +272,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_26_on_sha256", unique: true
end
create_table "blob_entries_p_27", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_27", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -319,8 +281,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_27_on_sha256", unique: true
end
create_table "blob_entries_p_28", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_28", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -329,8 +290,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_28_on_sha256", unique: true
end
create_table "blob_entries_p_29", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_29", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -339,8 +299,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_29_on_sha256", unique: true
end
create_table "blob_entries_p_30", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_30", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -349,8 +308,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_30_on_sha256", unique: true
end
create_table "blob_entries_p_31", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_31", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -359,8 +317,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_31_on_sha256", unique: true
end
create_table "blob_entries_p_32", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_32", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -369,8 +326,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_32_on_sha256", unique: true
end
create_table "blob_entries_p_33", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_33", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -379,8 +335,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_33_on_sha256", unique: true
end
create_table "blob_entries_p_34", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_34", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -389,8 +344,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_34_on_sha256", unique: true
end
create_table "blob_entries_p_35", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_35", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -399,8 +353,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_35_on_sha256", unique: true
end
create_table "blob_entries_p_36", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_36", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -409,8 +362,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_36_on_sha256", unique: true
end
create_table "blob_entries_p_37", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_37", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -419,8 +371,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_37_on_sha256", unique: true
end
create_table "blob_entries_p_38", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_38", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -429,8 +380,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_38_on_sha256", unique: true
end
create_table "blob_entries_p_39", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_39", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -439,8 +389,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_39_on_sha256", unique: true
end
create_table "blob_entries_p_40", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_40", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -449,8 +398,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_40_on_sha256", unique: true
end
create_table "blob_entries_p_41", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_41", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -459,8 +407,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_41_on_sha256", unique: true
end
create_table "blob_entries_p_42", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_42", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -469,8 +416,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_42_on_sha256", unique: true
end
create_table "blob_entries_p_43", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_43", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -479,8 +425,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_43_on_sha256", unique: true
end
create_table "blob_entries_p_44", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_44", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -489,8 +434,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_44_on_sha256", unique: true
end
create_table "blob_entries_p_45", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_45", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -499,8 +443,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_45_on_sha256", unique: true
end
create_table "blob_entries_p_46", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_46", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -509,8 +452,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_46_on_sha256", unique: true
end
create_table "blob_entries_p_47", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_47", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -519,8 +461,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_47_on_sha256", unique: true
end
create_table "blob_entries_p_48", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_48", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -529,8 +470,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_48_on_sha256", unique: true
end
create_table "blob_entries_p_49", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_49", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -539,8 +479,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_49_on_sha256", unique: true
end
create_table "blob_entries_p_50", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_50", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -549,8 +488,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_50_on_sha256", unique: true
end
create_table "blob_entries_p_51", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_51", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -559,8 +497,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_51_on_sha256", unique: true
end
create_table "blob_entries_p_52", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_52", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -569,8 +506,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_52_on_sha256", unique: true
end
create_table "blob_entries_p_53", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_53", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -579,8 +515,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_53_on_sha256", unique: true
end
create_table "blob_entries_p_54", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_54", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -589,8 +524,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_54_on_sha256", unique: true
end
create_table "blob_entries_p_55", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_55", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -599,8 +533,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_55_on_sha256", unique: true
end
create_table "blob_entries_p_56", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_56", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -609,8 +542,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_56_on_sha256", unique: true
end
create_table "blob_entries_p_57", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_57", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -619,8 +551,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_57_on_sha256", unique: true
end
create_table "blob_entries_p_58", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_58", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -629,8 +560,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_58_on_sha256", unique: true
end
create_table "blob_entries_p_59", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_59", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -639,8 +569,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_59_on_sha256", unique: true
end
create_table "blob_entries_p_60", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_60", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -649,8 +578,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_60_on_sha256", unique: true
end
create_table "blob_entries_p_61", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_61", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -659,8 +587,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_61_on_sha256", unique: true
end
create_table "blob_entries_p_62", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_62", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -669,8 +596,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["sha256"], name: "index_blob_entries_p_62_on_sha256", unique: true
end
create_table "blob_entries_p_63", id: false, force: :cascade do |t|
t.binary "sha256", null: false
create_table "blob_entries_p_63", primary_key: "sha256", id: :binary, force: :cascade do |t|
t.binary "base_sha256"
t.string "content_type", null: false
t.integer "size", null: false
@@ -1343,7 +1269,9 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.datetime "updated_at"
t.string "signature"
t.jsonb "args"
t.index ["priority", "run_at"], name: "delayed_jobs_priority"
t.index ["priority", "run_at"], name: "delayed_jobs_priority_run_at_idx"
t.index ["queue"], name: "delayed_jobs_queue_idx"
t.index ["signature"], name: "delayed_jobs_signature_idx", unique: true
end
create_table "domain_e621_post_versions", force: :cascade do |t|
@@ -1402,10 +1330,10 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["name"], name: "index_domain_e621_tags_on_name", unique: true
end
create_table "domain_fa_favs", force: :cascade do |t|
create_table "domain_fa_favs", id: false, force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "post_id", null: false
t.index ["post_id"], name: "index_domain_fa_favs_on_post_id"
t.index ["user_id", "post_id"], name: "index_domain_fa_favs_on_user_id_and_post_id", unique: true
t.index ["user_id"], name: "index_domain_fa_favs_on_user_id"
end
@@ -1447,7 +1375,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.jsonb "state_detail"
t.jsonb "log_entry_detail"
t.index ["creator_id"], name: "index_domain_fa_posts_on_creator_id"
t.index ["fa_id"], name: "index_domain_fa_posts_on_fa_id", unique: true
t.index ["fa_id", "id"], name: "index_domain_fa_posts_on_fa_id", unique: true
t.index ["file_id"], name: "index_domain_fa_posts_on_file_id"
end
@@ -1507,7 +1435,9 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.jsonb "state_detail"
t.datetime "scanned_follows_at"
t.datetime "scanned_favs_at"
t.index ["name"], name: "domain_fa_users_name_idx", opclass: :gist_trgm_ops, using: :gist
t.index ["name"], name: "index_domain_fa_users_on_name", unique: true
t.index ["url_name"], name: "domain_fa_users_url_name_idx", opclass: :gist_trgm_ops, using: :gist
t.index ["url_name"], name: "index_domain_fa_users_on_url_name", unique: true
end
@@ -1603,6 +1533,13 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.datetime "updated_at", null: false
end
create_table "domain_inkbunny_user_avatars", force: :cascade do |t|
t.bigint "user_id", null: false
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["user_id"], name: "index_domain_inkbunny_user_avatars_on_user_id"
end
create_table "domain_inkbunny_users", force: :cascade do |t|
t.integer "state", null: false
t.json "state_detail"
@@ -1658,7 +1595,6 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.integer "state"
t.json "state_detail"
t.json "raw_data"
t.integer "tw_id"
t.string "name", null: false
t.string "nick"
t.string "description"
@@ -1669,6 +1605,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.datetime "scanned_timeline_at"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.bigint "tw_id"
t.index ["name"], name: "index_domain_twitter_users_on_name", unique: true
t.index ["tw_id"], name: "index_domain_twitter_users_on_tw_id", unique: true
end
@@ -1838,9 +1775,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
t.index ["item_type", "item_id"], name: "index_versions_on_item_type_and_item_id"
end
add_foreign_key "blob_entries", "blob_entries", column: "base_sha256", primary_key: "sha256"
add_foreign_key "domain_e621_post_versions", "domain_e621_posts", column: "item_id"
add_foreign_key "domain_fa_favs", "domain_fa_posts", column: "post_id"
add_foreign_key "domain_fa_favs", "domain_fa_users", column: "user_id"
add_foreign_key "domain_fa_follows", "domain_fa_users", column: "followed_id"
add_foreign_key "domain_fa_follows", "domain_fa_users", column: "follower_id"
@@ -1860,8 +1795,7 @@ ActiveRecord::Schema[7.0].define(version: 2024_12_19_201430) do
add_foreign_key "domain_inkbunny_pool_joins", "domain_inkbunny_posts", column: "post_id"
add_foreign_key "domain_twitter_medias", "domain_twitter_tweets", column: "tweet_id"
add_foreign_key "domain_twitter_medias", "http_log_entries", column: "file_id"
add_foreign_key "domain_twitter_tweets", "domain_twitter_users", column: "author_id"
add_foreign_key "domain_twitter_user_versions", "domain_twitter_users", column: "item_id"
add_foreign_key "domain_twitter_tweets", "domain_twitter_users", column: "author_id", primary_key: "tw_id", name: "on_author_id"
add_foreign_key "http_log_entries", "http_log_entries", column: "caused_by_id"
add_foreign_key "http_log_entries", "http_log_entry_headers", column: "request_headers_id"
add_foreign_key "http_log_entries", "http_log_entry_headers", column: "response_headers_id"

View File

@@ -12,3 +12,6 @@ format-changed:
format-all:
find . -name '*.rb' -type f -exec sh -c 'stree format "$1" > "$1.tmp" && mv "$1.tmp" "$1"' sh {} \;
psql-dump-domain-fa-favs:
@psql -P pager=off -c 'select user_id, post_id, 1 from domain_fa_favs limit 10000000;' -d redux_prod -h 10.166.33.171 -U scraper_redux -t -A -F ' '

View File

@@ -5,65 +5,71 @@ namespace :blob_file do
task migrate_blob_entries: %i[environment] do
batch_size = ENV["batch_size"]&.to_i || 1000
profile = ENV["profile"] == "true" || false
start_at = ENV["start_at"] || "0" * 64
num_migrated = 0
puts "batch_size: #{batch_size}"
RubyProf.start if profile
def migrate_impl(batch_size, start_at, stop_at)
def migrate_impl(batch_size, start_at)
num_migrated = 0
num_processed = 0
start_time = Time.now
BlobEntryP
.where("sha256 NOT IN (SELECT sha256 FROM blob_files)")
.includes(:base)
.find_in_batches(
batch_size: batch_size,
start: HexUtil.hex2bin(start_at),
finish: HexUtil.hex2bin(stop_at)
) do |batch|
batch_migrated = insert_blob_entries_batch(batch)
num_migrated += batch_migrated
rate = batch_migrated.to_f / (Time.now - start_time)
puts "migrated #{batch_migrated} @ #{rate.round(1)}/second blob entries [last: #{HexUtil.bin2hex(batch.last.sha256)}]"
start_time = Time.now
end
BlobEntryP.in_batches(
of: batch_size,
start: HexUtil.hex2bin(start_at),
order: :asc
) do |batch|
batch_migrated = insert_blob_entries_batch(batch)
num_migrated += batch_migrated
num_processed += batch.size
rate = batch_migrated.to_f / (Time.now - start_time)
puts "migrated #{batch_migrated}, processed #{num_processed} @ #{rate.round(1)}/second [last: #{HexUtil.bin2hex(batch.last.sha256)}]"
start_time = Time.now
end
num_migrated
end
def insert_blob_entries_batch(batch)
num_migrated = 0
blob_entry_sha256s = batch.pluck(:sha256)
blob_file_sha256s =
BlobFile.where(sha256: blob_entry_sha256s).pluck(:sha256)
missing_sha256s = blob_entry_sha256s - blob_file_sha256s
BlobFile.transaction do
batch.each do |blob_entry|
blob_file = BlobFile.find_or_initialize_from_blob_entry(blob_entry)
sha256_hex = HexUtil.bin2hex(blob_file.sha256)
begin
unless blob_file.persisted?
BlobEntryP
.where(sha256: missing_sha256s)
.each do |blob_entry|
blob_file = BlobFile.initialize_from_blob_entry(blob_entry)
sha256_hex = HexUtil.bin2hex(blob_file.sha256)
begin
blob_file.save!
num_migrated += 1
rescue => e
puts "error saving blob file #{sha256_hex}: #{e}"
end
rescue => e
puts "error saving blob file #{sha256_hex}: #{e}"
end
end
end
num_migrated
end
def start_thread(batch_size, start_at, stop_at)
Thread.new { migrate_impl(batch_size, start_at, stop_at) }
def start_thread(batch_size, start_at)
Thread.new { migrate_impl(batch_size, start_at) }
end
num_ractors = 4
skip = (0xFFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF / num_ractors) + 1
num_threads = 1
# skip = ((0xFFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF) / num_threads) + 1
num_migrated =
(0...num_ractors)
(0...num_threads)
.map do |i|
# partition the entire sha256 space into num_ractors chunks
# each chunk is 256 / num_ractors in size
start_at = (skip * i).to_s(16).rjust(32, "0")
stop_at = ((skip * (i + 1)) - 1).to_s(16).rjust(32, "0")
puts "migrate #{start_at} -> #{stop_at}"
start_thread(batch_size, start_at, stop_at)
# partition the entire sha256 space into num_threads chunks
# each chunk is 256 / num_threads in size
# start_at = (skip * i).to_s(16).rjust(32, "0")
# stop_at = ((skip * (i + 1)) - 1).to_s(16).rjust(32, "0")
puts "migrate #{start_at}"
start_thread(batch_size, start_at)
end
.map(&:value)
.sum

View File

@@ -147,6 +147,34 @@ namespace :fa do
worker.write_factors
end
desc "dump user follows to stdout, formatted for mf-train"
task dump_user_follows: %i[set_logger_stdout environment] do
relation = Domain::Fa::Follow.all
total = relation.count
$stderr.puts "dumping #{total} follows..."
# print progress bar to stderr
progress =
ProgressBar.create(
total: total,
format: "%t: %c/%C %B %p%% %a %e",
output: $stderr,
throttle_rate: 0.2
)
# make stdout unbuffered
$stdout.sync = false
relation.in_batches(of: 100_000) do |relation|
values = relation.pluck(:follower_id, :followed_id)
values.each do |follower_id, followed_id|
$stdout.puts "#{follower_id} #{followed_id} 1"
end
progress.progress += values.size
end
$stdout.flush
end
desc "Import existing FA posts"
task :import_existing, [:start_at] => [:environment] do |t, args|
batch_size = args[:batch_size]&.to_i || ENV["batch_size"]&.to_i
@@ -273,7 +301,9 @@ namespace :fa do
tables =
ENV["tables"] ||
raise("'tables' required (all, users, follows, favs, posts)")
raise(
"'tables' required (all, #{Domain::Fa::SqliteExporter::TABLES.keys.join(", ")})"
)
tables = tables.split(",").map(&:to_sym)
db = SQLite3::Database.new(outfile)

View File

@@ -0,0 +1,2 @@
describe Domain::Fa::SqliteExporter do
end

View File

@@ -62,7 +62,7 @@ class BlobFileTest < ActiveSupport::TestCase
test "from an initialized BlobEntryP" do
blob_entry = TestUtil.build_blob_entry
blob_file = BlobFile.find_or_initialize_from_blob_entry(blob_entry)
blob_file = BlobFile.initialize_from_blob_entry(blob_entry)
assert blob_file.save
assert_equal blob_file.content_bytes, blob_entry.contents
assert_equal blob_file.content_type, blob_entry.content_type