migrate fa posts to json_attributes

This commit is contained in:
Dylan Knutson
2025-02-02 03:43:19 +00:00
parent c4f0a73cfd
commit bcd845759e
35 changed files with 4193 additions and 369 deletions

View File

@@ -62,7 +62,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
sh -c 'echo "deb [signed-by=/usr/share/postgresql-common/pgdg/apt.postgresql.org.asc] https://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' && \
apt update && \
apt-get install --no-install-recommends --no-install-suggests -y \
postgresql-client-17
postgresql-client-15
# Install & configure delta diff tool
RUN wget -O- https://github.com/dandavison/delta/releases/download/0.18.2/git-delta_0.18.2_amd64.deb > /tmp/git-delta.deb && \

View File

@@ -1,7 +1,7 @@
FROM postgres:17
FROM postgres:15
RUN apt-get update && apt-get install -y \
postgresql-17-pgvector \
postgresql-15-pgvector \
&& rm -rf /var/lib/apt/lists/*
COPY create-tablespaces.bash /docker-entrypoint-initdb.d/00-create-tablespaces.bash

View File

@@ -10,14 +10,19 @@ class Domain::Fa::Job::Base < Scraper::JobBase
sig { params(args: T.untyped).void }
def initialize(*args)
super(*T.unsafe(args))
@force_scan = T.let(false, T::Boolean)
@user = T.let(nil, T.nilable(Domain::Fa::User))
@created_user = T.let(false, T::Boolean)
@posts_enqueued_for_scan = T.let(Set.new, T::Set[Integer])
@force_scan = T.let(false, T::Boolean)
end
protected
sig { returns(T::Boolean) }
def force_scan?
!!arguments[0][:force_scan]
end
sig do
params(
args: T.untyped,
@@ -211,9 +216,9 @@ class Domain::Fa::Job::Base < Scraper::JobBase
end
def update_and_save_post_from_listings_page(job_type, post, submission)
if job_type == :browse_page
post.log_entry_detail["first_browse_page_id"] ||= causing_log_entry&.id
post.first_browse_page_id ||= causing_log_entry&.id
elsif job_type == :gallery_page
post.log_entry_detail["first_gallery_page_id"] ||= causing_log_entry&.id
post.first_gallery_page_id ||= causing_log_entry&.id
else
fatal_error("unhandled job_type: #{job_type}")
end
@@ -221,8 +226,6 @@ class Domain::Fa::Job::Base < Scraper::JobBase
post.creator ||=
Domain::Fa::User.find_or_build_from_submission_parser(submission)
post.title = submission.title || fatal_error("blank title")
post.thumbnail_uri =
submission.thumb_path || fatal_error("blank thumb_path")
post.save!
end

View File

@@ -149,30 +149,26 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
.pluck(:fa_id, :id)
.to_h
posts_to_create_hashes =
submissions
.reject { |submission| existing_fa_id_to_post_id[submission.id] }
.map do |submission|
Domain::Fa::Post.hash_from_submission_parser_helper(
submission,
created_posts =
submissions.map do |submission_parser_helper|
post =
Domain::Fa::Post.find_or_initialize_by_submission_parser(
submission_parser_helper,
first_seen_log_entry: response.log_entry,
)
end
post.enqueue_job_after_save(
Domain::Fa::Job::ScanPostJob,
{ post:, caused_by_entry: causing_log_entry },
)
post
end
created_post_ids = []
created_post_ids =
Domain::Fa::Post
.insert_all!(posts_to_create_hashes, returning: %i[id fa_id])
.map { |row| row["id"] } unless posts_to_create_hashes.empty?
enqueue_new_post_scan_jobs(
posts_to_create_hashes.map { |hash| hash[:fa_id] },
)
bulk_enqueue_jobs { created_posts.each(&:save!) }
@last_page_post_ids = Set.new
created_post_ids.each do |id|
@seen_post_ids.add(id)
@last_page_post_ids.add(id)
created_posts.each do |post|
@seen_post_ids.add(T.must(post.id))
@last_page_post_ids.add(T.must(post.id))
end
existing_fa_id_to_post_id.values.each do |id|
@seen_post_ids.add(id)
@@ -187,13 +183,4 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
ret
end
sig { params(fa_ids: T::Array[Integer]).void }
def enqueue_new_post_scan_jobs(fa_ids)
bulk_enqueue_jobs do
fa_ids.each do |fa_id|
defer_job(Domain::Fa::Job::ScanPostJob, { fa_id: fa_id })
end
end
end
end

View File

@@ -52,7 +52,7 @@ class Domain::Fa::Job::ScanFileJob < Domain::Fa::Job::Base
if is_unresolvable_host
logger.error("host is #{file_uri_host}, which will not resolve")
post.state = :file_error
post.state_detail = { file_error: "#{file_uri_host} is unresolvable" }
post.scan_file_error = "unresolvable host"
post.save!
return
end
@@ -66,17 +66,11 @@ class Domain::Fa::Job::ScanFileJob < Domain::Fa::Job::Base
response = http_client.get(post.file_uri.to_s)
if response.status_code == 404
post.state_detail["404_count"] ||= 0
post.state_detail["404_count"] += 1
fof_count = (post.state_detail["404_count"] || 0)
if fof_count > 1
post.state = :file_error
post.state_detail["file_error"] = "too many 404s"
post.save!
logger.error "too many 404s (#{fof_count.to_s.bold}), aborting"
return
end
post.state = :file_error
post.scan_file_error = "404"
post.save!
logger.error "404, aborting"
return
end
if response.status_code != 200

View File

@@ -2,15 +2,8 @@
class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
queue_as :fa_post
sig { params(args: T.untyped).void }
def initialize(*args)
super
end
sig { override.params(args: T::Hash[Symbol, T.untyped]).void }
def perform(args)
@force_scan = !!args[:force_scan]
post =
T.cast(
args[:post] ||
@@ -23,16 +16,9 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
logger.prefix =
proc { "[fa_id #{post.fa_id.to_s.bold} / #{post.state&.bold}]" }
if post.state == "ok" && !post.scanned? || @force_scan
if (post.state_detail["404_count"] || 0) > 2
logger.info "too many 404s, skipping"
return
end
scan_post(post) if (post.state == "ok" && !post.scanned?) || force_scan?
scan_post(post)
end
if (post.state == "ok" && post.file_uri && post.file.nil?) || @force_scan
if (post.state == "ok" && post.file_uri && post.file.nil?) || force_scan?
logger.info("enqueue file job (#{self.priority})")
defer_job(
Domain::Fa::Job::ScanFileJob,
@@ -60,13 +46,19 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
def scan_post(post)
response =
http_client.get("https://www.furaffinity.net/view/#{post.fa_id}/")
if response.status_code == 404
post.state = :scan_error
post.save!
return
end
if response.status_code != 200
fatal_error(
"error scanning fa_id #{post.fa_id}: #{response.status_code}, log entry #{response.log_entry.id}",
)
end
page = Domain::Fa::Parser::Page.new(response.body)
page = Domain::Fa::Parser::Page.new(response.body, require_logged_in: false)
if page.submission_not_found?
logger.error("post was removed")
post.state = :removed
@@ -74,6 +66,8 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
return
end
fatal_error("submission page is not logged in") unless page.logged_in?
unless page.probably_submission?
if response.body =~
/The page you are trying to reach is currently pending deletion/
@@ -103,7 +97,7 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
invalid: :replace,
undef: :replace,
)
post.keywords = submission.keywords_array
post.keywords = submission.keywords_array || []
post.file_uri = submission.full_res_img
post.theme = submission.theme
post.species = submission.species
@@ -112,8 +106,10 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
post.num_comments = submission.num_comments
post.num_views = submission.num_views
post.posted_at = submission.posted_date
post.scanned_at = Time.now
post.scanned_at = DateTime.current
post.save!
rescue StandardError
binding.pry
end
end

4
app/lib/domain.rb Normal file
View File

@@ -0,0 +1,4 @@
module Domain
module E621
end
end

View File

@@ -1,5 +1,7 @@
# typed: true
class Domain::Fa::Parser::Base
extend T::Sig
attr_reader :page_version
def unimplemented_version!

View File

@@ -63,6 +63,7 @@ class Domain::Fa::Parser::Page < Domain::Fa::Parser::Base
logged_in_user_elem ? true : false
end
sig { returns(T::Array[Domain::Fa::Parser::ListedSubmissionParserHelper]) }
def submissions_parsed
@submissions_hashes ||=
submission_elems.map do |elem|

View File

@@ -0,0 +1,130 @@
# typed: strict
module JsonAttrHelpers
extend T::Sig
extend T::Helpers
requires_ancestor { ActiveRecord::Migration }
class JsonAttrMigrateContext
extend T::Sig
extend T::Helpers
sig { returns(T::Array[[String, Symbol, Symbol]]) }
attr_reader :migrate_columns
sig { returns(T::Array[[String, String, String]]) }
attr_reader :migrate_json_columns
sig { params(table_name: String, migration: ActiveRecord::Migration).void }
def initialize(table_name, migration)
@table_name = table_name
@migration = migration
@migrate_columns = T.let([], T::Array[[String, Symbol, Symbol]])
@migrate_json_columns = T.let([], T::Array[[String, String, String]])
end
sig do
params(
column_name: String,
old_field_name: String,
new_field_name: T.nilable(String),
).void
end
def migrate_json_column(column_name, old_field_name, new_field_name = nil)
new_field_name ||= old_field_name
@migrate_json_columns << [column_name, old_field_name, new_field_name]
end
sig do
params(
column_name: String,
rails_column_type: Symbol,
db_column_type: Symbol,
).void
end
def migrate_column(column_name, rails_column_type, db_column_type)
@migrate_columns << [column_name, rails_column_type, db_column_type]
end
end
sig do
params(
table_name: String,
blk: T.proc.bind(JsonAttrMigrateContext).void,
).void
end
def migrate_json_table(table_name, &blk)
T.bind(self, ActiveRecord::Migration)
context = JsonAttrMigrateContext.new(table_name, self)
context.instance_eval(&blk)
reversible do |dir|
dir.up do
native_columns =
context
.migrate_columns
.map do |column_name, rails_column_type, db_column_type|
"'#{column_name}', COALESCE(to_jsonb(#{column_name}), 'null'::jsonb)"
end
json_columns =
context
.migrate_json_columns
.map do |column_name, old_field_name, new_field_name|
"'#{new_field_name}', COALESCE(#{column_name}->>'#{old_field_name}', 'null')::jsonb"
end
built_json_attrs = native_columns + json_columns
execute <<~SQL
UPDATE #{table_name}
SET json_attributes = jsonb_strip_nulls(
json_attributes ||
jsonb_build_object(#{built_json_attrs.join(", ")})
)
SQL
end
dir.down do
set_clauses = []
context
.migrate_json_columns
.each do |column_name, old_field_name, new_field_name|
execute <<~SQL.strip
UPDATE #{table_name}
SET #{column_name} = jsonb_set(#{column_name}, '{#{old_field_name}}', json_attributes->'#{new_field_name}')
WHERE json_attributes->>'#{column_name}' IS NOT NULL
SQL
end
context
.migrate_columns
.each do |column_name, rails_column_type, db_column_type|
set_clauses << <<~SQL.strip
#{column_name} = (json_attributes->>'#{column_name}')::#{db_column_type}
SQL
end
execute <<~SQL
UPDATE #{table_name}
SET #{set_clauses.join(",\n")}
SQL
end
end
context
.migrate_json_columns
.map(&:first)
.uniq
.each do |column_name|
rename_column table_name, column_name, "#{column_name}_backup"
end
context
.migrate_columns
.each do |column_name, rails_column_type, db_column_type|
rename_column table_name, column_name, "#{column_name}_backup"
end
end
end

View File

@@ -1,51 +1,96 @@
# typed: strict
class UnquotedArelSqlLiteral < Arel::Nodes::SqlLiteral
extend T::Sig
sig { params(name_in_to_s: String, name_in_to_sql: String).void }
def initialize(name_in_to_s, name_in_to_sql)
super(name_in_to_s)
@name_in_to_sql = name_in_to_sql
end
sig { returns(String) }
def in_sql
@name_in_to_sql
end
end
require "arel/visitors/to_sql"
class Arel::Visitors::ToSql
extend T::Sig
alias_method :quote_column_name_original, :quote_column_name
sig { params(value: T.untyped).returns(String) }
def quote_column_name(value)
return value.in_sql if UnquotedArelSqlLiteral === value
quote_column_name_original(value)
class UnquotedArelSqlLiteral < Arel::Nodes::SqlLiteral
extend T::Sig
sig { params(name_in_to_s: String, name_in_to_sql: String).void }
def initialize(name_in_to_s, name_in_to_sql)
super(name_in_to_s)
@name_in_to_sql = name_in_to_sql
end
sig { returns(String) }
def in_sql
@name_in_to_sql
end
end
prepend(
Module.new do
extend T::Sig
sig { params(value: T.untyped).returns(String) }
def quote_column_name(value)
return value.in_sql if UnquotedArelSqlLiteral === value
super(value)
end
end,
)
end
module AttrJsonRecordAliases
extend T::Sig
extend T::Helpers
extend ActiveSupport::Concern
requires_ancestor { ActiveRecord::Base }
abstract!
included do
include AttrJson::Record
included { include AttrJson::Record }
requires_ancestor { ActiveRecord::Base }
sig { params(name: T.untyped, value: T.untyped).returns(T.untyped) }
def write_attribute(name, value)
ret = super(name, value)
registry = T.unsafe(self.class).attr_json_registry
if attribute_def = registry[name.to_sym]
public_send(attribute_def.container_attribute)[
attribute_def.store_key
] = read_attribute(name)
end
ret
sig { abstract.returns(T.class_of(ActiveRecord::Base)) }
def class
end
sig { params(attributes: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
def update_json_columns(attributes)
klass = self.class
updated_json = klass.connection.quote(JSON.dump(attributes))
klass.connection.execute(<<~SQL)
UPDATE #{klass.quoted_table_name}
SET json_attributes = json_attributes || #{updated_json}
WHERE #{_query_constraints_hash.map { |k, v| "#{k} = #{v}" }.join(" AND ")}
SQL
end
sig { params(name: T.untyped, value: T.untyped).returns(T.untyped) }
def write_attribute(name, value)
klass = self.class
ret = super(name, value)
registry =
T.cast(
T.unsafe(klass).attr_json_registry,
AttrJson::AttributeDefinition::Registry,
)
if attribute_def = registry[name.to_sym]
public_send(attribute_def.container_attribute)[
attribute_def.store_key
] = read_attribute(name)
end
ret
end
module ClassMethods
extend T::Sig
extend T::Helpers
requires_ancestor { T.class_of(ActiveRecord::Base) }
sig { params(attr_name: Symbol).void }
def json_attributes_scope(attr_name)
scope :"where_#{attr_name}",
->(expr, *binds) do
where("json_attributes->>'#{attr_name}' #{expr}", binds)
end
scope :"order_#{attr_name}",
->(dir) do
unless [:asc, :desc, nil].include?(dir)
raise("invalid direction: #{dir}")
end
order(Arel.sql "json_attributes->>'#{attr_name}' #{dir}")
end
end
sig do
@@ -55,17 +100,19 @@ module AttrJsonRecordAliases
options: T.untyped,
).void
end
def self.attr_json(name, type, **options)
T.bind(self, T.class_of(ActiveRecord::Base))
super(name, type, **options)
def attr_json(name, type, options)
super(name, type, options)
json_attributes_scope(name)
self.attribute_aliases =
self.attribute_aliases.merge(
name.to_s =>
UnquotedArelSqlLiteral.new(
Arel::Visitors::ToSql::UnquotedArelSqlLiteral.new(
name.to_s,
"json_attributes->>'#{name}'",
),
)
end
end
mixes_in_class_methods(ClassMethods)
end

View File

@@ -22,7 +22,7 @@ class Domain::E621::Post < ReduxApplicationRecord
validates_presence_of(:e621_id, :state)
after_initialize do
self.state = :ok unless self.state.present?
self.state_detail ||= {}
# self.state_detail ||= {}
self.flags_array ||= []
self.pools_array ||= []
self.sources_array ||= []
@@ -70,7 +70,6 @@ class Domain::E621::Post < ReduxApplicationRecord
attr_json :scan_error, :string
attr_json :file_error, FileError.to_type
json_attributes_scope :last_index_page_id
belongs_to :index_page_http_log_entry,
class_name: "HttpLogEntry",
optional: true,
@@ -104,16 +103,4 @@ class Domain::E621::Post < ReduxApplicationRecord
def file_uri
Addressable::URI.parse(self.file_url_str) if self.file_url_str.present?
end
# sig { returns(T.nilable(Time)) }
# def e621_updated_at
# str = state_detail["e621_updated_at"]
# Time.parse(str) if str
# end
# sig { params(time: T.any(Time, String)).void }
# def e621_updated_at=(time)
# time = Time.parse(time) if time.is_a?(String)
# state_detail["e621_updated_at"] = time.iso8601
# end
end

View File

@@ -4,6 +4,7 @@ class Domain::Fa::Post < ReduxApplicationRecord
include HasIndexedPost
include Pundit::Authorization
include AttrJsonRecordAliases
enum :state,
[
@@ -11,17 +12,9 @@ class Domain::Fa::Post < ReduxApplicationRecord
:removed, # post has been removed
:scan_error, # error scanning post page (see state_detail)
:file_error, # error downloading post file (see state_detail)
]
],
default: :ok
validates_inclusion_of(:state, in: self.states.keys)
after_initialize do
self.state_detail ||= {}
if self.state_detail == "d9.facdn.net is unresolvable"
self.state_detail = { file_error: "d9.facdn.net is unresolvable" }
end
self.log_entry_detail ||= {}
self.state ||= :ok
end
validates_presence_of(:fa_id, :state)
belongs_to :creator,
@@ -46,6 +39,33 @@ class Domain::Fa::Post < ReduxApplicationRecord
foreign_key: :post_id,
dependent: :destroy
attr_json :title, :string
attr_json :category, :string
attr_json :theme, :string
attr_json :species, :string
attr_json :gender, :string
attr_json :description, :string
attr_json :keywords, :string, array: true, default: []
attr_json :num_favorites, :integer
attr_json :num_comments, :integer
attr_json :num_views, :integer
attr_json :posted_at, :datetime
attr_json :scanned_at, :datetime
attr_json :last_user_page_id, :integer
attr_json :last_submission_page_id, :integer
attr_json :first_browse_page_id, :integer
attr_json :first_gallery_page_id, :integer
attr_json :first_seen_entry_id, :integer
attr_json :scan_file_error, :string
belongs_to :last_user_page, class_name: "HttpLogEntry", optional: true
belongs_to :last_submission_page, class_name: "HttpLogEntry", optional: true
belongs_to :first_browse_page, class_name: "HttpLogEntry", optional: true
belongs_to :first_gallery_page, class_name: "HttpLogEntry", optional: true
belongs_to :first_seen_entry, class_name: "HttpLogEntry", optional: true
sig { returns(String) }
def to_param
self.fa_id.to_s
@@ -67,70 +87,51 @@ class Domain::Fa::Post < ReduxApplicationRecord
end
end
sig { returns(T.nilable(Addressable::URI)) }
def thumbnail_uri
if self.state_detail["thumbnail_url_str"]
Addressable::URI.parse(self.state_detail["thumbnail_url_str"])
else
nil
end
end
sig { params(uri: T.nilable(T.any(String, Addressable::URI))).void }
def thumbnail_uri=(uri)
if uri
uri = Addressable::URI.parse(uri)
uri.scheme = "https" if uri.scheme.blank?
self.state_detail["thumbnail_url_str"] = uri.to_s
else
self.state_detail["thumbnail_url_str"] = nil
end
end
sig { returns(T::Boolean) }
def scanned?
self.file_url_str.present?
end
sig { returns(T.nilable(Time)) }
sig { returns(T.nilable(ActiveSupport::TimeWithZone)) }
def scanned_at
# at some point, `scanned_at` was populated to avoid having to look up the
# post's `last_submission_page` log entry, but we fall back to that
# if scanned_at isn't populated yet
if state_detail["scanned_at"]
Time.at(state_detail["scanned_at"])
else
last_submission_page&.created_at
end
# # at some point, `scanned_at` was populated to avoid having to look up the
# # post's `last_submission_page` log entry, but we fall back to that
# # if scanned_at isn't populated yet
# if state_detail["scanned_at"]
# Time.at(state_detail["scanned_at"])
# else
# last_submission_page&.created_at
# end
super || last_submission_page&.created_at
end
sig { params(time: T.nilable(Time)).void }
def scanned_at=(time)
self.state_detail["scanned_at"] = time&.to_i
end
# sig { params(time: T.nilable(Time)).void }
# def scanned_at=(time)
# self.state_detail["scanned_at"] = time&.to_i
# end
sig { override.returns(T.nilable(ActiveSupport::TimeWithZone)) }
def posted_at
pa = super
return pa if pa
begin
contents = guess_last_submission_page&.response&.contents
if contents
parser = Domain::Fa::Parser::Page.new(contents)
parser.submission.posted_date if parser.probably_submission?
end
end
end
# sig { override.returns(T.nilable(ActiveSupport::TimeWithZone)) }
# def posted_at
# pa = super
# return pa if pa
# begin
# contents = guess_last_submission_page&.response&.contents
# if contents
# parser = Domain::Fa::Parser::Page.new(contents)
# parser.submission.posted_date if parser.probably_submission?
# end
# end
# end
sig { params(log_entry: T.nilable(HttpLogEntry)).void }
def last_submission_page=(log_entry)
self.log_entry_detail["last_submission_page_id"] = log_entry&.id
end
# sig { params(log_entry: T.nilable(HttpLogEntry)).void }
# def last_submission_page=(log_entry)
# self.last_submission_page_id = log_entry&.id
# end
sig { returns(T.nilable(HttpLogEntry)) }
def last_submission_page
HttpLogEntry.find_by(id: self.log_entry_detail["last_submission_page_id"])
end
# sig { returns(T.nilable(HttpLogEntry)) }
# def last_submission_page
# HttpLogEntry.find_by(id: self.last_submission_page_id)
# end
sig { returns(T.nilable(HttpLogEntry)) }
def guess_last_submission_page
@@ -174,9 +175,9 @@ class Domain::Fa::Post < ReduxApplicationRecord
params(
submission: T.untyped,
first_seen_log_entry: T.nilable(HttpLogEntry),
).returns(T::Hash[String, T.untyped])
).returns(Domain::Fa::Post)
end
def self.hash_from_submission_parser_helper(
def self.find_or_initialize_by_submission_parser(
submission,
first_seen_log_entry: nil
)
@@ -185,14 +186,10 @@ class Domain::Fa::Post < ReduxApplicationRecord
{ url_name: submission.artist_url_name },
) { |user| user.name = submission.artist }
{
fa_id: submission.id,
creator_id: creator.id,
title: submission.title,
state_detail: {
"first_seen_entry" => first_seen_log_entry&.id,
"thumbnail_url_str" => submission.thumb_path,
},
}
Domain::Fa::Post.find_or_initialize_by(fa_id: submission.id) do |post|
post.creator = creator
post.title = submission.title
post.first_seen_entry = first_seen_log_entry
end
end
end

View File

@@ -29,7 +29,9 @@ class ReduxApplicationRecord < ActiveRecord::Base
# clean up the json_attributes field in console output
sig { params(attr_name: T.any(Symbol, String)).returns(T.untyped) }
def attribute_for_inspect(attr_name)
if TRUNCATE_FIELDS.include?(attr_name.to_s)
if attr_name.to_s.ends_with?("_backup")
return "(backup value)"
elsif TRUNCATE_FIELDS.include?(attr_name.to_s)
str_value = read_attribute(attr_name).inspect
str_value = "#{str_value[0, 50]}..." if str_value.length > 50
return str_value
@@ -39,23 +41,10 @@ class ReduxApplicationRecord < ActiveRecord::Base
sig { returns(T::Array[String]) }
def attributes_for_inspect
super.reject { |attr_name| attr_name.start_with?("json_attributes-") }
end
sig { params(attr_name: Symbol).void }
def self.json_attributes_scope(attr_name)
scope :"where_#{attr_name}",
->(expr, *binds) do
where("json_attributes->>'#{attr_name}' #{expr}", binds)
end
scope :"order_#{attr_name}",
->(dir) do
unless [:asc, :desc, nil].include?(dir)
raise("invalid direction: #{dir}")
end
order(Arel.sql "json_attributes->>'#{attr_name}' #{dir}")
end
super.reject do |attr_name|
attr_name.start_with?("json_attributes-") #||
# attr_name.ends_with?("_backup")
end
end
sig do

View File

@@ -6,10 +6,17 @@ json.user do
json.log_entry_detail @user.log_entry_detail
json.user_page_log_entry do
json.partial! "log_entries/show", locals: { log_entry: HttpLogEntry.find(@user.log_entry_detail["last_user_page_id"]) }
end if @user.log_entry_detail["last_user_page_id"]
if @user.log_entry_detail["last_user_page_id"]
json.user_page_log_entry do
json.partial! "log_entries/show",
locals: {
log_entry:
HttpLogEntry.find(
@user.log_entry_detail["last_user_page_id"],
),
}
end
end
json.user_stats do
json.num_pageviews @user.num_pageviews
@@ -20,10 +27,6 @@ json.user do
json.num_favorites @user.num_favorites
end
# json.gallery_page_log_entry do
# json.partial! "log_entries/show", locals: { log_entry: HttpLogEntry.find(@user.log_entry_detail["last_gallery_page_id"]) }
# end if @user.log_entry_detail["last_gallery_page_id"]
json.created_at @user.created_at
json.updated_at @user.updated_at
end

View File

@@ -47,12 +47,21 @@
</span>
</div>
<% if policy(@post).view_scraper_metadata? %>
<% if hle = guess_scanned_http_log_entry(@post) %>
<% scanned_at = @post.scanned_at %>
<% scanned_hle = @post.last_submission_page || guess_scanned_http_log_entry(@post) %>
<% scanned_at ||= scanned_hle&.requested_at %>
<% if scanned_at %>
<div class="mt-2 text-sm text-slate-500">
<%= link_to "Scanned #{time_ago_in_words(hle.requested_at)} ago",
log_entry_path(hle),
class: "text-blue-600 hover:underline",
target: "_blank" %>
<div class="mt-2 text-sm text-slate-500">
<% if scanned_hle %>
<%= link_to "Scanned #{time_ago_in_words(scanned_at)} ago",
log_entry_path(scanned_hle),
class: "text-blue-600 hover:underline",
target: "_blank" %>
<% else %>
<span> Scanned <%= time_ago_in_words(scanned_at) %> ago </span>
<% end %>
</div>
</div>
<% else %>
<div class="mt-2 text-sm text-slate-500">Unknown when post scanned</div>

View File

@@ -46,10 +46,13 @@
<% end %>
<span
class="badge bg-light text-dark"
title="<%= time_ago_in_words(post.posted_at) %> ago"
title="<%= post.posted_at ? time_ago_in_words(post.posted_at) : "(posted at unknown)" %> ago"
>
<i class="fa-regular fa-clock me-1"></i
><%= post.posted_at.strftime("%Y-%m-%d %H:%M:%S") %>
<i class="fa-regular fa-clock me-1"></i><%= if post.posted_at
post.posted_at.strftime("%Y-%m-%d %H:%M:%S")
else
"(posted at unknown)"
end %>
</span>
</div>
</div>

View File

@@ -56,7 +56,7 @@ Rails.application.configure do
config.active_support.disallowed_deprecation_warnings = []
# Raise an error on page load if there are pending migrations.
config.active_record.migration_error = :page_load
# config.active_record.migration_error = :page_load
# Highlight code that triggered database queries in logs.
config.active_record.verbose_query_logs = true

View File

@@ -1,75 +1,25 @@
# typed: strict
require_relative "../../app/lib/json_attr_helpers"
class MigrateE621ColumnsToJsonAttrs < ActiveRecord::Migration[7.2]
extend T::Sig
include JsonAttrHelpers
sig { void }
def change
migrate_column("tags_array", :jsonb, :jsonb)
migrate_column("flags_array", :jsonb, :jsonb)
migrate_column("pools_array", :jsonb, :jsonb)
migrate_column("sources_array", :jsonb, :jsonb)
migrate_column("artists_array", :jsonb, :jsonb)
migrate_state_detail("e621_updated_at")
migrate_state_detail("last_index_page_id")
migrate_state_detail("index_page_ids")
migrate_state_detail("prev_md5s")
migrate_state_detail("caused_by_entry_id")
migrate_state_detail("scan_log_entry_id")
migrate_state_detail("scan_error")
migrate_state_detail("file_error")
end
sig { params(column_name: String).void }
def migrate_state_detail(column_name)
reversible do |dir|
dir.up do
execute <<~SQL
UPDATE domain_e621_posts
SET
json_attributes = jsonb_set(json_attributes, '{#{column_name}}', state_detail->'#{column_name}')
-- , state_detail = jsonb_set(state_detail, '{#{column_name}}', 'null'::jsonb)
WHERE state_detail->'#{column_name}' IS NOT NULL
SQL
# remove_column :domain_e621_posts, :state_detail
end
dir.down do
# add_column :domain_e621_posts, column_name, rails_column_type
execute <<~SQL
UPDATE domain_e621_posts
SET
state_detail = jsonb_set(state_detail, '{#{column_name}}', json_attributes->'#{column_name}')
-- , json_attributes = jsonb_set(json_attributes, '{#{column_name}}', 'null'::jsonb)
WHERE json_attributes->>'#{column_name}' IS NOT NULL
SQL
end
end
end
sig do
params(
column_name: String,
rails_column_type: Symbol,
db_column_type: Symbol,
).void
end
def migrate_column(column_name, rails_column_type, db_column_type)
reversible do |dir|
dir.up do
execute <<~SQL
UPDATE domain_e621_posts
SET json_attributes = jsonb_set(json_attributes, '{#{column_name}}', to_json(#{column_name})::jsonb)
WHERE #{column_name} IS NOT NULL
SQL
remove_column :domain_e621_posts, column_name
end
dir.down do
add_column :domain_e621_posts, column_name, rails_column_type
execute <<~SQL
UPDATE domain_e621_posts
SET #{column_name} = (json_attributes->>'#{column_name}')::text::#{db_column_type}
WHERE json_attributes->>'#{column_name}' IS NOT NULL
SQL
end
migrate_json_table("domain_e621_posts") do
migrate_column("tags_array", :jsonb, :jsonb)
migrate_column("flags_array", :jsonb, :jsonb)
migrate_column("pools_array", :jsonb, :jsonb)
migrate_column("sources_array", :jsonb, :jsonb)
migrate_column("artists_array", :jsonb, :jsonb)
migrate_json_column("state_detail", "e621_updated_at")
migrate_json_column("state_detail", "last_index_page_id")
migrate_json_column("state_detail", "index_page_ids")
migrate_json_column("state_detail", "prev_md5s")
migrate_json_column("state_detail", "caused_by_entry_id")
migrate_json_column("state_detail", "scan_log_entry_id")
migrate_json_column("state_detail", "scan_error")
migrate_json_column("state_detail", "file_error")
end
end
end

View File

@@ -0,0 +1,5 @@
class AddJsonAttributesToFaPosts < ActiveRecord::Migration[7.2]
def change
add_column :domain_fa_posts, :json_attributes, :jsonb, default: {}
end
end

View File

@@ -0,0 +1,43 @@
# typed: strict
require_relative "../../app/lib/json_attr_helpers"
class MigrateFaPostColumnsToJsonAttrs < ActiveRecord::Migration[7.2]
extend T::Sig
include JsonAttrHelpers
sig { void }
def change
# scanned at is a timestamp, but we store it as an int in state_detail,
# so it requires special handling
reversible { |dir| dir.up { execute <<~SQL } }
UPDATE domain_fa_posts
SET json_attributes = jsonb_set(
json_attributes,
'{scanned_at}',
to_jsonb(to_char(to_timestamp((state_detail->'scanned_at')::int), 'YYYY-MM-DD HH24:MI:SS TZH:TZM')::text)
)
WHERE state_detail->'scanned_at' IS NOT NULL
SQL
migrate_json_table("domain_fa_posts") do
migrate_column("title", :string, :text)
migrate_column("category", :string, :text)
migrate_column("theme", :string, :text)
migrate_column("species", :string, :text)
migrate_column("gender", :string, :text)
migrate_column("description", :string, :text)
migrate_column("keywords", :jsonb, :jsonb)
migrate_column("num_favorites", :integer, :integer)
migrate_column("num_comments", :integer, :integer)
migrate_column("num_views", :integer, :integer)
migrate_column("posted_at", :datetime, :timestamp)
migrate_json_column("log_entry_detail", "last_submission_page_id")
migrate_json_column("log_entry_detail", "first_browse_page_id")
migrate_json_column("log_entry_detail", "first_gallery_page_id")
migrate_json_column(
"log_entry_detail",
"first_seen_entry",
"first_seen_entry_id",
)
end
end
end

View File

@@ -2142,24 +2142,25 @@ CREATE TABLE public.domain_fa_posts (
id bigint NOT NULL,
fa_id integer,
creator_id bigint,
title character varying,
category character varying,
theme character varying,
species character varying,
gender character varying,
description character varying,
keywords jsonb,
num_favorites integer,
num_comments integer,
num_views integer,
posted_at timestamp(6) without time zone,
file_url_str character varying,
file_id bigint,
created_at timestamp(6) without time zone NOT NULL,
updated_at timestamp(6) without time zone NOT NULL,
state integer,
state_detail jsonb,
log_entry_detail jsonb
log_entry_detail_backup jsonb,
json_attributes jsonb DEFAULT '{}'::jsonb,
posted_at_backup timestamp(6) without time zone,
num_views_backup integer,
num_comments_backup integer,
num_favorites_backup integer,
keywords_backup jsonb,
description_backup character varying,
gender_backup character varying,
species_backup character varying,
theme_backup character varying,
category_backup character varying,
title_backup character varying
);
@@ -7844,6 +7845,8 @@ ALTER TABLE ONLY public.domain_twitter_tweets
SET search_path TO "$user", public;
INSERT INTO "schema_migrations" (version) VALUES
('20250131060105'),
('20250131055824'),
('20250129174128'),
('20250129173329'),
('20250129061805'),

View File

@@ -752,9 +752,6 @@ class Domain::E621::Post
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def order(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def order_last_index_page_id(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def preload(*args, &blk); end
@@ -810,9 +807,6 @@ class Domain::E621::Post
sig { params(args: T.untyped).returns(PrivateAssociationRelation) }
def where(*args); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def where_last_index_page_id(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateAssociationRelation) }
def with(*args, &blk); end
@@ -3092,9 +3086,6 @@ class Domain::E621::Post
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def order(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def order_last_index_page_id(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def preload(*args, &blk); end
@@ -3150,9 +3141,6 @@ class Domain::E621::Post
sig { params(args: T.untyped).returns(PrivateRelation) }
def where(*args); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def where_last_index_page_id(*args, &blk); end
sig { params(args: T.untyped, blk: T.untyped).returns(PrivateRelation) }
def with(*args, &blk); end

File diff suppressed because it is too large Load Diff

View File

@@ -2,6 +2,7 @@
# frozen_string_literal: true
require "./app/lib/has_color_logger"
require "./app/lib/domain"
require "./app/lib/domain/e621/tag_util"
require "./spec/helpers/debug_helpers"
require "./spec/helpers/http_client_mock_helpers"

View File

@@ -3,8 +3,6 @@ FactoryBot.define do
factory :domain_fa_post, class: "Domain::Fa::Post" do
sequence(:fa_id) { |n| n }
state { :ok }
state_detail { {} }
log_entry_detail { {} }
trait :with_creator do
association :creator, factory: :domain_fa_user

View File

@@ -28,6 +28,12 @@ describe Domain::Fa::Job::FavsJob do
[including({ url_name: "zzreg" })],
)
end
it "does not create any new posts" do
expect do
perform_now({ url_name: "zzreg" }, should_raise: true)
end.not_to change(Domain::Fa::Post, :count)
end
end
context "site indicates no favs" do
@@ -128,6 +134,37 @@ describe Domain::Fa::Job::FavsJob do
expect(user.scanned_favs_at).to be_within(1.second).of(Time.now)
end
it "enqueues post scans" do
expect do perform_now({ url_name: "zzreg" }) end.to change(
Domain::Fa::Post,
:count,
).by(5)
expect(SpecUtil.enqueued_job_args(Domain::Fa::Job::ScanPostJob)).to match(
array_including(
{
post: Domain::Fa::Post.find_by(fa_id: 52_106_426),
caused_by_entry: @log_entries[0],
},
{
post: Domain::Fa::Post.find_by(fa_id: 36_755_337),
caused_by_entry: @log_entries[0],
},
{
post: Domain::Fa::Post.find_by(fa_id: 40_769_488),
caused_by_entry: @log_entries[0],
},
{
post: Domain::Fa::Post.find_by(fa_id: 20_808_448),
caused_by_entry: @log_entries[0],
},
{
post: Domain::Fa::Post.find_by(fa_id: 20_585_829),
caused_by_entry: @log_entries[0],
},
),
)
end
context "the user model already has favs recorded" do
let(:old_post) { Domain::Fa::Post.create(fa_id: 12_345, creator: user) }
before { user.fav_posts << old_post }

View File

@@ -0,0 +1,212 @@
# typed: false
require "rails_helper"
describe Domain::Fa::Job::ScanPostJob do
let(:http_client_mock) { instance_double("::Scraper::HttpClient") }
before do
Scraper::ClientFactory.http_client_mock = http_client_mock
@log_entries =
HttpClientMockHelpers.init_http_client_mock(
http_client_mock,
client_mock_config,
)
end
let(:post_404_fixture) do
SpecUtil.read_fixture_file("domain/fa/job/submission_404_page.html")
end
shared_context "post 59_714_213 already exists" do
let!(:post) { Domain::Fa::Post.create!(fa_id: 59_714_213) }
end
shared_context "post with no keywords fixture" do
let(:post_fixture) do
SpecUtil.read_fixture_file("domain/fa/job/submission_page_59714213.html")
end
let(:client_mock_config) do
[
{
uri: "https://www.furaffinity.net/view/59714213/",
status_code: 200,
content_type: "text/html",
contents: post_fixture,
},
]
end
end
shared_context "post with keywords fixture" do
let(:post_fixture) do
SpecUtil.read_fixture_file("domain/fa/job/submission_page_59723907.html")
end
let(:client_mock_config) do
[
{
uri: "https://www.furaffinity.net/view/59723907/",
status_code: 200,
content_type: "text/html",
contents: post_fixture,
},
]
end
end
shared_context "creator is creeps" do
let!(:creator) do
Domain::Fa::User.create!(name: "-creeps", url_name: "-creeps")
end
end
shared_context "creator is lizardlars" do
let!(:creator) do
Domain::Fa::User.create!(name: "lizardlars", url_name: "lizardlars")
end
end
context "when scanning a post" do
include_context "post with no keywords fixture"
include_context "creator is creeps"
context "and the post does not exist" do
it "creates a new post with correct attributes" do
perform_now({ fa_id: 59_714_213 })
post = Domain::Fa::Post.find_by(fa_id: 59_714_213)
expect(post).not_to be_nil
expect(post.state).to eq("ok")
expect(post.last_submission_page).to eq(@log_entries.first)
expect(post.title).to eq("Slayer")
expect(post.creator).not_to be_nil
expect(post.creator.url_name).to eq("-creeps")
expect(post.creator.name).to eq("-creeps")
expect(post.description).to include("My newest oil painting")
expect(post.category).to eq("All")
expect(post.theme).to eq("All")
expect(post.species).to eq("Unspecified / Any")
expect(post.gender).to eq("Any")
expect(post.num_favorites).to eq(0)
expect(post.num_comments).to eq(0)
expect(post.num_views).to eq(3)
expect(post.file_uri.to_s).to eq(
"https://d.furaffinity.net/art/-creeps/1738343855/1738343855.-creeps_slayerlr.jpg",
)
expect(post.posted_at).to be_within(1.second).of(
Time.zone.parse("Jan 31, 2025 12:17 PM"),
)
expect(post.scanned_at).to be_within(1.second).of(DateTime.current)
end
end
context "and the post exists but has not been scanned yet" do
include_context "post 59_714_213 already exists"
it "updates the post attributes" do
post.update!(title: "old title")
expect do
perform_now({ fa_id: 59_714_213 })
post.reload
end.to change(post, :title)
.from("old title")
.to("Slayer")
.and(change(post, :scanned?).from(false).to(true))
.and(not_change(post, :state))
end
end
it "enqueues a file scan job" do
perform_now({ fa_id: 59_714_213 })
post = Domain::Fa::Post.find_by(fa_id: 59_714_213)
expect(SpecUtil.enqueued_job_args(Domain::Fa::Job::ScanFileJob)).to match(
array_including({ post: post, caused_by_entry: @log_entries.first }),
)
end
end
context "when scanning an already scanned post" do
include_context "post 59_714_213 already exists"
include_context "creator is creeps"
let(:client_mock_config) { [] }
before do
post.file_url_str = "https://example.com/image.jpg"
post.save!
expect(post.scanned?).to be(true)
end
it "does not perform a new scan" do
expect do
perform_now({ post: post })
post.reload
end.to not_change(post, :scanned?)
end
context "and force_scan is true" do
include_context "post with no keywords fixture"
it "performs a new scan" do
old_scanned_at = 1.day.ago
post.update!(state: "ok", scanned_at: old_scanned_at)
expect do
perform_now({ post: post, force_scan: true })
post.reload
end.to change(post, :file_url_str)
.from("https://example.com/image.jpg")
.to(
"https://d.furaffinity.net/art/-creeps/1738343855/1738343855.-creeps_slayerlr.jpg",
)
.and(
change(post, :scanned_at).from(
be_within(1.second).of(old_scanned_at),
).to(be_within(1.second).of(DateTime.current)),
)
end
end
end
context "the post has keywords" do
include_context "post with keywords fixture"
include_context "creator is lizardlars"
it "updates the post attributes" do
fa_id = 59_723_907
perform_now({ fa_id: fa_id })
post = Domain::Fa::Post.find_by(fa_id: fa_id)
expect(post.state).to eq("ok")
expect(post.title).to eq(
"Oh No! Stuck With Her Step Bro | Chapter 2 | Page 8",
)
expect(post.creator).to eq(creator)
expect(post.posted_at).to be_within(1.second).of(
Time.zone.parse("Feb 1, 2025 07:15 AM"),
)
expect(post.keywords).to match(array_including("wolfgang", "kiss"))
expect(post.scanned_at).to be_within(1.second).of(DateTime.current)
end
end
context "when post is removed" do
include_context "post 59_714_213 already exists"
let(:client_mock_config) do
[
{
uri: "https://www.furaffinity.net/view/59714213/",
status_code: 200,
content_type: "text/html",
contents: post_404_fixture,
},
]
end
it "marks the post as removed" do
perform_now({ post: post })
post.reload
expect(post.state).to eq("removed")
end
end
end

View File

@@ -30,9 +30,9 @@ describe FaBackfillFavs do
page = Domain::Fa::Parser::Page.new(html)
submissions = page.submissions_parsed
submissions.each do |submission|
Domain::Fa::Post.create!(
Domain::Fa::Post.hash_from_submission_parser_helper(submission),
)
Domain::Fa::Post.find_or_initialize_by_submission_parser(
submission,
).save!
end
end

View File

@@ -47,13 +47,13 @@ RSpec.describe Domain::Fa::Post do
it "accepts nil value" do
expect { post.scanned_at = nil }.not_to raise_error
expect(post.state_detail["scanned_at"]).to be_nil
expect(post.scanned_at).to be_nil
end
it "accepts Time value" do
time = Time.current
post.scanned_at = time
expect(post.state_detail["scanned_at"]).to eq(time.to_i)
expect(post.scanned_at).to be_within(1.second).of(time)
end
end
@@ -111,20 +111,6 @@ RSpec.describe Domain::Fa::Post do
end
end
describe "#thumbnail_uri" do
let(:post) { build(:domain_fa_post) }
it "returns nil when thumbnail_url_str is not present" do
expect(post.thumbnail_uri).to be_nil
end
it "returns parsed URI when thumbnail_url_str is present" do
post.thumbnail_uri = "http://example.com/thumb.jpg"
expect(post.thumbnail_uri).to be_a(Addressable::URI)
expect(post.thumbnail_uri.to_s).to eq("http://example.com/thumb.jpg")
end
end
describe "#scanned?" do
let(:post) { build(:domain_fa_post) }
@@ -225,7 +211,7 @@ RSpec.describe Domain::Fa::Post do
it "gets the posted_at from the postable if its nil before save" do
initial_time = 1.day.ago
post.update_column(:posted_at, initial_time)
post.update_json_columns(posted_at: initial_time)
post.indexed_post.update_column(:posted_at, nil)
post.reload

View File

@@ -21,6 +21,7 @@ require "./spec/support/matchers/html_matchers"
require "./spec/support/matchers/job_matchers"
require "rspec/sorbet"
RSpec::Sorbet.allow_doubles!
RSpec::Matchers.define_negated_matcher :not_change, :change
RSpec.configure do |config|
config.include PerformJobHelpers

View File

@@ -0,0 +1,26 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- -->
<html>
<head>
<title>System Error</title>
<link href="/themes/beta/img/favicon.ico"/>
<link type="text/css" rel="stylesheet" href="/themes/beta/css/ui_theme_dark.css?u=2025011001" />
</head>
<body>
<section style="margin: 30px auto; max-width: 800px;">
<div class="section-header">
<h2>System Error</h2>
</div>
<div class="section-body">
The submission you are trying to find is not in our database. <br>
<div class="alignright"><a class="button standard" href="javascript:history.go(-1)">Click here to go back</a></div>
</div>
</section>
</body>
</html>

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html>
<head>
<title>System Message -- Fur Affinity [dot] net</title>
</head>
<body>
<div id="error-page">
<h2>System Message</h2>
<p>The page you are trying to reach is currently pending deletion.</p>
</div>
</body>
</html>