This commit is contained in:
Dylan Knutson
2024-12-17 23:09:06 +00:00
parent 2ee31f4e74
commit 3eb9be47bc
158 changed files with 4621 additions and 3702 deletions

View File

@@ -12,3 +12,5 @@ rbenv rehash
npm install yarn
bin/rails yarn:install
yarn
yarn add --dev prettier @prettier/plugin-ruby

View File

@@ -10,5 +10,5 @@
},
"[erb]": {
"editor.defaultFormatter": "aliariff.vscode-erb-beautify"
}
},
}

View File

@@ -22,7 +22,7 @@ class ApplicationController < ActionController::Base
"9c38727f-f11d-41de-b775-0effd86d520c" => "xjal",
"e38c568f-a24d-4f26-87f0-dfcd898a359d" => "fyacin",
"41fa1144-d4cd-11ed-afa1-0242ac120002" => "soft_fox_lad",
"9b3cf444-5913-4efb-9935-bf26501232ff" => "syfaro",
"9b3cf444-5913-4efb-9935-bf26501232ff" => "syfaro"
}
def validate_api_token

View File

@@ -1,8 +1,5 @@
class BlobsController < ApplicationController
skip_before_action :validate_api_token,
only: %i[
contents
]
skip_before_action :validate_api_token, only: %i[contents]
def show
sha256 = HexUtil.hex2bin(params[:sha256])
@@ -25,20 +22,26 @@ class BlobsController < ApplicationController
# images, videos, etc
blob_entry = BlobEntryP.find(HexUtil.hex2bin(sha256))
if helpers.is_send_data_content_type?(blob_entry.content_type)
if !thumb.blank? && helpers.is_thumbable_content_type?(blob_entry.content_type)
if !thumb.blank? &&
helpers.is_thumbable_content_type?(blob_entry.content_type)
filename = "thumb-#{thumb}-#{sha256}"
filename = filename[..File.extname(filename).length]
filename += ".jpeg"
width, height = thumb_params(thumb)
image = Vips::Image.thumbnail_buffer(blob_entry.contents, width, height: height)
image =
Vips::Image.thumbnail_buffer(
blob_entry.contents,
width,
height: height
)
resized_image_contents = image.jpegsave_buffer
send_data(
resized_image_contents,
type: "image/jpg",
disposition: "inline",
filename: filename,
filename: filename
)
else
ext = helpers.ext_for_content_type(blob_entry.content_type)
@@ -47,16 +50,17 @@ class BlobsController < ApplicationController
blob_entry.contents,
type: blob_entry.content_type,
disposition: "inline",
filename: "data#{ext}",
filename: "data#{ext}"
)
end
elsif blob_entry.content_type =~ /text\/plain/
elsif blob_entry.content_type =~ %r{text/plain}
render plain: blob_entry.contents
elsif blob_entry.content_type.starts_with? "text/html"
render html: blob_entry.contents.html_safe
elsif blob_entry.content_type.starts_with? "application/json"
pretty_json = JSON.pretty_generate(JSON.parse blob_entry.contents)
render html: "<html><body><pre>#{pretty_json}</pre></body></html>".html_safe
render html:
"<html><body><pre>#{pretty_json}</pre></body></html>".html_safe
else
render plain: "no renderer for #{blob_entry.content_type}"
end

View File

@@ -1,14 +1,8 @@
class Domain::Fa::ApiController < ApplicationController
skip_before_action :verify_authenticity_token,
only: %i[
enqueue_objects
object_statuses
]
only: %i[enqueue_objects object_statuses]
skip_before_action :validate_api_token,
only: %i[
search_user_names
]
skip_before_action :validate_api_token, only: %i[search_user_names]
def search_user_names
name = params[:name]
@@ -25,26 +19,26 @@ class Domain::Fa::ApiController < ApplicationController
fa_ids = (params[:fa_ids] || []).map(&:to_i)
url_names = (params[:url_names] || [])
jobs_async = GoodJob::Job.
select(:id, :queue_name, :serialized_params).
where(queue_name: "manual", finished_at: nil).
where(
[
"(serialized_params->'exception_executions' = '{}')",
"(serialized_params->'exception_executions' is null)",
].join(" OR ")
).load_async
jobs_async =
GoodJob::Job
.select(:id, :queue_name, :serialized_params)
.where(queue_name: "manual", finished_at: nil)
.where(
[
"(serialized_params->'exception_executions' = '{}')",
"(serialized_params->'exception_executions' is null)"
].join(" OR ")
)
.load_async
users_async = Domain::Fa::User.
where(url_name: url_names).
load_async
users_async = Domain::Fa::User.where(url_name: url_names).load_async
fa_id_to_post = Domain::Fa::Post.
includes(:file).
where(fa_id: fa_ids).
map do |post|
[post.fa_id, post]
end.to_h
fa_id_to_post =
Domain::Fa::Post
.includes(:file)
.where(fa_id: fa_ids)
.map { |post| [post.fa_id, post] }
.to_h
posts_response = {}
users_response = {}
@@ -56,7 +50,7 @@ class Domain::Fa::ApiController < ApplicationController
terminal_state: false,
seen_at: time_ago_or_never(post&.created_at),
scanned_at: "never",
downloaded_at: "never",
downloaded_at: "never"
}
if post
@@ -64,7 +58,9 @@ class Domain::Fa::ApiController < ApplicationController
post_response[:scanned_at] = time_ago_or_never(post.scanned_at)
if post.file.present?
post_response[:downloaded_at] = time_ago_or_never(post.file.created_at)
post_response[:downloaded_at] = time_ago_or_never(
post.file.created_at
)
post_response[:state] = "have_file"
post_response[:terminal_state] = true
elsif post.scanned?
@@ -79,9 +75,7 @@ class Domain::Fa::ApiController < ApplicationController
posts_response[fa_id] = post_response
end
url_name_to_user = users_async.map do |user|
[user.url_name, user]
end.to_h
url_name_to_user = users_async.map { |user| [user.url_name, user] }.to_h
url_names.each do |url_name|
user = url_name_to_user[url_name]
@@ -90,7 +84,7 @@ class Domain::Fa::ApiController < ApplicationController
user_response = {
created_at: time_ago_or_never(user.created_at),
scanned_gallery_at: time_ago_or_never(user.scanned_gallery_at),
scanned_page_at: time_ago_or_never(user.scanned_page_at),
scanned_page_at: time_ago_or_never(user.scanned_page_at)
}
states = []
states << "page" unless user.due_for_page_scan?
@@ -103,41 +97,41 @@ class Domain::Fa::ApiController < ApplicationController
user_response[:terminal_state] = true
end
else
user_response = {
state: "not_seen",
terminal_state: false,
}
user_response = { state: "not_seen", terminal_state: false }
end
users_response[url_name] = user_response
end
queue_depths = Hash.new do |hash, key|
hash[key] = 0
end
queue_depths = Hash.new { |hash, key| hash[key] = 0 }
jobs_async.each do |job|
queue_depths[job.serialized_params["job_class"]] += 1
end
queue_depths = queue_depths.map do |key, value|
[key.
delete_prefix("Domain::Fa::Job::").
split("::").
last.
underscore.
delete_suffix("_job").
gsub("_", " "),
value]
end.to_h
queue_depths =
queue_depths
.map do |key, value|
[
key
.delete_prefix("Domain::Fa::Job::")
.split("::")
.last
.underscore
.delete_suffix("_job")
.gsub("_", " "),
value
]
end
.to_h
render json: {
posts: posts_response,
users: users_response,
queues: {
total_depth: queue_depths.values.sum,
depths: queue_depths,
},
}
posts: posts_response,
users: users_response,
queues: {
total_depth: queue_depths.values.sum,
depths: queue_depths
}
}
end
def enqueue_objects
@@ -147,13 +141,18 @@ class Domain::Fa::ApiController < ApplicationController
url_names = (params[:url_names] || [])
url_names_to_enqueue = Set.new(params[:url_names_to_enqueue] || [])
fa_id_to_post = Domain::Fa::Post.includes(:file).where(fa_id: fa_ids).map do |post|
[post.fa_id, post]
end.to_h
fa_id_to_post =
Domain::Fa::Post
.includes(:file)
.where(fa_id: fa_ids)
.map { |post| [post.fa_id, post] }
.to_h
url_name_to_user = Domain::Fa::User.where(url_name: url_names).map do |user|
[user.url_name, user]
end.to_h
url_name_to_user =
Domain::Fa::User
.where(url_name: url_names)
.map { |user| [user.url_name, user] }
.to_h
fa_ids.each do |fa_id|
post = fa_id_to_post[fa_id]
@@ -168,11 +167,11 @@ class Domain::Fa::ApiController < ApplicationController
enqueue_deferred!
render json: {
post_scans: @enqueue_counts[Domain::Fa::Job::ScanPostJob],
post_files: @enqueue_counts[Domain::Fa::Job::ScanFileJob],
user_pages: @enqueue_counts[Domain::Fa::Job::UserPageJob],
user_galleries: @enqueue_counts[Domain::Fa::Job::UserGalleryJob],
}
post_scans: @enqueue_counts[Domain::Fa::Job::ScanPostJob],
post_files: @enqueue_counts[Domain::Fa::Job::ScanFileJob],
user_pages: @enqueue_counts[Domain::Fa::Job::UserPageJob],
user_galleries: @enqueue_counts[Domain::Fa::Job::UserGalleryJob]
}
end
def similar_users
@@ -181,9 +180,10 @@ class Domain::Fa::ApiController < ApplicationController
user = Domain::Fa::User.find_by(url_name: url_name)
if user.nil?
render status: 404, json: {
render status: 404,
json: {
error: "user '#{url_name}' not found",
error_type: "user_not_found",
error_type: "user_not_found"
}
return
end
@@ -191,10 +191,12 @@ class Domain::Fa::ApiController < ApplicationController
all_similar_users = helpers.similar_users_by_followed(user, limit: 10)
if all_similar_users.nil?
render status: 500, json: {
error: "user '#{url_name}' has not had recommendations computed yet",
error_type: "recs_not_computed",
}
render status: 500,
json: {
error:
"user '#{url_name}' has not had recommendations computed yet",
error_type: "recs_not_computed"
}
return
end
@@ -202,83 +204,98 @@ class Domain::Fa::ApiController < ApplicationController
not_followed_similar_users = nil
if exclude_url_name
exclude_folowed_by_user = Domain::Fa::User.find_by(url_name: exclude_url_name)
not_followed_similar_users = if exclude_folowed_by_user.nil?
exclude_folowed_by_user =
Domain::Fa::User.find_by(url_name: exclude_url_name)
not_followed_similar_users =
if exclude_folowed_by_user.nil?
# TODO - enqueue a manual UserFollowsJob for this user and have client
# re-try the request later
{
error: "user '#{exclude_url_name}' not found",
error_type: "exclude_user_not_found",
error_type: "exclude_user_not_found"
}
elsif exclude_folowed_by_user.scanned_follows_at.nil?
{
error: "user '#{exclude_url_name}' followers list hasn't been scanned",
error_type: "exclude_user_not_scanned",
error:
"user '#{exclude_url_name}' followers list hasn't been scanned",
error_type: "exclude_user_not_scanned"
}
else
users_list_to_similar_list(helpers.similar_users_by_followed(
user,
limit: 10,
exclude_followed_by: exclude_folowed_by_user,
))
users_list_to_similar_list(
helpers.similar_users_by_followed(
user,
limit: 10,
exclude_followed_by: exclude_folowed_by_user
)
)
end
end
render json: {
all: all_similar_users,
not_followed: not_followed_similar_users,
not_followed: not_followed_similar_users
}
end
private
def get_best_user_page_http_log_entry_for(user)
for_path = proc { |uri_path|
HttpLogEntry.where(
uri_scheme: "https",
uri_host: "www.furaffinity.net",
uri_path: uri_path,
).order(created_at: :desc).first&.response
}
for_path =
proc do |uri_path|
HttpLogEntry
.where(
uri_scheme: "https",
uri_host: "www.furaffinity.net",
uri_path: uri_path
)
.order(created_at: :desc)
.first
&.response
end
for_hle_id = proc { |hle_id|
hle_id && HttpLogEntry.find_by(id: hle_id)&.response
}
for_hle_id =
proc { |hle_id| hle_id && HttpLogEntry.find_by(id: hle_id)&.response }
# older versions don't end in a trailing slash
hle_id = user.log_entry_detail && user.log_entry_detail["last_user_page_id"]
for_hle_id.call(hle_id) ||
for_path.call("/user/#{user.url_name}/") ||
for_hle_id.call(hle_id) || for_path.call("/user/#{user.url_name}/") ||
for_path.call("/user/#{user.url_name}")
end
def defer_post_scan(post, fa_id)
if !post || !post.scanned?
defer_manual(Domain::Fa::Job::ScanPostJob, {
fa_id: fa_id,
}, -17)
defer_manual(Domain::Fa::Job::ScanPostJob, { fa_id: fa_id }, -17)
end
if post && post.file_uri && !post.file.present?
return defer_manual(Domain::Fa::Job::ScanFileJob, {
post: post,
}, -15, "static_file")
return(
defer_manual(
Domain::Fa::Job::ScanFileJob,
{ post: post },
-15,
"static_file"
)
)
return
end
end
def defer_user_scan(user, url_name, highpri)
if !user || user.due_for_page_scan?
defer_manual(Domain::Fa::Job::UserPageJob, {
url_name: url_name,
}, highpri ? -16 : -6)
defer_manual(
Domain::Fa::Job::UserPageJob,
{ url_name: url_name },
highpri ? -16 : -6
)
return
end
if !user || user.due_for_gallery_scan?
defer_manual(Domain::Fa::Job::UserGalleryJob, {
url_name: url_name,
}, highpri ? -14 : -4)
defer_manual(
Domain::Fa::Job::UserGalleryJob,
{ url_name: url_name },
highpri ? -14 : -4
)
return
end
@@ -312,14 +329,18 @@ class Domain::Fa::ApiController < ApplicationController
end
def users_for_name(name)
users = Domain::Fa::User.where([
"(name ilike :name) OR (url_name ilike :name)",
{ name: "#{ReduxApplicationRecord.sanitize_sql_like(name)}%" },
]).
includes(:avatar).
select(:id, :state, :state_detail, :log_entry_detail, :name, :url_name).
order(name: :asc).
limit(10)
users =
Domain::Fa::User
.where(
[
"(name ilike :name) OR (url_name ilike :name)",
{ name: "#{ReduxApplicationRecord.sanitize_sql_like(name)}%" }
]
)
.includes(:avatar)
.select(:id, :state, :state_detail, :log_entry_detail, :name, :url_name)
.order(name: :asc)
.limit(10)
users.map do |user|
{
@@ -327,7 +348,7 @@ class Domain::Fa::ApiController < ApplicationController
name: user.name,
url_name: user.url_name,
thumb: helpers.fa_user_avatar_path(user, thumb: "64-avatar"),
show_path: domain_fa_user_path(user.url_name),
show_path: domain_fa_user_path(user.url_name)
}
end
end
@@ -335,36 +356,37 @@ class Domain::Fa::ApiController < ApplicationController
def users_list_to_similar_list(users_list)
users_list.map do |user|
profile_thumb_url = user.avatar&.file_uri&.to_s
profile_thumb_url || begin
profile_page_response = get_best_user_page_http_log_entry_for(user)
if profile_page_response
parser = Domain::Fa::Parser::Page.new(profile_page_response.contents, require_logged_in: false)
profile_thumb_url = parser.user_page.profile_thumb_url
else
if user.due_for_follows_scan?
Domain::Fa::Job::UserFollowsJob.set({
priority: -20,
}).perform_later({
user: user,
})
end
if user.due_for_page_scan?
Domain::Fa::Job::UserPageJob.set({
priority: -20,
}).perform_later({
user: user,
})
profile_thumb_url ||
begin
profile_page_response = get_best_user_page_http_log_entry_for(user)
if profile_page_response
parser =
Domain::Fa::Parser::Page.new(
profile_page_response.contents,
require_logged_in: false
)
profile_thumb_url = parser.user_page.profile_thumb_url
else
if user.due_for_follows_scan?
Domain::Fa::Job::UserFollowsJob.set(
{ priority: -20 }
).perform_later({ user: user })
end
if user.due_for_page_scan?
Domain::Fa::Job::UserPageJob.set({ priority: -20 }).perform_later(
{ user: user }
)
end
end
rescue StandardError
logger.error("error getting profile_thumb_url: #{$!.message}")
end
rescue
logger.error("error getting profile_thumb_url: #{$!.message}")
end
{
name: user.name,
url_name: user.url_name,
profile_thumb_url: profile_thumb_url,
url: "https://www.furaffinity.net/user/#{user.url_name}/",
url: "https://www.furaffinity.net/user/#{user.url_name}/"
}
end
end

View File

@@ -1,14 +1,8 @@
class Domain::Fa::ExportController < ApplicationController
skip_before_action :verify_authenticity_token,
only: %i[
enqueue_objects
object_statuses
]
only: %i[enqueue_objects object_statuses]
skip_before_action :validate_api_token,
only: %i[
search_user_names
]
skip_before_action :validate_api_token, only: %i[search_user_names]
def user
@user = Domain::Fa::User.find_by!(url_name: params[:url_name])

View File

@@ -1,22 +1,24 @@
class Domain::Fa::PostsController < ApplicationController
before_action :set_domain_fa_post,
only: %i[ show scan_post scan_post ]
before_action :set_domain_fa_post, only: %i[show scan_post scan_post]
skip_before_action :verify_authenticity_token,
only: %i[ try_scan_post try_scan_posts ]
only: %i[try_scan_post try_scan_posts]
# GET /domain/fa/posts
def index
if params[:user_url_name]
@user = Domain::Fa::User.find_by(url_name: params[:user_url_name]) || raise("404")
@user =
Domain::Fa::User.find_by(url_name: params[:user_url_name]) ||
raise("404")
end
relation = @user ? @user.posts : Domain::Fa::Post
@posts = relation.
includes(:creator, :file).
page(params[:page]).
per(50).
order(fa_id: :desc).
without_count
@posts =
relation
.includes(:creator, :file)
.page(params[:page])
.per(50)
.order(fa_id: :desc)
.without_count
end
# GET /domain/fa/posts/1
@@ -37,38 +39,36 @@ class Domain::Fa::PostsController < ApplicationController
enqueued = try_enqueue_post_scan(post, fa_id)
if post && post.file.present?
state_string = "downloaded #{helpers.time_ago_in_words(post.file.created_at, include_seconds: true)} ago"
state_string =
"downloaded #{helpers.time_ago_in_words(post.file.created_at, include_seconds: true)} ago"
elsif post && post.scanned?
state_string = "scanned #{helpers.time_ago_in_words(post.scanned_at, include_seconds: true)} ago"
state_string =
"scanned #{helpers.time_ago_in_words(post.scanned_at, include_seconds: true)} ago"
else
state_string = []
if !post
state_string << "not seen"
else
state_string << "#{post.state}"
end
!post ? state_string << "not seen" : state_string << "#{post.state}"
if enqueued
state_string << "enqueued"
end
state_string << "enqueued" if enqueued
state_string = state_string.join(", ")
end
render json: {
enqueued: enqueued,
title: post&.title,
state: state_string,
is_terminal_state: post&.scanned? && post&.file&.present? || false,
}
enqueued: enqueued,
title: post&.title,
state: state_string,
is_terminal_state: post&.scanned? && post&.file&.present? || false
}
end
def try_scan_posts
Rails.logger.info "params: #{params.inspect}"
fa_ids = params[:fa_ids].map(&:to_i)
fa_id_to_post = Domain::Fa::Post.where(fa_id: fa_ids).map do |post|
[post.fa_id, post]
end.to_h
fa_id_to_post =
Domain::Fa::Post
.where(fa_id: fa_ids)
.map { |post| [post.fa_id, post] }
.to_h
response = {}
@@ -86,7 +86,7 @@ class Domain::Fa::PostsController < ApplicationController
response[fa_id] = {
state: state,
enqueued: try_enqueue_post_scan(post, fa_id),
enqueued: try_enqueue_post_scan(post, fa_id)
}
end
render json: response
@@ -103,21 +103,19 @@ class Domain::Fa::PostsController < ApplicationController
if !post || !post.scanned?
Rails.logger.info "Enqueue scan #{fa_id}"
Domain::Fa::Job::ScanPostJob.
set(priority: -15, queue: "manual").
perform_later({
fa_id: fa_id,
})
Domain::Fa::Job::ScanPostJob.set(
priority: -15,
queue: "manual"
).perform_later({ fa_id: fa_id })
return true
end
if post && post.file_uri && !post.file.present?
Rails.logger.info "Enqueue file #{fa_id}"
Domain::Fa::Job::ScanFileJob.
set(priority: -15, queue: "manual").
perform_later({
post: post,
})
Domain::Fa::Job::ScanFileJob.set(
priority: -15,
queue: "manual"
).perform_later({ post: post })
return true
end

View File

@@ -1,6 +1,6 @@
class Domain::Fa::UsersController < ApplicationController
before_action :set_user, only: %i[ show ]
skip_before_action :validate_api_token, only: %i[ show ]
before_action :set_user, only: %i[show]
skip_before_action :validate_api_token, only: %i[show]
# GET /domain/fa/users or /domain/fa/users.json
def index

View File

@@ -1,14 +1,11 @@
class Domain::Twitter::ApiController < ApplicationController
skip_before_action :verify_authenticity_token,
only: %i[ enqueue_objects ]
skip_before_action :verify_authenticity_token, only: %i[enqueue_objects]
def enqueue_objects
@enqueue_counts ||= Hash.new { |h, k| h[k] = 0 }
names = (params[:names] || [])
names.each do |name|
defer_user_timeline_scan(name, true)
end
names.each { |name| defer_user_timeline_scan(name, true) }
enqueue_deferred!
render json: @enqueue_counts.to_json
end
@@ -16,9 +13,11 @@ class Domain::Twitter::ApiController < ApplicationController
private
def defer_user_timeline_scan(name, highpri)
defer_manual(Domain::Twitter::Job::UserTimelineTweetsJob, {
name: name,
}, highpri ? -16 : -6)
defer_manual(
Domain::Twitter::Job::UserTimelineTweetsJob,
{ name: name },
highpri ? -16 : -6
)
end
def defer_manual(klass, args, priority, queue = nil)

View File

@@ -10,9 +10,11 @@ class LogEntriesController < ApplicationController
if @uri_filter.path.present?
if @uri_filter.query.present?
query = query.
where("uri_path = ?", @uri_filter.path).
where("uri_query like ?", @uri_filter.query + "%")
query =
query.where("uri_path = ?", @uri_filter.path).where(
"uri_query like ?",
@uri_filter.query + "%"
)
else
query = query.where("uri_path like ?", @uri_filter.path + "%")
end
@@ -21,13 +23,14 @@ class LogEntriesController < ApplicationController
query = HttpLogEntry
end
@log_entries = query.
page(params[:page]).
per(50).
joins(:response).
includes(:response).
order(id: :desc).
without_count
@log_entries =
query
.page(params[:page])
.per(50)
.joins(:response)
.includes(:response)
.order(id: :desc)
.without_count
formats.clear
formats << :html
@@ -39,44 +42,45 @@ class LogEntriesController < ApplicationController
@last_window_count = 0
@last_window_bytes = 0
@last_window_bytes_stored = 0
@content_type_counts = Hash.new do |hash, key|
hash[key] = {
count: 0,
bytes: 0,
bytes_stored: 0,
}
end
@content_type_counts =
Hash.new do |hash, key|
hash[key] = { count: 0, bytes: 0, bytes_stored: 0 }
end
@by_domain_counts = Hash.new do |hash, key|
hash[key] = {
count: 0,
bytes: 0,
bytes_stored: 0,
}
end
@by_domain_counts =
Hash.new do |hash, key|
hash[key] = { count: 0, bytes: 0, bytes_stored: 0 }
end
HttpLogEntry.includes(:response).find_each(batch_size: 100, order: :desc) do |log_entry|
break if log_entry.created_at < @time_window.ago
@last_window_count += 1
@last_window_bytes += log_entry.response.size
@last_window_bytes_stored += log_entry.response.bytes_stored
content_type = log_entry.content_type.split(";").first
HttpLogEntry
.includes(:response)
.find_each(batch_size: 100, order: :desc) do |log_entry|
break if log_entry.created_at < @time_window.ago
@last_window_count += 1
@last_window_bytes += log_entry.response.size
@last_window_bytes_stored += log_entry.response.bytes_stored
content_type = log_entry.content_type.split(";").first
@content_type_counts[content_type][:count] += 1
@content_type_counts[content_type][:bytes] += log_entry.response.size
@content_type_counts[content_type][:bytes_stored] += log_entry.response.bytes_stored
@content_type_counts[content_type][:count] += 1
@content_type_counts[content_type][:bytes] += log_entry.response.size
@content_type_counts[content_type][
:bytes_stored
] += log_entry.response.bytes_stored
@by_domain_counts[log_entry.uri_host][:count] += 1
@by_domain_counts[log_entry.uri_host][:bytes] += log_entry.response.size
@by_domain_counts[log_entry.uri_host][:bytes_stored] += log_entry.response.bytes_stored
end
@by_domain_counts[log_entry.uri_host][:count] += 1
@by_domain_counts[log_entry.uri_host][:bytes] += log_entry.response.size
@by_domain_counts[log_entry.uri_host][
:bytes_stored
] += log_entry.response.bytes_stored
end
end
def show
@log_entry = HttpLogEntry.includes(
:caused_by_entry,
:triggered_entries,
response: :base,
).find(params[:id])
@log_entry =
HttpLogEntry.includes(
:caused_by_entry,
:triggered_entries,
response: :base
).find(params[:id])
end
end

View File

@@ -11,7 +11,7 @@ class UserScriptsController < ApplicationController
when "furecs.user.js"
send_file(
Rails.root.join("user_scripts/furecs.user.js"),
type: "application/json",
type: "application/json"
)
else
render status: 404, text: "not found"

View File

@@ -24,11 +24,8 @@ module Domain::Fa::PostsHelper
def scanned_and_file_description(post)
parts = []
if post.scanned?
time_ago = if post.scanned_at
time_ago_in_words(post.scanned_at)
else
"(unknown)"
end
time_ago =
(post.scanned_at ? time_ago_in_words(post.scanned_at) : "(unknown)")
parts << "Scanned #{time_ago} ago"
else
parts << "Not scanned"
@@ -43,15 +40,15 @@ module Domain::Fa::PostsHelper
def fa_post_description_sanitized(html)
raw Sanitize.fragment(
html,
elements: %w[br img b i span strong],
attributes: {
"span" => %w[style],
"a" => [],
},
css: {
properties: %w[font-size color],
},
)
html,
elements: %w[br img b i span strong],
attributes: {
"span" => %w[style],
"a" => []
},
css: {
properties: %w[font-size color]
}
)
end
end

View File

@@ -1,11 +1,7 @@
module Domain::Fa::UsersHelper
def fa_user_avatar_path(user, thumb: nil)
if (sha256 = user.avatar&.file_sha256)
contents_blob_path(
HexUtil.bin2hex(sha256),
format: "jpg",
thumb: thumb,
)
contents_blob_path(HexUtil.bin2hex(sha256), format: "jpg", thumb: thumb)
else
# default / 'not found' avatar image
# "/blobs/9080fd4e7e23920eb2dccfe2d86903fc3e748eebb2e5aa8c657bbf6f3d941cdc/contents.jpg"
@@ -15,95 +11,97 @@ module Domain::Fa::UsersHelper
def sanitized_fa_user_profile_html(html)
# try to preload all the FA usernames in the profile
maybe_url_names = Nokogiri::HTML(html).css("a").flat_map do |node|
href = URI.parse(node["href"])
right_host = href.host.nil? || href.host == "www.furaffinity.net"
right_path = href.path =~ /\/user\/.+/
if right_host && right_path
[href]
else
[]
end
end.map do |href|
href.path.split("/")[2]&.downcase
end
maybe_url_names =
Nokogiri
.HTML(html)
.css("a")
.flat_map do |node|
href = URI.parse(node["href"])
right_host = href.host.nil? || href.host == "www.furaffinity.net"
right_path = href.path =~ %r{/user/.+}
if right_host && right_path
[href]
else
[]
end
end
.map { |href| href.path.split("/")[2]&.downcase }
preloaded_users = Domain::Fa::User.
where(url_name: maybe_url_names).
select(:id, :state, :state_detail, :log_entry_detail, :url_name).
joins(:avatar).
includes(:avatar).
index_by(&:url_name)
preloaded_users =
Domain::Fa::User
.where(url_name: maybe_url_names)
.select(:id, :state, :state_detail, :log_entry_detail, :url_name)
.joins(:avatar)
.includes(:avatar)
.index_by(&:url_name)
raw Sanitize.fragment(
html,
elements: %w[br img b i span strong],
attributes: {
"span" => %w[style],
"a" => [],
},
css: {
properties: %w[font-size color],
},
transformers: lambda do |env|
return unless env[:node_name] == "a"
node = env[:node]
href = URI.parse(node["href"])
return unless href.host == nil || href.host == "www.furaffinity.net"
return unless href.path =~ /\/user\/.+/
url_name = href.path.split("/")[2]&.downcase
Sanitize.node!(node, {
:elements => %w[a],
:attributes => {
"a" => %w[href],
html,
elements: %w[br img b i span strong],
attributes: {
"span" => %w[style],
"a" => []
},
})
node["href"] = domain_fa_user_path(url_name)
node["class"] =
"text-slate-200 underline decoration-slate-200 " +
"decoration-dashed decoration-dashed decoration-1"
css: {
properties: %w[font-size color]
},
transformers:
lambda do |env|
return unless env[:node_name] == "a"
node = env[:node]
href = URI.parse(node["href"])
unless href.host == nil || href.host == "www.furaffinity.net"
return
end
return unless href.path =~ %r{/user/.+}
url_name = href.path.split("/")[2]&.downcase
Sanitize.node!(
node,
{ elements: %w[a], attributes: { "a" => %w[href] } }
)
node["href"] = domain_fa_user_path(url_name)
node["class"] = "text-slate-200 underline decoration-slate-200 " +
"decoration-dashed decoration-dashed decoration-1"
whitelist = [node]
whitelist = [node]
user = preloaded_users[url_name] || Domain::Fa::User.find_by(url_name: url_name)
if user
img = Nokogiri::XML::Node.new("img", node.document)
img["class"] = "inline w-5"
img["src"] = fa_user_avatar_path(user, thumb: "32-avatar")
node.prepend_child(img)
whitelist << img
end
user =
preloaded_users[url_name] ||
Domain::Fa::User.find_by(url_name: url_name)
if user
img = Nokogiri::XML::Node.new("img", node.document)
img["class"] = "inline w-5"
img["src"] = fa_user_avatar_path(user, thumb: "32-avatar")
node.prepend_child(img)
whitelist << img
end
{ :node_allowlist => whitelist }
end,
)
{ node_allowlist: whitelist }
end
)
end
def similar_users_by_followed(
user,
limit: 10,
exclude_followed_by: nil
)
def similar_users_by_followed(user, limit: 10, exclude_followed_by: nil)
if user.disco.nil?
nil
else
ReduxApplicationRecord.connection.execute("SET ivfflat.probes = 32")
user.similar_users_by_followed(
exclude_followed_by: exclude_followed_by,
exclude_followed_by: exclude_followed_by
).limit(limit)
end
end
def fa_user_account_status(user)
log_entry_id = user.log_entry_detail["last_user_page_id"]
if log_entry_id.nil?
return nil
end
return nil if log_entry_id.nil?
log_entry = HttpLogEntry.find(log_entry_id)
parser = Domain::Fa::Parser::Page.new(log_entry.response.contents, require_logged_in: false)
unless parser.probably_user_page?
return nil
end
parser =
Domain::Fa::Parser::Page.new(
log_entry.response.contents,
require_logged_in: false
)
return nil unless parser.probably_user_page?
parser.user_page.account_status
end
end

View File

@@ -7,21 +7,23 @@ module LogEntriesHelper
def ext_for_content_type(content_type)
case content_type
when "image/jpeg" then "jpeg"
when "image/jpg" then "jpg"
when "image/png" then "png"
when "image/gif" then "gif"
else nil
when "image/jpeg"
"jpeg"
when "image/jpg"
"jpg"
when "image/png"
"png"
when "image/gif"
"gif"
else
nil
end
end
def is_renderable_image_type?(content_type)
[
"image/jpeg",
"image/jpg",
"image/png",
"image/gif",
].any? { |ct| content_type.starts_with?(ct) }
%w[image/jpeg image/jpg image/png image/gif].any? do |ct|
content_type.starts_with?(ct)
end
end
def is_thumbable_content_type?(content_type)
@@ -29,12 +31,10 @@ module LogEntriesHelper
end
def is_renderable_video_type?(content_type)
[
"video/mp4",
].any? { |ct| content_type.starts_with?(ct) }
["video/mp4"].any? { |ct| content_type.starts_with?(ct) }
end
def is_flash_content_type?(content_type)
content_type =~ /application\/x-shockwave-flash/
content_type =~ %r{application/x-shockwave-flash}
end
end

View File

@@ -5,9 +5,11 @@ class ApplicationJob < ActiveJob::Base
retry_on(
StandardError,
wait: :exponentially_longer,
attempts: Float::INFINITY,
attempts: Float::INFINITY
) do |job, exception|
job.logger.error("error: #{exception.message}\n#{exception.backtrace.join("\n")}")
job.logger.error(
"error: #{exception.message}\n#{exception.backtrace.join("\n")}"
)
end
# Automatically retry jobs that encountered a deadlock

View File

@@ -7,13 +7,16 @@ module Domain::E621::Job
def perform(args)
@caused_by_entry = args[:caused_by_entry]
response = http_client.get(
"https://e621.net/posts.json",
caused_by_entry: @caused_by_entry,
)
response =
http_client.get(
"https://e621.net/posts.json",
caused_by_entry: @caused_by_entry
)
if response.status_code != 200
fatal_error("non 200 response for /posts.json: #{response.status_code.to_s.underline}")
fatal_error(
"non 200 response for /posts.json: #{response.status_code.to_s.underline}"
)
end
@log_entry = response.log_entry
@@ -22,22 +25,26 @@ module Domain::E621::Job
fatal_error("no posts in response: HLE #{@log_entry.id}}")
end
e621_id_to_post = Domain::E621::Post.where(
e621_id: json["posts"].map { |post_json| post_json["id"] },
).includes(taggings: :tag).index_by(&:e621_id)
e621_id_to_post =
Domain::E621::Post
.where(e621_id: json["posts"].map { |post_json| post_json["id"] })
.includes(taggings: :tag)
.index_by(&:e621_id)
@num_updated = 0
@num_created = 0
@num_seen = 0
all_tag_names = Set.new json["posts"].map { |post_json|
tag_and_cat_for_json(post_json)
}.flatten.map(&:name)
all_tag_names =
Set.new json["posts"]
.map { |post_json| tag_and_cat_for_json(post_json) }
.flatten
.map(&:name)
@name_to_tag_id, missing_tags =
Domain::E621::TagUtil.tag_names_to_id_map(
all_tag_names,
posts: e621_id_to_post.values,
posts: e621_id_to_post.values
)
@name_to_tag_id.merge!(
Domain::E621::TagUtil.create_tags_from_names(missing_tags)
@@ -46,18 +53,20 @@ module Domain::E621::Job
json["posts"].each do |post_json|
@num_seen += 1
e621_id = post_json["id"]
post = e621_id_to_post[e621_id] || begin
@num_created += 1
Domain::E621::Post.new({ e621_id: e621_id })
end
post =
e621_id_to_post[e621_id] ||
begin
@num_created += 1
Domain::E621::Post.new({ e621_id: e621_id })
end
logger.prefix = proc { "[e621_id #{post.e621_id.to_s.bold}]" }
if update_post!(post, post_json)
@num_updated += 1
end
@num_updated += 1 if update_post!(post, post_json)
end
logger.prefix = nil
logger.info("#{@num_updated} updated, #{@num_created} created, #{@num_seen} seen")
logger.info(
"#{@num_updated} updated, #{@num_created} created, #{@num_seen} seen"
)
end
private
@@ -79,7 +88,7 @@ module Domain::E621::Job
post.state_detail["prev_md5s"] ||= []
post.state_detail["prev_md5s"] << {
"md5" => post.md5,
"file_id" => post.file_id,
"file_id" => post.file_id
}
post.file = nil
file_changed = true
@@ -111,14 +120,16 @@ module Domain::E621::Job
Domain::E621::TagUtil.update_tags_on_post_with_cat(
post,
tag_and_cat_for_json(post_json),
@name_to_tag_id,
@name_to_tag_id
)
end
defer_job(Domain::E621::Job::StaticFileJob, {
post: post,
caused_by_entry: @log_entry,
}) if new_record || file_changed
if new_record || file_changed
defer_job(
Domain::E621::Job::StaticFileJob,
{ post: post, caused_by_entry: @log_entry }
)
end
true
end
@@ -126,11 +137,13 @@ module Domain::E621::Job
TAG_CATEGORIES = %w[general species character copyright artist lore meta]
def tag_and_cat_for_json(post_json)
TAG_CATEGORIES.map do |tc|
post_json["tags"][tc].map do |name|
TagAndCategory.new(name, "cat_#{tc}")
TAG_CATEGORIES
.map do |tc|
post_json["tags"][tc].map do |name|
TagAndCategory.new(name, "cat_#{tc}")
end
end
end.flatten
.flatten
end
end
end

View File

@@ -12,27 +12,23 @@ module Domain::E621::Job
file_url_str = post.file_url_str
if file_url_str.blank?
logger.warn("post has no file_url_str, enqueueing for scan")
defer_job(Domain::E621::Job::ScanPostJob, {
post: post,
caused_by_entry: caused_by_entry,
})
defer_job(
Domain::E621::Job::ScanPostJob,
{ post: post, caused_by_entry: caused_by_entry }
)
return
end
if post.state == "file_error"
retry_count = post.state_detail&.
[]("file_error")&.
[]("retry_count") || 0
retry_count =
post.state_detail&.[]("file_error")&.[]("retry_count") || 0
if retry_count >= 3
logger.error("file has been retried 3 times, giving up")
return
end
end
response = http_client.get(
file_url_str,
caused_by_entry: caused_by_entry,
)
response = http_client.get(file_url_str, caused_by_entry: caused_by_entry)
if response.status_code != 200
post.state = :file_error

View File

@@ -12,7 +12,8 @@ class Domain::Fa::Job::Base < Scraper::JobBase
@caused_by_entry = args[:caused_by_entry]
if build_user
@user = find_or_build_user_from_args(args, caused_by_entry: @caused_by_entry)
@user =
find_or_build_user_from_args(args, caused_by_entry: @caused_by_entry)
else
@user = find_user_from_args(args)
end
@@ -32,32 +33,42 @@ class Domain::Fa::Job::Base < Scraper::JobBase
end
def find_or_build_user_from_args(args, caused_by_entry: nil)
find_user_from_args(args) || begin
url_name = Domain::Fa::User.name_to_url_name(args[:url_name])
user = Domain::Fa::User.new
user.url_name = url_name
user.name = url_name
user.state_detail ||= {}
user.state_detail["first_seen_entry"] = caused_by_entry.id if caused_by_entry
user
end
find_user_from_args(args) ||
begin
url_name = Domain::Fa::User.name_to_url_name(args[:url_name])
user = Domain::Fa::User.new
user.url_name = url_name
user.name = url_name
user.state_detail ||= {}
user.state_detail[
"first_seen_entry"
] = caused_by_entry.id if caused_by_entry
user
end
end
def find_user_from_args(args)
args[:user] || begin
fatal_error("arg 'url_name' is required if arg 'user' is nil") if args[:url_name].blank?
url_name = Domain::Fa::User.name_to_url_name(args[:url_name])
Domain::Fa::User.find_by(url_name: url_name)
end
args[:user] ||
begin
if args[:url_name].blank?
fatal_error("arg 'url_name' is required if arg 'user' is nil")
end
url_name = Domain::Fa::User.name_to_url_name(args[:url_name])
Domain::Fa::User.find_by(url_name: url_name)
end
end
def user_due_for_scan?(scan_type)
unless @user.scan_due?(scan_type)
if @force_scan
logger.warn("scanned #{@user.scanned_ago_in_words(scan_type).bold} - force scanning")
logger.warn(
"scanned #{@user.scanned_ago_in_words(scan_type).bold} - force scanning"
)
return true
else
logger.warn("scanned #{@user.scanned_ago_in_words(scan_type).bold} - skipping")
logger.warn(
"scanned #{@user.scanned_ago_in_words(scan_type).bold} - skipping"
)
return false
end
end
@@ -65,11 +76,7 @@ class Domain::Fa::Job::Base < Scraper::JobBase
return true
end
ListingsPageScanStats = Struct.new(
:new_seen,
:total_seen,
:last_was_new,
)
ListingsPageScanStats = Struct.new(:new_seen, :total_seen, :last_was_new)
def update_and_enqueue_posts_from_listings_page(
job_type,
@@ -106,15 +113,16 @@ class Domain::Fa::Job::Base < Scraper::JobBase
min_fa_id = [max_fa_id - continue_for, 0].max
fa_ids_to_manually_enqueue = Set.new(min_fa_id..max_fa_id)
fa_ids_to_manually_enqueue.subtract(fa_ids)
existing = Domain::Fa::Post.where("fa_id >= ? AND fa_id <= ?", min_fa_id, max_fa_id).pluck(:fa_id)
existing =
Domain::Fa::Post.where(
"fa_id >= ? AND fa_id <= ?",
min_fa_id,
max_fa_id
).pluck(:fa_id)
fa_ids_to_manually_enqueue.subtract(existing)
end
page_desc = if page_desc
"page #{page_desc.to_s.bold}"
else
"page"
end
page_desc = (page_desc ? "page #{page_desc.to_s.bold}" : "page")
listing_page_stats = ListingsPageScanStats.new(0, 0, false)
submissions.each do |submission|
@@ -123,13 +131,20 @@ class Domain::Fa::Job::Base < Scraper::JobBase
listing_page_stats.new_seen += 1 if post.new_record?
listing_page_stats.total_seen += 1
update_and_save_post_from_listings_page(job_type, post, submission, caused_by_entry)
enqueue_user_scan(
post.creator,
caused_by_entry,
enqueue_page_scan: enqueue_page_scan,
enqueue_gallery_scan: enqueue_gallery_scan,
) if post.creator
update_and_save_post_from_listings_page(
job_type,
post,
submission,
caused_by_entry
)
if post.creator
enqueue_user_scan(
post.creator,
caused_by_entry,
enqueue_page_scan: enqueue_page_scan,
enqueue_gallery_scan: enqueue_gallery_scan
)
end
case post.state.to_sym
when :ok
@@ -164,7 +179,12 @@ class Domain::Fa::Job::Base < Scraper::JobBase
listing_page_stats
end
def update_and_save_post_from_listings_page(job_type, post, submission, caused_by_entry)
def update_and_save_post_from_listings_page(
job_type,
post,
submission,
caused_by_entry
)
if job_type == :browse_page
post.log_entry_detail["first_browse_page_id"] ||= caused_by_entry.id
elsif job_type == :gallery_page
@@ -173,9 +193,11 @@ class Domain::Fa::Job::Base < Scraper::JobBase
fatal_error("unhandled job_type: #{job_type}")
end
post.creator ||= Domain::Fa::User.find_or_build_from_submission_parser(submission)
post.creator ||=
Domain::Fa::User.find_or_build_from_submission_parser(submission)
post.title = submission.title || fatal_error("blank title")
post.thumbnail_uri = submission.thumb_path || fatal_error("blank thumb_path")
post.thumbnail_uri =
submission.thumb_path || fatal_error("blank thumb_path")
post.save!
end
@@ -188,16 +210,17 @@ class Domain::Fa::Job::Base < Scraper::JobBase
@users_enqueued_for_page_scan ||= Set.new
@users_enqueued_for_gallery_scan ||= Set.new
args = if user.persisted?
{ user: user }
else
unless user.url_name
logger.warn "user does not have a url name and is not persisted, skipping (#{user.name})"
return
end
args =
if user.persisted?
{ user: user }
else
unless user.url_name
logger.warn "user does not have a url name and is not persisted, skipping (#{user.name})"
return
end
{ url_name: user.url_name }
end.merge({ caused_by_entry: caused_by_entry })
{ url_name: user.url_name }
end.merge({ caused_by_entry: caused_by_entry })
if enqueue_page_scan && @users_enqueued_for_page_scan.add?(user.url_name)
if user.due_for_page_scan?
@@ -209,7 +232,8 @@ class Domain::Fa::Job::Base < Scraper::JobBase
end
end
if enqueue_gallery_scan && @users_enqueued_for_gallery_scan.add?(user.url_name)
if enqueue_gallery_scan &&
@users_enqueued_for_gallery_scan.add?(user.url_name)
if user.due_for_gallery_scan?
logger.info(
"enqueue user gallery job for #{user.url_name.bold}, " +
@@ -222,9 +246,12 @@ class Domain::Fa::Job::Base < Scraper::JobBase
def normalize_enqueue_pri(enqueue_pri)
case enqueue_pri
when :low then -5
when :high then -15
else -10
when :low
-5
when :high
-15
else
-10
end
end
@@ -234,10 +261,11 @@ class Domain::Fa::Job::Base < Scraper::JobBase
if @posts_enqueued_for_scan.add?(fa_id)
logger.info "enqueue post scan for fa_id #{fa_id}"
defer_job(Domain::Fa::Job::ScanPostJob, {
fa_id: fa_id,
caused_by_entry: caused_by_entry,
}, { priority: enqueue_pri })
defer_job(
Domain::Fa::Job::ScanPostJob,
{ fa_id: fa_id, caused_by_entry: caused_by_entry },
{ priority: enqueue_pri }
)
end
end
@@ -249,16 +277,18 @@ class Domain::Fa::Job::Base < Scraper::JobBase
fa_id_str = (post.fa_id || "(nil)").to_s.bold
if !post.scanned?
logger.info "enqueue post scan for fa_id #{fa_id_str}"
defer_job(Domain::Fa::Job::ScanPostJob, {
post: post,
caused_by_entry: caused_by_entry,
}, { priority: enqueue_pri })
defer_job(
Domain::Fa::Job::ScanPostJob,
{ post: post, caused_by_entry: caused_by_entry },
{ priority: enqueue_pri }
)
elsif !post.have_file?
logger.info "enqueue file scan for fa_id #{fa_id_str}"
defer_job(Domain::Fa::Job::ScanFileJob, {
post: post,
caused_by_entry: caused_by_entry,
}, { priority: enqueue_pri })
defer_job(
Domain::Fa::Job::ScanFileJob,
{ post: post, caused_by_entry: caused_by_entry },
{ priority: enqueue_pri }
)
end
end
end

View File

@@ -11,15 +11,16 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
@first_job_entry = nil
init_from_args!(args, build_user: false)
@full_scan = !!args[:full_scan]
@user || begin
Domain::Fa::Job::UserPageJob.perform_later({
url_name: args[:url_name],
caused_by_entry: best_caused_by_entry,
})
fatal_error("user does not exist: #{args}")
end
@user ||
begin
Domain::Fa::Job::UserPageJob.perform_later(
{ url_name: args[:url_name], caused_by_entry: best_caused_by_entry }
)
fatal_error("user does not exist: #{args}")
end
logger.prefix = "[#{(@user.url_name || @user.name).bold} / #{@user.state.bold}]"
logger.prefix =
"[#{(@user.url_name || @user.name).bold} / #{@user.state.bold}]"
return unless user_due_for_scan?(:favs)
@page_number = 0
@@ -50,33 +51,38 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
to_add = nil
to_remove = nil
measure(proc { |jobs|
"add #{to_add.size.to_s.bold} favs, " +
"remove #{to_remove.size.to_s.bold} favs"
}) do
measure(
proc do |jobs|
"add #{to_add.size.to_s.bold} favs, " +
"remove #{to_remove.size.to_s.bold} favs"
end
) do
to_remove = existing_faved_ids - @seen_post_ids
to_add = @seen_post_ids - existing_faved_ids
end
measure(proc {
"updated favs list to #{@user.fav_post_joins.count.to_s.bold} posts"
}) do
measure(
proc do
"updated favs list to #{@user.fav_post_joins.count.to_s.bold} posts"
end
) do
ReduxApplicationRecord.transaction do
if to_remove.any?
@user.fav_post_joins.where(post_id: to_remove).delete_all
end
slice_size = if to_add.size <= 2500
slice_size =
if to_add.size <= 2500
100
else
1000
end
to_add.each_slice(slice_size) do |slice|
@user.fav_post_joins.insert_all!(slice.map do |id|
{ post_id: id }
end)
end if to_add.any?
if to_add.any?
to_add.each_slice(slice_size) do |slice|
@user.fav_post_joins.insert_all!(slice.map { |id| { post_id: id } })
end
end
@user.scanned_favs_at = Time.now
@user.save!
@@ -89,7 +95,8 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
def scan_page
ret = nil
url = if @page_id
url =
if @page_id
"https://www.furaffinity.net/favorites/#{@user.url_name}/#{@page_id}/next"
else
"https://www.furaffinity.net/favorites/#{@user.url_name}/"
@@ -99,11 +106,14 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
if response.status_code != 200
fatal_error(
"http #{response.status_code.to_s.red.bold}, " +
"log entry #{response.log_entry.id.to_s.bold}"
"log entry #{response.log_entry.id.to_s.bold}"
)
end
if Domain::Fa::Job::ScanUserUtils.user_disabled_or_not_found?(@user, response)
if Domain::Fa::Job::ScanUserUtils.user_disabled_or_not_found?(
@user,
response
)
logger.error("account disabled / not found, abort")
return :stop
end
@@ -116,35 +126,38 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
@total_items_seen += submissions.length
posts_to_create_hashes = []
measure(proc {
"page #{@page_number.to_s.bold} - " +
"#{submissions.length.to_s.bold} posts on page, " +
"created #{posts_to_create_hashes.size.to_s.bold}"
}) do
existing_fa_id_to_post_id = Domain::Fa::Post.where(
fa_id: submissions.map(&:id),
).pluck(:fa_id, :id).to_h
posts_to_create_hashes = submissions.reject do |submission|
existing_fa_id_to_post_id[submission.id]
end.map do |submission|
Domain::Fa::Post.hash_from_submission_parser_helper(
submission,
first_seen_log_entry: response.log_entry,
)
measure(
proc do
"page #{@page_number.to_s.bold} - " +
"#{submissions.length.to_s.bold} posts on page, " +
"created #{posts_to_create_hashes.size.to_s.bold}"
end
) do
existing_fa_id_to_post_id =
Domain::Fa::Post
.where(fa_id: submissions.map(&:id))
.pluck(:fa_id, :id)
.to_h
posts_to_create_hashes =
submissions
.reject { |submission| existing_fa_id_to_post_id[submission.id] }
.map do |submission|
Domain::Fa::Post.hash_from_submission_parser_helper(
submission,
first_seen_log_entry: response.log_entry
)
end
created_post_ids = []
created_post_ids = Domain::Fa::Post.insert_all!(
posts_to_create_hashes,
returning: %i[id fa_id],
).map do |row|
row["id"]
end unless posts_to_create_hashes.empty?
created_post_ids =
Domain::Fa::Post
.insert_all!(posts_to_create_hashes, returning: %i[id fa_id])
.map { |row| row["id"] } unless posts_to_create_hashes.empty?
enqueue_new_post_scan_jobs(posts_to_create_hashes.map do |hash|
hash[:fa_id]
end)
enqueue_new_post_scan_jobs(
posts_to_create_hashes.map { |hash| hash[:fa_id] }
)
@last_page_post_ids = Set.new
created_post_ids.each do |id|
@@ -163,10 +176,9 @@ class Domain::Fa::Job::FavsJob < Domain::Fa::Job::Base
def enqueue_new_post_scan_jobs(fa_ids)
bulk_enqueue_jobs do
fa_ids.each do |fa_id|
Domain::Fa::Job::ScanPostJob.perform_later({
fa_id: fa_id,
caused_by_entry: best_caused_by_entry,
})
Domain::Fa::Job::ScanPostJob.perform_later(
{ fa_id: fa_id, caused_by_entry: best_caused_by_entry }
)
# sleep 100000
end
end

View File

@@ -11,28 +11,36 @@ class Domain::Fa::Job::HomePageJob < Domain::Fa::Job::Base
@total_num_posts_seen = 0
scan_home_page
logger.info("finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts")
logger.info(
"finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts"
)
end
private
def scan_home_page
url = "https://www.furaffinity.net/"
response = http_client.get(url, caused_by_entry: @first_entry || @caused_by_entry)
response =
http_client.get(url, caused_by_entry: @first_entry || @caused_by_entry)
log_entry = response.log_entry
@first_entry ||= log_entry
if response.status_code != 200
fatal_error("non 200 response for /: #{response.status_code.to_s.underline}")
fatal_error(
"non 200 response for /: #{response.status_code.to_s.underline}"
)
end
page = Domain::Fa::Parser::Page.new(response.body)
listing_page_stats = update_and_enqueue_posts_from_listings_page(
:browse_page, page, log_entry,
enqueue_posts_pri: :high,
page_desc: "HomePage",
continue_for: @continue_for,
)
listing_page_stats =
update_and_enqueue_posts_from_listings_page(
:browse_page,
page,
log_entry,
enqueue_posts_pri: :high,
page_desc: "HomePage",
continue_for: @continue_for
)
@total_num_new_posts_seen += listing_page_stats.new_seen
@total_num_posts_seen += listing_page_stats.total_seen

View File

@@ -9,10 +9,12 @@ class Domain::Fa::Job::ScanFileJob < Domain::Fa::Job::Base
if @post.nil?
logger.error "no post model - fa_id: #{args[:fa_id]}, enqueue scan"
defer_job(Domain::Fa::Job::ScanPostJob, {
fa_id: args[:fa_id],
caused_by_entry: @caused_by_entry,
}) if args[:fa_id]
if args[:fa_id]
defer_job(
Domain::Fa::Job::ScanPostJob,
{ fa_id: args[:fa_id], caused_by_entry: @caused_by_entry }
)
end
return
end
@@ -39,7 +41,8 @@ class Domain::Fa::Job::ScanFileJob < Domain::Fa::Job::Base
is_unresolvable_host = false
is_unresolvable_host ||= file_uri_host == "d9.facdn.net"
uri_tld = file_uri_host.split(".").last
is_unresolvable_host ||= uri_tld.length >= 6 && file_uri_host.start_with?("d.facdn.net")
is_unresolvable_host ||=
uri_tld.length >= 6 && file_uri_host.start_with?("d.facdn.net")
if is_unresolvable_host
logger.error("host is #{file_uri_host}, which will not resolve")
@@ -62,10 +65,8 @@ class Domain::Fa::Job::ScanFileJob < Domain::Fa::Job::Base
private
def download_file
response = http_client.get(
@post.file_uri.to_s,
caused_by_entry: @caused_by_entry,
)
response =
http_client.get(@post.file_uri.to_s, caused_by_entry: @caused_by_entry)
if response.status_code == 404
@post.state_detail["404_count"] ||= 0
@@ -82,13 +83,13 @@ class Domain::Fa::Job::ScanFileJob < Domain::Fa::Job::Base
end
if response.status_code != 200
defer_job(Domain::Fa::Job::ScanPostJob, {
post: @post,
caused_by_entry: response.log_entry,
force_scan: true,
})
defer_job(
Domain::Fa::Job::ScanPostJob,
{ post: @post, caused_by_entry: response.log_entry, force_scan: true }
)
err_msg = "error downloading - log entry #{response.log_entry.id} / status code #{response.status_code}"
err_msg =
"error downloading - log entry #{response.log_entry.id} / status code #{response.status_code}"
@post.save!
if response.status_code == 404 && @post.state == "removed"

View File

@@ -3,13 +3,16 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
ignore_signature_args :caused_by_entry
def perform(args)
@post = args[:post] || begin
Domain::Fa::Post.find_or_initialize_by(fa_id: args[:fa_id])
end
@post =
args[:post] ||
begin
Domain::Fa::Post.find_or_initialize_by(fa_id: args[:fa_id])
end
@caused_by_entry = args[:caused_by_entry]
@force_scan = !!args[:force_scan]
logger.prefix = proc { "[fa_id #{@post.fa_id.to_s.bold} / #{@post.state.bold}]" }
logger.prefix =
proc { "[fa_id #{@post.fa_id.to_s.bold} / #{@post.state.bold}]" }
if @post.state == "ok" && !@post.scanned? || @force_scan
if (@post.state_detail["404_count"] || 0) > 2
@@ -22,33 +25,36 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
if (@post.state == "ok" && @post.file_uri && @post.file.nil?) || @force_scan
logger.info("enqueue file job (#{self.priority})")
defer_job(Domain::Fa::Job::ScanFileJob, {
post: @post,
caused_by_entry: @submission_entry || @caused_by_entry,
}, { priority: self.priority })
defer_job(
Domain::Fa::Job::ScanFileJob,
{ post: @post, caused_by_entry: @submission_entry || @caused_by_entry },
{ priority: self.priority }
)
end
enqueue_user_scan(@post.creator, @submission_entry) if @post.creator
ensure
enqueue_jobs_from_found_links(
@submission_entry,
suppress_jobs: [{
job: self.class,
fa_id: @post.fa_id,
}],
) if @submission_entry && @submission_entry.status_code == 200
if @submission_entry && @submission_entry.status_code == 200
enqueue_jobs_from_found_links(
@submission_entry,
suppress_jobs: [{ job: self.class, fa_id: @post.fa_id }]
)
end
logger.info "finished post scan"
end
private
def scan_post
response = http_client.get(
"https://www.furaffinity.net/view/#{@post.fa_id}/",
caused_by_entry: @caused_by_entry,
)
response =
http_client.get(
"https://www.furaffinity.net/view/#{@post.fa_id}/",
caused_by_entry: @caused_by_entry
)
if response.status_code != 200
fatal_error("error scanning fa_id #{@post.fa_id}: #{response.status_code}, log entry #{response.log_entry.id}")
fatal_error(
"error scanning fa_id #{@post.fa_id}: #{response.status_code}, log entry #{response.log_entry.id}"
)
end
@submission_entry = response.log_entry
@@ -62,7 +68,8 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
end
unless page.probably_submission?
if response.body =~ /The page you are trying to reach is currently pending deletion/
if response.body =~
/The page you are trying to reach is currently pending deletion/
logger.error("post is pending deletion")
@post.state = :removed
@post.save!
@@ -80,9 +87,15 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::Base
@post.last_submission_page = @submission_entry
@post.title = submission.title
@post.creator = Domain::Fa::User.find_or_build_from_submission_parser(submission)
@post.creator =
Domain::Fa::User.find_or_build_from_submission_parser(submission)
@post.category = submission.category
@post.description = submission.description_html.encode("UTF-8", :invalid => :replace, :undef => :replace)
@post.description =
submission.description_html.encode(
"UTF-8",
invalid: :replace,
undef: :replace
)
@post.keywords = submission.keywords_array
@post.file_uri = submission.full_res_img
@post.theme = submission.theme

View File

@@ -5,15 +5,13 @@ module Domain::Fa::Job
missing = fa_ids - posts.map(&:fa_id)
missing.each do |fa_id|
post = Domain::Fa::Post.create!(fa_id: fa_id)
Domain::Fa::Job::ScanPostJob.perform_later({
post: post, caused_by_entry: caused_by_entry,
})
Domain::Fa::Job::ScanPostJob.perform_later(
{ post: post, caused_by_entry: caused_by_entry }
)
posts << post
end
posts = posts.index_by(&:fa_id)
fa_ids.map do |fa_id|
posts[fa_id]
end
fa_ids.map { |fa_id| posts[fa_id] }
end
end
end

View File

@@ -3,14 +3,16 @@ module Domain::Fa::Job
DISABLED_PAGE_PATTERNS = [
/User ".+" has voluntarily disabled access/,
/User ".+" was not found in our database./,
/The page you are trying to reach is currently pending deletion/,
/The page you are trying to reach is currently pending deletion/
]
def self.user_disabled_or_not_found?(user, response)
if DISABLED_PAGE_PATTERNS.any? { |pattern| response.body =~ pattern }
user.state = :scan_error
user.state_detail ||= {}
user.state_detail["scan_error"] = "account disabled or not found, see last_scanned_page_id"
user.state_detail[
"scan_error"
] = "account disabled or not found, see last_scanned_page_id"
user.state_detail["last_scanned_page_id"] = response.log_entry.id
user.save!
true
@@ -21,46 +23,49 @@ module Domain::Fa::Job
def self.check_disabled_or_not_found(user, response)
if response.status_code != 200
return [:fatal, {
message: "http #{response.status_code}, log entry #{response.log_entry.id}",
}]
return [
:fatal,
{
message:
"http #{response.status_code}, log entry #{response.log_entry.id}"
}
]
end
page = Domain::Fa::Parser::Page.new(response.body, require_logged_in: false)
if page.probably_user_page?
return [:ok, { page: page }]
end
page =
Domain::Fa::Parser::Page.new(response.body, require_logged_in: false)
return :ok, { page: page } if page.probably_user_page?
if response.body =~ /has voluntarily disabled access/
user.state = :scan_error
user.state_detail = {
scan_error: "(user scan) user has disabled account, see last_user_page_id",
last_user_page_id: response.log_entry.id,
scan_error:
"(user scan) user has disabled account, see last_user_page_id",
last_user_page_id: response.log_entry.id
}
try_name = /User "(.+)" has voluntarily disabled/.match(response.body)
user.name ||= try_name && try_name[1] || user.url_name
user.save!
return [:stop, {
message: "account disabled",
}]
return :stop, { message: "account disabled" }
end
if response.body =~ /This user cannot be found./ || response.body =~ /The page you are trying to reach is currently pending deletion/
if response.body =~ /This user cannot be found./ ||
response.body =~
/The page you are trying to reach is currently pending deletion/
user.state = :scan_error
user.state_detail = {
scan_error: "(user scan) user was not found, see last_user_page_id",
last_user_page_id: response.log_entry.id,
last_user_page_id: response.log_entry.id
}
user.name ||= user.url_name
user.save!
return [:stop, {
message: "account not found",
}]
return :stop, { message: "account not found" }
end
return [:fatal, {
message: "not a user page - log entry #{response.log_entry.id}",
}]
return [
:fatal,
{ message: "not a user page - log entry #{response.log_entry.id}" }
]
end
def self.update_user_fields_from_page(user, page, response)
@@ -73,30 +78,35 @@ module Domain::Fa::Job
user.num_comments_given = user_page.num_comments_given
user.num_journals = user_page.num_journals
user.num_favorites = user_page.num_favorites
user.profile_html = user_page.profile_html.encode("UTF-8", :invalid => :replace, :undef => :replace)
user.profile_html =
user_page.profile_html.encode(
"UTF-8",
invalid: :replace,
undef: :replace
)
user.log_entry_detail["last_user_page_id"] = response.log_entry.id
avatar = user.avatar_or_create
user.avatar.file_uri = user_page.profile_thumb_url
if user.avatar.changed?
user.avatar.save!
Domain::Fa::Job::UserAvatarJob.perform_later({
user: user,
caused_by_entry: response.log_entry,
})
Domain::Fa::Job::UserAvatarJob.perform_later(
{ user: user, caused_by_entry: response.log_entry }
)
end
end
# names is an array of structs - [Struct(:name, :url_name)]
def self.find_or_create_by_names(names, caused_by_entry: nil)
users = Domain::Fa::User.where(url_name: names.map(&:url_name)).to_a
missing = names.reject { |name| users.any? { |u| u.url_name == name.url_name } }
missing =
names.reject { |name| users.any? { |u| u.url_name == name.url_name } }
missing.each do |name|
user = Domain::Fa::User.create!(url_name: name.url_name, name: name.name)
Domain::Fa::Job::UserPageJob.perform_later({
user: user,
caused_by_entry: caused_by_entry,
})
user =
Domain::Fa::User.create!(url_name: name.url_name, name: name.name)
Domain::Fa::Job::UserPageJob.perform_later(
{ user: user, caused_by_entry: caused_by_entry }
)
users << user
end
users

View File

@@ -6,10 +6,15 @@ class Domain::Fa::Job::UserAvatarJob < Domain::Fa::Job::Base
init_from_args!(args, build_user: false)
@user || raise("user must exist")
@avatar = @user.avatar_or_create
logger.prefix = proc { "[avatar #{@avatar.id.to_s.bold} / user #{@user.url_name.to_s.bold}]" }
logger.prefix =
proc do
"[avatar #{@avatar.id.to_s.bold} / user #{@user.url_name.to_s.bold}]"
end
if @avatar.file_sha256 && !@force_scan
logger.warn("downloaded #{time_ago_in_words(@avatar.downloaded_file_at)}, skipping")
logger.warn(
"downloaded #{time_ago_in_words(@avatar.downloaded_file_at)}, skipping"
)
return
end
@@ -20,10 +25,10 @@ class Domain::Fa::Job::UserAvatarJob < Domain::Fa::Job::Base
if @avatar.file_uri.blank?
if @user.due_for_page_scan?
defer_job(Domain::Fa::Job::UserPageJob, {
user: @user,
caused_by_entry: @caused_by_entry,
})
defer_job(
Domain::Fa::Job::UserPageJob,
{ user: @user, caused_by_entry: @caused_by_entry }
)
logger.error("no file uri, scanning user page")
else
logger.error("no file uri")
@@ -31,10 +36,8 @@ class Domain::Fa::Job::UserAvatarJob < Domain::Fa::Job::Base
return
end
response = http_client.get(
@avatar.file_uri.to_s,
caused_by_entry: @caused_by_entry,
)
response =
http_client.get(@avatar.file_uri.to_s, caused_by_entry: @caused_by_entry)
@avatar.log_entry = response.log_entry
if [200, 404].include?(response.status_code)
@@ -49,8 +52,12 @@ class Domain::Fa::Job::UserAvatarJob < Domain::Fa::Job::Base
logger.info("downloaded avatar file")
else
@avatar.state = :download_error
@avatar.state_detail["download_error"] = "http status #{response.status_code}"
fatal_error("http #{response.status_code}, log entry #{response.log_entry.id}")
@avatar.state_detail[
"download_error"
] = "http status #{response.status_code}"
fatal_error(
"http #{response.status_code}, log entry #{response.log_entry.id}"
)
end
ensure
@avatar.save! if @avatar

View File

@@ -14,12 +14,14 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
init_from_args!(args)
@first_job_entry = nil
if !@user.due_for_follows_scan? && !@force_scan
logger.warn("scanned #{time_ago_in_words(@user.scanned_follows_at)}, skipping")
logger.warn(
"scanned #{time_ago_in_words(@user.scanned_follows_at)}, skipping"
)
return
end
# buggy (sentinal) user
return if @user.id == 117552 && @user.url_name == "click here"
return if @user.id == 117_552 && @user.url_name == "click here"
@page_number = 1
@total_follows_seen = 0
@@ -35,25 +37,29 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
to_add = nil
to_remove = nil
measure(proc { |jobs|
"add #{to_add.size.to_s.bold} follows, " +
"remove #{to_remove.size.to_s.bold} follows"
}) do
measure(
proc do |jobs|
"add #{to_add.size.to_s.bold} follows, " +
"remove #{to_remove.size.to_s.bold} follows"
end
) do
existing_followed_ids = Set.new(@user.follower_joins.pluck(:followed_id))
to_remove = existing_followed_ids - @scanned_followed_ids
to_add = @scanned_followed_ids - existing_followed_ids
end
measure(proc {
"updated follows list to #{@user.follows.count.to_s.bold} users"
}) do
measure(
proc { "updated follows list to #{@user.follows.count.to_s.bold} users" }
) do
ReduxApplicationRecord.transaction do
if to_remove.any?
@user.follower_joins.where(followed_id: to_remove).delete_all
end
@user.follower_joins.insert_all!(to_add.map do |id|
{ followed_id: id }
end) if to_add.any?
if to_add.any?
@user.follower_joins.insert_all!(
to_add.map { |id| { followed_id: id } }
)
end
@user.scanned_follows_at = Time.now
@user.save!
end
@@ -61,10 +67,9 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
if @created_user
logger.info("user was new record, enqueue page scan job")
Domain::Fa::Job::UserPageJob.perform_later({
user: @user,
caused_by_entry: best_caused_by_entry,
})
Domain::Fa::Job::UserPageJob.perform_later(
{ user: @user, caused_by_entry: best_caused_by_entry }
)
end
end
@@ -73,7 +78,8 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
def scan_follows_page
ret = nil
url = if @page_number > 1
url =
if @page_number > 1
"https://www.furaffinity.net/watchlist/by/#{@user.url_name}/#{@page_number}/?"
else
"https://www.furaffinity.net/watchlist/by/#{@user.url_name}/"
@@ -83,7 +89,7 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
if response.status_code != 200
fatal_error(
"http #{response.status_code.to_s.red.bold}, " +
"log entry #{response.log_entry.id.to_s.bold}"
"log entry #{response.log_entry.id.to_s.bold}"
)
end
@@ -96,7 +102,9 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
end
if user_list.last.url_name == @last_in_user_list
logger.info("page #{@page_number.to_s.bold} saw same user as last page, break")
logger.info(
"page #{@page_number.to_s.bold} saw same user as last page, break"
)
return :break
end
@@ -107,43 +115,49 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
@total_follows_seen += user_list.length
users_to_create_hashes = []
followed_user_ids = measure(proc {
"page #{@page_number.to_s.bold} - " +
"#{user_list.length.to_s.bold} users on page, " +
"created #{users_to_create_hashes.size.to_s.bold}"
}) do
existing_url_name_to_id = Domain::Fa::User.where(
url_name: user_list.map(&:url_name),
).pluck(:id, :url_name).map do |id, url_name|
[url_name, id]
end.to_h
followed_user_ids =
measure(
proc do
"page #{@page_number.to_s.bold} - " +
"#{user_list.length.to_s.bold} users on page, " +
"created #{users_to_create_hashes.size.to_s.bold}"
end
) do
existing_url_name_to_id =
Domain::Fa::User
.where(url_name: user_list.map(&:url_name))
.pluck(:id, :url_name)
.map { |id, url_name| [url_name, id] }
.to_h
users_to_create_hashes = user_list.reject do |user|
existing_url_name_to_id[user.url_name]
end.map do |user|
{
url_name: user.url_name,
name: user.name,
state_detail: { "first_seen_entry" => response.log_entry.id },
}
users_to_create_hashes =
user_list
.reject { |user| existing_url_name_to_id[user.url_name] }
.map do |user|
{
url_name: user.url_name,
name: user.name,
state_detail: {
"first_seen_entry" => response.log_entry.id
}
}
end
created_user_ids =
Domain::Fa::User
.upsert_all(
users_to_create_hashes,
unique_by: :url_name,
update_only: :url_name,
returning: %i[id url_name]
)
.map { |row| row["id"] } unless users_to_create_hashes.empty?
enqueue_new_user_pagescan_jobs(users_to_create_hashes)
(created_user_ids || []) + existing_url_name_to_id.values
end
created_user_ids = Domain::Fa::User.upsert_all(
users_to_create_hashes,
unique_by: :url_name,
update_only: :url_name,
returning: %i[id url_name],
).map do |row|
row["id"]
end unless users_to_create_hashes.empty?
enqueue_new_user_pagescan_jobs(users_to_create_hashes)
(created_user_ids || []) + existing_url_name_to_id.values
end
followed_user_ids.each do |user_id|
@scanned_followed_ids.add(user_id)
end
followed_user_ids.each { |user_id| @scanned_followed_ids.add(user_id) }
ret
end
@@ -151,10 +165,12 @@ class Domain::Fa::Job::UserFollowsJob < Domain::Fa::Job::Base
def enqueue_new_user_pagescan_jobs(user_hashes)
bulk_enqueue_jobs do
user_hashes.each do |user_hash|
Domain::Fa::Job::UserPageJob.perform_later({
url_name: user_hash[:url_name],
caused_by_entry: best_caused_by_entry,
})
Domain::Fa::Job::UserPageJob.perform_later(
{
url_name: user_hash[:url_name],
caused_by_entry: best_caused_by_entry
}
)
end
end
end

View File

@@ -13,27 +13,31 @@ class Domain::Fa::Job::UserGalleryJob < Domain::Fa::Job::Base
end
# buggy (sentinal) user
return if @user.id == 117552 && @user.url_name == "click here"
return if @user.id == 117_552 && @user.url_name == "click here"
@go_until_end = @user.scanned_gallery_at.nil?
@first_gallery_page_entry = nil
@max_page_number = MAX_PAGE_NUMBER
if @user.num_submissions && @user.scanned_page_at && @user.scanned_page_at > 3.days.ago
if @user.num_submissions && @user.scanned_page_at &&
@user.scanned_page_at > 3.days.ago
@max_page_number = (@user.num_submissions * 72) + 3
end
if !@user.due_for_gallery_scan? && !@force_scan
logger.warn("gallery scanned #{time_ago_in_words(@user.scanned_page_at)}, skipping")
logger.warn(
"gallery scanned #{time_ago_in_words(@user.scanned_page_at)}, skipping"
)
return
end
@visited = Set.new
@folders = Set.new(
[
{ href: "/gallery/#{@user.url_name}/", title: "Main Gallery" },
{ href: "/scraps/#{@user.url_name}/", title: "Scraps" },
]
)
@folders =
Set.new(
[
{ href: "/gallery/#{@user.url_name}/", title: "Main Gallery" },
{ href: "/scraps/#{@user.url_name}/", title: "Scraps" }
]
)
until (@folders - @visited).empty?
folder = (@folders - @visited).first
@@ -41,7 +45,9 @@ class Domain::Fa::Job::UserGalleryJob < Domain::Fa::Job::Base
break if scan_folder(folder) == :break
end
@user.log_entry_detail["last_gallery_page_id"] = @first_gallery_page_entry&.id
@user.log_entry_detail[
"last_gallery_page_id"
] = @first_gallery_page_entry&.id
@user.scanned_gallery_at = Time.now
@user.save!
end
@@ -54,23 +60,31 @@ class Domain::Fa::Job::UserGalleryJob < Domain::Fa::Job::Base
total_num_posts_seen = 0
while true
if page_number >= @max_page_number
fatal_error("hit max #{page_number}) pages, num submissions: #{@user.num_submissions}")
fatal_error(
"hit max #{page_number}) pages, num submissions: #{@user.num_submissions}"
)
end
folder_href = folder[:href]
folder_href += "/" unless folder_href.end_with?("/")
folder_href = "/" + folder_href unless folder_href.start_with?("/")
page_url = "https://www.furaffinity.net#{folder_href}#{page_number}?perpage=72"
page_url =
"https://www.furaffinity.net#{folder_href}#{page_number}?perpage=72"
response = http_client.get(page_url, caused_by_entry: @caused_by_entry)
@first_gallery_page_entry ||= response.log_entry
@caused_by_entry = @first_gallery_page_entry
if response.status_code != 200
fatal_error("http #{response.status_code}, log entry #{response.log_entry.id}")
fatal_error(
"http #{response.status_code}, log entry #{response.log_entry.id}"
)
end
if Domain::Fa::Job::ScanUserUtils.user_disabled_or_not_found?(@user, response)
if Domain::Fa::Job::ScanUserUtils.user_disabled_or_not_found?(
@user,
response
)
logger.error("account disabled / not found, abort")
return :break
end
@@ -80,19 +94,20 @@ class Domain::Fa::Job::UserGalleryJob < Domain::Fa::Job::Base
# newly instantiated users don't have a name yet, but can derive it from the gallery page
@user.name ||= page.user_page.name || @user.url_name
listing_page_stats = update_and_enqueue_posts_from_listings_page(
:gallery_page, page, response.log_entry,
enqueue_posts_pri: :low,
enqueue_gallery_scan: false,
enqueue_page_scan: false,
page_desc: "#{folder[:title]}@#{page_number}",
)
listing_page_stats =
update_and_enqueue_posts_from_listings_page(
:gallery_page,
page,
response.log_entry,
enqueue_posts_pri: :low,
enqueue_gallery_scan: false,
enqueue_page_scan: false,
page_desc: "#{folder[:title]}@#{page_number}"
)
total_num_new_posts_seen += listing_page_stats.new_seen
total_num_posts_seen += listing_page_stats.total_seen
page.submission_folders.each do |sf|
@folders.add?(sf)
end if @force_scan
page.submission_folders.each { |sf| @folders.add?(sf) } if @force_scan
page_number += 1
break if listing_page_stats.new_seen == 0 && !@go_until_end
@@ -101,12 +116,11 @@ class Domain::Fa::Job::UserGalleryJob < Domain::Fa::Job::Base
logger.info "folder `#{folder[:title].bold}` complete - #{total_num_new_posts_seen.to_s.bold} new, #{total_num_posts_seen.to_s.bold} total"
ensure
enqueue_jobs_from_found_links(
response.log_entry,
suppress_jobs: [{
job: self.class,
url_name: @user.url_name,
}],
) if response && response.status_code == 200
if response && response.status_code == 200
enqueue_jobs_from_found_links(
response.log_entry,
suppress_jobs: [{ job: self.class, url_name: @user.url_name }]
)
end
end
end

View File

@@ -6,8 +6,9 @@ module Domain::Fa::Job
def perform(args)
init_from_args!(args)
# buggy (sentinal) user
return if @user.id == 117552 && @user.url_name == "click here"
logger.prefix = proc { "[ #{@user.id.to_s.bold} / #{@user.url_name.bold} ]" }
return if @user.id == 117_552 && @user.url_name == "click here"
logger.prefix =
proc { "[ #{@user.id.to_s.bold} / #{@user.url_name.bold} ]" }
# this is similar to a user page job, and will update the user page
# however, it will incrementally update user favs & follows / following:
@@ -18,18 +19,24 @@ module Domain::Fa::Job
# and add new follows.
if !@user.due_for_incremental_scan? && !@force_scan
logger.warn("scanned #{@user.time_ago_for_incremental_scan.bold}, skipping")
logger.warn(
"scanned #{@user.time_ago_for_incremental_scan.bold}, skipping"
)
return
end
response = http_client.get(
"https://www.furaffinity.net/user/#{@user.url_name}/",
caused_by_entry: @caused_by_entry,
)
response =
http_client.get(
"https://www.furaffinity.net/user/#{@user.url_name}/",
caused_by_entry: @caused_by_entry
)
@log_entry = response.log_entry
ret, opts = Domain::Fa::Job::ScanUserUtils.
check_disabled_or_not_found(@user, response)
ret, opts =
Domain::Fa::Job::ScanUserUtils.check_disabled_or_not_found(
@user,
response
)
case ret
when :ok
page = opts[:page]
@@ -40,8 +47,11 @@ module Domain::Fa::Job
fatal_error(opts[:message])
end
Domain::Fa::Job::ScanUserUtils.
update_user_fields_from_page(@user, page, response)
Domain::Fa::Job::ScanUserUtils.update_user_fields_from_page(
@user,
page,
response
)
check_favs(@user, page.user_page.recent_fav_fa_ids)
check_watchers(@user, page.user_page.recent_watchers)
@@ -52,21 +62,24 @@ module Domain::Fa::Job
@user.save!
logger.info "completed page scan"
ensure
enqueue_jobs_from_found_links(
response.log_entry,
suppress_jobs: [{
job: self.class,
url_name: @user.url_name,
}],
) if response && response.status_code == 200
if response && response.status_code == 200
enqueue_jobs_from_found_links(
response.log_entry,
suppress_jobs: [{ job: self.class, url_name: @user.url_name }]
)
end
end
def check_favs(user, recent_fav_fa_ids)
recent_fav_posts = Domain::Fa::Job::ScanPostUtils.
find_or_create_by_fa_ids(recent_fav_fa_ids, caused_by_entry: @log_entry)
recent_fav_posts =
Domain::Fa::Job::ScanPostUtils.find_or_create_by_fa_ids(
recent_fav_fa_ids,
caused_by_entry: @log_entry
)
recent_fav_post_ids = recent_fav_posts.map(&:id)
existing_fav_post_ids = user.fav_post_joins.where(post_id: recent_fav_post_ids).pluck(:post_id)
existing_fav_post_ids =
user.fav_post_joins.where(post_id: recent_fav_post_ids).pluck(:post_id)
missing_fav_post_ids = recent_fav_post_ids - existing_fav_post_ids
if missing_fav_post_ids.empty?
logger.info("no new favs for user")
@@ -77,17 +90,19 @@ module Domain::Fa::Job
num_missing = missing_fav_post_ids.size
if num_missing >= 0
logger.info("add #{num_missing.to_s.bold} new favs for user")
@user.fav_post_joins.insert_all!(missing_fav_post_ids.map do |post_id|
{ post_id: post_id }
end)
@user.fav_post_joins.insert_all!(
missing_fav_post_ids.map { |post_id| { post_id: post_id } }
)
end
if missing_fav_post_ids.include? recent_fav_post_ids.last
logger.info("last fav is new (#{num_missing.to_s.bold} missing), enqueue full favs scan")
defer_job(Domain::Fa::Job::FavsJob, {
user: user,
caused_by_entry: @log_entry,
})
logger.info(
"last fav is new (#{num_missing.to_s.bold} missing), enqueue full favs scan"
)
defer_job(
Domain::Fa::Job::FavsJob,
{ user: user, caused_by_entry: @log_entry }
)
else
@user.scanned_favs_at = Time.now
end
@@ -98,8 +113,13 @@ module Domain::Fa::Job
# TODO - may be useful to have a separate 'scan full followed by' job
# to handle users who are watched by a large number of others
def check_watchers(user, recent_watchers)
recent_models = Domain::Fa::Job::ScanUserUtils.find_or_create_by_names(recent_watchers)
existing = user.followed_joins.where(follower_id: recent_models.map(&:id)).pluck(:follower_id)
recent_models =
Domain::Fa::Job::ScanUserUtils.find_or_create_by_names(recent_watchers)
existing =
user
.followed_joins
.where(follower_id: recent_models.map(&:id))
.pluck(:follower_id)
missing = recent_models.reject { |w| existing.include? w.id }
if missing.empty?
logger.info("no new watchers")
@@ -107,15 +127,20 @@ module Domain::Fa::Job
end
num_missing = missing.size
user.followed_joins.insert_all!(missing.map do |watcher|
{ follower_id: watcher.id }
end)
user.followed_joins.insert_all!(
missing.map { |watcher| { follower_id: watcher.id } }
)
logger.info("added #{num_missing.to_s.bold} new watchers")
end
def check_watching(user, recent_watching)
recent_models = Domain::Fa::Job::ScanUserUtils.find_or_create_by_names(recent_watching)
existing = user.follower_joins.where(followed_id: recent_models.map(&:id)).pluck(:followed_id)
recent_models =
Domain::Fa::Job::ScanUserUtils.find_or_create_by_names(recent_watching)
existing =
user
.follower_joins
.where(followed_id: recent_models.map(&:id))
.pluck(:followed_id)
missing = recent_models.reject { |w| existing.include? w.id }
if missing.empty?
logger.info("no new users watched")
@@ -124,17 +149,16 @@ module Domain::Fa::Job
end
num_missing = missing.size
user.follower_joins.insert_all!(missing.map do |watcher|
{ followed_id: watcher.id }
end)
user.follower_joins.insert_all!(
missing.map { |watcher| { followed_id: watcher.id } }
)
logger.info("added #{num_missing.to_s.bold} new users watched")
if missing.any? { |w| w.url_name == recent_watching.last.url_name }
logger.info("last user watched is new, enqueue full follows scan")
Domain::Fa::Job::UserFollowsJob.perform_later({
user: user,
caused_by_entry: @log_entry,
})
Domain::Fa::Job::UserFollowsJob.perform_later(
{ user: user, caused_by_entry: @log_entry }
)
else
@user.scanned_follows_at = Time.now
end

View File

@@ -6,20 +6,26 @@ class Domain::Fa::Job::UserPageJob < Domain::Fa::Job::Base
init_from_args!(args)
# buggy (sentinal) user
return if @user.id == 117552 && @user.url_name == "click here"
return if @user.id == 117_552 && @user.url_name == "click here"
if !@user.due_for_page_scan? && !@force_scan
logger.warn("scanned #{time_ago_in_words(@user.scanned_page_at)}, skipping")
logger.warn(
"scanned #{time_ago_in_words(@user.scanned_page_at)}, skipping"
)
return
end
response = http_client.get(
"https://www.furaffinity.net/user/#{@user.url_name}/",
caused_by_entry: @caused_by_entry,
)
response =
http_client.get(
"https://www.furaffinity.net/user/#{@user.url_name}/",
caused_by_entry: @caused_by_entry
)
ret, opts = Domain::Fa::Job::ScanUserUtils.
check_disabled_or_not_found(@user, response)
ret, opts =
Domain::Fa::Job::ScanUserUtils.check_disabled_or_not_found(
@user,
response
)
case ret
when :ok
page = opts[:page]
@@ -30,18 +36,20 @@ class Domain::Fa::Job::UserPageJob < Domain::Fa::Job::Base
fatal_error(opts[:message])
end
Domain::Fa::Job::ScanUserUtils.
update_user_fields_from_page(@user, page, response)
Domain::Fa::Job::ScanUserUtils.update_user_fields_from_page(
@user,
page,
response
)
@user.scanned_page_at = Time.now
@user.save!
logger.info "completed page scan"
ensure
enqueue_jobs_from_found_links(
response.log_entry,
suppress_jobs: [{
job: self.class,
url_name: @user.url_name,
}],
) if response && response.status_code == 200
if response && response.status_code == 200
enqueue_jobs_from_found_links(
response.log_entry,
suppress_jobs: [{ job: self.class, url_name: @user.url_name }]
)
end
end
end

View File

@@ -3,27 +3,22 @@ module Domain::Inkbunny::Job
def perform(args)
file = args[:file] || fatal_error("file is required")
caused_by_entry = args[:caused_by_entry]
logger.prefix = proc {
"[#{file.id.to_s.bold} / " +
"#{file.ib_file_id.to_s.bold} / " +
"#{file.state.to_s.bold}] "
}
logger.prefix =
proc do
"[#{file.id.to_s.bold} / " + "#{file.ib_file_id.to_s.bold} / " +
"#{file.state.to_s.bold}] "
end
url_str = file.url_str
if file.state == "error"
retry_count = file.state_detail&.
[]("error")&.
[]("retry_count") || 0
retry_count = file.state_detail&.[]("error")&.[]("retry_count") || 0
if retry_count >= 3
logger.error("file has been retried 3 times, giving up")
return
end
end
response = http_client.get(
url_str,
caused_by_entry: caused_by_entry,
)
response = http_client.get(url_str, caused_by_entry: caused_by_entry)
if response.status_code != 200
file.state = :error

View File

@@ -1,8 +1,13 @@
module Domain::Inkbunny::Job
class LatestPostsJob < Base
def perform(args)
url = "https://inkbunny.net/api_search.php?orderby=create_datetime&keywords=no&title=no&description=no"
@api_search_response = http_client.post(url, caused_by_entry: @first_browse_page_entry || @caused_by_entry)
url =
"https://inkbunny.net/api_search.php?orderby=create_datetime&keywords=no&title=no&description=no"
@api_search_response =
http_client.post(
url,
caused_by_entry: @first_browse_page_entry || @caused_by_entry
)
if @api_search_response.status_code != 200
fatal_error("api_search failed: #{@api_search_response.status_code}")
end
@@ -14,11 +19,13 @@ module Domain::Inkbunny::Job
def handle_search_response(api_search_json)
ib_submission_jsons = api_search_json["submissions"]
ib_submission_ids = ib_submission_jsons.map { |j| j["submission_id"]&.to_i }
@ib_post_id_to_model = Domain::Inkbunny::Post
.where(ib_post_id: ib_submission_ids)
.includes(:files, :creator)
.index_by(&:ib_post_id)
ib_submission_ids =
ib_submission_jsons.map { |j| j["submission_id"]&.to_i }
@ib_post_id_to_model =
Domain::Inkbunny::Post
.where(ib_post_id: ib_submission_ids)
.includes(:files, :creator)
.index_by(&:ib_post_id)
new_posts = []
users = []
@@ -26,15 +33,12 @@ module Domain::Inkbunny::Job
ib_submission_jsons.each do |submission_json|
ib_post_id = submission_json["submission_id"]&.to_i
unless @ib_post_id_to_model[ib_post_id]
post = Domain::Inkbunny::Post.new({
ib_post_id: ib_post_id,
})
post = Domain::Inkbunny::Post.new({ ib_post_id: ib_post_id })
user = Domain::Inkbunny::User.find_or_initialize_by({
ib_user_id: submission_json["user_id"].to_i,
}) do |user|
user.name = submission_json["username"]
end
user =
Domain::Inkbunny::User.find_or_initialize_by(
{ ib_user_id: submission_json["user_id"].to_i }
) { |user| user.name = submission_json["username"] }
user.save!
post.creator = user
new_posts << post
@@ -43,9 +47,7 @@ module Domain::Inkbunny::Job
end
Domain::Inkbunny::Post.transaction do
users.select do |user|
user.new_record? || user.changed?
end.each(&:save!)
users.select { |user| user.new_record? || user.changed? }.each(&:save!)
new_posts.each(&:save!)
end
@@ -53,7 +55,8 @@ module Domain::Inkbunny::Job
needs_deep_update_posts = []
Domain::Inkbunny::Post.transaction do
ib_submission_jsons.each do |submission_json|
needs_deep_update, post = shallow_update_post_from_submission_json(submission_json)
needs_deep_update, post =
shallow_update_post_from_submission_json(submission_json)
needs_deep_update_posts << post if needs_deep_update
end
end
@@ -67,15 +70,16 @@ module Domain::Inkbunny::Job
if needs_deep_update_posts.any?
ids_list = needs_deep_update_posts.map(&:ib_post_id).join(",")
url = "https://inkbunny.net/api_submissions.php?" +
"submission_ids=#{ids_list}" +
"&show_description=yes&show_writing=yes&show_pools=yes"
@api_submissions_response = http_client.get(
url,
caused_by_entry: @api_search_response.log_entry,
)
url =
"https://inkbunny.net/api_submissions.php?" +
"submission_ids=#{ids_list}" +
"&show_description=yes&show_writing=yes&show_pools=yes"
@api_submissions_response =
http_client.get(url, caused_by_entry: @api_search_response.log_entry)
if @api_submissions_response.status_code != 200
fatal_error("api_submissions failed: #{@api_submissions_response.status_code}")
fatal_error(
"api_submissions failed: #{@api_submissions_response.status_code}"
)
end
api_submissions_json = JSON.parse(@api_submissions_response.body)
api_submissions_json["submissions"].each do |submission_json|
@@ -96,9 +100,9 @@ module Domain::Inkbunny::Job
post.rating = json["rating_id"]&.to_i
post.submission_type = json["submission_type_id"]&.to_i
post.ib_detail_raw = json
needs_deep_update = post.last_file_updated_at_changed? ||
post.num_files_changed? ||
post.files.count != post.num_files
needs_deep_update =
post.last_file_updated_at_changed? || post.num_files_changed? ||
post.files.count != post.num_files
post.save!
[needs_deep_update, post]
end
@@ -122,41 +126,45 @@ module Domain::Inkbunny::Job
next if post_files_by_md5[md5_initial]
md5_full = file_json["full_file_md5"]
file = post.files.create!({
ib_file_id: file_json["file_id"]&.to_i,
ib_created_at: Time.parse(file_json["create_datetime"]),
file_order: file_json["submission_file_order"]&.to_i,
ib_detail_raw: file_json,
file_name: file_json["file_name"],
url_str: file_json["file_url_full"],
md5_initial: md5_initial,
md5_full: md5_full,
md5s: {
"initial_file_md5": md5_initial,
"full_file_md5": file_json["full_file_md5"],
"large_file_md5": file_json["large_file_md5"],
"small_file_md5": file_json["small_file_md5"],
"thumbnail_md5": file_json["thumbnail_md5"],
},
})
file =
post.files.create!(
{
ib_file_id: file_json["file_id"]&.to_i,
ib_created_at: Time.parse(file_json["create_datetime"]),
file_order: file_json["submission_file_order"]&.to_i,
ib_detail_raw: file_json,
file_name: file_json["file_name"],
url_str: file_json["file_url_full"],
md5_initial: md5_initial,
md5_full: md5_full,
md5s: {
initial_file_md5: md5_initial,
full_file_md5: file_json["full_file_md5"],
large_file_md5: file_json["large_file_md5"],
small_file_md5: file_json["small_file_md5"],
thumbnail_md5: file_json["thumbnail_md5"]
}
}
)
logger.info "[post #{post.ib_post_id.to_s.bold}]: " +
"new file #{file.ib_file_id.to_s.bold} - #{file.file_name.black.bold}"
defer_job(Domain::Inkbunny::Job::FileJob, {
file: file,
caused_by_entry: @api_submissions_response.log_entry,
})
defer_job(
Domain::Inkbunny::Job::FileJob,
{ file: file, caused_by_entry: @api_submissions_response.log_entry }
)
end
post.save!
end
def post_for_json(submission_json)
post_id = submission_json["submission_id"]&.to_i || fatal_error(
"submission_id not found in submission_json: #{submission_json.keys.join(", ")}"
)
@ib_post_id_to_model[post_id] || fatal_error(
"post not found for ib_post_id #{post_id}"
)
post_id =
submission_json["submission_id"]&.to_i ||
fatal_error(
"submission_id not found in submission_json: #{submission_json.keys.join(", ")}"
)
@ib_post_id_to_model[post_id] ||
fatal_error("post not found for ib_post_id #{post_id}")
end
end
end

View File

@@ -7,17 +7,16 @@ class Domain::Twitter::Job::MediaJob < Domain::Twitter::Job::TwitterJobBase
@media = args[:media]
@caused_by_entry = args[:caused_by_entry]
logger.prefix = "[tweet #{@media.tweet_id.to_s.bold} / media #{@media.id.bold}]"
logger.prefix =
"[tweet #{@media.tweet_id.to_s.bold} / media #{@media.id.bold}]"
if @media.file
@logger.warn("already have file, skipping")
return
end
response = http_client.get(
@media.url_str,
caused_by_entry: @caused_by_entry,
)
response =
http_client.get(@media.url_str, caused_by_entry: @caused_by_entry)
logger.debug "#{HexUtil.humansize(response.log_entry.response.size)} / " +
"#{response.log_entry.content_type} / " +
@@ -27,9 +26,8 @@ class Domain::Twitter::Job::MediaJob < Domain::Twitter::Job::TwitterJobBase
if [403, 404].include?(status_code)
@media.inc_error_status_count(status_code)
@media.state = "error"
@media.state_detail = ({
status_code_error: "got #{status_code} too many times, bailing",
})
@media.state_detail =
({ status_code_error: "got #{status_code} too many times, bailing" })
@media.save!
return
end

View File

@@ -18,23 +18,30 @@ class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::Twitte
logger.info("start tweet timeline scan (force: #{@force_scan.to_s.bold})")
if @user.state == "error" && @user.tweets_protected_error_proxies.include?(@proxy_name)
fatal_error("abort scan, this proxy (#{@proxy_name}) is in error proxies: #{@user.tweets_protected_error_proxies}")
if @user.state == "error" &&
@user.tweets_protected_error_proxies.include?(@proxy_name)
fatal_error(
"abort scan, this proxy (#{@proxy_name}) is in error proxies: #{@user.tweets_protected_error_proxies}"
)
end
if !@force_scan && !@user.due_for_timeline_tweets_scan?
logger.warn("scanned #{time_ago_in_words(@user.scanned_timeline_at)}, skipping")
logger.warn(
"scanned #{time_ago_in_words(@user.scanned_timeline_at)}, skipping"
)
return
end
gallery_dl_client.start_twitter_user(
@name || @user.name, caused_by_entry: @caused_by_entry,
@name || @user.name,
caused_by_entry: @caused_by_entry
)
while true
event = gallery_dl_client.next_message(
caused_by_entry: @first_twitter_caused_by || @caused_by_entry,
)
event =
gallery_dl_client.next_message(
caused_by_entry: @first_twitter_caused_by || @caused_by_entry
)
fatal_error("nil event from gallery_dl_client") if event.nil?
case event
@@ -56,11 +63,13 @@ class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::Twitte
end
end
logger.info([
"created #{@num_created_tweets.to_s.bold} tweets",
"scanned #{@num_scanned_tweets.to_s.bold} tweets",
"created #{@num_created_medias.to_s.bold} medias",
].join(", "))
logger.info(
[
"created #{@num_created_tweets.to_s.bold} tweets",
"scanned #{@num_scanned_tweets.to_s.bold} tweets",
"created #{@num_created_medias.to_s.bold} medias"
].join(", ")
)
@user.scanned_timeline_at = Time.now
@user.state = "ok"
@@ -89,7 +98,11 @@ class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::Twitte
def maybe_extract_user_info(http_event)
return unless http_event.response_code == 200
return unless http_event.response_headers[:'content-type'].starts_with?("application/json")
unless http_event.response_headers[:"content-type"].starts_with?(
"application/json"
)
return
end
json = JSON.parse(http_event.body)
return unless json
typename = json.dig("data", "user", "result", "__typename")
@@ -131,15 +144,19 @@ class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::Twitte
return nil
end
logger.info("+ tweet (#{@num_created_tweets.to_s.bold}) #{tweet_hash[:id].to_s.bold}")
logger.info(
"+ tweet (#{@num_created_tweets.to_s.bold}) #{tweet_hash[:id].to_s.bold}"
)
Domain::Twitter::Tweet.new({
id: tweet_hash[:id],
author: @user,
content: tweet_hash[:content],
reply_to_tweet_id: tweet_hash[:reply_to],
tweeted_at: Time.at(tweet_hash[:date]),
}).save!
Domain::Twitter::Tweet.new(
{
id: tweet_hash[:id],
author: @user,
content: tweet_hash[:content],
reply_to_tweet_id: tweet_hash[:reply_to],
tweeted_at: Time.at(tweet_hash[:date])
}
).save!
@num_created_tweets += 1
end
@@ -152,23 +169,29 @@ class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::Twitte
return
end
logger.info("+ media (#{@num_created_medias.to_s.bold}) #{media_event.filename.bold}")
logger.info(
"+ media (#{@num_created_medias.to_s.bold}) #{media_event.filename.bold}"
)
media = Domain::Twitter::Media.new({
id: media_event.filename,
tweet_id: media_event.tweet_id,
url_str: media_event.file_url,
})
media =
Domain::Twitter::Media.new(
{
id: media_event.filename,
tweet_id: media_event.tweet_id,
url_str: media_event.file_url
}
)
media.save!
@num_created_medias += 1
enqueue_media_file(media)
end
def enqueue_media_file(media)
defer_job(Domain::Twitter::Job::MediaJob, {
media: media || raise,
caused_by_entry: @first_twitter_caused_by,
}, { priority: self.priority })
defer_job(
Domain::Twitter::Job::MediaJob,
{ media: media || raise, caused_by_entry: @first_twitter_caused_by },
{ priority: self.priority }
)
end
def update_user_from_tweet(tweet_event)
@@ -176,7 +199,9 @@ class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::Twitte
author = tweet_event.author
raise("no tw_id on user") unless @user.tw_id
raise("tw_id mismatch: #{@user.tw_id} != #{author[:id]}") unless @user.tw_id == author[:id]
unless @user.tw_id == author[:id]
raise("tw_id mismatch: #{@user.tw_id} != #{author[:id]}")
end
@user.name = author[:name]
@user.nick = author[:nick]

View File

@@ -1,25 +1,19 @@
class DbSampler
SCHEMA = {
::Domain::Fa::User => [
:avatar,
:posts,
:disco,
:follower_joins,
:fav_post_joins,
],
::Domain::Fa::UserAvatar => [:file, :log_entry],
::Domain::Fa::Post => [:file, :creator],
::Domain::Fa::Follow => [:follower, :followed],
::Domain::Fa::Fav => [:user, :post],
::Domain::Fa::User => %i[avatar posts disco follower_joins fav_post_joins],
::Domain::Fa::UserAvatar => %i[file log_entry],
::Domain::Fa::Post => %i[file creator],
::Domain::Fa::Follow => %i[follower followed],
::Domain::Fa::Fav => %i[user post],
::Domain::Fa::UserFactor => [],
::BlobEntryP => [:base],
::HttpLogEntry => [
:request_headers,
:response_headers,
:response,
:caused_by_entry,
::HttpLogEntry => %i[
request_headers
response_headers
response
caused_by_entry
],
::HttpLogEntryHeader => [],
::HttpLogEntryHeader => []
}
def initialize(file)
@@ -28,13 +22,18 @@ class DbSampler
end
def export(url_names)
Domain::Fa::User.includes({
avatar: [:file, :log_entry],
follower_joins: [:follower, :followed],
posts: { file: :response },
}).where(url_name: url_names).each do |user|
handle_model(user, 0, 0)
end
Domain::Fa::User
.includes(
{
avatar: %i[file log_entry],
follower_joins: %i[follower followed],
posts: {
file: :response
}
}
)
.where(url_name: url_names)
.each { |user| handle_model(user, 0, 0) }
end
def import
@@ -57,7 +56,7 @@ class DbSampler
ReduxApplicationRecord.transaction do
deferred.each do |model|
import_model(model)
rescue
rescue StandardError
end
end
end
@@ -82,9 +81,12 @@ class DbSampler
$stderr.puts("skipped existing #{model_id(model)}")
else
model2 = model.class.new
model.attribute_names.map(&:to_sym).each do |attr|
model2.write_attribute(attr, model.read_attribute(attr))
end
model
.attribute_names
.map(&:to_sym)
.each do |attr|
model2.write_attribute(attr, model.read_attribute(attr))
end
model2.save(validate: false)
$stderr.puts("imported #{model_id(model)}")
end
@@ -99,15 +101,13 @@ class DbSampler
assocs = SCHEMA[model.class] || raise("invalid: #{model.class.name}")
assocs.each do |assoc|
if user_depth > 1
next unless [:avatar, :disco].include?(assoc)
next unless %i[avatar disco].include?(assoc)
end
model2 = model.send(assoc)
next unless model2
if model2.respond_to? :each
model2.each do |model3|
handle_model(model3, level + 1, user_depth)
end
model2.each { |model3| handle_model(model3, level + 1, user_depth) }
else
handle_model(model2, level + 1, user_depth)
end

View File

@@ -1,11 +1,7 @@
require "csv"
class Domain::E621::CsvPostImporter < LegacyImport::BulkImportJob
def initialize(
csv_path:,
start_at:,
limit: nil
)
def initialize(csv_path:, start_at:, limit: nil)
logger.info "loading..."
@csv_file = CSV.new(File.open(csv_path, "r+"), headers: true)
logger.info "loaded"
@@ -57,12 +53,9 @@ class Domain::E621::CsvPostImporter < LegacyImport::BulkImportJob
logger.prefix = proc { "[e621_id #{e621_id.to_s.bold}]" }
post = Domain::E621::Post.find_by({
e621_id: e621_id,
}) || Domain::E621::Post.new({
e621_id: e621_id,
md5: md5,
})
post =
Domain::E621::Post.find_by({ e621_id: e621_id }) ||
Domain::E621::Post.new({ e621_id: e621_id, md5: md5 })
if post.md5 && post.md5 != md5
logger.error("md5 mismatch, skipping")
@@ -81,7 +74,8 @@ class Domain::E621::CsvPostImporter < LegacyImport::BulkImportJob
end
file_ext = row["file_ext"]
post.file_url_str = "https://static1.e621.net/data/#{md5[0...2]}/#{md5[2...4]}/#{md5}.#{file_ext}"
post.file_url_str =
"https://static1.e621.net/data/#{md5[0...2]}/#{md5[2...4]}/#{md5}.#{file_ext}"
post.description = row["description"]
post.rating = row["rating"]
post.score = row["score"].to_i
@@ -101,10 +95,7 @@ class Domain::E621::CsvPostImporter < LegacyImport::BulkImportJob
post.tags_array = row["tag_string"].split(/[\n\s]/).map(&:strip).sort
name_to_tag_id, missing_tags =
Domain::E621::TagUtil.tag_names_to_id_map(
post.tags_array,
posts: [post],
)
Domain::E621::TagUtil.tag_names_to_id_map(post.tags_array, posts: [post])
name_to_tag_id.merge!(
Domain::E621::TagUtil.create_tags_from_names(missing_tags)
)
@@ -115,13 +106,11 @@ class Domain::E621::CsvPostImporter < LegacyImport::BulkImportJob
Domain::E621::TagUtil.update_tags_on_post(
post,
post.tags_array,
name_to_tag_id,
name_to_tag_id
)
end
Domain::E621::Job::StaticFileJob.perform_later({
post: post,
}) if new_record
Domain::E621::Job::StaticFileJob.perform_later({ post: post }) if new_record
logger.info("updated post (new? #{new_record ? "yes" : "no"})")

View File

@@ -8,15 +8,13 @@ module Domain::E621::TagUtil
tags = Domain::E621::Tag.where(name: tag_names)
name_to_tag_id = tags.map { |tag| [tag.name, tag.id] }.to_h
posts.each do |post|
post.tags.each do |tag|
name_to_tag_id[tag.name] = tag.id
end
post.tags.each { |tag| name_to_tag_id[tag.name] = tag.id }
end
[
# main tag map
name_to_tag_id,
# missing tags
tag_names - tags.map(&:name),
tag_names - tags.map(&:name)
]
end
@@ -24,18 +22,16 @@ module Domain::E621::TagUtil
return {} unless tag_names.any?
logger.info("creating #{tag_names.size.to_s.bold} missing tags")
upsert_hashes = tag_names.map do |name|
{ name: name }
end
upsert_hashes = tag_names.map { |name| { name: name } }
name_to_tag_id = {}
Domain::E621::Tag.upsert_all(
upsert_hashes,
unique_by: :name,
update_only: :name,
returning: %i[id name],
).each do |row|
name_to_tag_id[row["name"]] = row["id"]
end
Domain::E621::Tag
.upsert_all(
upsert_hashes,
unique_by: :name,
update_only: :name,
returning: %i[id name]
)
.each { |row| name_to_tag_id[row["name"]] = row["id"] }
name_to_tag_id
end
@@ -63,43 +59,30 @@ module Domain::E621::TagUtil
raise("post must be persisted") if post.new_record?
if to_remove.any?
to_remove_ids = to_remove.
map { |tacos|
name = if tacos.is_a?(TagAndCategory)
tacos.name
else
tacos
end
name_to_tag_id[name] || raise(
"invariant: #{name} not in id map"
)
}
to_remove_ids =
to_remove.map do |tacos|
name = (tacos.is_a?(TagAndCategory) ? tacos.name : tacos)
name_to_tag_id[name] || raise("invariant: #{name} not in id map")
end
post.
taggings.
where(tag_id: to_remove_ids).
delete_all
post.taggings.where(tag_id: to_remove_ids).delete_all
end
post.taggings.insert_all!(to_add.map do |tacos|
name = tag_and_cat_or_str_to_name(tacos)
id = name_to_tag_id[name] || raise(
"invariant: #{name} not in id map"
)
post.taggings.insert_all!(
to_add.map do |tacos|
name = tag_and_cat_or_str_to_name(tacos)
id = name_to_tag_id[name] || raise("invariant: #{name} not in id map")
if tacos.is_a?(TagAndCategory)
{ tag_id: id, category: tacos.category }
else
{ tag_id: id, category: "cat_general" }
if tacos.is_a?(TagAndCategory)
{ tag_id: id, category: tacos.category }
else
{ tag_id: id, category: "cat_general" }
end
end
end) if to_add.any?
) if to_add.any?
end
def self.tag_and_cat_or_str_to_name(tacos)
if tacos.is_a?(TagAndCategory)
tacos.name
else
tacos
end
tacos.is_a?(TagAndCategory) ? tacos.name : tacos
end
end

View File

@@ -3,20 +3,19 @@ class Domain::Fa::FactorCalculator
def initialize(epochs = 20)
factors = Domain::Fa::UserFactor::FACTORS_WIDTHS
@recommender = Disco::Recommender.new(
factors: factors,
epochs: epochs,
)
@recommender = Disco::Recommender.new(factors: factors, epochs: epochs)
logger.info "epochs=#{epochs.to_s.bold} factors=#{factors.to_s.bold}"
end
def fit
logger.info "loading follow rows..."
dataset = measure(proc { |r| "loaded #{r.length.to_s.bold} follows" }) do
Domain::Fa::Follow.all.pluck(:follower_id, :followed_id).map do |id1, id2|
{ user_id: id1, item_id: id2 }
dataset =
measure(proc { |r| "loaded #{r.length.to_s.bold} follows" }) do
Domain::Fa::Follow
.all
.pluck(:follower_id, :followed_id)
.map { |id1, id2| { user_id: id1, item_id: id2 } }
end
end
measure("fit #{dataset.length.to_s.bold} follows") do
@recommender.fit(dataset)
@@ -25,8 +24,10 @@ class Domain::Fa::FactorCalculator
def write_factors
total = 0
for_followed_width = Domain::Fa::UserFactor.native_factor_width("for_followed")
for_follower_width = Domain::Fa::UserFactor.native_factor_width("for_follower")
for_followed_width =
Domain::Fa::UserFactor.native_factor_width("for_followed")
for_follower_width =
Domain::Fa::UserFactor.native_factor_width("for_follower")
measure("#{"for_followed".bold} - done") do
write_factors_col(:item_ids, :item_factors, :for_followed)
@@ -39,30 +40,31 @@ class Domain::Fa::FactorCalculator
def write_factors_col(id_list_name, getter_name, factors_col_name)
total = 0
id_list = @recommender.send(id_list_name)
native_col_width = Domain::Fa::UserFactor.
columns_hash[factors_col_name.to_s].
sql_type_metadata.
limit
native_col_width =
Domain::Fa::UserFactor.columns_hash[
factors_col_name.to_s
].sql_type_metadata.limit
logger.info "#{factors_col_name.to_s.bold} - writing #{id_list.length.to_s.bold} factors"
id_list.map do |user_id|
factors = @recommender.send(getter_name, user_id)
padding = [0.0] * (native_col_width - factors.length)
{
user_id: user_id,
factors_col_name => padding + factors.to_a,
}
end.each_slice(20000) do |chunk|
total += chunk.size
measure(" -> wrote chunk of #{chunk.size.to_s.bold} - (#{total.to_s.bold} total)") do
Domain::Fa::UserFactor.upsert_all(
chunk,
unique_by: :user_id,
update_only: factors_col_name,
returning: :id,
)
id_list
.map do |user_id|
factors = @recommender.send(getter_name, user_id)
padding = [0.0] * (native_col_width - factors.length)
{ :user_id => user_id, factors_col_name => padding + factors.to_a }
end
.each_slice(20_000) do |chunk|
total += chunk.size
measure(
" -> wrote chunk of #{chunk.size.to_s.bold} - (#{total.to_s.bold} total)"
) do
Domain::Fa::UserFactor.upsert_all(
chunk,
unique_by: :user_id,
update_only: factors_col_name,
returning: :id
)
end
end
end
end
end

View File

@@ -2,17 +2,20 @@ module Domain::Fa::HasCountFailedInQueue
extend ActiveSupport::Concern
included do
def count_failed_in_queue(queue_name)
GoodJob::Job.where(
queue_name: queue_name,
finished_at: nil,
performed_at: nil,
error: nil,
).where(
[
"(serialized_params->'exception_executions' = '{}')",
"(serialized_params->'exception_executions' is null)",
].join(" OR ")
).count
GoodJob::Job
.where(
queue_name: queue_name,
finished_at: nil,
performed_at: nil,
error: nil
)
.where(
[
"(serialized_params->'exception_executions' = '{}')",
"(serialized_params->'exception_executions' is null)"
].join(" OR ")
)
.count
end
end
end

View File

@@ -8,11 +8,12 @@ class Domain::Fa::Parser::ListedSubmissionParserHelper
end
def id
@id ||= /\/view\/(\d+)/.match(view_path).try(:[], 1).try(:to_i)
@id ||= %r{/view/(\d+)}.match(view_path).try(:[], 1).try(:to_i)
end
def artist
@artist ||= if !@new_parse_mode
@artist ||=
if !@new_parse_mode
@elem.css("small a").first.try(:text)
else
@elem.css("figcaption p").last.css("a").text
@@ -20,7 +21,8 @@ class Domain::Fa::Parser::ListedSubmissionParserHelper
end
def artist_user_page_path
@artist_user_page_path ||= if !@new_parse_mode
@artist_user_page_path ||=
if !@new_parse_mode
@elem.css("small a").first.try(:[], "href")
else
@elem.css("figcaption p").last.css("a").first["href"]
@@ -32,7 +34,8 @@ class Domain::Fa::Parser::ListedSubmissionParserHelper
end
def title
@title ||= if !@new_parse_mode
@title ||=
if !@new_parse_mode
@elem.css("> span").first["title"]
else
@elem.css("figcaption p").first.css("a").first["title"]
@@ -40,7 +43,8 @@ class Domain::Fa::Parser::ListedSubmissionParserHelper
end
def view_path
@view_path ||= if !@new_parse_mode
@view_path ||=
if !@new_parse_mode
@elem.css("u s a").first["href"]
else
@elem.css("b u a").first["href"]
@@ -48,7 +52,8 @@ class Domain::Fa::Parser::ListedSubmissionParserHelper
end
def thumb_path
@thumb_path ||= if !@new_parse_mode
@thumb_path ||=
if !@new_parse_mode
@elem.css("u s a img").first["src"]
else
@elem.css("b u a img").first["src"]

View File

@@ -11,16 +11,19 @@ class Domain::Fa::Parser::Page < Domain::Fa::Parser::Base
VERSION_2 = :redux
def initialize(page_html, require_logged_in: true)
@page = if page_html.is_a? Nokogiri::HTML::Document
@page =
if page_html.is_a? Nokogiri::HTML::Document
page_html
else
phtml = page_html.delete("\u0000")
@phtml = phtml
Nokogiri::HTML(phtml)
Nokogiri.HTML(phtml)
end
@page_version = if @page.css("link[href='/themes/beta/img/favicon.ico']").first
@page_version =
if @page.css("link[href='/themes/beta/img/favicon.ico']").first
VERSION_2
elsif @page.css(".submission-list section").first || @page.css("meta[property='twitter:site']").first
elsif @page.css(".submission-list section").first ||
@page.css("meta[property='twitter:site']").first
VERSION_1
else
VERSION_0
@@ -40,19 +43,18 @@ class Domain::Fa::Parser::Page < Domain::Fa::Parser::Base
def submission_not_found?
# the username elem is never shown on a "not found" page
return false if logged_in_user_elem
not_found_text = "The submission you are trying to find is not in our database"
not_found_text =
"The submission you are trying to find is not in our database"
case @page_version
when VERSION_2
@page.css("body .section-body")&.
first&.
text&.
include?(not_found_text)
@page.css("body .section-body")&.first&.text&.include?(not_found_text)
else
@page.css("table.maintable td.alt1 font[size=1]")&.
first&.
text&.
include?(not_found_text)
@page
.css("table.maintable td.alt1 font[size=1]")
&.first
&.text
&.include?(not_found_text)
end
end
@@ -61,31 +63,41 @@ class Domain::Fa::Parser::Page < Domain::Fa::Parser::Base
end
def submissions_parsed
@submissions_hashes ||= submission_elems.map do |elem|
Domain::Fa::Parser::ListedSubmissionParserHelper.new(elem, @page_version)
end
@submissions_hashes ||=
submission_elems.map do |elem|
Domain::Fa::Parser::ListedSubmissionParserHelper.new(
elem,
@page_version
)
end
end
def favorites_next_button_id
button = @page.css(".gallery-section .pagination a.button.right").first
if button
href = button["href"]
match = /\/favorites\/.+\/(\d+)\/next\/?/.match(href)
match = %r{/favorites/.+/(\d+)/next/?}.match(href)
raise("invalid favs button uri #{href}") unless match
match[1].to_i
end
end
def submission_folders
@submission_folders ||= @page.css(".folder-list a.dotted").map do |folder_link|
{ href: folder_link["href"], title: folder_link.text }
end
@submission_folders ||=
@page
.css(".folder-list a.dotted")
.map do |folder_link|
{ href: folder_link["href"], title: folder_link.text }
end
end
def submission_elems
@submission_elems ||= case @page_version
when VERSION_0 then @page.css(".t-image")
when VERSION_1 then @page.css(".submission-list > .gallery > figure")
@submission_elems ||=
case @page_version
when VERSION_0
@page.css(".t-image")
when VERSION_1
@page.css(".submission-list > .gallery > figure")
when VERSION_2
[
# user gallery pages
@@ -98,11 +110,10 @@ class Domain::Fa::Parser::Page < Domain::Fa::Parser::Base
"#gallery-frontpage-submissions > figure",
"#gallery-frontpage-writing > figure",
"#gallery-frontpage-music > figure",
"#gallery-frontpage-crafts > figure",
].lazy.map do |css|
@page.css(css)
end.reject(&:empty?).to_a.flatten
else unimplemented_version!
"#gallery-frontpage-crafts > figure"
].lazy.map { |css| @page.css(css) }.reject(&:empty?).to_a.flatten
else
unimplemented_version!
end
end
@@ -118,7 +129,8 @@ class Domain::Fa::Parser::Page < Domain::Fa::Parser::Base
end
def logged_in_user_elem
@logged_in_user_elem ||= case @page_version
@logged_in_user_elem ||=
case @page_version
when VERSION_0, VERSION_1
@page.css("li a#my-username").first
when VERSION_2
@@ -132,17 +144,29 @@ class Domain::Fa::Parser::Page < Domain::Fa::Parser::Base
# center.flow seems to wrap all lists of submissions
case @page_version
when VERSION_0
first_matching_css(@page, [".submission-list", "td > section.gallery"]) ? true : false
if first_matching_css(@page, [".submission-list", "td > section.gallery"])
true
else
false
end
when VERSION_1
(@page.css(".submission-list").first) ? true : false
when VERSION_2
first_matching_css(@page, [
".submission-list",
"#gallery-browse",
"#gallery-favorites",
"#gallery-frontpage-submissions",
]) ? true : false
else unimplemented_version!
if first_matching_css(
@page,
%w[
.submission-list
#gallery-browse
#gallery-favorites
#gallery-frontpage-submissions
]
)
true
else
false
end
else
unimplemented_version!
end
end
@@ -166,12 +190,18 @@ class Domain::Fa::Parser::Page < Domain::Fa::Parser::Base
@page.css("#submissionImg").first ? true : false
when VERSION_2
@page.css(".submission-content").first ? true : false
else unimplemented_version!
else
unimplemented_version!
end
end
def submission
@submission ||= Domain::Fa::Parser::SubmissionParserHelper.new(@page, @phtml, @page_version)
@submission ||=
Domain::Fa::Parser::SubmissionParserHelper.new(
@page,
@phtml,
@page_version
)
end
def user_list
@@ -189,7 +219,5 @@ def elem_after_text_match(children, regex)
end
def elem_idx_after_text_match(children, regex)
children.find_index do |child|
child.text.match(regex)
end
children.find_index { |child| child.text.match(regex) }
end

View File

@@ -11,7 +11,8 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
def id
# @elem.css("form[name=myform]").first['action'].split("/").last.to_i
@id ||= begin
@id ||=
begin
elem = @elem.css("meta[property='og:url']").first
elem["content"].split("/").reject(&:empty?).last.to_i if elem
end
@@ -28,28 +29,33 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
@elem.css("#page-submission td.cat b").first.text.strip
when VERSION_2
@elem.css(".submission-title p").first.text.strip
else unimplemented_version!
else
unimplemented_version!
end
end
def artist
# @elem.css(".cat a").first.text.strip
@artist ||= case @page_version
@artist ||=
case @page_version
when VERSION_0, VERSION_1
@elem.css("#page-submission table.maintable td.cat a").first.text.strip
when VERSION_2
@elem.css(".submission-id-sub-container a")&.first&.text&.strip
else unimplemented_version!
else
unimplemented_version!
end
end
def artist_user_page_path
@artist_user_page_path ||= case @page_version
@artist_user_page_path ||=
case @page_version
when VERSION_2
@elem.css(".submission-id-sub-container a")&.first["href"]
when VERSION_0, VERSION_1
@elem.css("table[align=center] td.cat a")&.first["href"]
else unimplemented_version!
else
unimplemented_version!
end
end
@@ -58,12 +64,14 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
end
def artist_avatar_url
@artist_avatar_url ||= case @page_version
@artist_avatar_url ||=
case @page_version
when VERSION_2
@elem.css(".submission-user-icon.avatar")&.first&.[]("src")
when VERSION_0, VERSION_1
@elem.css("a img.avatar")&.first&.[]("src")
else unimplemented_version!
else
unimplemented_version!
end
end
@@ -75,7 +83,8 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
@elem.css("#page-submission td.alt1 .maintable tr .alt1").last.inner_html
when VERSION_2
@elem.css(".submission-description").first.inner_html
else unimplemented_version!
else
unimplemented_version!
end
end
@@ -87,15 +96,19 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
when VERSION_1
@elem.css("#page-submission div b a")[1]["href"].strip
when VERSION_2
@elem.css("a.button.standard.mobile-fix").find do |elem|
elem.text.strip == "Download"
end["href"]
else unimplemented_version!
@elem
.css("a.button.standard.mobile-fix")
.find { |elem| elem.text.strip == "Download" }[
"href"
]
else
unimplemented_version!
end
end
def posted_date
@posted_date ||= case @page_version
@posted_date ||=
case @page_version
# when VERSION_0
# info_child(6)["title"].strip
when VERSION_0, VERSION_1
@@ -123,110 +136,138 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
else
raise("unable to determine rating")
end
else unimplemented_version!
else
unimplemented_version!
end
end
def category
@category ||= case @page_version
@category ||=
case @page_version
when VERSION_0, VERSION_1
elem_after_text_match(info_children, /Category/).text.strip
when VERSION_2
category_full_str_redux&.split(" / ")&.first&.strip
else unimplemented_version!
else
unimplemented_version!
end
end
def theme
@theme ||= case @page_version
@theme ||=
case @page_version
when VERSION_0, VERSION_1
elem_after_text_match(info_children, /Theme/).text.strip
when VERSION_2
category_full_str_redux&.split(" / ")&.last&.strip
else unimplemented_version!
else
unimplemented_version!
end
end
# FA started combining "Category / Theme" string into one
def category_full_str_redux
@category_full_str_redux ||= case @page_version
@category_full_str_redux ||=
case @page_version
when VERSION_2
info_text_value_redux("Category")
else unimplemented_version!
else
unimplemented_version!
end
end
def species
@species ||= case @page_version
@species ||=
case @page_version
when VERSION_0, VERSION_1
elem_after_text_match(info_children, /Species/).try(:text).try(:strip)
when VERSION_2
info_text_value_redux("Species")
else unimplemented_version!
else
unimplemented_version!
end
end
def gender
@gender ||= case @page_version
@gender ||=
case @page_version
when VERSION_0, VERSION_1
elem_after_text_match(info_children, /Gender/).try(:text).try(:strip)
when VERSION_2
info_text_value_redux("Gender")
else unimplemented_version!
else
unimplemented_version!
end
end
def num_favorites
@num_favorites ||= case @page_version
@num_favorites ||=
case @page_version
when VERSION_0, VERSION_1
elem_after_text_match(info_children, /Favorites/).text.strip.to_i
when VERSION_2
stats_container_redux.css(".favorites .font-large").first.text.strip.to_i
else unimplemented_version!
stats_container_redux
.css(".favorites .font-large")
.first
.text
.strip
.to_i
else
unimplemented_version!
end
end
def num_comments
@num_comments ||= case @page_version
@num_comments ||=
case @page_version
when VERSION_0, VERSION_1
elem_after_text_match(info_children, /Comments/).text.strip.to_i
when VERSION_2
stats_container_redux.css(".comments .font-large").first.text.strip.to_i
else unimplemented_version!
else
unimplemented_version!
end
end
def num_views
@num_views ||= case @page_version
@num_views ||=
case @page_version
when VERSION_0, VERSION_1
elem_after_text_match(info_children, /Views/).text.strip.to_i
when VERSION_2
stats_container_redux.css(".views .font-large").first.text.strip.to_i
else unimplemented_version!
else
unimplemented_version!
end
end
def resolution_str
@resolution_str ||= case @page_version
@resolution_str ||=
case @page_version
when VERSION_0
elem_after_text_match(info_children, /Resolution/).try(:text).try(:strip)
elem_after_text_match(info_children, /Resolution/).try(:text).try(
:strip
)
when VERSION_1
idx = elem_idx_after_text_match(info_children, /Resolution/)
info_children[idx + 1].try(:text).try(:strip)
when VERSION_2
parts = info_text_value_redux("Size").split(" ")
parts.first + "x" + parts.last
else unimplemented_version!
else
unimplemented_version!
end
end
def keywords_array
@keywords_array ||= case @page_version
@keywords_array ||=
case @page_version
when VERSION_0, VERSION_1
@elem.css("#keywords a").map(&:text)
when VERSION_2
@elem.css(".tags-row .tags a").map(&:text).map(&:strip)
else unimplemented_version!
else
unimplemented_version!
end
end
@@ -241,12 +282,14 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
end
def information_elem
@information_elem ||= case @page_version
@information_elem ||=
case @page_version
# when VERSION_0
# @elem.css("td td td td td td.alt1[align=left]").first
when VERSION_0, VERSION_1
@elem.css("#page-submission td td td td.alt1[align=left]").first
else unimplemented_version!
else
unimplemented_version!
end
end
@@ -255,10 +298,14 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
end
def info_text_value_redux(info_section)
info_text_elem_redux.
css(".highlight").
find { |e| e.text == info_section }&.
parent&.children&.slice(1..)&.text&.strip
info_text_elem_redux
.css(".highlight")
.find { |e| e.text == info_section }
&.parent
&.children
&.slice(1..)
&.text
&.strip
end
def stats_container_redux

View File

@@ -1,15 +1,10 @@
class Domain::Fa::Parser::UserListParserHelper
User = Struct.new(
:name,
:url_name,
:href,
keyword_init: true,
)
User = Struct.new(:name, :url_name, :href, keyword_init: true)
def self.user_list(page)
page.css(".watch-list .watch-list-items").map do |elem|
watch_list_item_to_user_struct(elem)
end
page
.css(".watch-list .watch-list-items")
.map { |elem| watch_list_item_to_user_struct(elem) }
end
private
@@ -21,10 +16,6 @@ class Domain::Fa::Parser::UserListParserHelper
# strip off '/user/' prefix
url_name = href.split("/").reject(&:blank?).last
User.new(
name: link.text,
url_name: url_name,
href: href,
)
User.new(name: link.text, url_name: url_name, href: href)
end
end

View File

@@ -13,13 +13,15 @@ class Domain::Fa::Parser::UserPageHelper < Domain::Fa::Parser::Base
"~", # regular user
"!", # suspended
"-", # banned (permanent)
"@", # admin
"", # deceased
"@", # admin
"" # deceased
]
def name
@name ||= begin
elem = case @page_version
@name ||=
begin
elem =
case @page_version
when VERSION_0, VERSION_1
elem = @elem.css(".addpad.lead b")
when VERSION_2
@@ -28,16 +30,15 @@ class Domain::Fa::Parser::UserPageHelper < Domain::Fa::Parser::Base
unimplemented_version!
end
name = elem&.
first&.
text&.
strip
name = elem&.first&.text&.strip
if name
if @elem.css("userpage-nav-header img.userIcon.type-admin").first
# in newer versions of FA user pages, admins have no '@' prefix,
# but rather an indicator image icon
raise("invalid prefix for admin user name: #{name}") if PREFIXES.include?(name[0])
if PREFIXES.include?(name[0])
raise("invalid prefix for admin user name: #{name}")
end
name
elsif PREFIXES.include?(name[0])
name[1..]
@@ -54,27 +55,31 @@ class Domain::Fa::Parser::UserPageHelper < Domain::Fa::Parser::Base
end
def account_status
@account_status ||= begin
@account_status ||=
begin
if @elem.css("userpage-nav-header img.userIcon.type-admin").first
:admin
else
elem = case @page_version
elem =
case @page_version
when VERSION_2
@elem.css("userpage-nav-user-details username")
else
unimplemented_version!
end
name = elem&.
first&.
text&.
strip || ""
name = elem&.first&.text&.strip || ""
case name[0]
when "~" then :active
when "!" then :suspended
when "-" then :banned
when "" then :deceased
else nil
when "~"
:active
when "!"
:suspended
when "-"
:banned
when ""
:deceased
else
nil
end
end
end
@@ -85,29 +90,38 @@ class Domain::Fa::Parser::UserPageHelper < Domain::Fa::Parser::Base
end
def artist_type
@artist_type ||= elem_after_text_match(main_about.children, /Type/).try(:text).try(:strip)
@artist_type ||=
elem_after_text_match(main_about.children, /Type/).try(:text).try(:strip)
end
def profile_thumb_url
@profile_thumb_url ||= case @page_version
@profile_thumb_url ||=
case @page_version
when VERSION_0
@elem.css(".addpad.alt1 a img.avatar").first.try(:[], "src")
when VERSION_2
@elem.css("userpage-nav-avatar a.current img").first.try(:[], "src")
else unimplemented_version!
else
unimplemented_version!
end
end
def registered_since
@registered_since ||= case @page_version
@registered_since ||=
case @page_version
when VERSION_0, VERSION_1
elem_after_text_match(main_about.children, /Registered/).text.strip
when VERSION_2
date_str = @elem.css("username span").find do |elem|
elem&.text&.strip == "Registered:"
end&.next_sibling&.text&.strip
date_str =
@elem
.css("username span")
.find { |elem| elem&.text&.strip == "Registered:" }
&.next_sibling
&.text
&.strip
DateTime.parse(date_str) if date_str
else unimplemented_version!
else
unimplemented_version!
end
end
@@ -144,23 +158,29 @@ class Domain::Fa::Parser::UserPageHelper < Domain::Fa::Parser::Base
end
def recent_fav_fa_ids
@recent_favs ||= case @page_version
@recent_favs ||=
case @page_version
when VERSION_2
@elem.css("#gallery-latest-favorites").first&.css("figure a")&.map do |elem|
href = elem["href"]
/\/view\/(\d+)/.match(href)[1]&.to_i || raise(
"invalid url: #{href}"
)
end || []
else unimplemented_version!
@elem
.css("#gallery-latest-favorites")
.first
&.css("figure a")
&.map do |elem|
href = elem["href"]
%r{/view/(\d+)}.match(href)[1]&.to_i ||
raise("invalid url: #{href}")
end || []
else
unimplemented_version!
end
end
RecentUser = Struct.new(:name, :url_name) do
def to_a
[name, url_name]
RecentUser =
Struct.new(:name, :url_name) do
def to_a
[name, url_name]
end
end
end
def recent_watchers
@recent_watchers ||= recent_users_for_section("Recent Watchers")
@@ -175,54 +195,48 @@ class Domain::Fa::Parser::UserPageHelper < Domain::Fa::Parser::Base
def recent_users_for_section(section_name)
case @page_version
when VERSION_2
section_elem = @elem.css(".userpage-section-left").find do |elem|
elem.css(".section-header h2")&.first&.text&.strip == section_name
end
section_elem =
@elem
.css(".userpage-section-left")
.find do |elem|
elem.css(".section-header h2")&.first&.text&.strip == section_name
end
section_elem = section_elem.css(".section-body").first
section_elem.css("a").map do |link_elem|
href = link_elem["href"]
url_name = /\/user\/(.+)\//.match(href)&.[](1) || raise(
"invalid url: #{href}"
)
name = link_elem.css(".artist_name").first.text.strip
RecentUser.new(name, url_name)
end
else unimplemented_version!
section_elem
.css("a")
.map do |link_elem|
href = link_elem["href"]
url_name =
%r{/user/(.+)/}.match(href)&.[](1) || raise("invalid url: #{href}")
name = link_elem.css(".artist_name").first.text.strip
RecentUser.new(name, url_name)
end
else
unimplemented_version!
end
end
def stat_value(legacy_name, redux_idx)
legacy_map = if false # old mode?
{
pvs: 2,
subs: 5,
crec: 8,
cgiv: 11,
njr: 14,
nfav: 17,
}
legacy_map =
if false # old mode?
{ pvs: 2, subs: 5, crec: 8, cgiv: 11, njr: 14, nfav: 17 }
else
{
pvs: 2,
subs: 6,
crec: 10,
cgiv: 14,
njr: 18,
nfav: 22,
}
{ pvs: 2, subs: 6, crec: 10, cgiv: 14, njr: 18, nfav: 22 }
end
value = case @page_version
value =
case @page_version
when VERSION_0, VERSION_1
statistics.children[legacy_map[legacy_name] || raise].text.strip.to_i
when VERSION_2
statistics.css(".highlight")[redux_idx]&.next_sibling&.text&.strip&.to_i
else unimplemented_version!
else
unimplemented_version!
end
# FA databases can be a little weird
if value >= (2 ** 32 - 1)
if value >= (2**32 - 1)
0
else
value
@@ -230,24 +244,30 @@ class Domain::Fa::Parser::UserPageHelper < Domain::Fa::Parser::Base
end
def statistics
@statistics ||= case @page_version
@statistics ||=
case @page_version
when VERSION_0, VERSION_1
@elem.css(".ldot table tr:last-child td").first
when VERSION_2
@elem.css(".userpage-layout-right-col-content .userpage-section-right").find do |child|
child.css(".section-header h2")&.first&.text&.strip == "Stats"
end
else unimplemented_version!
@elem
.css(".userpage-layout-right-col-content .userpage-section-right")
.find do |child|
child.css(".section-header h2")&.first&.text&.strip == "Stats"
end
else
unimplemented_version!
end
end
def main_about
@main_about ||= case @page_version
@main_about ||=
case @page_version
when VERSION_0, VERSION_1
@elem.css(".alt1.addpad .ldot").first
when VERSION_2
@elem.css(".section-body.userpage-profile").first
else unimplemented_version!
else
unimplemented_version!
end
end
end

View File

@@ -4,42 +4,52 @@ class Domain::Fa::PostEnqueuer
include HasMeasureDuration
include Domain::Fa::HasCountFailedInQueue
def initialize(reverse_scan_holes:, start_at:, low_water_mark:, high_water_mark:)
def initialize(
reverse_scan_holes:,
start_at:,
low_water_mark:,
high_water_mark:
)
@low_water_mark = low_water_mark
@high_water_mark = high_water_mark
raise if @high_water_mark <= @low_water_mark
@post_iterator = Enumerator.new do |e|
if reverse_scan_holes
while start_at > 0
if !Domain::Fa::Post.exists?(fa_id: start_at)
e << [nil, start_at, nil]
@post_iterator =
Enumerator.new do |e|
if reverse_scan_holes
while start_at > 0
if !Domain::Fa::Post.exists?(fa_id: start_at)
e << [nil, start_at, nil]
end
start_at -= 1
end
start_at -= 1
end
else
Domain::Fa::Post.
where("id >= ?", start_at).
where("file_id is null").
where(state: "ok").
pluck_each(:id, :fa_id, :file_url_str) do |p|
e << p
else
Domain::Fa::Post
.where("id >= ?", start_at)
.where("file_id is null")
.where(state: "ok")
.pluck_each(:id, :fa_id, :file_url_str) { |p| e << p }
end
end
end
end
def run_once
already_enqueued = enqueued_count
if already_enqueued <= @low_water_mark
to_enqueue = @high_water_mark - already_enqueued
logger.info("enqueuing #{to_enqueue.to_s.bold} more posts - #{already_enqueued.to_s.bold} already enqueued")
rows = measure(proc { |p| "gather #{p.length.to_s.bold} posts to enqueue" }) do
to_enqueue.times.map do
@post_iterator.next
rescue StopIteration
nil
end.reject(&:nil?)
end
logger.info(
"enqueuing #{to_enqueue.to_s.bold} more posts - #{already_enqueued.to_s.bold} already enqueued"
)
rows =
measure(proc { |p| "gather #{p.length.to_s.bold} posts to enqueue" }) do
to_enqueue
.times
.map do
@post_iterator.next
rescue StopIteration
nil
end
.reject(&:nil?)
end
measure("enqueue jobs") do
rows.each do |post_id, fa_id, file_url_str|
@@ -72,6 +82,6 @@ class Domain::Fa::PostEnqueuer
private
def enqueued_count
count_failed_in_queue(["static_file", "fa_post"])
count_failed_in_queue(%w[static_file fa_post])
end
end

View File

@@ -8,41 +8,35 @@ class Domain::Fa::SqliteExporter
end
def run
measure("created tables") do
migrate
measure("created tables") { migrate }
measure("drop indexes") { drop_indexes }
if dump_table?(:users)
measure(proc { |num| "dumped #{num&.to_s&.bold} fa users" }) do
dump_fa_users
end
end
measure("drop indexes") do
drop_indexes
if dump_table?(:follows)
measure(proc { |num| "dumped #{num&.to_s&.bold} fa follows" }) do
dump_fa_follows
end
end
measure(proc do |num|
"dumped #{num&.to_s&.bold} fa users"
end) do
dump_fa_users
end if dump_table?(:users)
measure(proc do |num|
"dumped #{num&.to_s&.bold} fa follows"
end) do
dump_fa_follows
end if dump_table?(:follows)
measure(proc do |num|
"dumped #{num&.to_s&.bold} fa favs"
end) do
dump_fa_favs
end if dump_table?(:favs)
measure(proc do |num|
"dumped #{num&.to_s&.bold} fa posts"
end) do
dump_fa_posts
end if dump_table?(:posts)
measure("created indexes") do
create_indexes
if dump_table?(:favs)
measure(proc { |num| "dumped #{num&.to_s&.bold} fa favs" }) do
dump_fa_favs
end
end
if dump_table?(:posts)
measure(proc { |num| "dumped #{num&.to_s&.bold} fa posts" }) do
dump_fa_posts
end
end
measure("created indexes") { create_indexes }
ensure
@db.close
end
@@ -110,24 +104,20 @@ class Domain::Fa::SqliteExporter
["fa_favs", "user_id", false],
["fa_favs", "post_id", false],
["fa_posts", "creator_id", false],
["fa_posts", "fa_id", true],
["fa_posts", "fa_id", true]
]
def create_indexes
@db.execute_batch2(INDEXES.map do |table, col, unique|
<<-SQL
@db.execute_batch2(INDEXES.map { |table, col, unique| <<-SQL }.join("\n"))
create #{unique ? "unique" : ""} index if not exists #{col}_on_#{table}
on #{table} (#{col});
SQL
end.join("\n"))
end
def drop_indexes
@db.execute_batch2(INDEXES.map do |table, col, unique|
<<-SQL
@db.execute_batch2(INDEXES.map { |table, col, unique| <<-SQL }.join("\n"))
drop index if exists #{col}_on_#{table};
SQL
end.join("\n"))
end
def dump_fa_users
@@ -135,12 +125,10 @@ class Domain::Fa::SqliteExporter
model: Domain::Fa::User,
table: "fa_users",
columns: %w[id url_name name num_favorites registered_at],
batch_size: 512,
batch_size: 512
) do |batch|
# format registered_at
batch.each do |row|
row[4] = row[4]&.iso8601
end
batch.each { |row| row[4] = row[4]&.iso8601 }
end
end
@@ -148,8 +136,8 @@ class Domain::Fa::SqliteExporter
dump_table_common(
model: Domain::Fa::Follow,
table: "fa_follows",
columns: ["id", "follower_id", "followed_id"],
batch_size: 4096,
columns: %w[id follower_id followed_id],
batch_size: 4096
)
end
@@ -157,8 +145,8 @@ class Domain::Fa::SqliteExporter
dump_table_common(
model: Domain::Fa::Fav,
table: "fa_favs",
columns: ["id", "user_id", "post_id"],
batch_size: 4096,
columns: %w[id user_id post_id],
batch_size: 4096
)
end
@@ -167,16 +155,19 @@ class Domain::Fa::SqliteExporter
model: Domain::Fa::Post.where("file_url_str is not null"),
table: "fa_posts",
columns: %w[
id fa_id title creator_id
num_views num_comments num_favorites
id
fa_id
title
creator_id
num_views
num_comments
num_favorites
posted_at
],
batch_size: 4096,
batch_size: 4096
) do |batch|
# format posted_at
batch.each do |row|
row[7] = row[7]&.iso8601
end
batch.each { |row| row[7] = row[7]&.iso8601 }
end
end
@@ -186,7 +177,9 @@ class Domain::Fa::SqliteExporter
num_models = 0
start_id = max_id(table, "id")
start_id += 1 if start_id
logger.info("dumping #{table.bold}, start at #{(start_id || "first").to_s.bold}...")
logger.info(
"dumping #{table.bold}, start at #{(start_id || "first").to_s.bold}..."
)
inserter = create_inserter(batch_size, table, columns)
load_duration = 0.0
@@ -198,7 +191,12 @@ class Domain::Fa::SqliteExporter
@db.transaction
pluck_rows(model, columns, start_id: start_id, batch_size: batch_size) do |rows|
pluck_rows(
model,
columns,
start_id: start_id,
batch_size: batch_size
) do |rows|
load_duration += Time.now - load_start
map_start = Time.now
@@ -265,11 +263,7 @@ class Domain::Fa::SqliteExporter
colss = colss[@bulk_size...]
end
if colss.any?
colss.each do |col|
insert_single(col)
end
end
colss.each { |col| insert_single(col) } if colss.any?
end
private
@@ -301,9 +295,7 @@ class Domain::Fa::SqliteExporter
def bind_bulk(stmt, binds)
# stmt.bind_params(binds)
binds.each do |arr|
bind_single(stmt, arr)
end
binds.each { |arr| bind_single(stmt, arr) }
end
def dimensionality!
@@ -320,7 +312,10 @@ class Domain::Fa::SqliteExporter
relation = relation.all unless relation.is_a?(ActiveRecord::Relation)
relation = relation.where("id >= ?", start_id) if start_id
relation.pluck_in_batches(*cols.map(&:to_sym), batch_size: batch_size) do |batch|
relation.pluck_in_batches(
*cols.map(&:to_sym),
batch_size: batch_size
) do |batch|
yield batch
num_models += batch.size
@@ -349,9 +344,7 @@ class Domain::Fa::SqliteExporter
def dump_table?(table)
ret = @tables.include?(:all) || @tables.include?(table)
if !ret
logger.info("skipping #{table.to_s.bold}...")
end
logger.info("skipping #{table.to_s.bold}...") if !ret
ret
end
end

View File

@@ -13,16 +13,17 @@ class Domain::Fa::UserAvatarFixer < LegacyImport::BulkImportJob
@processed = 0
if @url_name
user = Domain::Fa::User.find_by(url_name: @url_name) || raise("user not found")
user =
Domain::Fa::User.find_by(url_name: @url_name) || raise("user not found")
process_avatar(user.avatar)
else
Domain::Fa::UserAvatar.
where(state: "no_file_on_guessed_user_page_error").
find_each(start: @start_at, batch_size: 5) do |avatar|
@processed += 1
break if @limit && @processed > @limit
process_avatar(avatar)
end
Domain::Fa::UserAvatar
.where(state: "no_file_on_guessed_user_page_error")
.find_each(start: @start_at, batch_size: 5) do |avatar|
@processed += 1
break if @limit && @processed > @limit
process_avatar(avatar)
end
end
@processed
@@ -32,7 +33,10 @@ class Domain::Fa::UserAvatarFixer < LegacyImport::BulkImportJob
def process_avatar(avatar)
user = avatar.user
logger.prefix = proc { "[avatar #{avatar.id.to_s.bold}, user #{user.url_name.to_s.bold}]" }
logger.prefix =
proc do
"[avatar #{avatar.id.to_s.bold}, user #{user.url_name.to_s.bold}]"
end
logger.info("guessing...")

View File

@@ -8,30 +8,31 @@ class Domain::Fa::UserEnqueuer
@low_water_mark = low_water_mark
@high_water_mark = high_water_mark
raise if @high_water_mark <= @low_water_mark
@user_iterator = Enumerator.new do |e|
Domain::Fa::User.
where("id >= ?", start_at).
find_each do |user|
e << user
@user_iterator =
Enumerator.new do |e|
Domain::Fa::User
.where("id >= ?", start_at)
.find_each { |user| e << user }
end
end
end
def run_once
already_enqueued = enqueued_count
if already_enqueued <= @low_water_mark
to_enqueue = @high_water_mark - already_enqueued
logger.info("enqueuing #{to_enqueue.to_s.bold} more users - #{already_enqueued.to_s.bold} already enqueued")
rows = measure(proc { |p| "gather #{p.length.to_s.bold} users to enqueue" }) do
to_enqueue.times.map do
@user_iterator.next
logger.info(
"enqueuing #{to_enqueue.to_s.bold} more users - #{already_enqueued.to_s.bold} already enqueued"
)
rows =
measure(proc { |p| "gather #{p.length.to_s.bold} users to enqueue" }) do
to_enqueue.times.map { @user_iterator.next }
end
end
measure("enqueue jobs") do
rows.each do |user|
types = []
if user.state == "ok"
if user.due_for_favs_scan? || user.due_for_page_scan? || user.due_for_follows_scan?
if user.due_for_favs_scan? || user.due_for_page_scan? ||
user.due_for_follows_scan?
Domain::Fa::Job::UserIncrementalJob.perform_later({ user: user })
types << "incremental"
end
@@ -82,16 +83,14 @@ class Domain::Fa::UserEnqueuer
return SpecUtil.enqueued_jobs(Domain::Fa::Job::UserFollowsJob).count
end
[
"fa_post",
"fa_user_avatar",
"fa_user_favs",
"fa_user_follows",
"fa_user_gallery",
"fa_user_page",
"static_file",
].map do |queue_name|
count_failed_in_queue(queue_name)
end.max
%w[
fa_post
fa_user_avatar
fa_user_favs
fa_user_follows
fa_user_gallery
fa_user_page
static_file
].map { |queue_name| count_failed_in_queue(queue_name) }.max
end
end

View File

@@ -2,56 +2,63 @@ class ForkFuture
def initialize(&block)
read, write = ::IO.pipe
@read = read
pid = ::Process.fork do
start = Time.now
read.close
begin
result = block.call
rescue
result = RuntimeError.new([$!.message, $!.backtrace])
pid =
::Process.fork do
start = Time.now
read.close
begin
result = block.call
rescue StandardError
result = RuntimeError.new([$!.message, $!.backtrace])
end
duration = Time.now - start
::Marshal.dump({ duration: duration, result: result }, write)
::Process.exit!(true)
end
duration = Time.now - start
::Marshal.dump({
duration: duration,
result: result,
}, write)
::Process.exit!(true)
end
write.close
end
def self.parallel_map(num_processes, enumerator, &block)
ForkFuture.each_slice_impl(num_processes, enumerator).map do |slice|
ForkFuture.new do
slice.map(&block)
end
end.to_a.map(&:join).flatten(1)
ForkFuture
.each_slice_impl(num_processes, enumerator)
.map { |slice| ForkFuture.new { slice.map(&block) } }
.to_a
.map(&:join)
.flatten(1)
end
def self.parallel_each(num_processes, enumerator, &block)
ForkFuture.each_slice_impl(num_processes, enumerator).map do |slice|
ForkFuture.new do
slice.each(&block)
nil
ForkFuture
.each_slice_impl(num_processes, enumerator)
.map do |slice|
ForkFuture.new do
slice.each(&block)
nil
end
end
end.to_a.map(&:join)
.to_a
.map(&:join)
end
def self.parallel_each_slice(num_processes, enumerator, &block)
ForkFuture.each_slice_impl(num_processes, enumerator).map do |slice|
ForkFuture.new do
block.call(slice)
nil
ForkFuture
.each_slice_impl(num_processes, enumerator)
.map do |slice|
ForkFuture.new do
block.call(slice)
nil
end
end
end.to_a.map(&:join)
.to_a
.map(&:join)
end
def self.parallel_map_slice(num_processes, enumerator, &block)
ForkFuture.each_slice_impl(num_processes, enumerator).map do |slice|
ForkFuture.new do
block.call(slice)
end
end.to_a.map(&:join)
ForkFuture
.each_slice_impl(num_processes, enumerator)
.map { |slice| ForkFuture.new { block.call(slice) } }
.to_a
.map(&:join)
end
def join
@@ -77,7 +84,8 @@ class ForkFuture
end
def wait!
@result ||= begin
@result ||=
begin
result_buffer = @read.read
@read.close
::Marshal.load(result_buffer)

View File

@@ -7,17 +7,18 @@ module HasBulkEnqueueJobs
old_limit = Scraper::JobBase.good_job_concurrency_config[:total_limit]
Scraper::JobBase.good_job_concurrency_config[:total_limit] = nil
key_to_job = GoodJob::Bulk.capture(&block).map do |job|
[job.good_job_concurrency_key, job]
end.to_h
key_to_job =
GoodJob::Bulk
.capture(&block)
.map { |job| [job.good_job_concurrency_key, job] }
.to_h
ReduxApplicationRecord.transaction do
existing_keys = GoodJob::Job.
where(concurrency_key: key_to_job.keys).
pluck(:concurrency_key)
existing_keys.each do |key|
key_to_job.delete(key)
end
existing_keys =
GoodJob::Job.where(concurrency_key: key_to_job.keys).pluck(
:concurrency_key
)
existing_keys.each { |key| key_to_job.delete(key) }
GoodJob::Bulk.enqueue(key_to_job.values)
end
ensure

View File

@@ -10,9 +10,7 @@ module HasColorLogger
end
included do
define_method(:logger) do
@logger ||= ColorLogger.make(sink, self)
end
define_method(:logger) { @logger ||= ColorLogger.make(sink, self) }
define_singleton_method(:logger) do
@logger ||= ColorLogger.make(sink, self.name)
@@ -23,7 +21,5 @@ module HasColorLogger
# by default, write to stdout
extend ActiveSupport::Concern
included do
include HasColorLogger[$stdout]
end
included { include HasColorLogger[$stdout] }
end

View File

@@ -9,8 +9,8 @@ class HexUtil
units = %w[B KiB MiB GiB TiB Pib EiB ZiB]
return "0.0 B" if size == 0
exp = (Math.log(size) / Math.log(1024)).to_i
exp += 1 if (size.to_f / 1024 ** exp >= 1024 - 0.05)
exp += 1 if (size.to_f / 1024**exp >= 1024 - 0.05)
exp = units.size - 1 if exp > units.size - 1
"%.1f %s" % [size.to_f / 1024 ** exp, units[exp]]
"%.1f %s" % [size.to_f / 1024**exp, units[exp]]
end
end

View File

@@ -45,15 +45,15 @@ class LegacyImport::AdaptiveCache
@candidates.insert(idx, new_entry)
end
while @candidates.size > @max_size
@candidates.pop
end
@candidates.pop while @candidates.size > @max_size
end
def to_s
@candidates.map do |entry|
" - #{entry.score.round(1)} score, id #{entry.id} - #{entry.extra}"
end.join("\n")
@candidates
.map do |entry|
" - #{entry.score.round(1)} score, id #{entry.id} - #{entry.extra}"
end
.join("\n")
end
private

View File

@@ -21,15 +21,22 @@ class LegacyImport::E621LegacyPostImporter < LegacyImport::BulkImportJob
query = ::Legacy::E621::Post.includes(:blob_entry, { taggings: :tag })
# finish = @start_at + (2 * 32 * 32)
finish = nil
query.find_in_batches(start: @start_at, finish: finish, batch_size: @batch_size * @forks) do |batch|
query.find_in_batches(
start: @start_at,
finish: finish,
batch_size: @batch_size * @forks
) do |batch|
last_id = batch.last&.id
if @forks <= 1
progress += import_e621_posts(batch)
else
progress += ForkFuture.parallel_map_slice(@forks, batch) do |fork_batch|
import_e621_posts(fork_batch)
end.sum
progress +=
ForkFuture
.parallel_map_slice(@forks, batch) do |fork_batch|
import_e621_posts(fork_batch)
end
.sum
end
rate = progress.to_f / (Time.now - @start_time)
@@ -44,11 +51,13 @@ class LegacyImport::E621LegacyPostImporter < LegacyImport::BulkImportJob
def import_e621_posts(legacy_posts)
progress = 0
skip_posts_ids = Set.new(
::Domain::E621::Post.select(:e621_id).
where(e621_id: legacy_posts.map(&:e621_id)).
pluck(:e621_id)
)
skip_posts_ids =
Set.new(
::Domain::E621::Post
.select(:e621_id)
.where(e621_id: legacy_posts.map(&:e621_id))
.pluck(:e621_id)
)
legacy_posts.reject! do |legacy_post|
skip_posts_ids.include?(legacy_post.e621_id)
@@ -60,12 +69,14 @@ class LegacyImport::E621LegacyPostImporter < LegacyImport::BulkImportJob
ReduxApplicationRecord.transaction do
post = ::Domain::E621::Post.find_or_build_from_legacy(legacy_post)
unless post.valid?
raise("error building post #{post.id} / #{post.e621_id}: #{post.errors.full_messages}")
raise(
"error building post #{post.id} / #{post.e621_id}: #{post.errors.full_messages}"
)
end
post.save!
progress += 1
end
rescue
rescue StandardError
retries += 1
sleep 0.1 and retry if retries < 3
raise

View File

@@ -27,9 +27,12 @@ class LegacyImport::FaPostImporter < LegacyImport::BulkImportJob
if @forks <= 1
progress += import_fa_posts(batch)
else
progress += ForkFuture.parallel_map_slice(@forks, batch) do |fork_batch|
import_fa_posts(fork_batch)
end.sum
progress +=
ForkFuture
.parallel_map_slice(@forks, batch) do |fork_batch|
import_fa_posts(fork_batch)
end
.sum
end
rate = progress.to_f / (Time.now - @start_time)
@@ -42,12 +45,14 @@ class LegacyImport::FaPostImporter < LegacyImport::BulkImportJob
def import_fa_posts(legacy_posts)
progress = 0
skip_posts_ids = Set.new(
::Domain::Fa::Post.select(:fa_id, :creator_id).
where(fa_id: legacy_posts.map(&:fa_id)).
where("creator_id is not null").
pluck(:fa_id)
)
skip_posts_ids =
Set.new(
::Domain::Fa::Post
.select(:fa_id, :creator_id)
.where(fa_id: legacy_posts.map(&:fa_id))
.where("creator_id is not null")
.pluck(:fa_id)
)
legacy_posts.reject! do |legacy_post|
skip_posts_ids.include?(legacy_post.fa_id)
@@ -59,12 +64,14 @@ class LegacyImport::FaPostImporter < LegacyImport::BulkImportJob
ReduxApplicationRecord.transaction do
post = ::Domain::Fa::Post.find_or_build_from_legacy(legacy_post)
unless post.valid?
raise(" !! error building post #{post.id} / #{post.fa_id}: #{post.errors.full_messages}")
raise(
" !! error building post #{post.id} / #{post.fa_id}: #{post.errors.full_messages}"
)
end
post.save!
progress += 1
end
rescue
rescue StandardError
retries += 1
sleep 0.1 and retry if retries < 3
raise

View File

@@ -14,9 +14,10 @@ class LegacyImport::HttpLogEntryBulkImporter
# key is content_type|domain
# value is the adaptive cache
@blob_entry_cache = Hash.new do |hash, key|
hash[key] = LegacyImport::AdaptiveCache.new(cache_size, 1.0, 0.1)
end
@blob_entry_cache =
Hash.new do |hash, key|
hash[key] = LegacyImport::AdaptiveCache.new(cache_size, 1.0, 0.1)
end
end
def run
@@ -28,23 +29,24 @@ class LegacyImport::HttpLogEntryBulkImporter
start_at = Time.now
last_model_id = nil
stats_printer = Thread.new do
Thread.current.name = "stats-printer"
i = 0
loop do
sleep 3
duration = Time.now - start_at
rate = @insert_stats.http_entries_inserted / duration
hr
puts "insert stats: #{@insert_stats} - " +
"#{rate.round(2)}/sec (last id: #{last_model_id})"
i += 1
if i % 5 == 0
stats_printer =
Thread.new do
Thread.current.name = "stats-printer"
i = 0
loop do
sleep 3
duration = Time.now - start_at
rate = @insert_stats.http_entries_inserted / duration
hr
dump_timings
puts "insert stats: #{@insert_stats} - " +
"#{rate.round(2)}/sec (last id: #{last_model_id})"
i += 1
if i % 5 == 0
hr
dump_timings
end
end
end
end
query = Legacy::HttpLogEntry.order(id: :asc)
@@ -52,7 +54,7 @@ class LegacyImport::HttpLogEntryBulkImporter
query.find_in_batches(
batch_size: @batch_size,
start: @start_id,
finish: @end_id,
finish: @end_id
) do |legacy_models|
@timings.finish :bulk_load
import_legacy_models(legacy_models)
@@ -98,68 +100,82 @@ class LegacyImport::HttpLogEntryBulkImporter
private
def dump_timings
tp(@timings.entries.map do |entry|
{
key: entry[:key],
duration: "#{entry[:key_secs].round(1)} sec",
percent: "#{(100 * entry[:proportion]).round(1)}%".rjust(5),
}
end)
tp(
@timings.entries.map do |entry|
{
key: entry[:key],
duration: "#{entry[:key_secs].round(1)} sec",
percent: "#{(100 * entry[:proportion]).round(1)}%".rjust(5)
}
end
)
end
def import_legacy_models(legacy_models)
@timings.start :lookup_existing_http
already_exist_ids = ::HttpLogEntry.
where(id: legacy_models.map(&:id)).
pluck(:id).
to_set
already_exist_ids =
::HttpLogEntry.where(id: legacy_models.map(&:id)).pluck(:id).to_set
@timings.finish :lookup_existing_http
# ignore the models which have no stored content (for now)
@timings.start :reject_empty_legacy
legacy_models = ForkFuture.parallel_map(@fork_amount, legacy_models) do |legacy_model|
next nil if already_exist_ids.include?(legacy_model.id)
begin
next nil if legacy_model.response_body.nil?
rescue
puts "legacy model #{legacy_model.id} (#{legacy_model.full_path}): error reading response body"
next nil
end
legacy_models =
ForkFuture.parallel_map(@fork_amount, legacy_models) do |legacy_model|
next nil if already_exist_ids.include?(legacy_model.id)
begin
next nil if legacy_model.response_body.nil?
rescue StandardError
puts "legacy model #{legacy_model.id} (#{legacy_model.full_path}): error reading response body"
next nil
end
# legacy model now has response body loaded
legacy_model
end
# legacy model now has response body loaded
legacy_model
end
legacy_models.reject!(&:nil?)
@timings.finish :reject_empty_legacy
blob_creation_future = ForkFuture.new do
bulk_import_blob_entries(legacy_models)
end
header_creation_future = ForkFuture.new do
bulk_import_headers(legacy_models)
end
blob_creation_future =
ForkFuture.new { bulk_import_blob_entries(legacy_models) }
header_creation_future =
ForkFuture.new { bulk_import_headers(legacy_models) }
insert_stats, timings, cache_ops, legacy_model_id_to_response_sha256 = blob_creation_future.join
insert_stats, timings, cache_ops, legacy_model_id_to_response_sha256 =
blob_creation_future.join
@insert_stats.merge!(insert_stats)
@timings.merge!(timings)
cache_ops.each do |op|
@blob_entry_cache[op[0]].send(op[1], *op[2..])
end
cache_ops.each { |op| @blob_entry_cache[op[0]].send(op[1], *op[2..]) }
insert_stats, timings, legacy_model_id_to_header_sha256s, header_sha256_to_header_id = header_creation_future.join
insert_stats,
timings,
legacy_model_id_to_header_sha256s,
header_sha256_to_header_id =
header_creation_future.join
@insert_stats.merge!(insert_stats)
@timings.merge!(timings)
@timings.start :build_new_https
http_models = legacy_models.map do |legacy_model|
request_headers_id = header_sha256_to_header_id[legacy_model_id_to_header_sha256s[legacy_model.id][:req_sha256]]
response_headers_id = header_sha256_to_header_id[legacy_model_id_to_header_sha256s[legacy_model.id][:res_sha256]]
response_sha256 = legacy_model_id_to_response_sha256[legacy_model.id]
request_headers_id || raise("no request header id")
response_headers_id || raise("no response header id")
response_sha256 || raise("no response sha256")
build_http_log_entry(legacy_model, request_headers_id, response_headers_id, response_sha256)
end
http_models =
legacy_models.map do |legacy_model|
request_headers_id =
header_sha256_to_header_id[
legacy_model_id_to_header_sha256s[legacy_model.id][:req_sha256]
]
response_headers_id =
header_sha256_to_header_id[
legacy_model_id_to_header_sha256s[legacy_model.id][:res_sha256]
]
response_sha256 = legacy_model_id_to_response_sha256[legacy_model.id]
request_headers_id || raise("no request header id")
response_headers_id || raise("no response header id")
response_sha256 || raise("no response sha256")
build_http_log_entry(
legacy_model,
request_headers_id,
response_headers_id,
response_sha256
)
end
@timings.finish :build_new_https
@timings.start :insert_new_https
@@ -179,78 +195,113 @@ class LegacyImport::HttpLogEntryBulkImporter
# compute all blob entries for the legacy models, removing duplicates
timings.start :lookup_existing_bes
legacy_model_id_to_response_sha256 = legacy_models.map do |m|
[m.id, Digest::SHA256.digest(m.response_body)]
end.to_h
legacy_model_id_to_response_sha256 =
legacy_models
.map { |m| [m.id, Digest::SHA256.digest(m.response_body)] }
.to_h
sha256_to_existing_blob_entry = ::BlobEntryP.where(sha256: legacy_model_id_to_response_sha256.values).map do |be|
[be.sha256, be]
end.to_h
sha256_to_existing_blob_entry =
::BlobEntryP
.where(sha256: legacy_model_id_to_response_sha256.values)
.map { |be| [be.sha256, be] }
.to_h
timings.finish :lookup_existing_bes
timings.start :build_new_bes
blob_entries_to_insert = ForkFuture.parallel_map(@fork_amount, legacy_models) do |legacy_model|
sha256 = legacy_model_id_to_response_sha256[legacy_model.id] || raise
next nil if sha256_to_existing_blob_entry[sha256]
blob_entries_to_insert =
ForkFuture
.parallel_map(@fork_amount, legacy_models) do |legacy_model|
sha256 = legacy_model_id_to_response_sha256[legacy_model.id] || raise
next nil if sha256_to_existing_blob_entry[sha256]
content_type = legacy_model.content_type
cache_key = "#{legacy_model.host}|#{content_type}"
cache = @blob_entry_cache[cache_key]
content_type = legacy_model.content_type
cache_key = "#{legacy_model.host}|#{content_type}"
cache = @blob_entry_cache[cache_key]
# N% chance (if we're not at cache capacity) to not supply any candidates,
# to give new entries in the cache a chance to replace poor performing ones
candidates = if cache.at_capacity? # && rand(0..100) >= 5
cache.candidates
else
[]
# N% chance (if we're not at cache capacity) to not supply any candidates,
# to give new entries in the cache a chance to replace poor performing ones
candidates =
if cache.at_capacity? # && rand(0..100) >= 5
cache.candidates
else
[]
end
blob_entry =
::BlobEntryP.build_record(
content_type: content_type,
sha256: sha256,
contents: legacy_model.response_body,
candidates: candidates
)
# reward the base if it was used, if not, insert this blob into the
# cache so it'll be a future candidate (unless it's not a new model)
# cache keys are hex encoded for easier viewing / debugging
cache_op = nil
if !blob_entry.persisted? && @cache_size > 0
if blob_entry.base_sha256
cache_op = [
cache_key,
:reward,
HexUtil.bin2hex(blob_entry.base_sha256)[0..8]
]
else
cache_op = [
cache_key,
:insert,
HexUtil.bin2hex(blob_entry.sha256)[0..8],
blob_entry,
legacy_model.full_path
]
end
end
blob_entry.valid? ||
raise(
"invalid blob entry (legacy model id #{legacy_model.id}): #{blob_entry.errors.full_messages}"
)
cache.send(cache_op[1], *cache_op[2..]) if cache_op
[blob_entry, cache_op]
end
blob_entry = ::BlobEntryP.build_record(
content_type: content_type,
sha256: sha256,
contents: legacy_model.response_body,
candidates: candidates,
)
# reward the base if it was used, if not, insert this blob into the
# cache so it'll be a future candidate (unless it's not a new model)
# cache keys are hex encoded for easier viewing / debugging
cache_op = nil
if !blob_entry.persisted? && @cache_size > 0
if blob_entry.base_sha256
cache_op = [cache_key, :reward, HexUtil.bin2hex(blob_entry.base_sha256)[0..8]]
else
cache_op = [cache_key, :insert, HexUtil.bin2hex(blob_entry.sha256)[0..8], blob_entry, legacy_model.full_path]
.reject(&:nil?)
.map do |pair|
blob_entry = pair[0]
cache_op = pair[1]
merged_cache_ops << cache_op if cache_op
blob_entry
end
end
blob_entry.valid? || raise("invalid blob entry (legacy model id #{legacy_model.id}): #{blob_entry.errors.full_messages}")
cache.send(cache_op[1], *cache_op[2..]) if cache_op
[blob_entry, cache_op]
end.reject(&:nil?).map do |pair|
blob_entry = pair[0]
cache_op = pair[1]
merged_cache_ops << cache_op if cache_op
blob_entry
end.uniq do |blob_entry|
blob_entry.sha256
end
.uniq { |blob_entry| blob_entry.sha256 }
timings.finish :build_new_bes
# bulk-insert all the new blob entries
timings.start :insert_new_bes
slice_size = [(blob_entries_to_insert.size.to_f / @fork_amount).ceil, 1].max
blob_entries_to_insert.each_slice(slice_size).map do |slice|
ForkFuture.new do
BlobEntryP.insert_all!(slice.map(&:to_bulk_insert_hash)) if slice.any?
end
end.to_a.map(&:join) if blob_entries_to_insert.any?
if blob_entries_to_insert.any?
blob_entries_to_insert
.each_slice(slice_size)
.map do |slice|
ForkFuture.new do
if slice.any?
BlobEntryP.insert_all!(slice.map(&:to_bulk_insert_hash))
end
end
end
.to_a
.map(&:join)
end
insert_stats.blob_entries_inserted += blob_entries_to_insert.size
insert_stats.bytes_length += blob_entries_to_insert.map(&:contents).map(&:size).sum
insert_stats.bytes_length +=
blob_entries_to_insert.map(&:contents).map(&:size).sum
insert_stats.bytes_stored += blob_entries_to_insert.map(&:bytes_stored).sum
timings.finish :insert_new_bes
[insert_stats, timings, merged_cache_ops, legacy_model_id_to_response_sha256]
[
insert_stats,
timings,
merged_cache_ops,
legacy_model_id_to_response_sha256
]
end
def bulk_import_headers(legacy_models)
@@ -260,75 +311,95 @@ class LegacyImport::HttpLogEntryBulkImporter
timings.start :build_new_headers
header_sha256_to_header_model = {}
legacy_model_id_to_header_sha256s =
ForkFuture.parallel_map(@fork_amount / 2, legacy_models) do |legacy_model|
req_headers = ::HttpLogEntryHeader.build_record(headers: legacy_model.req_headers)
res_headers = ::HttpLogEntryHeader.build_record(headers: legacy_model.res_headers)
[legacy_model.id, {
req: req_headers,
res: res_headers,
}]
end.map do |pair|
legacy_model_id = pair[0]
req_headers = pair[1][:req]
res_headers = pair[1][:res]
header_sha256_to_header_model[req_headers.sha256] = req_headers
header_sha256_to_header_model[res_headers.sha256] = res_headers
[legacy_model_id, {
req_sha256: req_headers.sha256,
res_sha256: res_headers.sha256,
}]
end.to_h
ForkFuture
.parallel_map(@fork_amount / 2, legacy_models) do |legacy_model|
req_headers =
::HttpLogEntryHeader.build_record(headers: legacy_model.req_headers)
res_headers =
::HttpLogEntryHeader.build_record(headers: legacy_model.res_headers)
[legacy_model.id, { req: req_headers, res: res_headers }]
end
.map do |pair|
legacy_model_id = pair[0]
req_headers = pair[1][:req]
res_headers = pair[1][:res]
header_sha256_to_header_model[req_headers.sha256] = req_headers
header_sha256_to_header_model[res_headers.sha256] = res_headers
[
legacy_model_id,
{ req_sha256: req_headers.sha256, res_sha256: res_headers.sha256 }
]
end
.to_h
timings.finish :build_new_headers
# excluding existing headers, and bulk-insert the new headers
timings.start :insert_new_headers
header_sha256_to_header_id = ::HttpLogEntryHeader.where(sha256: header_sha256_to_header_model.keys).map do |model|
[model.sha256, model.id]
end.to_h
header_sha256_to_header_id =
::HttpLogEntryHeader
.where(sha256: header_sha256_to_header_model.keys)
.map { |model| [model.sha256, model.id] }
.to_h
headers_to_insert = header_sha256_to_header_model.map do |sha256, header_model|
next nil if header_sha256_to_header_id[sha256]
header_model.valid? || raise("invalid header models")
header_model
end.reject(&:nil?).uniq do |header_model|
header_model.sha256
end
headers_to_insert =
header_sha256_to_header_model
.map do |sha256, header_model|
next nil if header_sha256_to_header_id[sha256]
header_model.valid? || raise("invalid header models")
header_model
end
.reject(&:nil?)
.uniq { |header_model| header_model.sha256 }
::HttpLogEntryHeader.insert_all!(
headers_to_insert.map(&:to_bulk_insert_hash),
returning: [:id, :sha256],
).rows.each do |row|
id, sha256 = row
# rails does not deserialize the returned sha256 - we have to do that ourselves
# postgres prefixes hex-encoded binaries with "\x", must strip that first
raise("invariant") unless sha256[0..1] == "\\x"
sha256 = ::HexUtil.hex2bin(sha256[2..])
header_sha256_to_header_id[sha256] = id
end if headers_to_insert.any?
::HttpLogEntryHeader
.insert_all!(
headers_to_insert.map(&:to_bulk_insert_hash),
returning: %i[id sha256]
)
.rows
.each do |row|
id, sha256 = row
# rails does not deserialize the returned sha256 - we have to do that ourselves
# postgres prefixes hex-encoded binaries with "\x", must strip that first
raise("invariant") unless sha256[0..1] == "\\x"
sha256 = ::HexUtil.hex2bin(sha256[2..])
header_sha256_to_header_id[sha256] = id
end if headers_to_insert.any?
insert_stats.header_entries_inserted += headers_to_insert.size
timings.finish :insert_new_headers
[insert_stats, timings, legacy_model_id_to_header_sha256s, header_sha256_to_header_id]
[
insert_stats,
timings,
legacy_model_id_to_header_sha256s,
header_sha256_to_header_id
]
end
def build_http_log_entry(legacy_model, request_headers_id, response_headers_id, response_sha256)
model = ::HttpLogEntry.new(
id: legacy_model.id,
uri_scheme: legacy_model.scheme,
uri_host: legacy_model.host,
uri_path: legacy_model.path,
uri_query: legacy_model.query,
verb: legacy_model.verb,
content_type: legacy_model.content_type,
status_code: legacy_model.status,
response_time_ms: legacy_model.response_time,
request_headers_id: request_headers_id,
response_headers_id: response_headers_id,
response_sha256: response_sha256,
requested_at: legacy_model.requested_at,
created_at: legacy_model.created_at,
updated_at: legacy_model.updated_at,
)
def build_http_log_entry(
legacy_model,
request_headers_id,
response_headers_id,
response_sha256
)
model =
::HttpLogEntry.new(
id: legacy_model.id,
uri_scheme: legacy_model.scheme,
uri_host: legacy_model.host,
uri_path: legacy_model.path,
uri_query: legacy_model.query,
verb: legacy_model.verb,
content_type: legacy_model.content_type,
status_code: legacy_model.status,
response_time_ms: legacy_model.response_time,
request_headers_id: request_headers_id,
response_headers_id: response_headers_id,
response_sha256: response_sha256,
requested_at: legacy_model.requested_at,
created_at: legacy_model.created_at,
updated_at: legacy_model.updated_at
)
model
end
@@ -336,102 +407,102 @@ class LegacyImport::HttpLogEntryBulkImporter
puts "-" * 40
end
Timings = Struct.new(
:keys,
:totals
) do
def initialize
@start_at = Time.now
self.keys = []
self.totals = {}
end
Timings =
Struct.new(:keys, :totals) do
def initialize
@start_at = Time.now
self.keys = []
self.totals = {}
end
def merge!(other)
raise if other.nil?
other.keys.each do |key|
def merge!(other)
raise if other.nil?
other.keys.each do |key|
self.keys << key unless self.keys.include?(key)
self.entry_for(key)[:secs] += other.totals[key][:secs]
end
end
def start(key)
self.keys << key unless self.keys.include?(key)
self.entry_for(key)[:secs] += other.totals[key][:secs]
entry = self.entry_for(key)
raise("#{key} already started") if entry[:started]
entry[:started] = Time.now
end
def finish(key)
entry = self.totals[key]
raise("#{key} does not exist") unless entry
started = entry[:started]
entry[:started] = nil
raise("#{key} not started") unless started
entry[:secs] += Time.now - started
end
def entries
total_secs = Time.now - @start_at
total_measured_secs = self.totals.values.map { |e| e[:secs] }.sum
self
.keys
.map do |key|
key_secs = self.totals[key][:secs]
{
key: key,
key_secs: key_secs,
proportion: key_secs / total_measured_secs
}
end
.chain(
[
{
key: :measured_total,
key_secs: total_measured_secs,
proportion: total_measured_secs / total_secs
},
{ key: :actual_total, key_secs: total_secs, proportion: 1.0 }
]
)
end
def entry_for(key)
self.totals[key] ||= { started: nil, secs: 0.0 }
end
end
def start(key)
self.keys << key unless self.keys.include?(key)
entry = self.entry_for(key)
raise("#{key} already started") if entry[:started]
entry[:started] = Time.now
end
InsertStats =
Struct.new(
:http_entries_inserted,
:blob_entries_inserted,
:header_entries_inserted,
:bytes_stored,
:bytes_length
) do
def initialize
self.http_entries_inserted = 0
self.blob_entries_inserted = 0
self.header_entries_inserted = 0
self.bytes_stored = 0
self.bytes_length = 0
end
def finish(key)
entry = self.totals[key]
raise("#{key} does not exist") unless entry
started = entry[:started]
entry[:started] = nil
raise("#{key} not started") unless started
entry[:secs] += Time.now - started
end
def merge!(other)
self.http_entries_inserted += other.http_entries_inserted
self.blob_entries_inserted += other.blob_entries_inserted
self.header_entries_inserted += other.header_entries_inserted
self.bytes_stored += other.bytes_stored
self.bytes_length += other.bytes_length
end
def entries
total_secs = Time.now - @start_at
total_measured_secs = self.totals.values.map { |e| e[:secs] }.sum
self.keys.map do |key|
key_secs = self.totals[key][:secs]
{
key: key,
key_secs: key_secs,
proportion: key_secs / total_measured_secs,
}
end.chain([
{
key: :measured_total,
key_secs: total_measured_secs,
proportion: total_measured_secs / total_secs,
},
{
key: :actual_total,
key_secs: total_secs,
proportion: 1.0,
},
])
end
def to_s
ratio = self.bytes_stored.to_f / self.bytes_length
[
"+#{self.http_entries_inserted} requests, +#{self.blob_entries_inserted} blobs, +#{self.header_entries_inserted} headers",
"size ratio: #{ratio.round(2)} - #{self.class.humansize(self.bytes_stored)}/#{self.class.humansize(self.bytes_length)}"
].join("\n")
end
def entry_for(key)
self.totals[key] ||= { started: nil, secs: 0.0 }
def self.humansize(size)
HexUtil.humansize(size)
end
end
end
InsertStats = Struct.new(
:http_entries_inserted,
:blob_entries_inserted,
:header_entries_inserted,
:bytes_stored,
:bytes_length,
) do
def initialize
self.http_entries_inserted = 0
self.blob_entries_inserted = 0
self.header_entries_inserted = 0
self.bytes_stored = 0
self.bytes_length = 0
end
def merge!(other)
self.http_entries_inserted += other.http_entries_inserted
self.blob_entries_inserted += other.blob_entries_inserted
self.header_entries_inserted += other.header_entries_inserted
self.bytes_stored += other.bytes_stored
self.bytes_length += other.bytes_length
end
def to_s
ratio = self.bytes_stored.to_f / self.bytes_length
[
"+#{self.http_entries_inserted} requests, +#{self.blob_entries_inserted} blobs, +#{self.header_entries_inserted} headers",
"size ratio: #{ratio.round(2)} - #{self.class.humansize(self.bytes_stored)}/#{self.class.humansize(self.bytes_length)}",
].join("\n")
end
def self.humansize(size)
HexUtil.humansize(size)
end
end
end

View File

@@ -6,7 +6,8 @@ module LiteTrail::ActiveRecordClassMethods
)
self_class = self
versions_table_name = if separate_versions_table.is_a?(String)
versions_table_name =
if separate_versions_table.is_a?(String)
separate_versions_table
elsif separate_versions_table == true
self.table_name.singularize + "_versions"
@@ -14,14 +15,16 @@ module LiteTrail::ActiveRecordClassMethods
nil
end
lite_trail_class = if versions_table_name.nil?
lite_trail_class =
if versions_table_name.nil?
::LiteTrail::Version
else
# separate table defined, use that, no need for polymorphism
klass = Class.new(::LiteTrail::AbstractVersion) do
self.table_name = versions_table_name
belongs_to :item, class_name: self_class.to_s
end
klass =
Class.new(::LiteTrail::AbstractVersion) do
self.table_name = versions_table_name
belongs_to :item, class_name: self_class.to_s
end
# "foo_bar_versions" => define "LiteTrail::PerTable::FooBarVersions"
LiteTrail::PerTable.const_set(versions_table_name.camelize, klass)
@@ -33,7 +36,7 @@ module LiteTrail::ActiveRecordClassMethods
class_attribute :lite_trail_options
self.lite_trail_options = {
schema_version: schema_version,
map_attribute: map_attribute,
map_attribute: map_attribute
}
if !separate_versions_table
@@ -62,9 +65,13 @@ module LiteTrail::ActiveRecordClassMethods
map_attribute.each do |attr_name, mapper|
if changes[attr_name]
# value before the update
changes[attr_name][0] = mapper.map_to(changes[attr_name][0]) if changes[attr_name][0]
changes[attr_name][0] = mapper.map_to(
changes[attr_name][0]
) if changes[attr_name][0]
# value after the update
changes[attr_name][1] = mapper.map_to(changes[attr_name][1]) if changes[attr_name][1]
changes[attr_name][1] = mapper.map_to(
changes[attr_name][1]
) if changes[attr_name][1]
end
end if map_attribute
@@ -74,18 +81,18 @@ module LiteTrail::ActiveRecordClassMethods
model_updated_at = Time.now
end
self.versions << lite_trail_class.new({
event: "update",
item: self,
schema_version: schema_version,
diff: changes,
created_at: model_updated_at,
})
self.versions << lite_trail_class.new(
{
event: "update",
item: self,
schema_version: schema_version,
diff: changes,
created_at: model_updated_at
}
)
end
end
after_save do
self.versions.filter(&:new_record?).each(&:save!)
end
after_save { self.versions.filter(&:new_record?).each(&:save!) }
end
end

View File

@@ -11,6 +11,9 @@ module LiteTrail::MigrationExtensions
t.datetime :created_at, null: false
end
add_foreign_key versions_table_name, table_name, column: :item_id, validate: true
add_foreign_key versions_table_name,
table_name,
column: :item_id,
validate: true
end
end

View File

@@ -1,19 +1,12 @@
class Scraper::E621HttpClientConfig < Scraper::HttpClientConfig
DEFAULT_ALLOWED_DOMAINS = [
"e621.net",
"*.e621.net",
]
DEFAULT_ALLOWED_DOMAINS = %w[e621.net *.e621.net]
def cookies
[]
end
def ratelimit
[
["static1.e621.net", 1],
["facdn.net", 2],
["*", 0.25],
]
[["static1.e621.net", 1], ["facdn.net", 2], ["*", 0.25]]
end
def allowed_domains

View File

@@ -3,32 +3,31 @@ class Scraper::GalleryDlClient
StartEvent = Struct.new(:url, :extractor)
FinishEvent = Struct.new(:ignore)
HttpRequestEvent = Struct.new(
:method,
:url,
:kwargs,
:requested_at,
:request_headers,
:response_headers,
:response_code,
:response_time_ms,
:body,
:log_entry,
keyword_init: true,
)
TweetEvent = Struct.new(
:tweet,
:author
)
TweetMediaEvent = Struct.new(
:tweet_id,
:file_url,
:filename,
:media_num,
:extension,
:height,
:width,
)
HttpRequestEvent =
Struct.new(
:method,
:url,
:kwargs,
:requested_at,
:request_headers,
:response_headers,
:response_code,
:response_time_ms,
:body,
:log_entry,
keyword_init: true
)
TweetEvent = Struct.new(:tweet, :author)
TweetMediaEvent =
Struct.new(
:tweet_id,
:file_url,
:filename,
:media_num,
:extension,
:height,
:width
)
def initialize(name, host)
name || raise("no name provided")
@@ -37,14 +36,19 @@ class Scraper::GalleryDlClient
@performed_by = name
@client = Ripcord::Client.new(host)
@max_cache_size = 8
@blob_entry_cache = Hash.new do |hash, key|
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
end
@blob_entry_cache =
Hash.new do |hash, key|
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
end
end
def start_twitter_user(username, caused_by_entry: nil)
@token = SecureRandom.uuid
rpc = @client.call("start_user", [@token, "https://twitter.com/#{username}/tweets"])
rpc =
@client.call(
"start_user",
[@token, "https://twitter.com/#{username}/tweets"]
)
raise rpc_error_str(rpc) unless rpc.successful?
decode_message(rpc.result, caused_by_entry)
end
@@ -66,25 +70,29 @@ class Scraper::GalleryDlClient
raise("token mismatch: #{token} != #{@token}") if token != @token
case response[:event]
when "start" then StartEvent.new(response[:url], response[:extractor])
when "finish" then FinishEvent.new(nil)
when "start"
StartEvent.new(response[:url], response[:extractor])
when "finish"
FinishEvent.new(nil)
when "http_request"
http_request = response[:http_request]
event = HttpRequestEvent.new(
method: http_request[:method],
url: http_request[:url],
kwargs: http_request[:kwargs],
requested_at: Time.at(http_request[:requested_at]),
request_headers: http_request[:kwargs][:headers],
response_headers: http_request[:response_headers],
response_code: http_request[:status_code],
response_time_ms: (http_request[:duration] * 1000).to_i,
body: Base64.decode64(http_request[:content_base64]),
log_entry: nil,
)
event =
HttpRequestEvent.new(
method: http_request[:method],
url: http_request[:url],
kwargs: http_request[:kwargs],
requested_at: Time.at(http_request[:requested_at]),
request_headers: http_request[:kwargs][:headers],
response_headers: http_request[:response_headers],
response_code: http_request[:status_code],
response_time_ms: (http_request[:duration] * 1000).to_i,
body: Base64.decode64(http_request[:content_base64]),
log_entry: nil
)
log_and_set_http_request_event(event, caused_by_entry)
event
when "tweet" then TweetEvent.new(response[:tweet], response[:author])
when "tweet"
TweetEvent.new(response[:tweet], response[:author])
when "tweet_media"
media = response[:media]
TweetMediaEvent.new(
@@ -94,7 +102,7 @@ class Scraper::GalleryDlClient
media[:media_num],
media[:extension],
media[:height],
media[:width],
media[:width]
)
end
end
@@ -102,14 +110,15 @@ class Scraper::GalleryDlClient
def log_and_set_http_request_event(http_event, caused_by_entry)
request_headers = http_event.request_headers
response_headers = http_event.response_headers
content_type = response_headers[:"Content-Type"] ||
response_headers[:"content-type"] ||
raise("no content type provided: #{response_headers}")
content_type =
response_headers[:"Content-Type"] || response_headers[:"content-type"] ||
raise("no content type provided: #{response_headers}")
url = http_event.url
uri = Addressable::URI.parse(url)
if http_event.kwargs && http_event.kwargs[:params] && http_event.kwargs[:params][:variables]
if http_event.kwargs && http_event.kwargs[:params] &&
http_event.kwargs[:params][:variables]
uri.query = JSON.parse(http_event.kwargs[:params][:variables]).to_query
end
@@ -117,38 +126,47 @@ class Scraper::GalleryDlClient
cache_key = "#{uri.host}|#{content_type}"
blob_entry_cache = @blob_entry_cache[cache_key]
candidates = if blob_entry_cache.at_capacity? && rand(0..100) >= 5
candidates =
if blob_entry_cache.at_capacity? && rand(0..100) >= 5
blob_entry_cache.candidates
else
[]
end
candidates << caused_by_entry.response if caused_by_entry&.response
candidates << caused_by_entry.response.base if caused_by_entry&.response&.base
if caused_by_entry&.response&.base
candidates << caused_by_entry.response.base
end
retries = 0
begin
response_blob_entry = BlobEntryP.find_or_build(
content_type: content_type,
contents: http_event.body,
candidates: candidates,
)
response_blob_entry =
BlobEntryP.find_or_build(
content_type: content_type,
contents: http_event.body,
candidates: candidates
)
log_entry = HttpLogEntry.new({
uri: url,
verb: http_event.method.downcase,
content_type: content_type,
status_code: http_event.response_code,
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers),
response: response_blob_entry,
response_time_ms: http_event.response_time_ms,
requested_at: http_event.requested_at,
caused_by_entry: caused_by_entry,
performed_by: @performed_by,
})
log_entry =
HttpLogEntry.new(
{
uri: url,
verb: http_event.method.downcase,
content_type: content_type,
status_code: http_event.response_code,
request_headers:
HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers:
HttpLogEntryHeader.find_or_build(headers: response_headers),
response: response_blob_entry,
response_time_ms: http_event.response_time_ms,
requested_at: http_event.requested_at,
caused_by_entry: caused_by_entry,
performed_by: @performed_by
}
)
log_entry.save!
rescue
rescue StandardError
retries += 1
retry if retries < 2
raise
@@ -158,10 +176,14 @@ class Scraper::GalleryDlClient
http_event.log_entry = log_entry
if response_blob_entry.base_sha256
blob_entry_cache.reward(HexUtil.bin2hex(response_blob_entry.base_sha256)[0..8])
blob_entry_cache.reward(
HexUtil.bin2hex(response_blob_entry.base_sha256)[0..8]
)
else
blob_entry_cache.insert(
HexUtil.bin2hex(response_blob_entry.sha256)[0..8], response_blob_entry, url
HexUtil.bin2hex(response_blob_entry.sha256)[0..8],
response_blob_entry,
url
)
end
end

View File

@@ -1,13 +1,11 @@
class Scraper::InkbunnyHttpClientConfig < Scraper::HttpClientConfig
DEFAULT_ALLOWED_DOMAINS = [
"inkbunny.net",
"*.ib.metapix.net",
]
DEFAULT_ALLOWED_DOMAINS = %w[inkbunny.net *.ib.metapix.net]
def do_login(performer)
sid_model = Domain::Inkbunny::GlobalState.find_or_create_by(
key: "#{performer.name}-sid",
)
sid_model =
Domain::Inkbunny::GlobalState.find_or_create_by(
key: "#{performer.name}-sid"
)
sid_model.with_lock do
if sid_model.value.blank?
sid_model.value = do_ib_login(performer)
@@ -28,13 +26,12 @@ class Scraper::InkbunnyHttpClientConfig < Scraper::HttpClientConfig
def scrub_stored_uri(uri)
if uri.path == "/api_login.php"
uri = uri.dup
new_query_ar = URI.decode_www_form(uri.query || "").map do |k, v|
if ["username", "password"].include?(k)
[k, "*****"]
else
[k, v]
end
end
new_query_ar =
URI
.decode_www_form(uri.query || "")
.map do |k, v|
%w[username password].include?(k) ? [k, "*****"] : [k, v]
end
uri.query = URI.encode_www_form(new_query_ar)
end
uri
@@ -44,10 +41,7 @@ class Scraper::InkbunnyHttpClientConfig < Scraper::HttpClientConfig
end
def ratelimit
[
["inkbunny.net", 2],
["*.ib.metapix.net", 1],
]
[["inkbunny.net", 2], ["*.ib.metapix.net", 1]]
end
def allowed_domains
@@ -61,23 +55,29 @@ class Scraper::InkbunnyHttpClientConfig < Scraper::HttpClientConfig
private
def do_ib_login(performer)
username = Domain::Inkbunny::GlobalState.find_by(key: "#{performer.name}-username")&.value || raise(
"missing inkbunny username in global state"
)
password = Domain::Inkbunny::GlobalState.find_by(key: "#{performer.name}-password")&.value || raise(
"missing inkbunny password in global state"
)
uri = "https://inkbunny.net/api_login.php?username=#{username}&password=#{password}"
username =
Domain::Inkbunny::GlobalState.find_by(
key: "#{performer.name}-username"
)&.value || raise("missing inkbunny username in global state")
password =
Domain::Inkbunny::GlobalState.find_by(
key: "#{performer.name}-password"
)&.value || raise("missing inkbunny password in global state")
uri =
"https://inkbunny.net/api_login.php?username=#{username}&password=#{password}"
logger.info("logging in to inkbunny as #{username}...")
response = performer.do_request(:post, uri, {})
if response.response_code != 200
message = "inkbunny login failed: #{response.response_code} #{response.body}"
message =
"inkbunny login failed: #{response.response_code} #{response.body}"
logger.error(message)
raise message
end
sid = JSON.parse(response.body)["sid"] || raise(
"inkbunny login failed: no sid in response (#{response.body[0..1000]})"
)
sid =
JSON.parse(response.body)["sid"] ||
raise(
"inkbunny login failed: no sid in response (#{response.body[0..1000]})"
)
logger.info("logged in to inkbunny as #{username}: #{sid}")
sid
end

View File

@@ -3,68 +3,64 @@ class Scraper::LinkFinder
def initialize(origin_host, document)
@origin_host = origin_host
@document = document.encode("UTF-8", :invalid => :replace, :undef => :replace)
@document = document.encode("UTF-8", invalid: :replace, undef: :replace)
end
RAW_REGEXES = [
URI.regexp,
/(?:(?:https?:\/\/)?(?:[-\w\.]+)\.\w+(?:\/(?:[\w\-\/_\.…]*(?:\?\S+)?)?))/,
%r{(?:(?:https?://)?(?:[-\w\.]+)\.\w+(?:/(?:[\w\-/_\.…]*(?:\?\S+)?)?))}
]
def find_links
from_text_uris = RAW_REGEXES.map do |regex|
@document.scan(regex)
end.flatten.reject(&:blank?).reject do |str|
str.include?("")
end.map do |str|
uri = Addressable::URI.heuristic_parse(str)
if uri.host
uri
else
nil
end
rescue
logger.warn("invalid parsed uri detected - #{str}")
nil
end.reject(&:blank?)
from_text_uris =
RAW_REGEXES
.map { |regex| @document.scan(regex) }
.flatten
.reject(&:blank?)
.reject { |str| str.include?("") }
.map do |str|
uri = Addressable::URI.heuristic_parse(str)
uri.host ? uri : nil
rescue StandardError
logger.warn("invalid parsed uri detected - #{str}")
nil
end
.reject(&:blank?)
parsed_uris = Nokogiri::HTML(@document).css("a").map do |link|
uri = Addressable::URI.parse(link[:href])
uri.host ||= @origin_host
uri
rescue
logger.warn("invalid parsed uri detected - #{link[:href]}")
nil
end.reject(&:blank?)
parsed_uris =
Nokogiri
.HTML(@document)
.css("a")
.map do |link|
uri = Addressable::URI.parse(link[:href])
uri.host ||= @origin_host
uri
rescue StandardError
logger.warn("invalid parsed uri detected - #{link[:href]}")
nil
end
.reject(&:blank?)
all_uris = from_text_uris + parsed_uris
all_uris.map do |uri|
if uri.host.end_with?("furaffinity.net")
process_fa(uri)
elsif uri.host.end_with?("e621.net")
process_e621(uri)
elsif uri.host.end_with?("twitter.com")
process_twitter(uri)
all_uris
.map do |uri|
if uri.host.end_with?("furaffinity.net")
process_fa(uri)
elsif uri.host.end_with?("e621.net")
process_e621(uri)
elsif uri.host.end_with?("twitter.com")
process_twitter(uri)
end
end
end.
reject(&:nil?).
uniq
.reject(&:nil?)
.uniq
end
private
FA_USER_PREFIXES = [
"/user/",
"/gallery/",
"/scraps/",
"/journals/",
"/favorites/",
]
FA_USER_PREFIXES = %w[/user/ /gallery/ /scraps/ /journals/ /favorites/]
FA_POST_PREFIXES = [
"/view/",
"/full/",
]
FA_POST_PREFIXES = %w[/view/ /full/]
def process_fa(uri)
if prefixed_with?(FA_USER_PREFIXES, uri.path)
@@ -80,21 +76,21 @@ class Scraper::LinkFinder
nil
end
TWITTER_IGNORE_FIRST_PATH_PART = [
"messages",
"explore",
"home",
"i",
"notifications",
"privacy",
"search",
"tos",
TWITTER_IGNORE_FIRST_PATH_PART = %w[
messages
explore
home
i
notifications
privacy
search
tos
]
TWITTER_IGNORE_HOSTS = [
"business.twitter.com",
"help.twitter.com",
"support.twitter.com",
TWITTER_IGNORE_HOSTS = %w[
business.twitter.com
help.twitter.com
support.twitter.com
]
def process_twitter(uri)
@@ -106,9 +102,7 @@ class Scraper::LinkFinder
end
def prefixed_with?(prefixes, path)
prefixes.any? do |prefix|
path.start_with?(prefix)
end
prefixes.any? { |prefix| path.start_with?(prefix) }
end
def path_parts(path)

View File

@@ -1,18 +1,12 @@
class Scraper::TwitterHttpClientConfig < Scraper::HttpClientConfig
DEFAULT_ALLOWED_DOMAINS = [
"*.twimg.com",
"ipinfo.io",
]
DEFAULT_ALLOWED_DOMAINS = %w[*.twimg.com ipinfo.io]
def cookies
[]
end
def ratelimit
[
["*.twimg.com", 0.5],
["*", 1],
]
[["*.twimg.com", 0.5], ["*", 1]]
end
def allowed_domains

View File

@@ -1,14 +1,14 @@
class VpnOnlyRouteConstraint
def matches?(request)
if request.params[:force_vpn_off] == '1'
if request.params[:force_vpn_off] == "1"
false
elsif Rails.env.test?
true
elsif Rails.env.development? || Rails.env.staging?
request.ip == '127.0.0.1' || request.ip == '::1'
request.ip == "127.0.0.1" || request.ip == "::1"
elsif Rails.env.production?
# curtus IP on vpn
request.ip == '10.200.0.3'
request.ip == "10.200.0.3"
else
false
end

View File

@@ -5,12 +5,15 @@ class BlobEntryP < ReduxApplicationRecord
before_destroy { raise ActiveRecord::ReadOnlyRecord }
self.primary_key = :sha256
EMPTY_FILE_SHA256 = HexUtil.hex2bin("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855")
EMPTY_FILE_SHA256 =
HexUtil.hex2bin(
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
)
belongs_to :base,
optional: true,
foreign_key: :base_sha256,
class_name: "::BlobEntryP"
optional: true,
foreign_key: :base_sha256,
class_name: "::BlobEntryP"
after_create do
actual_sha256 = Digest::SHA256.digest(contents)
@@ -21,17 +24,19 @@ class BlobEntryP < ReduxApplicationRecord
@base_model ||= super || self.class.ensure(base_sha256) if base_sha256
end
validates_presence_of(
:sha256,
:content_type,
:size
)
validates :contents, length: { minimum: 0, allow_nil: false, message: "can't be nil" }
validates_presence_of(:sha256, :content_type, :size)
validates :contents,
length: {
minimum: 0,
allow_nil: false,
message: "can't be nil"
}
validates :sha256, length: { is: 32 }
validates :base_sha256, length: { is: 32 }, if: :base_sha256
def self.ensure(sha256)
find_by(sha256: sha256) || raise("blob #{HexUtil.bin2hex(sha256)} does not exist")
find_by(sha256: sha256) ||
raise("blob #{HexUtil.bin2hex(sha256)} does not exist")
end
def sha256_hex
@@ -39,13 +44,10 @@ class BlobEntryP < ReduxApplicationRecord
end
def contents
@contents ||= begin
@contents ||=
begin
contents_raw = self.read_attribute(:contents)
if self.base
XDiff.patch(self.base.contents, contents_raw)
else
contents_raw
end
self.base ? XDiff.patch(self.base.contents, contents_raw) : contents_raw
end
end
@@ -55,47 +57,48 @@ class BlobEntryP < ReduxApplicationRecord
def self.find_or_build(content_type:, contents:, candidates: [])
sha256 = Digest::SHA256.digest(contents)
BlobEntryP.find_by(sha256: sha256) || begin
build_record(
content_type: content_type,
sha256: sha256,
contents: contents,
candidates: candidates,
)
end
BlobEntryP.find_by(sha256: sha256) ||
begin
build_record(
content_type: content_type,
sha256: sha256,
contents: contents,
candidates: candidates
)
end
end
DIFFABLE_CONTENT_TYPES = [
/text\/html/,
/text\/plain/,
/application\/json/,
]
DIFFABLE_CONTENT_TYPES = [%r{text/html}, %r{text/plain}, %r{application/json}]
def self.build_record(content_type:, sha256: nil, contents:, candidates: [])
sha256 ||= Digest::SHA256.digest(contents)
record = self.new(sha256: sha256, content_type: content_type, size: contents.size)
record =
self.new(sha256: sha256, content_type: content_type, size: contents.size)
smallest_patch_size = nil
smallest_patch = nil
smallest_candidate = nil
candidates.map do |candidate|
# only consider candidates with the same content type (may relax this later)
next nil if candidate.content_type != content_type
# only consider candidates who themselves aren't patch-based
next nil unless candidate.base.nil?
# only consider diffable content types
next nil unless DIFFABLE_CONTENT_TYPES.any? { |ct| content_type =~ ct }
candidates
.map do |candidate|
# only consider candidates with the same content type (may relax this later)
next nil if candidate.content_type != content_type
# only consider candidates who themselves aren't patch-based
next nil unless candidate.base.nil?
# only consider diffable content types
next nil unless DIFFABLE_CONTENT_TYPES.any? { |ct| content_type =~ ct }
[candidate, XDiff.diff(candidate.contents, contents)]
end.reject(&:nil?).each do |pair|
candidate, patch = pair
if smallest_patch_size.nil? || patch.size < smallest_patch_size
smallest_patch_size = patch.size
smallest_patch = patch
smallest_candidate = candidate
[candidate, XDiff.diff(candidate.contents, contents)]
end
.reject(&:nil?)
.each do |pair|
candidate, patch = pair
if smallest_patch_size.nil? || patch.size < smallest_patch_size
smallest_patch_size = patch.size
smallest_patch = patch
smallest_candidate = candidate
end
end
end
# only use a patch if it's <= 60% the original content size
if smallest_patch_size && smallest_patch_size <= (contents.size * 0.6)
@@ -106,9 +109,7 @@ class BlobEntryP < ReduxApplicationRecord
record.contents = contents
end
if record.contents != contents
raise RuntimeError.new("invariant!")
end
raise RuntimeError.new("invariant!") if record.contents != contents
record
end

View File

@@ -1,7 +1,5 @@
module ImmutableModel
extend ActiveSupport::Concern
included do
before_update { raise ActiveRecord::ReadOnlyRecord }
end
included { before_update { raise ActiveRecord::ReadOnlyRecord } }
end

View File

@@ -10,10 +10,7 @@ class Domain::E621::Post < ReduxApplicationRecord
enum state: %i[ok scan_error, file_error]
enum rating: %i[s q e]
validates_presence_of(
:e621_id,
:state,
)
validates_presence_of(:e621_id, :state)
after_initialize do
self.state ||= :ok
self.state_detail ||= {}
@@ -24,12 +21,9 @@ class Domain::E621::Post < ReduxApplicationRecord
self.tags_array ||= []
end
has_many :taggings,
class_name: "Domain::E621::Tagging"
has_many :taggings, class_name: "Domain::E621::Tagging"
has_many :tags,
class_name: "Domain::E621::Tag",
through: :taggings
has_many :tags, class_name: "Domain::E621::Tag", through: :taggings
# If the file was scraped, this is the blob entry that represents it
belongs_to :file,
@@ -42,67 +36,67 @@ class Domain::E621::Post < ReduxApplicationRecord
foreign_key: :e621_id,
optional: true
SKIP_MISMATCH_LEGACY_IDS = Set.new([
836414,
1070178,
])
SKIP_MISMATCH_LEGACY_IDS = Set.new([836_414, 1_070_178])
def self.find_or_build_from_legacy(legacy_model)
model = self.find_by(e621_id: legacy_model.e621_id)
return model if model
model = self.new({
state: :ok,
file_url_str: legacy_model.file_url,
rating: legacy_model.rating,
sources_array: legacy_model.sources,
tags_array: legacy_model.tags.map(&:value),
artists_array: legacy_model.artists || [],
})
model =
self.new(
{
state: :ok,
file_url_str: legacy_model.file_url,
rating: legacy_model.rating,
sources_array: legacy_model.sources,
tags_array: legacy_model.tags.map(&:value),
artists_array: legacy_model.artists || []
}
)
if legacy_model.e621_status != "active"
model.flags_array << legacy_model.e621_status
model.flags_array.uniq!
end
[
:e621_id,
:md5,
:description,
:score,
:created_at,
].each do |attr|
%i[e621_id md5 description score created_at].each do |attr|
model.send(:"#{attr}=", legacy_model.send(attr))
end
http_log_entries = ::HttpLogEntry.where(
uri_host: model.file_uri.host,
uri_path: model.file_uri.path,
)
http_log_entries =
::HttpLogEntry.where(
uri_host: model.file_uri.host,
uri_path: model.file_uri.path
)
http_log_entry = http_log_entries.first
if !http_log_entry && legacy_model.blob_entry
legacy_hles = ::Legacy::HttpLogEntry.where(
host: model.file_uri.host,
path: model.file_uri.path,
)
legacy_hles =
::Legacy::HttpLogEntry.where(
host: model.file_uri.host,
path: model.file_uri.path
)
legacy_hle = legacy_hles.first
if legacy_hle
http_log_entry = ::HttpLogEntry.build_from_legacy(legacy_hle)
else
http_log_entry = ::HttpLogEntry.new({
uri: model.file_uri || raise,
status_code: 200,
verb: "get",
response_time_ms: -1,
requested_at: Time.now,
request_headers: ::HttpLogEntryHeader.empty,
response_headers: ::HttpLogEntryHeader.empty,
performed_by: "legacy",
})
http_log_entry =
::HttpLogEntry.new(
{
uri: model.file_uri || raise,
status_code: 200,
verb: "get",
response_time_ms: -1,
requested_at: Time.now,
request_headers: ::HttpLogEntryHeader.empty,
response_headers: ::HttpLogEntryHeader.empty,
performed_by: "legacy"
}
)
end
http_log_entry.response ||= ::BlobEntryP.find_or_build_from_legacy(legacy_model.blob_entry)
http_log_entry.response ||=
::BlobEntryP.find_or_build_from_legacy(legacy_model.blob_entry)
blob_entry = http_log_entry.response
if blob_entry && http_log_entry

View File

@@ -1,19 +1,17 @@
class Domain::E621::Tagging < ReduxApplicationRecord
self.table_name = "domain_e621_taggings"
belongs_to :post,
class_name: "Domain::E621::Post"
belongs_to :tag,
class_name: "Domain::E621::Tag"
belongs_to :post, class_name: "Domain::E621::Post"
belongs_to :tag, class_name: "Domain::E621::Tag"
enum category: [
:cat_general,
:cat_artist,
:cat_copyright,
:cat_character,
:cat_species,
:cat_invalid,
:cat_meta,
:cat_lore,
]
enum category: %i[
cat_general
cat_artist
cat_copyright
cat_character
cat_species
cat_invalid
cat_meta
cat_lore
]
validates_inclusion_of(:category, in: self.categories.keys)
end

View File

@@ -1,8 +1,6 @@
class Domain::Fa::Fav < ReduxApplicationRecord
self.table_name = "domain_fa_favs"
belongs_to :user,
class_name: "::Domain::Fa::User"
belongs_to :post,
class_name: "::Domain::Fa::Post"
belongs_to :user, class_name: "::Domain::Fa::User"
belongs_to :post, class_name: "::Domain::Fa::Post"
end

View File

@@ -2,8 +2,6 @@ class Domain::Fa::Follow < ReduxApplicationRecord
self.table_name = "domain_fa_follows"
self.primary_keys = :follower_id, :followed_id
belongs_to :follower,
class_name: "::Domain::Fa::User"
belongs_to :followed,
class_name: "::Domain::Fa::User"
belongs_to :follower, class_name: "::Domain::Fa::User"
belongs_to :followed, class_name: "::Domain::Fa::User"
end

View File

@@ -4,16 +4,17 @@ class Domain::Fa::Post < ReduxApplicationRecord
has_lite_trail(
schema_version: 1,
map_attribute: {
file_sha256: ::Sha256AttributeMapper,
},
file_sha256: ::Sha256AttributeMapper
}
)
enum :state, [
:ok, # so far so good, post may not yet be scanned or have file downloaded
:removed, # post has been removed
:scan_error, # error scanning post page (see state_detail)
:file_error, # error downloading post file (see state_detail)
]
enum :state,
[
:ok, # so far so good, post may not yet be scanned or have file downloaded
:removed, # post has been removed
:scan_error, # error scanning post page (see state_detail)
:file_error # error downloading post file (see state_detail)
]
validates_inclusion_of(:state, in: self.states.keys)
after_initialize do
self.state_detail ||= {}
@@ -27,20 +28,18 @@ class Domain::Fa::Post < ReduxApplicationRecord
validates_presence_of(:fa_id, :state)
belongs_to :creator,
class_name: "::Domain::Fa::User",
inverse_of: :posts,
optional: true,
autosave: true
class_name: "::Domain::Fa::User",
inverse_of: :posts,
optional: true,
autosave: true
# If the file was scraped, this is the blob entry that represents it
belongs_to :file,
class_name: "::HttpLogEntry",
optional: :true,
autosave: true
class_name: "::HttpLogEntry",
optional: :true,
autosave: true
has_many :fav_post_joins,
class_name: "::Domain::Fa::Fav",
inverse_of: :post
has_many :fav_post_joins, class_name: "::Domain::Fa::Fav", inverse_of: :post
has_many :faved_by,
class_name: "::Domain::Fa::User",
@@ -95,22 +94,23 @@ class Domain::Fa::Post < ReduxApplicationRecord
end
def scanned_at=(time)
unless time.is_a?(Time)
raise ArgumentError("time must be Time, was #{time.class}")
end unless time.nil?
unless time.nil?
unless time.is_a?(Time)
raise ArgumentError("time must be Time, was #{time.class}")
end
end
self.state_detail["scanned_at"] = time&.to_i
end
def posted_at
pa = super
return pa if pa
@posted_at ||= begin
@posted_at ||=
begin
contents = guess_last_submission_page&.response&.contents
if contents
parser = Domain::Fa::Parser::Page.new(contents)
if parser.probably_submission?
parser.submission.posted_date
end
parser.submission.posted_date if parser.probably_submission?
end
end
end
@@ -124,26 +124,32 @@ class Domain::Fa::Post < ReduxApplicationRecord
end
def guess_last_submission_page
last_submission_page || begin
HttpLogEntry.where(
uri_host: "www.furaffinity.net",
uri_path: ["/view/#{self.fa_id}/", "/view/#{self.fa_id}"],
uri_query: nil,
status_code: 200,
).order(created_at: :desc).first
end
last_submission_page ||
begin
HttpLogEntry
.where(
uri_host: "www.furaffinity.net",
uri_path: ["/view/#{self.fa_id}/", "/view/#{self.fa_id}"],
uri_query: nil,
status_code: 200
)
.order(created_at: :desc)
.first
end
end
def have_file?
self.file_id.present?
end
def self.hash_from_submission_parser_helper(submission, first_seen_log_entry: nil)
creator = Domain::Fa::User.find_or_create_by({
url_name: submission.artist_url_name,
}) do |user|
user.name = submission.artist
end
def self.hash_from_submission_parser_helper(
submission,
first_seen_log_entry: nil
)
creator =
Domain::Fa::User.find_or_create_by(
{ url_name: submission.artist_url_name }
) { |user| user.name = submission.artist }
{
fa_id: submission.id,
@@ -151,8 +157,8 @@ class Domain::Fa::Post < ReduxApplicationRecord
title: submission.title,
state_detail: {
"first_seen_entry" => first_seen_log_entry&.id,
"thumbnail_url_str" => submission.thumb_path,
},
"thumbnail_url_str" => submission.thumb_path
}
}
end
end

View File

@@ -4,15 +4,16 @@ class Domain::Fa::UserAvatar < ReduxApplicationRecord
schema_version: 1,
separate_versions_table: true,
map_attribute: {
file_sha256: ::Sha256AttributeMapper,
},
file_sha256: ::Sha256AttributeMapper
}
)
enum :state, [
enum :state,
[
:ok, # got the file, no problem
:download_error, # other error processing the file
:no_file_on_guessed_user_page_error,
:file_not_found, # 404 from server
:file_not_found # 404 from server
]
after_initialize do
self.state ||= :ok
@@ -20,16 +21,17 @@ class Domain::Fa::UserAvatar < ReduxApplicationRecord
end
belongs_to :user, class_name: "::Domain::Fa::User"
belongs_to :file, foreign_key: :file_sha256, class_name: "::BlobEntryP", optional: true
belongs_to :file,
foreign_key: :file_sha256,
class_name: "::BlobEntryP",
optional: true
belongs_to :log_entry, class_name: "::HttpLogEntry", optional: true
def file
@file_model ||= BlobEntryP.ensure(file_sha256) if file_sha256
end
before_validation do
file_uri = Addressable::URI.parse(file_url_str)
end
before_validation { file_uri = Addressable::URI.parse(file_url_str) }
def file_uri
Addressable::URI.parse(file_url_str) unless file_url_str.blank?
@@ -48,25 +50,30 @@ class Domain::Fa::UserAvatar < ReduxApplicationRecord
def guess_file_uri_from_hles_with_info
hle = guess_user_page_log_entry
if hle
page = Domain::Fa::Parser::Page.new(hle.response.contents, require_logged_in: false)
page =
Domain::Fa::Parser::Page.new(
hle.response.contents,
require_logged_in: false
)
if page.probably_user_page? && (url = page.user_page.profile_thumb_url)
return [:user_page, url]
return :user_page, url
end
end
posts = user.
posts.
where(state: [:ok, nil]).
where("file_url_str IS NOT NULL").
order(created_at: :desc).
limit(3)
posts =
user
.posts
.where(state: [:ok, nil])
.where("file_url_str IS NOT NULL")
.order(created_at: :desc)
.limit(3)
for post in posts
if (hle = post.guess_last_submission_page)
page = Domain::Fa::Parser::Page.new(hle.response.contents)
next unless page.probably_submission?
url = page.submission.artist_avatar_url
return [:post_page, url, post.fa_id] if url
return :post_page, url, post.fa_id if url
end
end

View File

@@ -1,18 +1,14 @@
class Domain::Inkbunny::File < ReduxApplicationRecord
self.table_name = "domain_inkbunny_files"
belongs_to :post,
class_name: "::Domain::Inkbunny::Post",
inverse_of: :files
belongs_to :post, class_name: "::Domain::Inkbunny::Post", inverse_of: :files
belongs_to :blob_entry,
class_name: "::BlobEntryP",
foreign_key: :blob_entry_sha256,
optional: true
belongs_to :log_entry,
class_name: "::HttpLogEntry",
optional: true
belongs_to :log_entry, class_name: "::HttpLogEntry", optional: true
enum :state, %i[ok error]
after_initialize do
@@ -20,9 +16,16 @@ class Domain::Inkbunny::File < ReduxApplicationRecord
self.state_detail ||= {}
end
validates_presence_of(%i[
ib_file_id file_name url_str
ib_created_at file_order
md5_initial md5_full md5s
])
validates_presence_of(
%i[
ib_file_id
file_name
url_str
ib_created_at
file_order
md5_initial
md5_full
md5s
]
)
end

View File

@@ -5,29 +5,28 @@ class Domain::Inkbunny::Post < ReduxApplicationRecord
class_name: "::Domain::Inkbunny::User",
inverse_of: :posts
has_many :files,
class_name: "::Domain::Inkbunny::File",
inverse_of: :post
has_many :files, class_name: "::Domain::Inkbunny::File", inverse_of: :post
enum :state, %i[ok error]
enum :rating, %i[general mature adult]
enum :submission_type, %i[
unknown
picture_pinup
sketch
picture_series
comic
portfolio
flash_animation
flash_interactive
video_feature
video_animation
music_single
music_album
writing_document
character_sheet
photography
]
enum :submission_type,
%i[
unknown
picture_pinup
sketch
picture_series
comic
portfolio
flash_animation
flash_interactive
video_feature
video_animation
music_single
music_album
writing_document
character_sheet
photography
]
after_initialize do
self.state ||= :ok

View File

@@ -6,9 +6,7 @@ class Domain::Twitter::Tweet < ReduxApplicationRecord
foreign_key: :author_id,
primary_key: :tw_id
has_many :medias,
class_name: "Domain::Twitter::Media",
foreign_key: :tweet_id
has_many :medias, class_name: "Domain::Twitter::Media", foreign_key: :tweet_id
enum state: %i[ok error]
after_initialize do

View File

@@ -10,29 +10,28 @@ class HttpLogEntry < ReduxApplicationRecord
dedipath-1
direct-gdl
serverhost-1
], _prefix: true
],
_prefix: true
belongs_to :response,
foreign_key: :response_sha256,
class_name: "::BlobEntryP",
autosave: true
foreign_key: :response_sha256,
class_name: "::BlobEntryP",
autosave: true
belongs_to :request_headers,
class_name: "::HttpLogEntryHeader"
belongs_to :request_headers, class_name: "::HttpLogEntryHeader"
belongs_to :response_headers,
class_name: "::HttpLogEntryHeader"
belongs_to :response_headers, class_name: "::HttpLogEntryHeader"
validates :response_sha256, length: { is: 32 }
belongs_to :caused_by_entry,
class_name: "::HttpLogEntry",
foreign_key: :caused_by_id,
optional: true
class_name: "::HttpLogEntry",
foreign_key: :caused_by_id,
optional: true
has_many :triggered_entries,
class_name: "::HttpLogEntry",
foreign_key: :caused_by_id
class_name: "::HttpLogEntry",
foreign_key: :caused_by_id
validates_presence_of(
:uri_scheme,
@@ -48,25 +47,23 @@ class HttpLogEntry < ReduxApplicationRecord
def self.find_by_uri_host_path(uri)
uri = Addressable::URI.parse(uri) unless uri.is_a?(Addressable::URI)
find_by(
uri_host: uri.host,
uri_path: uri.path,
)
find_by(uri_host: uri.host, uri_path: uri.path)
end
def self.build_from_legacy(legacy_model)
response_body = legacy_model.response_body
can_reconstruct_be =
response_body.nil? &&
legacy_model.parent_log_entry_id.nil? &&
legacy_model.resp_body.present? &&
legacy_model.blob_entry.present?
response_body.nil? && legacy_model.parent_log_entry_id.nil? &&
legacy_model.resp_body.present? && legacy_model.blob_entry.present?
if can_reconstruct_be
blob_entry = ::BlobEntryP.find_or_build_from_legacy(legacy_model.blob_entry)
blob_entry =
::BlobEntryP.find_or_build_from_legacy(legacy_model.blob_entry)
blob_sha256 = HexUtil.hex2bin(legacy_model.resp_body)
unless blob_entry.sha256 == blob_sha256
raise("mismatch for legacy http entry #{legacy_model.id} / legacy blob entry #{legacy_model.blob_entry.id}")
raise(
"mismatch for legacy http entry #{legacy_model.id} / legacy blob entry #{legacy_model.blob_entry.id}"
)
end
else
blob_entry = nil
@@ -76,20 +73,24 @@ class HttpLogEntry < ReduxApplicationRecord
uri.scheme ||= "https"
uri.path ||= "/"
::HttpLogEntry.new({
verb: legacy_model.verb,
uri: uri,
content_type: legacy_model.content_type,
status_code: legacy_model.status,
response_time_ms: legacy_model.response_time,
request_headers: ::HttpLogEntryHeader.find_or_build(headers: legacy_model.req_headers),
response_headers: ::HttpLogEntryHeader.find_or_build(headers: legacy_model.res_headers),
response: blob_entry,
requested_at: legacy_model.requested_at,
created_at: legacy_model.created_at,
updated_at: legacy_model.updated_at,
performed_by: "legacy",
})
::HttpLogEntry.new(
{
verb: legacy_model.verb,
uri: uri,
content_type: legacy_model.content_type,
status_code: legacy_model.status,
response_time_ms: legacy_model.response_time,
request_headers:
::HttpLogEntryHeader.find_or_build(headers: legacy_model.req_headers),
response_headers:
::HttpLogEntryHeader.find_or_build(headers: legacy_model.res_headers),
response: blob_entry,
requested_at: legacy_model.requested_at,
created_at: legacy_model.created_at,
updated_at: legacy_model.updated_at,
performed_by: "legacy"
}
)
end
def uri=(uri)
@@ -133,7 +134,7 @@ class HttpLogEntry < ReduxApplicationRecord
response_sha256: self.response_sha256,
requested_at: self.requested_at,
created_at: self.created_at,
updated_at: self.updated_at,
updated_at: self.updated_at
}
end
end

View File

@@ -27,10 +27,7 @@ class HttpLogEntryHeader < ReduxApplicationRecord
headers = headers.sort.to_h
sha256 = Digest::SHA256.digest(headers.to_s)
HttpLogEntryHeader.new(
sha256: sha256,
headers: headers,
)
HttpLogEntryHeader.new(sha256: sha256, headers: headers)
end
def self.find_or_build(headers:)
@@ -49,16 +46,11 @@ class HttpLogEntryHeader < ReduxApplicationRecord
end
def self.scrub_datetime_header(hash, key)
if hash[key]
hash[key].gsub!(/\d\d:\d\d:\d\d/, "(scrubbed)")
end
hash[key].gsub!(/\d\d:\d\d:\d\d/, "(scrubbed)") if hash[key]
end
def to_bulk_insert_hash
{
sha256: sha256,
headers: headers,
}
{ sha256: sha256, headers: headers }
end
def self.empty

View File

@@ -46,16 +46,19 @@ class Legacy::BlobEntry < LegacyApplicationRecord
unless File.exist?(file_path)
found = false
(2..5).each do |depth|
path = File.join(
Legacy::SConfig.blob_static_dir,
self.class.file_path_at_depth(sha256: sha256, depth: depth)
)
path =
File.join(
Legacy::SConfig.blob_static_dir,
self.class.file_path_at_depth(sha256: sha256, depth: depth)
)
next unless File.exist?(path)
self.dir_depth = depth
save!
found = true
Legacy::SConfig.logger.warn("found fixed path at #{depth} for BE id #{id}")
Legacy::SConfig.logger.warn(
"found fixed path at #{depth} for BE id #{id}"
)
break
end
@@ -69,9 +72,9 @@ class Legacy::BlobEntry < LegacyApplicationRecord
# generate something like sha256[0]/sha256[1]/sha256
raise("invalid sha256: #{sha256}") unless sha256.length == hash_length
parts = (0...depth).map do |idx|
sha256[(idx * stride)...((idx + 1) * stride)]
end + [sha256]
parts =
(0...depth).map { |idx| sha256[(idx * stride)...((idx + 1) * stride)] } +
[sha256]
File.join(*parts)
end
@@ -90,28 +93,31 @@ class Legacy::BlobEntry < LegacyApplicationRecord
def self.create_from_blob(blob:, opts: {})
sha256 = Digest::SHA256.hexdigest blob
write_out = lambda { |be, _contents|
dir = File.dirname be.file_path
FileUtils.mkdir_p dir
f = File.open(be.file_path, "wb")
begin
f.write(blob)
f.fsync
write_out =
lambda do |be, _contents|
dir = File.dirname be.file_path
FileUtils.mkdir_p dir
f = File.open(be.file_path, "wb")
begin
f.write(blob)
f.fsync
unless File.exist?(be.file_path)
raise("error ensuring blob exists for #{be.id}")
unless File.exist?(be.file_path)
raise("error ensuring blob exists for #{be.id}")
end
ensure
f.close
end
ensure
f.close
end
}
be = nil
::Legacy::BlobEntry.transaction do
be = ::Legacy::BlobEntry.find_by(sha256: sha256)
if be && !be.ensure_file_path
# correct directory depth as well
Legacy::SConfig.logger.warn("file doesn't exist for #{be.id}, writing again...")
Legacy::SConfig.logger.warn(
"file doesn't exist for #{be.id}, writing again..."
)
write_out.call(be, blob)
elsif !be
new_be = ::Legacy::BlobEntry.new(opts.merge(sha256: sha256))

View File

@@ -31,42 +31,49 @@
class Legacy::E621::Post < LegacyApplicationRecord
self.table_name = "e621_posts"
validates_presence_of :e621_id, :md5, :author, :file_url, :file_ext, :rating, :tags_string, :status, :score
validates_presence_of :e621_id,
:md5,
:author,
:file_url,
:file_ext,
:rating,
:tags_string,
:status,
:score
validates_uniqueness_of :md5, :e621_id
serialize :sources, Array
serialize :artists, Array
belongs_to :blob_entry,
class_name: "Legacy::BlobEntry"
belongs_to :blob_entry, class_name: "Legacy::BlobEntry"
# just inserted into db: :not_processed
# we've checked for the existance of its file on the
# disk and it isn't there: :should_download
# we've made an attempt to download its file: :processed
enum status: %i[not_processed should_download processed processed_404 processed_err]
validates_inclusion_of :status,
in: statuses.keys
enum status: %i[
not_processed
should_download
processed
processed_404
processed_err
]
validates_inclusion_of :status, in: statuses.keys
has_many :taggings, class_name: "Legacy::E621::Tagging"
has_many :tags, through: :taggings
enum rating: %i[s q e]
validates_inclusion_of :rating,
in: ratings.keys
validates_inclusion_of :rating, in: ratings.keys
def file_relative_path
base = File.basename(file_url)
"#{base[0]}/#{base[1]}/#{base}"
end
before_validation do
self.file_ext ||= File.extname(file_path)[1..-1]
end
before_validation { self.file_ext ||= File.extname(file_path)[1..-1] }
before_destroy do
blob_entry.dec_refcount
end
before_destroy { blob_entry.dec_refcount }
def file_path
File.join SConfig.e621_static_dir, file_relative_path
@@ -75,32 +82,43 @@ class Legacy::E621::Post < LegacyApplicationRecord
def resized_file_path(style)
raise("no md5") unless md5
hashed_path = Legacy::BlobEntry.file_path_at_depth(
sha256: md5,
depth: 4, stride: 2, hash_length: 32,
)
File.join SConfig.e621_data_dir, "resized", style.to_s, (hashed_path + "." + file_ext)
hashed_path =
Legacy::BlobEntry.file_path_at_depth(
sha256: md5,
depth: 4,
stride: 2,
hash_length: 32
)
File.join SConfig.e621_data_dir,
"resized",
style.to_s,
(hashed_path + "." + file_ext)
end
FASource = Struct.new(:type, :id, :url)
def fa_sources
self.sources.flatten.map do |source|
if matches = /furaffinity.net\/view\/(\d+)/.match(source)
fa_id = matches[1]
FASource.new(:post, fa_id.to_i, source)
elsif matches = /furaffinity.net\/(gallery|user)\/([^\/]+)/.match(source)
url_name = FA::User.name_to_url_name(matches[2])
FASource.new(:user, url_name, source)
else
nil
self
.sources
.flatten
.map do |source|
if matches = %r{furaffinity.net/view/(\d+)}.match(source)
fa_id = matches[1]
FASource.new(:post, fa_id.to_i, source)
elsif matches = %r{furaffinity.net/(gallery|user)/([^/]+)}.match(source)
url_name = FA::User.name_to_url_name(matches[2])
FASource.new(:user, url_name, source)
else
nil
end
end
end.reject(&:nil?)
.reject(&:nil?)
end
def update_taggings(assume_total_overwrite: false)
tags_string_split = tags_string.split(/\s+/).map(&:strip).reject(&:blank?)
tags_arr = Legacy::E621::Tag.where(value: tags_string_split).select(:id, :value).to_a
tags_arr =
Legacy::E621::Tag.where(value: tags_string_split).select(:id, :value).to_a
missing = Set.new(tags_string_split) - Set.new(tags_arr.map(&:value))

View File

@@ -46,12 +46,12 @@ class Legacy::Fa::Post < LegacyApplicationRecord
serialize :in_folders, Array
enum state: [
:seen_listing, # have seen a reference to this post on a listing page
:scanned_submission, # have scanned the actual submission page
:scan_error, # error scanning the submission page
:have_static, # have the static asset associated with the page
:static_error,
] # error getting the static asset
:seen_listing, # have seen a reference to this post on a listing page
:scanned_submission, # have scanned the actual submission page
:scan_error, # error scanning the submission page
:have_static, # have the static asset associated with the page
:static_error
] # error getting the static asset
validates_inclusion_of :state, in: Legacy::Fa::Post.states.keys
serialize :state_error
@@ -60,10 +60,10 @@ class Legacy::Fa::Post < LegacyApplicationRecord
belongs_to :creator, class_name: "::Legacy::Fa::User"
belongs_to :blob_entry, class_name: "::Legacy::BlobEntry"
belongs_to :submission_page_log_entry,
class_name: "::Legacy::HttpLogEntry"
belongs_to :submission_page_log_entry, class_name: "::Legacy::HttpLogEntry"
belongs_to :description_ref, nil,
belongs_to :description_ref,
nil,
class_name: "::Legacy::Fa::PostDescription",
foreign_key: :description_id,
inverse_of: :fa_post
@@ -86,7 +86,12 @@ class Legacy::Fa::Post < LegacyApplicationRecord
name = ensure_creator_name_filename
file = self.class.file_name_filter(File.basename(file_url))
File.join(name, file).encode(Encoding.find("UTF-8"), invalid: :replace, undef: :replace, replace: "")
File.join(name, file).encode(
Encoding.find("UTF-8"),
invalid: :replace,
undef: :replace,
replace: ""
)
end
def file_path
@@ -102,11 +107,17 @@ class Legacy::Fa::Post < LegacyApplicationRecord
def resized_file_path(style)
raise("no fa_id") unless fa_id
hashed_path = Legacy::BlobEntry.file_path_at_depth(
sha256: Digest::SHA256.hexdigest(fa_id.to_s),
depth: 4, stride: 2,
)
File.join Legacy::SConfig.fa_data_dir, "static", "resized", style.to_s, (hashed_path + file_ext)
hashed_path =
Legacy::BlobEntry.file_path_at_depth(
sha256: Digest::SHA256.hexdigest(fa_id.to_s),
depth: 4,
stride: 2
)
File.join Legacy::SConfig.fa_data_dir,
"static",
"resized",
style.to_s,
(hashed_path + file_ext)
end
private

View File

@@ -13,7 +13,5 @@
class Legacy::Fa::PostDescription < LegacyApplicationRecord
self.table_name = "fa_post_descriptions"
has_one :fa_post,
class_name: "Legacy::Fa::Post",
foreign_key: :description_id
has_one :fa_post, class_name: "Legacy::Fa::Post", foreign_key: :description_id
end

View File

@@ -29,22 +29,17 @@
class Legacy::Fa::User < LegacyApplicationRecord
self.table_name = "fa_users"
validates :name,
uniqueness: true,
presence: true
validates :name, uniqueness: true, presence: true
validates :url_name,
uniqueness: true,
presence: true
validates :url_name, uniqueness: true, presence: true
has_many :posts, class_name: "Legacy::Fa::Post",
foreign_key: :creator_name,
primary_key: :name
has_many :posts,
class_name: "Legacy::Fa::Post",
foreign_key: :creator_name,
primary_key: :name
# if present, this user is being watched
has_one :watched_user,
foreign_key: :user_id,
inverse_of: :user
has_one :watched_user, foreign_key: :user_id, inverse_of: :user
belongs_to :user_page_log_entry, class_name: "Legacy::Cache::HttpLogEntry"

View File

@@ -74,20 +74,20 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
# native: use the native LogStore server to store the entry
enum diff_type: %i[text binary native]
validates_inclusion_of :diff_type, in: Legacy::HttpLogEntry.diff_types.keys
after_initialize do
self.diff_type = "native" if new_record?
end
after_initialize { self.diff_type = "native" if new_record? }
# out of line req/response headers
belongs_to :req_headers_ref,
foreign_key: :req_headers_id, class_name: "Legacy::HttpLogEntryHeader"
foreign_key: :req_headers_id,
class_name: "Legacy::HttpLogEntryHeader"
belongs_to :res_headers_ref,
foreign_key: :res_headers_id, class_name: "Legacy::HttpLogEntryHeader"
foreign_key: :res_headers_id,
class_name: "Legacy::HttpLogEntryHeader"
belongs_to :native_blob_entry,
foreign_key: :native_blob_entry_sha256,
primary_key: :key,
class_name: "::LogStoreSstEntry"
foreign_key: :native_blob_entry_sha256,
primary_key: :key,
class_name: "::LogStoreSstEntry"
attr_accessor :can_force_update
before_update do
@@ -137,25 +137,15 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
def req_headers
ref = req_headers_ref
if ref
ref.headers
else
{}
end
ref ? ref.headers : {}
end
def res_headers
ref = res_headers_ref
if ref
ref.headers
else
{}
end
ref ? ref.headers : {}
end
before_create do
self.requested_at ||= DateTime.now
end
before_create { self.requested_at ||= DateTime.now }
def body_stored?
!!(blob_entry_id || parent_log_entry_id || resp_body)
@@ -177,20 +167,31 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
raise("legacy can't write")
# try and find a good HttpLogEntry to diff this against
candidate_keys = if !opts[:skip_find_candidates]
Legacy::HttpLogEntry.where(
host: host,
path: path,
diff_type: self.class.diff_types["native"],
).limit(5).to_a.map(&:resp_body).reject(&:nil?).reject(&:empty?)
candidate_keys =
if !opts[:skip_find_candidates]
Legacy::HttpLogEntry
.where(
host: host,
path: path,
diff_type: self.class.diff_types["native"]
)
.limit(5)
.to_a
.map(&:resp_body)
.reject(&:nil?)
.reject(&:empty?)
else
[]
end
SConfig.with_log_store_client do |lsc|
ret = lsc.put_binary(
LogStore::PutBinaryArgs.new(hint_hashes: candidate_keys, contents: body_string)
)
ret =
lsc.put_binary(
LogStore::PutBinaryArgs.new(
hint_hashes: candidate_keys,
contents: body_string
)
)
self.resp_body = ret.key
end
@@ -202,23 +203,24 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
return set_response_body_native(body_string, opts) if diff_type == "native"
# try and find a good HttpLogEntry to diff this against
candidate_entries = Legacy::HttpLogEntry.where(
host: host,
path: path,
parent_log_entry_id: nil,
).limit(3).to_a
candidate_entries =
Legacy::HttpLogEntry
.where(host: host, path: path, parent_log_entry_id: nil)
.limit(3)
.to_a
# add or remove trailing slash to each of the paths
hint_paths = opts[:similar_content_path_hints] || []
hint_paths += hint_paths.map do |p|
if p == "/"
p
elsif p[-1] == "/"
p[0..-2]
else
p + "/"
hint_paths +=
hint_paths.map do |p|
if p == "/"
p
elsif p[-1] == "/"
p[0..-2]
else
p + "/"
end
end
end
body_string = body_string.force_encoding("UTF-8")
@@ -228,15 +230,17 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
if body_string.valid_encoding?
if hint_paths.any?
candidate_entries += Legacy::HttpLogEntry.where(
host: host,
path: hint_paths,
parent_log_entry_id: nil,
).limit(50).to_a
candidate_entries +=
Legacy::HttpLogEntry
.where(host: host, path: hint_paths, parent_log_entry_id: nil)
.limit(50)
.to_a
end
SConfig.logger.info("Comparing against #{candidate_entries.length} " \
"candidates: #{candidate_entries.map(&:path).join(", ")}")
SConfig.logger.info(
"Comparing against #{candidate_entries.length} " \
"candidates: #{candidate_entries.map(&:path).join(", ")}"
)
candidate_entries.each do |ce|
SConfig.logger.info "Comparing diff against HLE (#{ce.id}: #{ce.path})"
ce_body = ce.response_body
@@ -246,14 +250,19 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
end
ce_diff = self.class.get_diff(ce_body, body_string, diff_type)
if (diff_type == "text") && (/^Binary files .+ and .+ differ/ =~ ce_diff)
SConfig.logger.warn("diff detected HLE #{ce.id} was a binary, skipping...")
if (diff_type == "text") &&
(/^Binary files .+ and .+ differ/ =~ ce_diff)
SConfig.logger.warn(
"diff detected HLE #{ce.id} was a binary, skipping..."
)
next
end
# verify we can reconstruct the original body string
if self.class.apply_patch(ce_body, ce_diff, diff_type) != body_string
SConfig.logger.error("couldn't succesfully apply patch to get orig...")
SConfig.logger.error(
"couldn't succesfully apply patch to get orig..."
)
next
end
@@ -273,13 +282,13 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
if use_string.length < ce_use_string.length
SConfig.logger.info(
"Previous config was still smaller (#{use_string.length} vs" \
" #{ce_use_string.length} bytes)"
" #{ce_use_string.length} bytes)"
)
next
else
SConfig.logger.info(
"HLE (#{ce.id}) is good candidate: #{ce_use_string.length} bytes " \
"(gz: #{ce_gzipped})"
"(gz: #{ce_gzipped})"
)
end
@@ -298,7 +307,13 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
self.resp_body = use_string
SConfig.logger.info "Storing data interally"
else
self.blob_entry = Legacy::BlobEntry.create_from_blob(blob: use_string, opts: { dir_depth: 4 })
self.blob_entry =
Legacy::BlobEntry.create_from_blob(
blob: use_string,
opts: {
dir_depth: 4
}
)
blob_entry.inc_refcount
SConfig.logger.info "Storing data in blob entry #{blob_entry.id}..."
end
@@ -311,7 +326,7 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
total_bytes = body_string.length
SConfig.logger.info(
"Stored #{stored_bytes}/#{total_bytes} bytes" \
" (#{(stored_bytes.to_f / total_bytes.to_f * 100.0).round(1)}\% of original)"
" (#{(stored_bytes.to_f / total_bytes.to_f * 100.0).round(1)}\% of original)"
)
response_body
@@ -325,14 +340,17 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
# -> string
def response_body
@response_body ||= begin
@response_body ||=
begin
return response_body_native if diff_type == "native"
our_string = if blob_entry
our_string =
if blob_entry
path = blob_entry.ensure_file_path
unless path
raise NoBEPathException, "no path for blob entry " \
"#{blob_entry_id} (HLE id: #{id}) (#{blob_entry.file_path})"
raise NoBEPathException,
"no path for blob entry " \
"#{blob_entry_id} (HLE id: #{id}) (#{blob_entry.file_path})"
end
File.read(path)
else
@@ -346,7 +364,11 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
# our_string = our_string.force_encoding("UTF-8")
if parent_log_entry
self.class.apply_patch(parent_log_entry.response_body, our_string, diff_type)
self.class.apply_patch(
parent_log_entry.response_body,
our_string,
diff_type
)
else
our_string
end
@@ -360,12 +382,17 @@ class Legacy::HttpLogEntry < LegacyApplicationRecord
# new:
@response_body_native ||=
self.native_blob_entry&.patched_value ||
::FlatSstEntry.find_by_hex_key(resp_body)&.contents ||
::LogStoreSstEntry.find_by_hex_key(resp_body)&.patched_value
::FlatSstEntry.find_by_hex_key(resp_body)&.contents ||
::LogStoreSstEntry.find_by_hex_key(resp_body)&.patched_value
end
def self.encode_str(str)
str.encode(Encoding.find("UTF-8"), invalid: :replace, undef: :replace, replace: "")
str.encode(
Encoding.find("UTF-8"),
invalid: :replace,
undef: :replace,
replace: ""
)
end
def self.gunzip(data)

View File

@@ -77,21 +77,28 @@ class Legacy::SConfig
end
def self.logger
@@logger ||= begin
@@logger ||=
begin
l = Logger.new(STDOUT)
l.level = Logger::INFO
l.datetime_format = "%Y-%m-%d %H:%M:%S"
l.formatter = proc do |sev, datetime, _prog, msg|
color = case sev
when "INFO" then :blue
when "ERROR" then :red
when "DEBUG" then :yellow
else :white
end
l.formatter =
proc do |sev, datetime, _prog, msg|
color =
case sev
when "INFO"
:blue
when "ERROR"
:red
when "DEBUG"
:yellow
else
:white
end
date_format = datetime.strftime("%Y-%m-%d %H:%M:%S")
"[#{date_format}] #{sev.ljust(5).send(color)}: #{msg}\n"
end
date_format = datetime.strftime("%Y-%m-%d %H:%M:%S")
"[#{date_format}] #{sev.ljust(5).send(color)}: #{msg}\n"
end
l
end
end

View File

@@ -6,16 +6,20 @@ class LiteTrail::AbstractVersion < ReduxApplicationRecord
versions_arr = item.versions
self_idx = versions_arr.find_index(self)
if self_idx == nil
raise("item.versions (#{item.item_type}/#{item.item_id}) does not contain self: #{self.id}")
raise(
"item.versions (#{item.item_type}/#{item.item_id}) does not contain self: #{self.id}"
)
end
model = self.item.dup
# unapply versions in reverse order
(versions_arr.length - 1).downto(self_idx).each do |idx|
version = versions_arr[idx]
version._unapply(model)
end
(versions_arr.length - 1)
.downto(self_idx)
.each do |idx|
version = versions_arr[idx]
version._unapply(model)
end
model
end
@@ -31,14 +35,20 @@ class LiteTrail::AbstractVersion < ReduxApplicationRecord
attr_name_sym = attr_name.to_sym
if mapper_config[attr_name_sym]
attr_before = mapper_config[attr_name_sym].map_from(attr_before) if attr_before
attr_after = mapper_config[attr_name_sym].map_from(attr_after) if attr_after
attr_before =
mapper_config[attr_name_sym].map_from(attr_before) if attr_before
attr_after =
mapper_config[attr_name_sym].map_from(attr_after) if attr_after
end
# sanity check - but ignore updated_at due to rounding issues
if model.send(attr_name.to_sym) != attr_after
raise("expected #{attr_name} to be #{attr_after}, was #{item_attributes[attr_name]}")
end if attr_name_sym != :updated_at
if attr_name_sym != :updated_at
if model.send(attr_name.to_sym) != attr_after
raise(
"expected #{attr_name} to be #{attr_after}, was #{item_attributes[attr_name]}"
)
end
end
model.send(:"#{attr_name}=", attr_before)
end

View File

@@ -11,10 +11,10 @@ class LogStoreSstEntry < ReduxApplicationRecord
# contents - bytea
belongs_to :base,
foreign_key: :base_key,
primary_key: :key,
class_name: "::LogStoreSstEntry",
optional: true
foreign_key: :base_key,
primary_key: :key,
class_name: "::LogStoreSstEntry",
optional: true
def hex_key
self.class.bin2hex(self.key)
@@ -24,13 +24,16 @@ class LogStoreSstEntry < ReduxApplicationRecord
if contents
contents_digest = Digest::SHA256.digest(contents)
if contents_digest != key
raise RuntimeError("digest mismatch: #{HexUtil.bin2hex(contents_digest)} != #{hex_key}")
raise RuntimeError(
"digest mismatch: #{HexUtil.bin2hex(contents_digest)} != #{hex_key}"
)
end
return contents
end
@patched_value ||= begin
@patched_value ||=
begin
if base
# format is:
# 0..4 - version
@@ -42,7 +45,9 @@ class LogStoreSstEntry < ReduxApplicationRecord
parent_value = base.patched_value
patched_value = XDiff.patch(parent_value, patch_value)
if patched_value.length != value_length
raise RuntimeError.new("length mismatch: #{patched_value.length} != #{value_length}")
raise RuntimeError.new(
"length mismatch: #{patched_value.length} != #{value_length}"
)
end
else
# format is:
@@ -54,7 +59,9 @@ class LogStoreSstEntry < ReduxApplicationRecord
value_digest = Digest::SHA256.digest(patched_value)
if value_digest != key
raise RuntimeError("digest mismatch: #{HexUtil.bin2hex(value_digest)} != #{hex_key}")
raise RuntimeError(
"digest mismatch: #{HexUtil.bin2hex(value_digest)} != #{hex_key}"
)
end
patched_value
@@ -82,11 +89,7 @@ class LogStoreSstEntry < ReduxApplicationRecord
end
def parent_key
if has_parent?
self.value[12...44]
else
nil
end
has_parent? ? self.value[12...44] : nil
end
def self.find_by_hex_key(hex_key)

View File

@@ -27,7 +27,7 @@ Rails.application.configure do
config.cache_store = :memory_store
config.public_file_server.headers = {
"Cache-Control" => "public, max-age=#{2.days.to_i}",
"Cache-Control" => "public, max-age=#{2.days.to_i}"
}
else
config.action_controller.perform_caching = false

View File

@@ -91,11 +91,11 @@ Rails.application.configure do
end
config.log_tags = {
ip: ->request {
ip: ->(request) do
request.headers["HTTP_CF_CONNECTING_IP"] || request.remote_ip
},
api_token: ->request { request.params[:api_token] || "(nil api token)" },
user_name: ->request {
end,
api_token: ->(request) { request.params[:api_token] || "(nil api token)" },
user_name: ->(request) do
api_token = request.params[:api_token]
if api_token
user = ApplicationController::API_TOKENS[api_token]
@@ -103,7 +103,7 @@ Rails.application.configure do
else
"(nil api_token)"
end
},
end
}
# Do not dump schema after migrations.

View File

@@ -28,7 +28,7 @@ Rails.application.configure do
config.cache_store = :memory_store
config.public_file_server.headers = {
"Cache-Control" => "public, max-age=#{2.days.to_i}",
"Cache-Control" => "public, max-age=#{2.days.to_i}"
}
else
config.action_controller.perform_caching = false

View File

@@ -19,7 +19,7 @@ Rails.application.configure do
# Configure public file server for tests with Cache-Control for performance.
config.public_file_server.enabled = true
config.public_file_server.headers = {
"Cache-Control" => "public, max-age=#{1.hour.to_i}",
"Cache-Control" => "public, max-age=#{1.hour.to_i}"
}
# Show full error reports and disable caching.

View File

@@ -1,9 +1,11 @@
if Rails.env.staging?
Rails.application.config.middleware.insert_before 0, Rack::Cors, debug: true do
Rails.application.config.middleware.insert_before 0,
Rack::Cors,
debug: true do
allow do
origins "localhost:3000"
resource "/api/fa/search_users", headers: :any, methods: [:get, :options]
resource "/api/fa/load_users", headers: :any, methods: [:get, :options]
resource "/api/fa/search_users", headers: :any, methods: %i[get options]
resource "/api/fa/load_users", headers: :any, methods: %i[get options]
end
end
end

View File

@@ -3,6 +3,14 @@
# Configure parameters to be filtered from the log file. Use this to limit dissemination of
# sensitive information. See the ActiveSupport::ParameterFilter documentation for supported
# notations and behaviors.
Rails.application.config.filter_parameters += [
:passw, :secret, :token, :_key, :crypt, :salt, :certificate, :otp, :ssn,
Rails.application.config.filter_parameters += %i[
passw
secret
token
_key
crypt
salt
certificate
otp
ssn
]

View File

@@ -3,5 +3,7 @@ require_relative Rails.root.join("app/lib/lite_trail/migration_extensions")
ActiveRecord::Migration.send(:include, ::LiteTrail::MigrationExtensions)
require_relative Rails.root.join("app/models/redux_application_record")
require_relative Rails.root.join("app/lib/lite_trail/active_record_class_methods")
require_relative Rails.root.join(
"app/lib/lite_trail/active_record_class_methods"
)
ReduxApplicationRecord.send(:extend, ::LiteTrail::ActiveRecordClassMethods)

View File

@@ -1,5 +1,5 @@
if Rails.env.development? || Rails.env.staging?
Rack::MiniProfiler.config.position = "top-right"
Rack::MiniProfiler.config.disable_caching = false
Rack::MiniProfiler.config.skip_paths = [/\/blobs\/.+\/contents.jpg$/]
Rack::MiniProfiler.config.skip_paths = [%r{/blobs/.+/contents.jpg$}]
end

View File

@@ -97,7 +97,6 @@
# "Referrer-Policy" => "strict-origin-when-cross-origin"
# }
# ** Please read carefully, this must be configured in config/application.rb **
# Change the format of the cache entry.
# Changing this default means that all new cache entries added to the cache
@@ -107,9 +106,8 @@
# When you're ready to change format, add this to `config/application.rb` (NOT this file):
# config.active_support.cache_format_version = 7.0
# Cookie serializer: 2 options
#
#
# If you're upgrading and haven't set `cookies_serializer` previously, your cookie serializer
# is `:marshal`. The default for new apps is `:json`.
#
@@ -117,10 +115,10 @@
#
#
# To migrate an existing application to the `:json` serializer, use the `:hybrid` option.
#
#
# Rails transparently deserializes existing (Marshal-serialized) cookies on read and
# re-writes them in the JSON format.
#
#
# It is fine to use `:hybrid` long term; you should do that until you're confident *all* your cookies
# have been converted to JSON. To keep using `:hybrid` long term, move this config to its own
# initializer or to `config/application.rb`.
@@ -131,5 +129,5 @@
# If your cookies can't yet be serialized to JSON, keep using `:marshal` for backward-compatibility.
#
# If you have configured the serializer elsewhere, you can remove this section of the file.
#
#
# See https://guides.rubyonrails.org/action_controller_overview.html#cookies for more information.

View File

@@ -1,11 +1,11 @@
Rails.application.routes.draw do
root to: 'pages#root'
root to: "pages#root"
# Define your application routes per the DSL in https://guides.rubyonrails.org/routing.html
namespace :api do
namespace :fa do
get :similar_users, to: '/domain/fa/api#similar_users'
get :search_user_names, to: '/domain/fa/api#search_user_names'
get :similar_users, to: "/domain/fa/api#similar_users"
get :search_user_names, to: "/domain/fa/api#search_user_names"
end
end
@@ -20,30 +20,30 @@ Rails.application.routes.draw do
get :contents, on: :member
end
get 'us/:script', to: 'user_scripts#get', constraints: { script: /.*/ }
get "us/:script", to: "user_scripts#get", constraints: { script: /.*/ }
scope constraints: VpnOnlyRouteConstraint.new do
mount PgHero::Engine => 'pghero'
mount GoodJob::Engine => 'jobs'
mount PgHero::Engine => "pghero"
mount GoodJob::Engine => "jobs"
namespace :api do
get 'search/user/:prefix', to: 'search#user'
get "search/user/:prefix", to: "search#user"
namespace :fa do
post :enqueue_objects, to: '/domain/fa/api#enqueue_objects'
post :object_statuses, to: '/domain/fa/api#object_statuses'
post :enqueue_objects, to: "/domain/fa/api#enqueue_objects"
post :object_statuses, to: "/domain/fa/api#object_statuses"
get :export_user, to: '/domain/fa/export#user'
get :export_user, to: "/domain/fa/export#user"
end
namespace :twitter do
post :enqueue_objects, to: '/domain/twitter/api#enqueue_objects'
post :object_statuses, to: '/domain/twitter/api#object_statuses'
post :enqueue_objects, to: "/domain/twitter/api#enqueue_objects"
post :object_statuses, to: "/domain/twitter/api#object_statuses"
end
end
namespace :domain do
namespace :fa do
resources :users, param: :url_name, only: [] do
resources :posts, controller: '/domain/fa/posts'
resources :posts, controller: "/domain/fa/posts"
end
resources :posts, param: :fa_id, only: [:inde] do
post :scan_post, on: :member
@@ -56,7 +56,12 @@ Rails.application.routes.draw do
resources :log_entries, only: %i[index show] do
get :stats, on: :collection
get 'filter/*filter', on: :collection, action: :index, constraints: { filter: /.*/ }
get "filter/*filter",
on: :collection,
action: :index,
constraints: {
filter: /.*/
}
end
end
end

View File

@@ -50,19 +50,28 @@ ActiveRecord::Schema[7.0].define(version: 0) do
t.integer "res_headers_id"
t.integer "diff_type", default: 0
t.binary "native_blob_entry_sha256"
t.index ["imported_from_file"], name: "index_cache_http_log_entries_on_imported_from_file"
t.index ["path", "host"], name: "index_cache_http_log_entries_on_path_and_host"
t.index ["path"], name: "cache_http_log_entries_path_idx", opclass: :gist_trgm_ops, using: :gist
t.index ["imported_from_file"],
name: "index_cache_http_log_entries_on_imported_from_file"
t.index %w[path host], name: "index_cache_http_log_entries_on_path_and_host"
t.index ["path"],
name: "cache_http_log_entries_path_idx",
opclass: :gist_trgm_ops,
using: :gist
t.index ["path"], name: "index_pattern_ops_on_hle_entry_path"
t.index ["requested_at"], name: "index_cache_http_log_entries_on_requested_at"
t.index ["requested_at"],
name: "index_cache_http_log_entries_on_requested_at"
end
create_table "cache_http_log_entry_headers", id: :serial, force: :cascade do |t|
create_table "cache_http_log_entry_headers",
id: :serial,
force: :cascade do |t|
t.hstore "headers", null: false
t.binary "sha256", null: false
t.datetime "created_at"
t.datetime "updated_at"
t.index ["sha256"], name: "index_cache_http_log_entry_headers_on_sha256", unique: true
t.index ["sha256"],
name: "index_cache_http_log_entry_headers_on_sha256",
unique: true
end
create_table "e621_posts", id: :serial, force: :cascade do |t|
@@ -89,7 +98,8 @@ ActiveRecord::Schema[7.0].define(version: 0) do
t.integer "imgsearch_entry_id"
t.index ["blob_entry_id"], name: "index_e621_posts_on_blob_entry_id"
t.index ["e621_id"], name: "index_e621_posts_on_e621_id", unique: true
t.index ["imgsearch_entry_id"], name: "index_e621_posts_on_imgsearch_entry_id"
t.index ["imgsearch_entry_id"],
name: "index_e621_posts_on_imgsearch_entry_id"
t.index ["md5"], name: "index_e621_posts_on_md5"
end
@@ -238,7 +248,10 @@ ActiveRecord::Schema[7.0].define(version: 0) do
t.integer "object_type", limit: 2, null: false
t.integer "object_id", null: false
t.bigint "fingerprints", array: true
t.index ["object_type", "object_id"], name: "index_lite_media_file_fingerprints_on_object_type_and_object_id", unique: true
t.index %w[object_type object_id],
name:
"index_lite_media_file_fingerprints_on_object_type_and_object_id",
unique: true
end
create_table "lite_s3_objects", force: :cascade do |t|
@@ -258,7 +271,8 @@ ActiveRecord::Schema[7.0].define(version: 0) do
t.float "total_time"
t.bigint "calls"
t.datetime "captured_at"
t.index ["database", "captured_at"], name: "index_pghero_query_stats_on_database_and_captured_at"
t.index %w[database captured_at],
name: "index_pghero_query_stats_on_database_and_captured_at"
end
create_table "pghero_space_stats", id: :serial, force: :cascade do |t|
@@ -267,7 +281,8 @@ ActiveRecord::Schema[7.0].define(version: 0) do
t.text "relation"
t.bigint "size"
t.datetime "captured_at"
t.index ["database", "captured_at"], name: "index_pghero_space_stats_on_database_and_captured_at"
t.index %w[database captured_at],
name: "index_pghero_space_stats_on_database_and_captured_at"
end
create_table "watch_jobs", id: :serial, force: :cascade do |t|
@@ -304,7 +319,9 @@ ActiveRecord::Schema[7.0].define(version: 0) do
t.integer "status"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["enum_type", "weasyl_id"], name: "index_weasyl_attempted_posts_on_enum_type_and_weasyl_id", unique: true
t.index %w[enum_type weasyl_id],
name: "index_weasyl_attempted_posts_on_enum_type_and_weasyl_id",
unique: true
end
create_table "weasyl_descriptions", id: :serial, force: :cascade do |t|
@@ -316,13 +333,18 @@ ActiveRecord::Schema[7.0].define(version: 0) do
create_table "weasyl_joins_user_follows", id: :serial, force: :cascade do |t|
t.integer "follower_id", null: false
t.integer "followed_id", null: false
t.index ["follower_id", "followed_id"], name: "index_weasyl_joins_user_follows_on_follower_id_and_followed_id", unique: true
t.index %w[follower_id followed_id],
name:
"index_weasyl_joins_user_follows_on_follower_id_and_followed_id",
unique: true
end
create_table "weasyl_joins_user_friends", id: :serial, force: :cascade do |t|
t.integer "a_id", null: false
t.integer "b_id", null: false
t.index ["a_id", "b_id"], name: "index_weasyl_joins_user_friends_on_a_id_and_b_id", unique: true
t.index %w[a_id b_id],
name: "index_weasyl_joins_user_friends_on_a_id_and_b_id",
unique: true
end
create_table "weasyl_medias", id: :serial, force: :cascade do |t|
@@ -355,7 +377,9 @@ ActiveRecord::Schema[7.0].define(version: 0) do
t.datetime "full_scanned_at"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["enum_type", "weasyl_id"], name: "index_weasyl_posts_on_enum_type_and_weasyl_id", unique: true
t.index %w[enum_type weasyl_id],
name: "index_weasyl_posts_on_enum_type_and_weasyl_id",
unique: true
end
create_table "weasyl_users", id: :serial, force: :cascade do |t|
@@ -369,7 +393,9 @@ ActiveRecord::Schema[7.0].define(version: 0) do
t.integer "userid"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["login_name"], name: "index_weasyl_users_on_login_name", unique: true
t.index ["login_name"],
name: "index_weasyl_users_on_login_name",
unique: true
end
create_table "xtwitter_tweets", id: :serial, force: :cascade do |t|

View File

@@ -7,7 +7,9 @@ class AddServiceNameToActiveStorageBlobs < ActiveRecord::Migration[6.0]
add_column :active_storage_blobs, :service_name, :string
if configured_service = ActiveStorage::Blob.service.name
ActiveStorage::Blob.unscoped.update_all(service_name: configured_service)
ActiveStorage::Blob.unscoped.update_all(
service_name: configured_service
)
end
change_column :active_storage_blobs, :service_name, :string, null: false

View File

@@ -4,11 +4,18 @@ class CreateActiveStorageVariantRecords < ActiveRecord::Migration[6.0]
return unless table_exists?(:active_storage_blobs)
# Use Active Record's configured type for primary key
create_table :active_storage_variant_records, id: primary_key_type, if_not_exists: true do |t|
t.belongs_to :blob, null: false, index: false, type: blobs_primary_key_type
create_table :active_storage_variant_records,
id: primary_key_type,
if_not_exists: true do |t|
t.belongs_to :blob,
null: false,
index: false,
type: blobs_primary_key_type
t.string :variation_digest, null: false
t.index %i[blob_id variation_digest], name: 'index_active_storage_variant_records_uniqueness', unique: true
t.index %i[blob_id variation_digest],
name: "index_active_storage_variant_records_uniqueness",
unique: true
t.foreign_key :active_storage_blobs, column: :blob_id
end
end
@@ -22,7 +29,8 @@ class CreateActiveStorageVariantRecords < ActiveRecord::Migration[6.0]
def blobs_primary_key_type
pkey_name = connection.primary_key(:active_storage_blobs)
pkey_column = connection.columns(:active_storage_blobs).find { |c| c.name == pkey_name }
pkey_column =
connection.columns(:active_storage_blobs).find { |c| c.name == pkey_name }
pkey_column.bigint? ? :bigint : pkey_column.type
end
end

View File

@@ -24,7 +24,8 @@ class CreateHttpLogEntries < ActiveRecord::Migration[7.0]
t.datetime :requested_at, null: false
t.timestamps null: false
t.index ["uri_host", "uri_path", "uri_query"], name: "index_http_log_entries_on_uri_host_path_query"
t.index %w[uri_host uri_path uri_query],
name: "index_http_log_entries_on_uri_host_path_query"
end
create_table :http_log_entry_headers do |t|

View File

@@ -1,8 +1,24 @@
class AddForeignKeyConstraints < ActiveRecord::Migration[7.0]
def change
add_foreign_key :http_log_entries, :blob_entries, column: :response_sha256, primary_key: :sha256, validate: true
add_foreign_key :http_log_entries, :http_log_entry_headers, column: :request_headers_id, primary_key: :id, validate: true
add_foreign_key :http_log_entries, :http_log_entry_headers, column: :response_headers_id, primary_key: :id, validate: true
add_foreign_key :blob_entries, :blob_entries, column: :base_sha256, primary_key: :sha256, validate: true
add_foreign_key :http_log_entries,
:blob_entries,
column: :response_sha256,
primary_key: :sha256,
validate: true
add_foreign_key :http_log_entries,
:http_log_entry_headers,
column: :request_headers_id,
primary_key: :id,
validate: true
add_foreign_key :http_log_entries,
:http_log_entry_headers,
column: :response_headers_id,
primary_key: :id,
validate: true
add_foreign_key :blob_entries,
:blob_entries,
column: :base_sha256,
primary_key: :sha256,
validate: true
end
end

Some files were not shown because too many files have changed in this diff Show More