moduleify color logger, use rspec

This commit is contained in:
2023-03-06 09:58:32 -08:00
parent ea261b228e
commit 406080cef6
39 changed files with 803 additions and 273 deletions

1
.rspec Normal file
View File

@@ -0,0 +1 @@
--require spec_helper

View File

@@ -80,7 +80,7 @@ group :test do
gem "capybara"
gem "selenium-webdriver"
gem "webdrivers"
# gem "minitest-rspec_mocks"
gem "rspec-rails"
end
gem "xdiff", path: "../xdiff-rb"
@@ -104,6 +104,7 @@ gem "colorize"
gem "daemons"
gem "delayed_job_worker_pool"
gem "ripcord"
gem "influxdb-client"
# gem 'cli-ui'
# gem "paper_trail"
# gem "paper_trail-hashdiff"

View File

@@ -121,6 +121,7 @@ GEM
sinatra (>= 1.4.4)
delayed_job_worker_pool (1.0.0)
delayed_job (>= 3.0, < 4.2)
diff-lcs (1.5.0)
diffy (3.4.2)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
@@ -135,6 +136,7 @@ GEM
importmap-rails (1.1.5)
actionpack (>= 6.0.0)
railties (>= 6.0.0)
influxdb-client (2.9.0)
io-console (0.6.0)
irb (1.6.2)
reline (>= 0.3.0)
@@ -239,6 +241,23 @@ GEM
io-console (~> 0.5)
rexml (3.2.5)
ripcord (2.0.0)
rspec-core (3.12.1)
rspec-support (~> 3.12.0)
rspec-expectations (3.12.2)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.12.0)
rspec-mocks (3.12.3)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.12.0)
rspec-rails (6.0.1)
actionpack (>= 6.1)
activesupport (>= 6.1)
railties (>= 6.1)
rspec-core (~> 3.11)
rspec-expectations (~> 3.11)
rspec-mocks (~> 3.11)
rspec-support (~> 3.11)
rspec-support (3.12.0)
ruby-prof (1.4.5)
ruby-prof-speedscope (0.3.0)
ruby-prof (~> 1.0)
@@ -314,6 +333,7 @@ DEPENDENCIES
diffy
http-cookie
importmap-rails
influxdb-client
jbuilder
kaminari
listen
@@ -326,6 +346,7 @@ DEPENDENCIES
rails (~> 7.0.4, >= 7.0.4.2)
rb-bsdiff!
ripcord
rspec-rails
ruby-prof
ruby-prof-speedscope
selenium-webdriver

View File

@@ -9,7 +9,7 @@ $LOAD_PATH << Rails.root.join("rake")
Rake.application.rake_require "sst"
Rake.application.rake_require "log_entry"
Rake.application.rake_require "worker"
Rake.application.rake_require "metrics"
Rake.application.rake_require "fa"
Rake.application.rake_require "e621"

View File

@@ -11,6 +11,11 @@ class Domain::Fa::Job::FaJobBase < Scraper::JobBase
end
end
def self.reset_http_client
@@fa_base_http_client.close!
@@fa_base_http_client = nil
end
protected
def find_or_intitialize_user_from_args(args)

View File

@@ -18,7 +18,7 @@ class Domain::Fa::Job::ScanFileJob < Domain::Fa::Job::FaJobBase
return
end
logger_prefix! "[fa_id #{@post.fa_id.to_s.bold} / #{@post.state.bold}]"
logger.prefix = "[fa_id #{@post.fa_id.to_s.bold} / #{@post.state.bold}]"
if @post.state == "removed" && @post.file_uri.nil?
logger.error "removed and has no file, skipping"
@@ -36,12 +36,20 @@ class Domain::Fa::Job::ScanFileJob < Domain::Fa::Job::FaJobBase
return
end
if @post.file_uri.host == "d9.facdn.net"
logger.warn("host is d9.facdn.net, which will not resolve")
@post.state = :file_error
@post.state_detail = { file_error: "d9.facdn.net is unresolvable" }
@post.save!
return
file_uri_host = @post.file_uri&.host
if file_uri_host
is_unresolvable_host = false
is_unresolvable_host ||= file_uri_host == "d9.facdn.net"
uri_tld = file_uri_host.split(".").last
is_unresolvable_host ||= uri_tld.length >= 6 && file_uri_host.start_with?("d.facdn.net")
if is_unresolvable_host
logger.error("host is #{file_uri_host}, which will not resolve")
@post.state = :file_error
@post.state_detail = { file_error: "#{file_uri_host} is unresolvable" }
@post.save!
return
end
end
if @post.state == "file_error" && !@force_scan

View File

@@ -8,7 +8,7 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::FaJobBase
end
@caused_by_entry = args[:caused_by_entry]
@force_scan = !!args[:force_scan]
logger_prefix! "[fa_id #{@post.fa_id.to_s.bold} / #{@post.state.bold}]"
logger.prefix = "[fa_id #{@post.fa_id.to_s.bold} / #{@post.state.bold}]"
logger.info "start scan, triggered by #{@caused_by_entry&.id}"
@@ -74,7 +74,7 @@ class Domain::Fa::Job::ScanPostJob < Domain::Fa::Job::FaJobBase
@post.title = submission.title
@post.creator = Domain::Fa::User.find_or_build_from_submission_parser(submission)
@post.category = submission.category
@post.description = submission.description_html
@post.description = submission.description_html.encode("UTF-8", :invalid => :replace, :undef => :replace)
@post.keywords = submission.keywords_array
@post.file_uri = submission.full_res_img
@post.theme = submission.theme

View File

@@ -7,7 +7,7 @@ class Domain::Fa::Job::UserGalleryJob < Domain::Fa::Job::FaJobBase
def perform(args)
@force_scan = !!args[:force_scan]
@user = find_or_intitialize_user_from_args(args)
logger_prefix! "[user #{(@user.url_name || @user.name).bold} / #{@user.state.bold}]"
logger.prefix = "[user #{(@user.url_name || @user.name).bold} / #{@user.state.bold}]"
if @user.state != "ok" && @user.scanned_gallery_at
logger.warn("state == #{@user.state} and already scanned, skipping")

View File

@@ -6,7 +6,7 @@ class Domain::Fa::Job::UserPageJob < Domain::Fa::Job::FaJobBase
@user = find_or_intitialize_user_from_args(args)
@caused_by_entry = args[:caused_by_entry]
@force_scan = !!args[:force_scan]
logger_prefix! "[#{(@user.url_name || @user.name).bold} / #{@user.state.bold}]"
logger.prefix = "[#{(@user.url_name || @user.name).bold} / #{@user.state.bold}]"
# buggy (sentinal) user
return if @user.id == 117552 && @user.url_name == "click here"

View File

@@ -1,5 +1,5 @@
class Scraper::JobBase < ApplicationJob
attr_reader :logger_prefix
include HasColorLogger
def self.ignore_signature_args(args = nil)
@ignore_signature_args ||= []
@@ -9,16 +9,43 @@ class Scraper::JobBase < ApplicationJob
def initialize(http_client = nil)
@http_client = http_client || self.class.build_http_client
@logger_prefix = ""
super
end
def logger
@logger ||= ColorLogger.make($stdout, self)
def write_point(name, tags: {}, fields: {})
Metrics::Reporter.singleton.write_point(
name,
tags: tags.merge({
class_name: self.class.name,
}),
fields: fields,
)
end
def logger_prefix!(prefix)
@logger_prefix = prefix
around_perform do |job, block|
error = nil
start = Time.now
block.call
rescue Net::ReadTimeout => e
logger.error "Net::ReadTimeout - resetting http client"
error = e
job.class.reset_http_client
raise
rescue Exception => e
error = e
raise
ensure
duration_ms = (Time.now - start) * 1000
job.write_point(
"delayed_job_performed",
tags: {
success: error.nil?,
error_class: error&.class&.name,
},
fields: {
duration_ms: duration_ms,
},
)
end
def enqueue_job(job_class, args, set_args = {})

View File

@@ -15,17 +15,22 @@ class ColorLogger
klass_name = "Fa::#{klass_name}"
end
klass_name = "[#{klass_name.light_blue}]".ljust(32)
Logger.new(sink).tap do |logger|
logger.formatter = proc do |severity, datetime, progname, msg|
prefix = if instance.respond_to?(:logger_prefix)
instance.logger_prefix || ""
else
""
end.strip
prefix = ""
[klass_name, prefix, msg].reject(&:blank?).join(" ") + "\n"
def logger.prefix=(p)
prefix = p
end
logger.formatter = proc do |severity, datetime, progname, msg|
color = case severity
when "ERROR" then :red
when "WARN" then :yellow
else :light_blue
end
klass_name_str = "[#{klass_name.send(color)}]".ljust(32)
[klass_name_str, prefix, msg].reject(&:blank?).join(" ") + "\n"
end
end
end

View File

@@ -0,0 +1,11 @@
require "active_support/concern"
module HasColorLogger
extend ActiveSupport::Concern
included do
def logger
@logger ||= ColorLogger.make($stdout, self)
end
end
end

View File

@@ -1,88 +0,0 @@
require "csv"
class LegacyImport::E621CsvImporter < LegacyImport::BulkImportJob
def initialize(
posts_csv_path:,
batch_size:,
forks:,
start_at:
)
@posts_csv_file = CSV.new(File.open(posts_csv_path, "r+"), headers: true)
@forks = forks || 16
@batch_size = batch_size || @forks * 32
@start_at = start_at || 0
@start_time = Time.now
logger.info "forks=#{@forks} batch_size=#{@batch_size} start_at=#{@start_at}"
end
def name
"e621_csv_post_importer"
end
def profile?
false
end
def run
start_profiling!
progress = 0
query = ::Legacy::E621::Post.includes(:blob_entry, :description_ref)
query.find_in_batches(start: @start_at, batch_size: @batch_size) do |batch|
last_id = batch.last&.id
if @forks <= 1
progress += import_e621_posts(batch)
else
progress += ForkFuture.parallel_map_slice(@forks, batch) do |fork_batch|
import_e621_posts(fork_batch)
end.sum
end
rate = progress.to_f / (Time.now - @start_time)
puts "finish batch, last id #{last_id} - #{progress} - #{rate.round(1)} / second"
write_progress last_id
end
stop_profiling!
end
private
def import_e621_posts(legacy_posts)
progress = 0
skip_posts_ids = Set.new(
::Domain::E621::Post.select(:e621_id).
where(e621_id: legacy_posts.map(&:e621_id)).
pluck(:e621_id)
)
legacy_posts.reject! do |legacy_post|
skip_posts_ids.include?(legacy_post.e621_id)
end
legacy_posts.each do |legacy_post|
retries = 0
begin
ReduxApplicationRecord.transaction do
post = ::Domain::E621::Post.find_or_build_from_legacy(legacy_post)
unless post.valid?
raise("error building post #{post.id} / #{post.e621_id}: #{post.errors.full_messages}")
end
post.save!
progress += 1
end
rescue
retries += 1
sleep 0.1 and retry if retries < 3
raise
end
end
ReduxApplicationRecord.clear_active_connections!
LegacyApplicationRecord.clear_active_connections!
progress
end
end

View File

@@ -0,0 +1,87 @@
require "csv"
class LegacyImport::E621CsvPostImporter < LegacyImport::BulkImportJob
def initialize(
csv_path:,
batch_size:,
forks:,
start_at:
)
@csv_file = CSV.new(File.open(csv_path, "r+"), headers: true)
@forks = forks || 16
@batch_size = batch_size || @forks * 32
@start_at = start_at || 0
@start_time = Time.now
logger.info "forks=#{@forks} batch_size=#{@batch_size} start_at=#{@start_at}"
end
def name
"e621_csv_post_importer"
end
def profile?
false
end
def run_impl
progress = 0
while row = @csv_file.shift&.to_h
e621_id = row["id"].to_i
if @start_at && e621_id < @start_at
next
end
progress += import_e621_post(row)
rate = progress.to_f / (Time.now - @start_time)
puts "finish batch, last id #{e621_id} - #{progress} - #{rate.round(1)} / second"
write_last_id e621_id
end
progress
end
private
def import_e621_post(row)
e621_id = row["id"].to_i
post = Domain::E621::Post.find_by(e621_id: e621_id)
md5 = row["md5"]
if post
unless post.md5 == md5
post.file = nil
post.md5 = md5
end
else
post = Domain::E621::Post.new({
e621_id: e621_id,
md5: md5,
})
end
file_ext = row["file_ext"]
post.file_url_str = "https://static1.e621.net/data/#{md5[0...2]}/#{md5[2...4]}/#{md5}.#{file_ext}"
post.sources_array = row["source"].split("\n")
post.rating = row["rating"]
post.tags_array = row["tag_string"].split(" ").sort
post.num_favorites = row["fav_count"].to_i
post.num_comments = row["comment_count"].to_i
post.description = row["description"]
post.score = row["score"].to_i
post.score_up = row["up_score"].to_i
post.score_down = row["down_score"].to_i
post.parent_e621_id = row["parent_id"]&.to_i
flags_array = []
flags_array << "deleted" if row["is_deleted"] == "t"
flags_array << "pending" if row["is_pending"] == "t"
flags_array << "flagged" if row["is_flagged"] == "t"
post.flags_array = flags_array
if post.changed?
post.save!
return 1
else
return 0
end
end
end

View File

@@ -62,7 +62,6 @@ class LegacyImport::E621LegacyPostImporter < LegacyImport::BulkImportJob
unless post.valid?
raise("error building post #{post.id} / #{post.e621_id}: #{post.errors.full_messages}")
end
# binding.pry
post.save!
progress += 1
end

View File

@@ -0,0 +1,32 @@
class Metrics::DelayedJobReporter < Metrics::Reporter
def report_impl
total = Delayed::Job.count
by_queue = Delayed::Job.group("queue").count
num_working = Delayed::Job.where("locked_at is not null").count
num_failed = Delayed::Job.where("last_error is not null").count
num_pending = total - num_working - num_failed
by_status = {
"working" => num_working,
"failed" => num_failed,
"pending" => num_pending,
}
write_point(
"delayed_jobs",
tags: { aggregation: "total" },
fields: { "total" => total },
)
write_point(
"delayed_jobs",
tags: { aggregation: "queue" },
fields: by_queue,
)
write_point(
"delayed_jobs",
tags: { aggregation: "status" },
fields: by_status,
)
end
end

View File

@@ -0,0 +1,24 @@
class Metrics::EstimateDbRowsReporter < Metrics::Reporter
def report_impl
extra_tables = [
"http_log_entries",
"http_log_entry_headers",
"versions",
"delayed_jobs",
]
extra_tables_sql = extra_tables.map { |t| "'#{t}'" }.join(",")
row_estimates = ReduxApplicationRecord.connection.exec_query(
[
"SELECT relname, n_live_tup",
"FROM pg_stat_all_tables",
"WHERE relname IN (#{extra_tables_sql})",
"OR relname like 'domain_%'",
].join(" ")
).rows.to_h
write_point(
"estimate_db_rows",
fields: row_estimates,
)
end
end

View File

@@ -0,0 +1,78 @@
class Metrics::Reporter
attr_reader :logger
def self.singleton
@singleton ||= Metrics::Reporter.new
end
def self.singleton=(instance)
@singleton = instance
end
def initialize(host: nil, token: nil, org: nil, bucket: nil, default_tags: {})
host ||= Rails.application.config.x.influxdb.host || raise("no host")
token ||= Rails.application.config.x.influxdb.token || raise("no token")
org ||= Rails.application.config.x.influxdb.org || raise("no org")
bucket ||= Rails.application.config.x.influxdb.bucket || raise("no bucket")
@logger = ColorLogger.make($stdout, self)
@client = InfluxDB2::Client.new(
host, token,
org: org,
bucket: bucket,
use_ssl: false,
precision: InfluxDB2::WritePrecision::MILLISECOND,
)
write_options = InfluxDB2::WriteOptions.new(
write_type: InfluxDB2::WriteType::BATCHING,
batch_size: 100, flush_interval: 5_000,
max_retries: 3, max_retry_delay: 15_000,
exponential_base: 2,
)
point_settings = InfluxDB2::PointSettings.new(default_tags: default_tags)
@writer = @client.create_write_api(
write_options: write_options,
point_settings: point_settings,
)
end
def report
start_at = Time.now
success = true
begin
self.report_impl
rescue Exception => e
success = false
ex = e
end
duration = Time.now - start_at
write_point(
"metrics_reporter",
tags: { class_name: self.class.name },
fields: { success: success, duration: duration },
)
close!
color = success ? :light_blue : :red
if success
logger.info "reporter finished (#{duration.round(2)} sec)"
else
logger.error "reporter failed (#{duration.round(2)} sec) - #{ex.message}"
end
raise ex unless success
end
def close!
@client.close!
end
def write_point(name, tags: {}, fields: {})
@writer.write(data: { name: name, tags: tags, fields: fields })
end
def report_impl
raise NotImplementedError, "need to implement #report on #{self.class.name}"
end
end

View File

@@ -1,12 +1,12 @@
class Scraper::BaseHttpClient
include HasColorLogger
Response = Struct.new(
:status_code,
:body,
:log_entry,
)
attr_accessor :logger
class InvalidURLError < ArgumentError; end
def initialize(http_performer_or_proxy)
@@ -27,8 +27,11 @@ class Scraper::BaseHttpClient
@http_performer = http_performer_or_proxy
end
@domain_last_requested_at = {}
@logger ||= ColorLogger.make($stdout, self)
@logger.level = :info
logger.level = :info
end
def close!
@http_performer.close!
end
def cookies
@@ -53,6 +56,10 @@ class Scraper::BaseHttpClient
get_impl(url, caused_by_entry)
end
def reporter
Metrics::Reporter.singleton
end
private
def get_impl(url, caused_by_entry)
@@ -152,12 +159,12 @@ class Scraper::BaseHttpClient
response_code.to_s.yellow.bold
end
actual_time_ms = ((Time.now - requested_at) * 1000).round(0).to_s
total_time_ms = ((Time.now - requested_at) * 1000).round(0)
logger.info([
"[entry #{log_entry.id.to_s.bold} /",
"GET #{response_code_colorized} /",
"#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]",
"[#{response_time_ms.to_s.bold} ms / #{actual_time_ms.bold} ms]",
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
uri.to_s.black,
].reject(&:nil?).join(" "))
@@ -166,6 +173,22 @@ class Scraper::BaseHttpClient
sleep 15
end
reporter.write_point(
"http_client_response",
tags: {
method: "GET",
host: uri.host,
status_code: response_code,
performed_by: @http_performer.name,
},
fields: {
response_time_ms: response_time_ms,
total_time_ms: total_time_ms,
content_size: response_blob_entry.size,
content_stored: response_blob_entry.bytes_stored,
},
)
Response.new(
response_code,
response_body,

View File

@@ -9,7 +9,7 @@ class Scraper::CurlHttpPerformer < Scraper::HttpPerformer
def get(url, request_headers)
start_at = Time.now
@curl.url = url
@curl.url = Addressable::URI.encode url
@curl.headers = {}
request_headers.each do |key, value|
@curl.headers[key.to_s] = value
@@ -28,6 +28,10 @@ class Scraper::CurlHttpPerformer < Scraper::HttpPerformer
)
end
def close!
# no-op - don't need to close in the base class
end
private
def self.parse_header_str(header_str)

View File

@@ -1,4 +1,6 @@
class Scraper::HttpPerformer
include HasColorLogger
Response = Struct.new(
:response_code,
:response_headers,
@@ -9,4 +11,12 @@ class Scraper::HttpPerformer
def get(url, request_headers)
raise NotImplementedError, "implement in subclass"
end
def close!
raise NotImplementedError, "implement in subclass"
end
def name
raise NotImplementedError, "implement in subclass"
end
end

View File

@@ -1,9 +1,9 @@
require "base64"
class Scraper::ProxyHttpPerformer < Scraper::HttpPerformer
def initialize(proxy)
@name = proxy
proxy_url = Rails.application.config.x.proxies[proxy]
def initialize(proxy_name)
@name = proxy_name
proxy_url = Rails.application.config.x.proxies[proxy_name]
@client = Ripcord::Client.new(proxy_url)
end
@@ -21,4 +21,12 @@ class Scraper::ProxyHttpPerformer < Scraper::HttpPerformer
end
Scraper::HttpPerformer::Response.new(code, headers, time_ms, body)
end
def close!
begin
@client.instance_variable_get("@http_client").finish
rescue IOError => e
Rails.logger.error("Failed to close http client: #{e.inspect}")
end
end
end

View File

@@ -1,5 +1,6 @@
module ImmutableModel
extend ActiveSupport::Concern
included do
def readonly?
!new_record?

View File

@@ -4,7 +4,7 @@ class Domain::E621::Post < ReduxApplicationRecord
# see state_detail for scan_error/file_error
enum state: %i[ok scan_error, file_error]
enum rating: %i[safe questionable explicit]
enum rating: %i[s q e]
validates_presence_of(
:e621_id,
@@ -33,8 +33,14 @@ class Domain::E621::Post < ReduxApplicationRecord
optional: :true,
autosave: true
belongs_to :parent_e621,
class_name: "Domain::E621::Post",
foreign_key: :e621_id,
optional: true
SKIP_MISMATCH_LEGACY_IDS = Set.new([
836414,
1070178,
])
def self.find_or_build_from_legacy(legacy_model)
@@ -43,7 +49,7 @@ class Domain::E621::Post < ReduxApplicationRecord
model = self.new({
state: :ok,
file_url_str: legacy_model.file_url,
rating: legacy_model.class.ratings[legacy_model.rating],
rating: legacy_model.rating,
sources_array: legacy_model.sources,
tags_array: legacy_model.tags.map(&:value),
artists_array: legacy_model.artists || [],
@@ -94,18 +100,28 @@ class Domain::E621::Post < ReduxApplicationRecord
http_log_entry.response ||= ::BlobEntry.find_or_build_from_legacy(legacy_model.blob_entry)
blob_entry = http_log_entry.response
http_log_entry.content_type ||= blob_entry.content_type
raise("#{http_log_entry.content_type} != #{blob_entry.content_type}") unless blob_entry.content_type.start_with?(http_log_entry.content_type)
if blob_entry && http_log_entry
http_log_entry.content_type ||= blob_entry.content_type
else
# unable to construct http & blob entries, skip
File.write(
Rails.root.join("tmp/e621_legacy_post_importer_failures"),
"#{model.e621_id} - (no hle) - unable to reconstruct http / blob entry\n"
)
http_log_entry = nil
end
end
if http_log_entry
blob_entry = http_log_entry.response
if model.md5 != Digest::MD5.hexdigest(blob_entry.contents)
if http_log_entry.status_code != 404
raise("#{model.md5} != #{Digest::MD5.hexdigest(blob_entry.contents)}")
end
File.write(
Rails.root.join("tmp/e621_legacy_post_importer_failures"),
"#{model.e621_id} - #{http_log_entry.status_code} - expected #{model.md5} != actual #{Digest::MD5.hexdigest(blob_entry.contents)}\n"
)
http_log_entry = nil
end
end

View File

@@ -27,5 +27,7 @@ module ReduxScraper
"proxy-1" => "http://proxy-1.local:9292",
"dedipath-1" => "http://10.200.0.6:9292",
}
config.x.influxdb = ReduxScraper::Application.config_for("influxdb")
end
end

View File

@@ -31,6 +31,24 @@ vipvillageworker: &vipvillageworker
value: 1905x976
path: /
blazeandwish: &blazeandwish
- domain: .furaffinity.net
cookies:
- name: a
value: 343f3180-1ba4-4e20-af45-b0ce39201a78
- name: b
value: 05784dcd-1fe7-482a-8191-4470e68b776a
- domain: rv.furaffinity.net
cookies:
- name: OAID
value: a0196c5a7bdc3fa4865b86dab9be9ce4
path: /
- domain: www.furaffinity.net
cookies:
- name: sz
value: 1600x944
path: /
testcookies: &testcookies
- domain: .furaffinity.net
cookies:
@@ -49,14 +67,14 @@ testcookies: &testcookies
development:
direct: *ddwhatnow
proxy-1: *vipvillageworker
dedipath-1: []
dedipath-1: *blazeandwish
production:
direct: *ddwhatnow
proxy-1: *vipvillageworker
dedipath-1: []
dedipath-1: *blazeandwish
test:
direct: *testcookies
proxy-1: *testcookies
dedipath-1: []
dedipath-1: *testcookies

17
config/influxdb.yml Normal file
View File

@@ -0,0 +1,17 @@
development:
host: "http://grafana.local:8086"
token: "W2ikhmpQxQHZAStgSEK6s6aJxnOqeD4Zz2MI1m_lnD1JX57V9Esqm0zXb3DWbN7Gnj2GdmF_YrcvE8cy6NbIqQ=="
org: "primary"
bucket: "redux-scraper"
production:
host: "http://grafana.local:8086"
token: "W2ikhmpQxQHZAStgSEK6s6aJxnOqeD4Zz2MI1m_lnD1JX57V9Esqm0zXb3DWbN7Gnj2GdmF_YrcvE8cy6NbIqQ=="
org: "primary"
bucket: "redux-scraper"
test:
host: "http://grafana.local:8086"
token: "P0hzfg-vKiBTmpE-wPMJKkU8wj8VjUN1_OMxNVFhKRqxSdq3msQZ0ZWpVCT9MPAzsvoR40Caaxd2kq_LEIJpxQ=="
org: "primary"
bucket: "redux-scraper-test"

52
pool_combined.rb Normal file
View File

@@ -0,0 +1,52 @@
# This runs in the master process after it preloads the app
after_preload_app do
puts "Master #{Process.pid} preloaded app"
# Don't hang on to database connections from the master after we've
# completed initialization
ActiveRecord::Base.connection_pool.disconnect!
end
# This runs in the worker processes after it has been forked
on_worker_boot do |worker_info|
ActiveJob::Base.logger = Logger.new("/dev/null")
Delayed::Worker.logger = Logger.new(STDOUT)
Delayed::Worker.logger.level = :error
Rails.logger = Logger.new(STDOUT)
puts "Worker #{Process.pid} started"
sleep rand(1..5)
# Reconnect to the database
ActiveRecord::Base.establish_connection
end
# This runs in the master process after a worker starts
after_worker_boot do |worker_info|
puts "Master #{Process.pid} booted worker #{worker_info.name} with " \
"process id #{worker_info.process_id}"
end
# This runs in the master process after a worker shuts down
after_worker_shutdown do |worker_info|
puts "Master #{Process.pid} detected dead worker #{worker_info.name} " \
"with process id #{worker_info.process_id}"
end
{
default: 1,
fa_user_page: 1,
fa_user_gallery: 1,
fa_post: 3,
manual: 4,
static_file: 6,
}.each do |queue, workers|
worker_group(queue) do |g|
g.read_ahead = 8
g.sleep_delay = 5
g.workers = workers
g.queues = [queue.to_s]
end
end
preload_app

View File

@@ -15,10 +15,13 @@ namespace :e621 do
batch_size = ENV["batch_size"]&.to_i
forks = ENV["forks"]&.to_i
start_at = ENV["start_at"]&.to_i
posts_csv_path = ENV["posts_csv_path"] || raise("must supply `posts_csv_path`")
csv_path = ENV["csv"] || raise("must supply `csv`")
LegacyImport::E621CsvPostImporter.
new(batch_size: batch_size, forks: forks, start_at: start_at).
run
LegacyImport::E621CsvPostImporter.new(
csv_path: csv_path,
batch_size: batch_size,
forks: forks,
start_at: start_at,
).run
end
end

View File

@@ -115,106 +115,4 @@ namespace :fa do
})
end
end
# treat this like a bomb
# desc "delete / undo changes made by buggy proxy responses"
# task :fix_buggy_posts => [:set_logger_stdout, :environment] do
# raise "Already ran, would destroy everything if it ran again"
# # log_entries = HttpLogEntry.where("id > ?", 54040076).to_a
# earliest_created_at = log_entries.map(&:created_at).min
# puts "earliest: #{earliest_created_at}"
# users_to_revert = Set.new
# posts_to_revert = Set.new
# files_to_revert = Set.new
# log_entries.each do |log_entry|
# uri = log_entry.uri
# # puts "[id #{log_entry.id} / uri #{uri}]"
# case uri.host
# when "ipinfo.io"
# puts "ipinfo, do nothing"
# when "d.furaffinity.net"
# # puts "fix static file #{uri.path.bold}"
# files_to_revert.add(uri.to_s)
# post = Domain::Fa::Post.find_by(file_id: log_entry.id)
# posts_to_revert.add(post) if post
# when "www.furaffinity.net"
# if uri.path.start_with?("/user/")
# url_name = uri.path.split("/")[2]
# user = Domain::Fa::User.find_by(url_name: url_name) || raise
# # puts "fix fa user #{url_name.bold}"
# users_to_revert.add(user)
# elsif uri.path.start_with?("/view/")
# fa_id = uri.path.split("/")[2]
# post = Domain::Fa::Post.find_by(fa_id: fa_id) || raise
# # puts "fix fa post #{fa_id.to_s.bold}"
# posts_to_revert.add(post)
# elsif uri.path.start_with?("/gallery/") || uri.path.start_with?("/scraps/")
# url_name = uri.path.split("/")[2]
# user = Domain::Fa::User.find_by(url_name: url_name) || raise
# # puts "fix fa user gallery #{url_name.bold}"
# users_to_revert.add(user)
# elsif uri.path == "/"
# next
# else
# raise("unsupported path: #{uri.path}")
# end
# else raise("unsupported host: #{uri.host}")
# end
# end
# puts "entries: #{log_entries.size}"
# puts "users: #{users_to_revert.size}"
# puts "posts: #{posts_to_revert.size}"
# puts "files: #{files_to_revert.size}"
# # puts "----- users -----"
# # users_to_revert.each do |user|
# # puts user.to_json
# # end
# # puts "----- posts -----"
# # posts_to_revert.each do |post|
# # puts post.to_json
# # end
# reset_detail = ({ :info => "reverted on #{Time.now}" })
# job_params = ({ :queue => "manual", :priority => -20 })
# ReduxApplicationRecord.transaction do
# users_to_revert.each do |user|
# user.state = :ok
# user.state_detail = reset_detail
# user.log_entry_detail = reset_detail
# user.scanned_page_at = nil
# user.scanned_gallery_at = nil
# user.save!
# Domain::Fa::Job::UserGalleryJob.set(job_params).perform_later({
# user: user,
# })
# Domain::Fa::Job::UserPageJob.set(job_params).perform_later({
# user: user,
# })
# end
# posts_to_revert.each do |post|
# post.state = :ok
# post.state_detail = reset_detail
# post.log_entry_detail = reset_detail
# post.file = nil
# post.file_url_str = nil
# post.save!
# Domain::Fa::Job::ScanPostJob.set(job_params).perform_later({
# post: post,
# })
# end
# log_entries.reverse.each(&:destroy!)
# end
# end
end

37
rake/metrics.rake Normal file
View File

@@ -0,0 +1,37 @@
namespace :metrics do
desc "run reporters periodically"
task :report_all => [:environment, :set_logger_stdout] do
schedule = {
Rake::Task["metrics:delayed_job"] => 10.seconds,
Rake::Task["metrics:estimate_db_rows"] => 10.seconds,
}
last_ran = {}
while true
schedule.each do |task, run_every|
task_name = task.name
if last_ran[task_name].nil? || last_ran[task_name] < run_every.ago
begin
task.execute
rescue NotImplementedError, StandardError => e
Rails.logger.error("error invoking #{task_name}: #{e.message.red}")
end
last_ran[task_name] = Time.now
end
end
sleep_amt = schedule.values.min
puts "sleeping #{sleep_amt}"
sleep sleep_amt
end
end
desc "Report DelayedJob queue metrics"
task :delayed_job => :environment do
Metrics::DelayedJobReporter.new.report
end
desc "Report estimated db row metrics"
task :estimate_db_rows => :environment do
Metrics::EstimateDbRowsReporter.new.report
end
end

View File

@@ -1,6 +1,8 @@
class Domain::Fa::Scraper::HttpClientTest < ActiveSupport::TestCase
test "creates an http log entry" do
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_http_performer(
require "rails_helper"
describe Domain::Fa::Scraper::HttpClient do
it "creates an http log entry" do
client = Domain::Fa::Scraper::HttpClient.new(SpecUtil.mock_http_performer(
"https://www.furaffinity.net/",
request_headers: Hash,
response_code: 200,

View File

@@ -1,4 +1,6 @@
class Scraper::BaseHttpClientTest < ActiveSupport::TestCase
require "rails_helper"
describe Scraper::BaseHttpClient do
class TestHttpClient < Scraper::BaseHttpClient
def cookies
[]
@@ -9,21 +11,21 @@ class Scraper::BaseHttpClientTest < ActiveSupport::TestCase
end
end
test "throws on unallowed domain" do
client = TestHttpClient.new(TestUtil.mock_http_performer(""))
it "throws on unallowed domain" do
client = TestHttpClient.new(SpecUtil.mock_http_performer(""))
assert_raises(Scraper::BaseHttpClient::InvalidURLError) do
client.get("https://foobar.com")
end
end
test "creates an http log entry" do
client = TestHttpClient.new(TestUtil.mock_http_performer(
it "creates an http log entry" do
client = TestHttpClient.new(SpecUtil.mock_http_performer(
"https://www.example.com/",
request_headers: { "cookie" => "" },
response_code: 200,
response_time_ms: 15,
response_headers: { "content-type" => "text/plain" },
response_body: "the response " + TestUtil.random_string(16),
response_body: "the response " + SpecUtil.random_string(16),
))
client.logger.level = :error

View File

@@ -0,0 +1,35 @@
describe Scraper::CurlHttpPerformer do
it "can parse header string" do
headers = Scraper::CurlHttpPerformer.parse_header_str(
"HTTP/2 200 \r\n" +
"date: Mon, 20 Feb 2023 00:57:22 GMT\r\n" +
"content-type: text/html; charset=UTF-8\r\n" +
"cache-control: no-cache\r\n" +
"expires: Thu, 01 Jan 1970 00:00:01 GMT\r\n" +
"referrer-policy: strict-origin-when-cross-origin\r\n" +
"x-frame-options: SAMEORIGIN\r\n" +
"content-security-policy: frame-ancestors 'self';\r\n" +
"fa-upstream: mainsite-01\r\n" +
"content-encoding: gzip\r\n" +
"cf-cache-status: DYNAMIC\r\n" +
"server: cloudflare\r\n" +
"cf-ray: 79c349abd918abd2-CPH\r\n" +
"\r\n"
)
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
assert_equal ["content-encoding", "gzip"], headers[8]
end
it "correctly wraps curl" do
body_str = "the response body " + SpecUtil.random_string(16)
client = Scraper::CurlHttpPerformer.new(
SpecUtil.mock_curl_easy("https://www.example.com/", body_str: body_str)
)
response = client.get("https://www.example.com/", {})
assert_equal 200, response.response_code
assert_equal ({ "content-type" => "text/plain", "resp" => "respheader" }),
response.response_headers
assert_equal 0, response.response_time_ms
end
end

65
spec/rails_helper.rb Normal file
View File

@@ -0,0 +1,65 @@
# This file is copied to spec/ when you run 'rails generate rspec:install'
require "spec_helper"
ENV["RAILS_ENV"] ||= "test"
require_relative "../config/environment"
# Prevent database truncation if the environment is production
abort("The Rails environment is running in production mode!") if Rails.env.production?
require "rspec/rails"
# Add additional requires below this line. Rails is not loaded until this point!
# Requires supporting ruby files with custom matchers and macros, etc, in
# spec/support/ and its subdirectories. Files matching `spec/**/*_spec.rb` are
# run as spec files by default. This means that files in spec/support that end
# in _spec.rb will both be required and run as specs, causing the specs to be
# run twice. It is recommended that you do not name files matching this glob to
# end with _spec.rb. You can configure this pattern with the --pattern
# option on the command line or in ~/.rspec, .rspec or `.rspec-local`.
#
# The following line is provided for convenience purposes. It has the downside
# of increasing the boot-up time by auto-requiring all files in the support
# directory. Alternatively, in the individual `*_spec.rb` files, manually
# require only the support files necessary.
#
# Dir[Rails.root.join('spec', 'support', '**', '*.rb')].sort.each { |f| require f }
# Checks for pending migrations and applies them before tests are run.
# If you are not using ActiveRecord, you can remove these lines.
begin
ActiveRecord::Migration.maintain_test_schema!
rescue ActiveRecord::PendingMigrationError => e
abort e.to_s.strip
end
RSpec.configure do |config|
# Remove this line if you're not using ActiveRecord or ActiveRecord fixtures
config.fixture_path = "#{::Rails.root}/spec/fixtures"
# If you're not using ActiveRecord, or you'd prefer not to run each of your
# examples within a transaction, remove the following line or assign false
# instead of true.
config.use_transactional_fixtures = true
# You can uncomment this line to turn off ActiveRecord support entirely.
# config.use_active_record = false
# RSpec Rails can automatically mix in different behaviours to your tests
# based on their file location, for example enabling you to call `get` and
# `post` in specs under `spec/controllers`.
#
# You can disable this behaviour by removing the line below, and instead
# explicitly tag your specs with their type, e.g.:
#
# RSpec.describe UsersController, type: :controller do
# # ...
# end
#
# The different available types are documented in the features, such as in
# https://relishapp.com/rspec/rspec-rails/docs
config.infer_spec_type_from_file_location!
# Filter lines from Rails gems in backtraces.
config.filter_rails_from_backtrace!
# arbitrary gems may also be filtered via:
# config.filter_gems_from_backtrace("gem name")
end
require "spec_util"

94
spec/spec_helper.rb Normal file
View File

@@ -0,0 +1,94 @@
# This file was generated by the `rails generate rspec:install` command. Conventionally, all
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
# The generated `.rspec` file contains `--require spec_helper` which will cause
# this file to always be loaded, without a need to explicitly require it in any
# files.
#
# Given that it is always loaded, you are encouraged to keep this file as
# light-weight as possible. Requiring heavyweight dependencies from this file
# will add to the boot time of your test suite on EVERY test run, even for an
# individual file that may not need all of that loaded. Instead, consider making
# a separate helper file that requires the additional dependencies and performs
# the additional setup, and require it from the spec files that actually need
# it.
#
# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
RSpec.configure do |config|
# rspec-expectations config goes here. You can use an alternate
# assertion/expectation library such as wrong or the stdlib/minitest
# assertions if you prefer.
config.expect_with :rspec do |expectations|
# This option will default to `true` in RSpec 4. It makes the `description`
# and `failure_message` of custom matchers include text for helper methods
# defined using `chain`, e.g.:
# be_bigger_than(2).and_smaller_than(4).description
# # => "be bigger than 2 and smaller than 4"
# ...rather than:
# # => "be bigger than 2"
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
end
# rspec-mocks config goes here. You can use an alternate test double
# library (such as bogus or mocha) by changing the `mock_with` option here.
config.mock_with :rspec do |mocks|
# Prevents you from mocking or stubbing a method that does not exist on
# a real object. This is generally recommended, and will default to
# `true` in RSpec 4.
mocks.verify_partial_doubles = true
end
# This option will default to `:apply_to_host_groups` in RSpec 4 (and will
# have no way to turn it off -- the option exists only for backwards
# compatibility in RSpec 3). It causes shared context metadata to be
# inherited by the metadata hash of host groups and examples, rather than
# triggering implicit auto-inclusion in groups with matching metadata.
config.shared_context_metadata_behavior = :apply_to_host_groups
# The settings below are suggested to provide a good initial experience
# with RSpec, but feel free to customize to your heart's content.
=begin
# This allows you to limit a spec run to individual examples or groups
# you care about by tagging them with `:focus` metadata. When nothing
# is tagged with `:focus`, all examples get run. RSpec also provides
# aliases for `it`, `describe`, and `context` that include `:focus`
# metadata: `fit`, `fdescribe` and `fcontext`, respectively.
config.filter_run_when_matching :focus
# Allows RSpec to persist some state between runs in order to support
# the `--only-failures` and `--next-failure` CLI options. We recommend
# you configure your source control system to ignore this file.
config.example_status_persistence_file_path = "spec/examples.txt"
# Limits the available syntax to the non-monkey patched syntax that is
# recommended. For more details, see:
# https://relishapp.com/rspec/rspec-core/docs/configuration/zero-monkey-patching-mode
config.disable_monkey_patching!
# Many RSpec users commonly either run the entire suite or an individual
# file, and it's useful to allow more verbose output when running an
# individual spec file.
if config.files_to_run.one?
# Use the documentation formatter for detailed output,
# unless a formatter has already been configured
# (e.g. via a command-line flag).
config.default_formatter = "doc"
end
# Print the 10 slowest examples and example groups at the
# end of the spec run, to help surface which specs are running
# particularly slow.
config.profile_examples = 10
# Run specs in random order to surface order dependencies. If you find an
# order dependency and want to debug it, you can fix the order by providing
# the seed, which is printed after each run.
# --seed 1234
config.order = :random
# Seed global randomization in this process using the `--seed` CLI option.
# Setting this allows you to use `--seed` to deterministically reproduce
# test failures related to randomization by passing the same `--seed` value
# as the one that triggered the failure.
Kernel.srand config.seed
=end
end

52
spec/spec_util.rb Normal file
View File

@@ -0,0 +1,52 @@
require "rspec/mocks"
class SpecUtil
extend RSpec::Mocks::ExampleMethods
def self.random_string(length)
(0...length).map { (65 + rand(26)).chr }.join
end
def self.mock_http_performer(
expected_url,
request_headers: {},
response_code: 200,
response_time_ms: 15,
response_headers: { "content-type" => "text/plain" },
response_body: "http body"
)
mock = instance_double("Scraper::HttpPerformer")
allow(mock).to receive(:is_a?).with(String).and_return(false)
allow(mock).to receive(:name).and_return("direct")
allow(mock).to receive(:get).
with(expected_url, request_headers).
and_return(Scraper::HttpPerformer::Response.new(
response_code,
response_headers,
response_time_ms,
response_body
))
mock
end
def self.mock_curl_easy(
expected_url,
response_code: 200,
body_str: "http body",
request_headers: {},
response_headers_str: "HTTP 200\r\nresp: respheader\r\ncontent-type: text/plain\r\n\r\n"
)
mock = instance_double("Curl::Easy")
allow(mock).to receive(:url=).with(expected_url)
allow(mock).to receive(:headers=).with(Hash)
allow(mock).to receive(:perform)
allow(mock).to receive(:response_code).and_return(response_code)
allow(mock).to receive(:body_str).and_return(body_str)
allow(mock).to receive(:header_str).and_return(response_headers_str)
allow(mock).to receive(:headers).and_return(request_headers)
mock
end
end

View File

@@ -1,20 +0,0 @@
class Scraper::CurlHttpPerformerTest < ActiveSupport::TestCase
test "can parse header string" do
headers = Scraper::CurlHttpPerformer.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
assert_equal ["content-encoding", "gzip"], headers[8]
end
test "correctly wraps curl" do
body_str = "the response body " + TestUtil.random_string(16)
client = Scraper::CurlHttpPerformer.new(
TestUtil.mock_curl_easy("https://www.example.com/", body_str: body_str)
)
response = client.get("https://www.example.com/", {})
assert_equal 200, response.response_code
assert_equal ({ "content-type" => "text/plain", "resp" => "respheader" }),
response.response_headers
assert_equal 0, response.response_time_ms
end
end

View File

@@ -28,7 +28,7 @@ class Domain::E621::PostTest < ActiveSupport::TestCase
assert_equal post.updated_at, updated_rating_version.created_at
}
post.rating = "safe"
post.rating = "s"
assert post.save
check_update.call