remove proxy code

This commit is contained in:
Dylan Knutson
2024-12-17 17:57:17 +00:00
parent aad2f29e96
commit 9de3c1b5e8
36 changed files with 565 additions and 771 deletions

View File

@@ -1,9 +0,0 @@
PGADMIN_DEFAULT_EMAIL=admin@example.com
PGADMIN_DEFAULT_PASSWORD=password
PGADMIN_LISTEN_PORT=8080
# no password required
PGADMIN_CONFIG_SERVER_MODE=False
PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED=False
RAILS_ENV=development
NODE_ENV=development

View File

@@ -19,7 +19,7 @@ RUN bundle install
RUN rake compile
# Primary image
FROM mcr.microsoft.com/devcontainers/base:bookworm
FROM mcr.microsoft.com/devcontainers/ruby:1-3.2-bookworm
# apt caching & install packages
RUN rm -f /etc/apt/apt.conf.d/docker-clean; \
@@ -48,21 +48,34 @@ RUN \
libvips42 \
libyaml-dev \
patch \
rbenv \
# rbenv \
iputils-ping \
rustc \
uuid-dev \
zlib1g-dev
USER vscode
ENV PATH=/home/vscode/.rbenv/shims:$PATH
ENV CONFIGURE_OPTS="--disable-install-rdoc"
# USER vscode
# ENV PATH=/home/vscode/.rbenv/shims:$PATH
# ENV CONFIGURE_OPTS="--disable-install-rdoc"
RUN git clone https://github.com/rbenv/ruby-build.git "$(rbenv root)"/plugins/ruby-build
RUN rbenv install 3.2.0
RUN rbenv global 3.2.0
# RUN git clone https://github.com/rbenv/ruby-build.git "$(rbenv root)"/plugins/ruby-build
# RUN rbenv install 3.2.0
# RUN rbenv global 3.2.0
COPY --from=native-gems /usr/src/app/gems/xdiff-rb /gems/xdiff-rb
COPY --from=native-gems /usr/src/app/gems/rb-bsdiff /gems/rb-bsdiff
ENV RAILS_ENV development
ENV RUBOCOP_OPTS --server
# [Optional] Uncomment this line to install additional gems.
RUN su vscode -c "gem install bundler -v '2.4.1'" && \
su vscode -c "gem install rake -v '13.0.6'" && \
su vscode -c "gem install ruby-lsp -v '0.22.1'" && \
su vscode -c "gem install rubocop -v '1.69.2'"
# install exo
RUN su vscode -c "curl -sL https://exo.deref.io/install | bash"
ENV PATH "/home/vscode/.exo/bin:$PATH"
RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && nvm install 18 && nvm use 18 && npm install -g yarn" 2>&1
ENV PATH /usr/local/share/nvm/current/bin:$PATH

View File

@@ -0,0 +1,7 @@
FROM postgres:17
RUN apt-get update && apt-get install -y \
postgresql-17-pgvector \
&& rm -rf /var/lib/apt/lists/*
RUN echo "CREATE EXTENSION pgvector;" >> /docker-entrypoint-initdb.d/01-pgvector.sql

View File

@@ -0,0 +1,5 @@
CREATE USER vscode CREATEDB;
CREATE DATABASE vscode WITH OWNER vscode;
CREATE DATABASE redux_test WITH OWNER vscode;
CREATE DATABASE legacy_test WITH OWNER vscode;

View File

@@ -1,42 +1,25 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
// README at: https://github.com/devcontainers/templates/tree/main/src/ruby-rails-postgres
{
"name": "Redux Scraper",
// "build": {
// // Sets the run context to one level up instead of the .devcontainer folder.
// "context": "..",
// // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
// "dockerfile": "Dockerfile"
// },
"name": "Ruby on Rails & Postgres",
"dockerComposeFile": "docker-compose.yml",
"service": "app",
"workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",
// "mounts": [
// {
// "type": "volume",
// "source": "${devcontainerId}-gems",
// "target": "/home/vscode/.rbenv/versions/3.2.0/lib/ruby/gems"
// }
// ],
"features": {
"ghcr.io/meaningful-ooo/devcontainer-features/fish:1": {},
"ghcr.io/nikobockerman/devcontainer-features/fish-persistent-data:2": {},
"ghcr.io/devcontainers/features/node:1": {
"installYarnUsingApt": true,
"version": "18",
"pnpmVersion": "none",
"nvmVersion": "latest"
}
"ghcr.io/nikobockerman/devcontainer-features/fish-persistent-data:2": {}
},
// "runArgs": [
// "--network=host"
// ],
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],
// Uncomment the next line to run commands after the container is created.
"postCreateCommand": "${containerWorkspaceFolder}/.devcontainer/post-create.sh"
// This can be used to network with other containers or the host.
// "forwardPorts": [3000, 5432],
// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "bundle install && rake db:setup",
"postCreateCommand": ".devcontainer/post-create.sh",
"forwardPorts": [
8080, // pgadmin
3000 // rails
]
// Configure tool-specific properties.
// "customizations": {},
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "devcontainer"
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "root"
}

View File

@@ -1,33 +1,45 @@
version: '3.8'
volumes:
devcontainer-redux-postgres-data:
devcontainer-redux-gem-cache:
version: '3'
services:
app:
build:
context: ..
dockerfile: Dockerfile.devcontainer
dockerfile: .devcontainer/Dockerfile.devcontainer
volumes:
# - ../..:/workspaces:cached
- devcontainer-redux-gem-cache:/home/vscode/.rbenv/versions/3.2.0/lib/ruby/gems
- ../..:/workspaces:cached
- devcontainer-redux-gem-cache:/usr/local/rvm/gems
# Overrides default command so things don't shut down after the process ends.
command: sleep infinity
# Runs app on the same network as the database container, allows "forwardPorts" in devcontainer.json function.
network_mode: service:db
env_file:
- .env
db:
image: pgvector/pgvector:pg17
build:
context: .
dockerfile: Dockerfile.postgres
restart: unless-stopped
volumes:
- devcontainer-redux-postgres-data:/var/lib/postgresql/data
env_file:
- .env
- postgres-data:/var/lib/postgresql/data
- ./create-db-user.sql:/docker-entrypoint-initdb.d/create-db-user.sql
environment:
POSTGRES_USER: postgres
POSTGRES_DB: postgres
POSTGRES_PASSWORD: postgres
pgadmin:
image: dpage/pgadmin4:8.13.0
restart: unless-stopped
network_mode: service:db
env_file:
- .env
environment:
PGADMIN_DEFAULT_EMAIL: admin@example.com
PGADMIN_DEFAULT_PASSWORD: password
PGADMIN_LISTEN_PORT: 8080
PGADMIN_CONFIG_SERVER_MODE: False
PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED: False
volumes:
postgres-data:
devcontainer-redux-gem-cache:

View File

@@ -0,0 +1,11 @@
#!/bin/bash -ex
# EDITOR_BIN="$(ls /vscode/cursor-server/bin/*/*/bin/remote-cli/cursor | head -n 1)"
EDITOR_BIN=cursor
function install_extension() {
$EDITOR_BIN --install-extension "$1"
}
install_extension Shopify.ruby-extensions-pack
install_extension dbaeumer.vscode-eslint
install_extension aliariff.vscode-erb-beautify

View File

@@ -6,14 +6,9 @@ echo "Gem: $(which gem)"
echo "Bundler: $(which bundler)"
echo "Rake: $(which rake)"
gem install bundler -v '2.4.1'
gem install rake -v '13.0.6'
gem install ruby-lsp -v '0.22.1'
gem install rubocop -v '1.69.2'
bundle install --jobs $(getconf _NPROCESSORS_ONLN)
rbenv rehash
rubocop --start-server
npm install yarn
bin/rails yarn:install
yarn

View File

@@ -1 +1 @@
3.2.0
system

View File

@@ -1,8 +0,0 @@
{
"recommendations": [
"Shopify.ruby-extensions-pack",
"LoranKloeze.ruby-rubocop-revived",
"CraigMaslowski.erb",
"dbaeumer.vscode-eslint"
]
}

View File

@@ -4,9 +4,11 @@
"window.title": "${activeEditorMedium}${separator}${rootName}${separator}${profileName}",
"workbench.preferredDarkColorTheme": "Spinel",
"workbench.preferredLightColorTheme": "Spinel Light",
"rubyLsp.formatter": "auto",
"rubyLsp.formatter": "syntax_tree",
"[ruby]": {
"editor.formatOnSave": true,
"editor.defaultFormatter": "LoranKloeze.ruby-rubocop-revived"
"editor.defaultFormatter": "Shopify.ruby-lsp"
},
"[erb]": {
"editor.defaultFormatter": "aliariff.vscode-erb-beautify"
}
}

View File

@@ -1,7 +1,7 @@
source 'https://rubygems.org'
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
ruby '3.2.0'
ruby '3.2.6'
# ruby "3.0.3"
# Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main"
@@ -98,7 +98,6 @@ gem 'daemons'
gem 'discard'
gem 'good_job', '3.17.2'
gem 'http-cookie'
gem 'influxdb-client'
gem 'kaminari'
gem 'nokogiri'
gem 'pluck_each'
@@ -131,3 +130,5 @@ gem 'rack-cors'
gem 'react_on_rails'
gem 'sanitize'
gem 'shakapacker'
gem "syntax_tree", "~> 6.2"

View File

@@ -154,7 +154,6 @@ GEM
domain_name (~> 0.5)
i18n (1.12.0)
concurrent-ruby (~> 1.0)
influxdb-client (2.9.0)
io-console (0.6.0)
irb (1.6.2)
reline (>= 0.3.0)
@@ -216,6 +215,7 @@ GEM
pluck_each (0.2.0)
activerecord (> 3.2.0)
activesupport (> 3.0.0)
prettier_print (1.2.1)
progressbar (1.13.0)
pry (0.14.2)
coderay (~> 1.1)
@@ -341,6 +341,8 @@ GEM
stackprof (0.2.24)
stimulus-rails (1.2.1)
railties (>= 6.0.0)
syntax_tree (6.2.0)
prettier_print (>= 1.2.0)
table_print (1.5.7)
tailwindcss-rails (2.0.26)
railties (>= 6.0.0)
@@ -394,7 +396,6 @@ DEPENDENCIES
good_job (= 3.17.2)
htmlbeautifier
http-cookie
influxdb-client
jbuilder
kaminari
memory_profiler
@@ -429,6 +430,7 @@ DEPENDENCIES
sqlite3 (~> 1.4)
stackprof
stimulus-rails
syntax_tree (~> 6.2)
table_print
tailwindcss-rails (~> 2.0)
turbo-rails
@@ -439,7 +441,7 @@ DEPENDENCIES
zstd-ruby
RUBY VERSION
ruby 3.2.0p0
ruby 3.2.6p234
BUNDLED WITH
2.4.6

View File

@@ -1,6 +1,6 @@
# Procfile for development using HMR
# You can run these commands in separate shells
rails: bundle exec rails s -p 3000
tailwind: bundle exec rake tailwindcss:watch
wp-client: HMR=true bin/webpacker-dev-server
wp-server: HMR=true SERVER_BUNDLE_ONLY=yes bin/webpacker --watch
rails: RAILS_ENV=development bundle exec rails s -p 3000
tailwind: RAILS_ENV=development bundle exec rake tailwindcss:watch
wp-client: HMR=true RAILS_ENV=development bin/webpacker-dev-server
wp-server: HMR=true RAILS_ENV=development SERVER_BUNDLE_ONLY=yes bin/webpacker --watch

View File

@@ -1,4 +1,4 @@
rails: RAILS_ENV=staging ./bin/rails s -p 3000
tailwind: ./bin/rake rake tailwindcss:watch
tailwind: RAILS_ENV=staging ./bin/rake rake tailwindcss:watch
wp-client: RAILS_ENV=staging HMR=true ./bin/webpacker-dev-server
wp-server: RAILS_ENV=staging HMR=true SERVER_BUNDLE_ONLY=yes ./bin/webpacker --watch

View File

@@ -14,21 +14,20 @@ Rake.application.rake_require "e621"
Rake.application.rake_require "twitter"
Rake.application.rake_require "ib"
task :set_ar_stdout => :environment do
task set_ar_stdout: :environment do
ActiveRecord::Base.logger = Logger.new($stdout)
end
task :set_logger_stdout => :environment do
task set_logger_stdout: :environment do
Rails.logger = Logger.new($stdout)
Rails.logger.formatter = proc do |severity, datetime, progname, msg|
"#{severity}: #{msg}\n"
end
Rails.logger.formatter =
proc { |severity, datetime, progname, msg| "#{severity}: #{msg}\n" }
ActiveRecord::Base.logger = nil
ActiveJob::Base.logger = nil
GoodJob.logger = Rails.logger
end
task :periodic_tasks => [:environment, :set_logger_stdout] do
task periodic_tasks: %i[environment set_logger_stdout] do
Thread.new do
loop do
Rake::Task["pghero:capture_space_stats"].execute
@@ -55,22 +54,21 @@ task :periodic_tasks => [:environment, :set_logger_stdout] do
end
end
loop do
sleep 10
end
loop { sleep 10 }
end
namespace :blob_entries do
task :export_samples => :environment do
task export_samples: :environment do
limit = ENV["limit"]&.to_i || raise("need 'limit' (num)")
outfile = ENV["outfile"] || raise("need 'outfile' (file path, .json encoded)")
outfile =
ENV["outfile"] || raise("need 'outfile' (file path, .json encoded)")
BlobEntrySampleExporter.new.export_samples(limit, outfile)
end
task :import_samples => :environment do
task import_samples: :environment do
infile = ENV["infile"] || raise("need 'infile' (file path, .json encoded)")
BlobEntrySampleExporter.new.import_samples(infile)
end
task :migrate_entries => :environment do
task migrate_entries: :environment do
start_at = ENV["start_at"]
batch_size = ENV["batch_size"]&.to_i || 64
BlobEntrySampleExporter.new.migrate_blob_entries(start_at, batch_size)
@@ -78,7 +76,7 @@ namespace :blob_entries do
end
namespace :db_sampler do
task :export => :environment do
task export: :environment do
url_names = ENV["url_names"] || raise("need 'url_names' (comma-separated)")
# outfile_path = ENV["outfile"] || raise("need 'outfile' (file path)")
# outfile = File.open(outfile_path, "wb")
@@ -88,7 +86,7 @@ namespace :db_sampler do
outfile.close if outfile
end
task :import => [:environment] do
task import: [:environment] do
# infile_path = ENV["infile"] || raise("need 'infile' (file path)")
# infile = File.open(infile_path, "rb")
infile = $stdin
@@ -98,23 +96,16 @@ namespace :db_sampler do
end
end
task :good_job do
proxies = ["direct", "proxy-1", "dedipath-1", "serverhost-1"]
proxy = ENV["proxy"]
raise("'proxy' must be set") unless proxy
raise("'proxy' must be one of #{proxies}") unless proxies.include?(proxy)
task good_job: %i[environment set_ar_stdout set_logger_stdout] do
env_hash = {
"RAILS_ENV" => "worker",
"GOOD_JOB_POLL_INTERVAL" => "5",
"GOOD_JOB_MAX_CACHE" => "10000",
"GOOD_JOB_QUEUE_SELECT_LIMIT" => "4096",
"GOOD_JOB_MAX_THREADS" => "4",
"GOOD_JOB_QUEUES" => [
"manual:4",
"fa_post,e621:2",
"*:6",
].reject(&:nil?).join(";"),
"GOOD_JOB_QUEUES" =>
ENV["GOOD_JOB_QUEUES"] ||
%w[manual:4 fa_post,e621:2 *:6].reject(&:nil?).join(";")
}
env_hash.each do |key, value|
@@ -127,7 +118,7 @@ task :good_job do
exec(cmd)
end
task :recompute_job_signatures => :environment do
task recompute_job_signatures: :environment do
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Base.logger.level = :error
@@ -148,21 +139,22 @@ task :recompute_job_signatures => :environment do
end
end
task :workoff_failed_jobs => [:environment, :set_ar_stdout, :set_logger_stdout] do
task workoff_failed_jobs: %i[environment set_ar_stdout set_logger_stdout] do
worker = Delayed::Worker.new
Delayed::Job.where(
"last_error is not null and attempts <= 2"
).find_each(batch_size: 1) do |job|
worker.run(job)
end
Delayed::Job
.where("last_error is not null and attempts <= 2")
.find_each(batch_size: 1) { |job| worker.run(job) }
end
task :reverse_csv do
file = ENV["file"] || raise("need 'file' (file path)")
in_csv = CSV.parse(File.open(file, "r+"), headers: true)
out_csv = CSV.new(File.open("rev_" + file, "w"), write_headers: true, headers: in_csv.headers)
in_csv.reverse_each do |row|
out_csv << row.map(&:second)
end
out_csv =
CSV.new(
File.open("rev_" + file, "w"),
write_headers: true,
headers: in_csv.headers
)
in_csv.reverse_each { |row| out_csv << row.map(&:second) }
out_csv.close
end

View File

@@ -3,7 +3,8 @@ module Domain::E621::Job
queue_as :e621
ignore_signature_args :caused_by_entry
def perform(post:, caused_by_entry:)
def perform(**args)
logger.info("ScanPostJob args: #{args.inspect}")
end
end
end

View File

@@ -2,7 +2,7 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
queue_as :fa_browse_page
ignore_signature_args :caused_by_entry
def perform(args)
def perform(**args)
@caused_by_entry = args[:caused_by_entry]
@first_browse_page_entry = nil
@@ -16,7 +16,9 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
@page_number += 1
end
logger.info("finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts across #{@page_number.to_s.bold} pages")
logger.info(
"finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts across #{@page_number.to_s.bold} pages"
)
end
private
@@ -28,20 +30,29 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
url = "https://www.furaffinity.net/browse/#{@page_number}/"
end
response = http_client.get(url, caused_by_entry: @first_browse_page_entry || @caused_by_entry)
response =
http_client.get(
url,
caused_by_entry: @first_browse_page_entry || @caused_by_entry
)
log_entry = response.log_entry
@first_browse_page_entry ||= log_entry
if response.status_code != 200
fatal_error("non 200 response for /browse: #{response.status_code.to_s.underline}")
fatal_error(
"non 200 response for /browse: #{response.status_code.to_s.underline}"
)
end
page = Domain::Fa::Parser::Page.new(response.body)
listing_page_stats = update_and_enqueue_posts_from_listings_page(
:browse_page, page, log_entry,
listing_page_stats =
update_and_enqueue_posts_from_listings_page(
:browse_page,
page,
log_entry,
enqueue_posts_pri: :high,
page_desc: "Browse@#{@page_number}",
fill_id_gaps: true,
fill_id_gaps: true
)
@total_num_new_posts_seen += listing_page_stats.new_seen

View File

@@ -1,5 +1,6 @@
class Scraper::JobBase < ApplicationJob
class JobError < RuntimeError; end
class JobError < RuntimeError
end
def initialize(...)
@deferred_jobs = []
@@ -20,7 +21,8 @@ class Scraper::JobBase < ApplicationJob
good_job_control_concurrency_with(
total_limit: 1,
key: proc do
key:
proc do
raise("too many arguments") if arguments.size > 1
first_argument = arguments || []
first_argument = first_argument[0] || {}
@@ -28,9 +30,10 @@ class Scraper::JobBase < ApplicationJob
ignore_signature_args = self.class.ignore_signature_args
ignore_signature_args << :_aj_symbol_keys
ignore_signature_args << :_aj_ruby2_keywords
sig_arguments = first_argument.reject do |key, value|
ignore_signature_args.include?(key.to_sym)
end.to_h
sig_arguments =
first_argument
.reject { |key, value| ignore_signature_args.include?(key.to_sym) }
.to_h
sig = []
sig << self.class.name || raise
@@ -39,7 +42,7 @@ class Scraper::JobBase < ApplicationJob
sig << Digest::SHA256.hexdigest(sig_arguments.inspect)[0...16]
sig = sig.join("|")
sig
end,
end
)
# make the concurrency config threadlocal so it can be modified
@@ -50,16 +53,15 @@ class Scraper::JobBase < ApplicationJob
@@gjcc_tl.value
end
PERMITTED_CONTENT_TYPES = [
/text\/html/,
/application\/json/,
]
PERMITTED_CONTENT_TYPES = [%r{text/html}, %r{application/json}]
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
return unless ENV["enqueue_jobs_from_found_links"] == "1"
start_time = Time.now
unless PERMITTED_CONTENT_TYPES.any? { |ct| ct.match(log_entry.content_type) }
unless PERMITTED_CONTENT_TYPES.any? { |ct|
ct.match(log_entry.content_type)
}
raise("unsupported content type: #{log_entry.content_type}")
end
document = log_entry.response.contents
@@ -68,28 +70,37 @@ class Scraper::JobBase < ApplicationJob
links = link_finder.find_links
job_defs = []
filter_mapper = proc { |list, type, key|
filter_mapper =
proc do |list, type, key|
list.filter { |item| item[:type] == type }.map { |item| item[key] }
}
end
attr_to_mapper = proc { |model_class, attr_sym, in_values|
model_class.where({ attr_sym => in_values }).map do |model|
[model.send(attr_sym), model]
end.to_h
}
attr_to_mapper =
proc do |model_class, attr_sym, in_values|
model_class
.where({ attr_sym => in_values })
.map { |model| [model.send(attr_sym), model] }
.to_h
end
url_name_to_fa_user = attr_to_mapper.call(
Domain::Fa::User, :url_name,
url_name_to_fa_user =
attr_to_mapper.call(
Domain::Fa::User,
:url_name,
filter_mapper.call(links, :fa_user, :url_name)
)
fa_id_to_fa_post = attr_to_mapper.call(
Domain::Fa::Post, :fa_id,
fa_id_to_fa_post =
attr_to_mapper.call(
Domain::Fa::Post,
:fa_id,
filter_mapper.call(links, :fa_post, :fa_id)
)
name_to_twitter_user = attr_to_mapper.call(
Domain::Twitter::User, :name,
name_to_twitter_user =
attr_to_mapper.call(
Domain::Twitter::User,
:name,
filter_mapper.call(links, :twitter_user, :name)
)
@@ -99,57 +110,52 @@ class Scraper::JobBase < ApplicationJob
when :fa_user
url_name = link[:url_name]
user = url_name_to_fa_user[url_name]
params = if user
{ user: user }
else
{ url_name: url_name }
end
params = (user ? { user: user } : { url_name: url_name })
job_def = ({ params: params, desc: "fa user " + (user&.name || url_name).bold })
job_def =
({ params: params, desc: "fa user " + (user&.name || url_name).bold })
if !user || user.due_for_page_scan?
job_defs << job_def.merge(
job: Domain::Fa::Job::UserPageJob,
)
job_defs << job_def.merge(job: Domain::Fa::Job::UserPageJob)
elsif !user || user.due_for_gallery_scan?
job_defs << job_def.merge(
job: Domain::Fa::Job::UserGalleryJob,
)
job_defs << job_def.merge(job: Domain::Fa::Job::UserGalleryJob)
end
when :fa_post
fa_id = link[:fa_id]
post = fa_id_to_fa_post[fa_id]
params = if post
{ post: post }
else
{ fa_id: fa_id }
end
params = (post ? { post: post } : { fa_id: fa_id })
job_def = ({ params: params, desc: "fa post #{fa_id.to_s.bold}" })
job_defs << job_def.merge(
job: Domain::Fa::Job::ScanPostJob,
) if !post || (post.state == "ok" && !post.scanned?)
if !post || (post.state == "ok" && !post.scanned?)
job_defs << job_def.merge(job: Domain::Fa::Job::ScanPostJob)
end
job_defs << job_def.merge(
job: Domain::Fa::Job::ScanFileJob,
) if post && post.state == "ok" && post.scanned? && !post.have_file?
if post && post.state == "ok" && post.scanned? && !post.have_file?
job_defs << job_def.merge(job: Domain::Fa::Job::ScanFileJob)
end
when :twitter_user
name = link[:name]
user = name_to_twitter_user[name]
params = if user
{ user: user }
else
{ name: name }
end
params = (user ? { user: user } : { name: name })
job_def = ({ params: params, desc: "twitter user " + (user&.name || name).bold })
job_def =
(
{
params: params,
desc: "twitter user " + (user&.name || name).bold
}
)
if !user || user.due_for_timeline_tweets_scan?
job_defs << job_def.merge(
job: Domain::Twitter::Job::UserTimelineTweetsJob,
) if !user || user.due_for_timeline_tweets_scan?
job: Domain::Twitter::Job::UserTimelineTweetsJob
)
end
else
logger.warn("unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}")
logger.warn(
"unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}"
)
end
end
@@ -163,19 +169,20 @@ class Scraper::JobBase < ApplicationJob
job_class = job_def[:job]
params = job_def[:params]
desc = job_def[:desc]
logger.debug([
logger.debug(
[
"link finder -",
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
desc,
].join(" "))
defer_job(
job_class,
params.merge({ caused_by_entry: log_entry }),
desc
].join(" ")
)
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
end
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
logger.info("link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)")
logger.info(
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)"
)
end
around_perform do |job, block|
@@ -194,27 +201,9 @@ class Scraper::JobBase < ApplicationJob
rescue => e
error = e
raise e
ensure
Metrics::Client.singleton.write_point(
self,
"job_performed",
tags: {
job_queue: job.queue_name,
job_class: job.class.name,
success: error.nil?,
error_class: error&.class&.name,
},
fields: {
duration_ms: duration_ms,
},
)
end
after_perform do |job|
ColorLogger.quiet do
job.enqueue_deferred_jobs!
end
end
after_perform { |job| ColorLogger.quiet { job.enqueue_deferred_jobs! } }
def defer_job(job_class, params, set_args = {})
@deferred_jobs << [job_class, params, set_args]

View File

@@ -24,7 +24,10 @@ class ColorLogger
klass_name = "(Anonymous)"
end
logger = Logger.new(sink).tap do |logger|
logger =
Logger
.new(sink)
.tap do |logger|
prefix = ""
logger.define_singleton_method :prefix= do |p|
prefix = p
@@ -33,17 +36,20 @@ class ColorLogger
sink
end
logger.formatter = proc do |severity, datetime, progname, msg|
color = case severity
when "ERROR" then :red
when "WARN" then :yellow
else :light_blue
logger.formatter =
proc do |severity, datetime, progname, msg|
color =
case severity
when "ERROR"
:red
when "WARN"
:yellow
else
:light_blue
end
klass_name_str = "[#{klass_name.send(color)}]".ljust(32)
if prefix.is_a?(Proc)
prefix = prefix.call
end
prefix = prefix.call if prefix.is_a?(Proc)
if @quiet.value > 0
""
@@ -53,11 +59,7 @@ class ColorLogger
end
end
if Rails.env.production?
ActiveSupport::TaggedLogging.new(logger)
else
logger
end
end
def self.klass_name_from_instance(instance)
@@ -66,40 +68,7 @@ class ColorLogger
else
klass_name = instance.class.name.dup
end
klass_name.delete_prefix!("Domain::")
prefixes = [
{
if_prefix: "Fa::Scraper::",
replace_with: "Fa::",
},
{
if_prefix: "Fa::Job::",
replace_with: "Fa::",
delete_suffix: "Job",
},
{
if_prefix: "Twitter::Job::",
replace_with: "Twitter::",
delete_suffix: "Job",
},
{
if_prefix: "E621::Job::",
replace_with: "E621::",
delete_suffix: "Job",
},
].each do |conf|
if klass_name.start_with?(conf[:if_prefix])
klass_name.delete_prefix!(conf[:if_prefix])
if (suffix = conf[:delete_suffix])
klass_name.delete_suffix!(suffix)
end
klass_name = conf[:replace_with] + klass_name
break
end
end
klass_name
end
end

View File

@@ -1,54 +0,0 @@
class Metrics::Client
include HasColorLogger
REPORT = !Rails.env.test?
def self.singleton
@singleton ||= Metrics::Client.new
end
def self.singleton=(instance)
@singleton = instance
end
private
def initialize(default_tags: {})
unless REPORT
logger.warn "not reporting metrics for '#{Rails.env.to_s.bold}' environment"
return
end
host = Rails.application.config.x.influxdb.host || raise("no host")
bucket = Rails.application.config.x.influxdb.bucket || raise("no bucket")
@client = InfluxDB2::Client.new(
host, "",
org: "",
bucket: bucket,
precision: InfluxDB2::WritePrecision::MILLISECOND,
use_ssl: false,
)
write_options = InfluxDB2::WriteOptions.new(
write_type: InfluxDB2::WriteType::BATCHING,
batch_size: 100, flush_interval: 5_000,
max_retries: 3, max_retry_delay: 15_000,
exponential_base: 2,
)
point_settings = InfluxDB2::PointSettings.new(default_tags: default_tags)
@writer = @client.create_write_api(
write_options: write_options,
point_settings: point_settings,
)
end
public
def write_point(caller, name, tags: {}, fields: {})
return unless REPORT
@writer.write(data: { name: name, tags: tags, fields: fields })
rescue => e
logger.error("error writing influxdb metric: #{e}")
end
end

View File

@@ -1,30 +0,0 @@
class Metrics::EstimateDbRowsReporter < Metrics::Reporter
def initialize
log_writes!
super
end
def report
extra_tables = [
"http_log_entries",
"http_log_entry_headers",
"versions",
"delayed_jobs",
"blob_entries_p",
]
extra_tables_sql = extra_tables.map { |t| "'#{t}'" }.join(",")
row_estimates = ReduxApplicationRecord.connection.exec_query(
[
"SELECT relname, n_live_tup",
"FROM pg_stat_all_tables",
"WHERE relname IN (#{extra_tables_sql})",
"OR relname like 'domain_%'",
"OR relname like 'blob_entries_p_%'",
].join(" ")
).rows.to_h
write_point(
"estimate_db_rows", fields: row_estimates,
)
end
end

View File

@@ -1,30 +0,0 @@
class Metrics::GoodJobReporter < Metrics::Reporter
def initialize
super
end
def report
total = GoodJob::Job.count
by_queue = GoodJob::Job.group("queue_name").count
by_state = GoodJob::JobsFilter.new({}).states
logger.info "job queue metrics: total=#{total}, by_queue=#{by_queue}, by_state=#{by_state}"
write_point(
"job_queues",
tags: { aggregation: "total" },
fields: { "total" => total },
)
write_point(
"job_queues",
tags: { aggregation: "queue" },
fields: by_queue,
)
write_point(
"job_queues",
tags: { aggregation: "state" },
fields: by_state,
)
end
end

View File

@@ -1,20 +0,0 @@
class Metrics::Reporter
include HasColorLogger
def initialize
@client = Metrics::Client.singleton
end
def log_writes!
@log_writes = true
end
def write_point(name, tags: {}, fields: {})
@client.write_point(self, name, tags: tags, fields: fields)
logger.info("reporter wrote point '#{name.bold}'") if @log_writes
end
def report
raise NotImplementedError, "implement in subclass"
end
end

View File

@@ -58,19 +58,27 @@ class Scraper::ClientFactory
end
def self._gallery_dl_client_impl
@gallery_dl_clients.value ||= begin
proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
@gallery_dl_clients.value ||=
begin
proxy_config =
Rails.application.config.x.proxy || raise("no proxy config")
if proxy_config[:gallery_dl].blank?
raise("no gallery_dl host defined for proxy config #{proxy_config[:name]}")
raise(
"no gallery_dl host defined for proxy config #{proxy_config[:name]}"
)
end
Scraper::GalleryDlClient.new(proxy_config[:name], proxy_config[:gallery_dl])
Scraper::GalleryDlClient.new(
proxy_config[:name],
proxy_config[:gallery_dl]
)
end
end
def self._http_client_impl(key, config_klass)
@http_clients.value[key] ||= begin
proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http])
# proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
# performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http])
performer = Scraper::CurlHttpPerformer.new
Scraper::HttpClient.new(config_klass.new, performer)
end
end

View File

@@ -0,0 +1,106 @@
class Scraper::CurlHttpPerformer
Response =
Struct.new(:response_code, :response_headers, :response_time_ms, :body)
def get(url, request_headers)
do_request(:http_get, url, request_headers)
end
def post(url, request_headers)
do_request(:http_post, url, request_headers)
end
def name
"direct"
end
METHOD_MAP = { http_get: "GET", http_post: "POST" }
def do_request(method, url, request_headers)
t, curl = get_curl
start_at = Time.now
curl.url = Addressable::URI.encode url
curl.follow_location = true
request_headers.each { |key, value| curl.headers[key.to_s] = value }
curl.headers["User-Agent"] = "FurryArchiver/1.0 / dhelta"
curl.send(method)
response_time_ms = ((Time.now - start_at) * 1000).to_i
response_code = curl.response_code
body_str = curl.body_str
response_headers = self.class.parse_header_str(curl.header_str).to_h
rc_color =
case response_code / 100
when 2
[:green]
when 3
[:cyan]
when 4
[:red]
when 5
%i[red bold]
else
[:white]
end
rc_string = response_code.to_s
rc_color.each { |m| rc_string = rc_string.send(m) }
cf_cache_status =
if response_headers["cf-cache-status"]
"(#{response_headers["cf-cache-status"].light_blue})"
else
nil
end
puts [
"[#{Process.pid.to_s.black.bold} / #{t.name.to_s.black.bold}]",
"[#{METHOD_MAP[method]} #{rc_string}",
"#{self.class.humansize(body_str.size).to_s.bold}",
(response_time_ms.to_s + "ms").light_blue.bold + "]",
cf_cache_status,
"#{url.bold}"
].reject(&:nil?).join(" ")
Response.new(response_code, response_headers, response_time_ms, body_str)
end
private
def get_curl
t = Thread.current
unless t.thread_variable?(:curl)
t.thread_variable_set(:curl, Curl::Easy.new)
end
curl = t.thread_variable_get(:curl)
curl.headers = {}
[t, curl]
end
def self.humansize(size)
units = %w[B KiB MiB GiB TiB Pib EiB ZiB]
return "0.0 B" if size == 0
exp = (Math.log(size) / Math.log(1024)).to_i
exp += 1 if (size.to_f / 1024**exp >= 1024 - 0.05)
exp = units.size - 1 if exp > units.size - 1
unit = units[exp]
if unit == "B"
"%d B" % [size]
else
"%.1f %s" % [size.to_f / 1024**exp, unit]
end
end
def self.parse_header_str(header_str)
header_str
.split("\r\n")
.each_with_index
.map do |str, idx|
next nil if idx == 0
idx = str.index(": ")
next nil unless idx
[str[0...idx], str[idx + 2..-1]]
end
.reject(&:nil?)
end
end

View File

@@ -1,23 +1,13 @@
class Scraper::FaHttpClientConfig < Scraper::HttpClientConfig
DEFAULT_ALLOWED_DOMAINS = [
"*.furaffinity.net",
"*.facdn.net",
"ipinfo.io",
]
DEFAULT_ALLOWED_DOMAINS = %w[*.furaffinity.net *.facdn.net ipinfo.io]
def cookies
proxy_name = Rails.application.config.x.proxy_name
Rails.application.config.x.cookies.fa[
proxy_name.to_s
] || raise("no fa cookies defined for #{proxy_name}")
Rails.application.config.x.cookies.fa["direct"] ||
raise("no fa cookies defined for direct")
end
def ratelimit
[
["d.furaffinity.net", :none],
["*.facdn.net", :none],
["*", 0.25],
]
[["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 0.25]]
end
def allowed_domains

View File

@@ -2,13 +2,10 @@ class Scraper::HttpClient
include HasColorLogger
attr_reader :config
Response = Struct.new(
:status_code,
:body,
:log_entry,
)
Response = Struct.new(:status_code, :body, :log_entry)
class InvalidURLError < ArgumentError; end
class InvalidURLError < ArgumentError
end
def initialize(config, http_performer)
@config = config
@@ -17,7 +14,8 @@ class Scraper::HttpClient
init_cookie_jar
# TODO - populate AdaptiveCache with domains this client cares about
@max_cache_size = 8
@blob_entry_cache = Hash.new do |hash, key|
@blob_entry_cache =
Hash.new do |hash, key|
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
end
logger.level = :info
@@ -46,12 +44,14 @@ class Scraper::HttpClient
url = uri.to_s
unless allowed_domain?(uri.host)
raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}")
raise InvalidURLError.new(
"domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}"
)
end
ratelimit_conf = config.ratelimit.find do |conf|
File.fnmatch? conf[0], uri.host
end || ["*", :none]
ratelimit_conf =
config.ratelimit.find { |conf| File.fnmatch? conf[0], uri.host } ||
["*", :none]
ratelimit_delay = ratelimit_conf[1]
if ratelimit_delay == :none
logger.debug("no rate limit for #{uri.host}")
@@ -68,7 +68,10 @@ class Scraper::HttpClient
end
request_headers = {
"cookie" => HTTP::Cookie.cookie_value(@cookie_jar.cookies(Addressable::URI.encode url)),
"cookie" =>
HTTP::Cookie.cookie_value(
@cookie_jar.cookies(Addressable::URI.encode url)
)
}
requested_at = Time.now
response = @http_performer.do_request(method, url, request_headers)
@@ -77,47 +80,57 @@ class Scraper::HttpClient
response_headers = response.response_headers
response_time_ms = response.response_time_ms
response_body = response.body
@domain_last_requested_at[uri.host] = requested_at + (response_time_ms.to_f / 1000)
@domain_last_requested_at[uri.host] = requested_at +
(response_time_ms.to_f / 1000)
content_type = response_headers["Content-Type"] ||
response_headers["content-type"] ||
content_type =
response_headers["Content-Type"] || response_headers["content-type"] ||
"none/none"
cache_key = "#{uri.host}|#{content_type}"
blob_entry_cache = @blob_entry_cache[cache_key]
candidates = if blob_entry_cache.at_capacity? && rand(0..100) >= 5
candidates =
if blob_entry_cache.at_capacity? && rand(0..100) >= 5
blob_entry_cache.candidates
else
[]
end
candidates << caused_by_entry.response if caused_by_entry&.response
candidates << caused_by_entry.response.base if caused_by_entry&.response&.base
if caused_by_entry&.response&.base
candidates << caused_by_entry.response.base
end
retries = 0
begin
response_blob_entry = BlobEntryP.find_or_build(
response_blob_entry =
BlobEntryP.find_or_build(
content_type: content_type,
contents: response_body,
candidates: candidates,
candidates: candidates
)
scrubbed_uri = @config.scrub_stored_uri(uri)
log_entry = HttpLogEntry.new({
log_entry =
HttpLogEntry.new(
{
uri: scrubbed_uri.to_s,
verb: method,
content_type: content_type,
status_code: response_code,
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers),
request_headers:
HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers:
HttpLogEntryHeader.find_or_build(headers: response_headers),
response: response_blob_entry,
response_time_ms: response_time_ms,
requested_at: requested_at,
caused_by_entry: caused_by_entry,
performed_by: @http_performer.name,
})
performed_by: @http_performer.name
}
)
log_entry.save!
rescue
rescue StandardError
retries += 1
retry if retries < 2
raise
@@ -129,11 +142,14 @@ class Scraper::HttpClient
)
else
blob_entry_cache.insert(
HexUtil.bin2hex(response_blob_entry.sha256)[0..8], response_blob_entry, scrubbed_uri
HexUtil.bin2hex(response_blob_entry.sha256)[0..8],
response_blob_entry,
scrubbed_uri
)
end
response_code_colorized = if response_code == 200
response_code_colorized =
if response_code == 200
response_code.to_s.light_green
elsif (response_code / 100) == 4 || (response_code / 100) == 5
response_code.to_s.red.bold
@@ -142,51 +158,30 @@ class Scraper::HttpClient
end
total_time_ms = ((Time.now - requested_at) * 1000).round(0)
logger.info([
logger.info(
[
"[entry #{log_entry.id.to_s.bold} /",
"GET #{response_code_colorized} /",
"#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]",
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
scrubbed_uri.to_s.black,
].reject(&:nil?).join(" "))
scrubbed_uri.to_s.black
].reject(&:nil?).join(" ")
)
if response_code == 524 || response_code == 502 || response_code == 503 || response_code == 403
if response_code == 524 || response_code == 502 || response_code == 503 ||
response_code == 403
logger.error("HTTP #{response_code} - sleeping to not throttle server")
sleep 15
end
if content_type =~ /text\/plain/ && response_body =~ /Your IP address has been temporarily banned/
if content_type =~ %r{text/plain} &&
response_body =~ /Your IP address has been temporarily banned/
logger.error("IP temp banned, sleeping & raising...")
sleep 60 * 5
raise
end
Metrics::Client.singleton.write_point(
self,
"http_client_response",
tags: {
method: "GET",
host: uri.host,
status_code: response_code,
content_type: clean_content_type(content_type),
performed_by: @http_performer.name,
},
fields: {
response_time_ms: response_time_ms,
total_time_ms: total_time_ms,
content_size: response_blob_entry.size,
content_stored: response_blob_entry.bytes_stored,
uri: scrubbed_uri.to_s,
status_code: response_code,
content_type: clean_content_type(content_type),
},
)
Response.new(
response_code,
response_body,
log_entry
)
Response.new(response_code, response_body, log_entry)
end
def allowed_domain?(domain)
@@ -206,7 +201,7 @@ class Scraper::HttpClient
name: cookie_key_conf[:name],
value: cookie_key_conf[:value],
path: cookie_key_conf[:path] || "",
for_domain: true,
for_domain: true
}
@cookie_jar.add(HTTP::Cookie.new(conf))
end

View File

@@ -1,56 +0,0 @@
require "base64"
class Scraper::HttpPerformer
include HasColorLogger
Response = Struct.new(
:response_code,
:response_headers,
:response_time_ms,
:body
)
def initialize(name, host)
name || raise("must provide name")
host || raise("must provide host")
logger.info("#{Thread.name.to_s.light_black} - build performer #{name.to_s.green} - #{host.to_s.green.bold}")
@name = name
@client = Ripcord::Client.new(host)
end
def name
@name
end
def get(uri, headers)
do_request("get", uri, headers)
end
def post(uri, headers)
do_request("post", uri, headers)
end
METHODS = %i[get post].freeze
def do_request(method, uri, headers)
raise "invalid method #{method}" unless METHODS.include?(method)
send_token = SecureRandom.uuid
rpc_result = @client.call(method.to_s, [uri, headers, send_token])
raise "rpc error: #{rpc_result.error.inspect}" unless rpc_result.successful?
recv_token, code, headers, time_ms, body = Marshal.load(Base64.decode64(rpc_result.result))
if send_token != recv_token
raise "token mismatch: #{send_token} != #{recv_token}"
end
Scraper::HttpPerformer::Response.new(code, headers, time_ms, body)
end
def close!
begin
@client.instance_variable_get("@http_client")&.finish
rescue IOError => e
Rails.logger.error("Failed to close http client: #{e.inspect}")
end
end
private
end

View File

@@ -1,59 +1,54 @@
class Domain::Fa::User < ReduxApplicationRecord
self.table_name = 'domain_fa_users'
self.table_name = "domain_fa_users"
has_lite_trail(schema_version: 1)
has_many :posts,
class_name: '::Domain::Fa::Post',
class_name: "::Domain::Fa::Post",
inverse_of: :creator,
foreign_key: :creator_id
has_one :disco,
class_name: '::Domain::Fa::UserFactor',
class_name: "::Domain::Fa::UserFactor",
inverse_of: :user,
foreign_key: :user_id,
dependent: :destroy
has_one :avatar,
class_name: '::Domain::Fa::UserAvatar',
class_name: "::Domain::Fa::UserAvatar",
inverse_of: :user,
dependent: :destroy
enum :state, [
enum :state,
[
:ok, # so far so good, user may not yet be scanned
:scan_error # user has been removed or otherwise, see state_detail
]
# Who this user follows (join table)
has_many :follower_joins,
class_name: '::Domain::Fa::Follow',
class_name: "::Domain::Fa::Follow",
foreign_key: :follower_id,
inverse_of: :follower,
dependent: :destroy
# Who this user follows (User model)
has_many :follows,
through: :follower_joins,
source: :followed
has_many :follows, through: :follower_joins, source: :followed
# Who follows this user (join table)
has_many :followed_joins,
class_name: '::Domain::Fa::Follow',
class_name: "::Domain::Fa::Follow",
foreign_key: :followed_id,
inverse_of: :followed,
dependent: :destroy
# Who follows this user (User model)
has_many :followed_by,
through: :followed_joins,
source: :follower
has_many :followed_by, through: :followed_joins, source: :follower
has_many :fav_post_joins,
class_name: '::Domain::Fa::Fav',
inverse_of: :user
has_many :fav_post_joins, class_name: "::Domain::Fa::Fav", inverse_of: :user
has_many :fav_posts,
class_name: '::Domain::Fa::Post',
class_name: "::Domain::Fa::Post",
through: :fav_post_joins,
source: :post
@@ -64,7 +59,8 @@ class Domain::Fa::User < ReduxApplicationRecord
if name && url_name
expected = self.class.name_to_url_name(name)
matches = if name.length >= 30
matches =
if name.length >= 30
url_name.starts_with?(expected)
else
url_name == expected
@@ -79,17 +75,11 @@ class Domain::Fa::User < ReduxApplicationRecord
end
if url_name && url_name =~ /[A-Z]/
errors.add(
:url_name,
"url_name '#{url_name}' contains uppercase"
)
errors.add(:url_name, "url_name '#{url_name}' contains uppercase")
end
if url_name && url_name =~ /\s/
errors.add(
:url_name,
"url_name '#{url_name}' contains whitespace"
)
errors.add(:url_name, "url_name '#{url_name}' contains whitespace")
end
end
@@ -99,9 +89,7 @@ class Domain::Fa::User < ReduxApplicationRecord
self.log_entry_detail ||= {}
end
before_destroy do
throw :abort if posts.any?
end
before_destroy { throw :abort if posts.any? }
SCAN_TYPES = {
page: 1.month,
@@ -120,9 +108,7 @@ class Domain::Fa::User < ReduxApplicationRecord
}
SCAN_TYPES.keys.each do |scan_type|
define_method(:"due_for_#{scan_type}_scan?") do
scan_due?(scan_type)
end
define_method(:"due_for_#{scan_type}_scan?") { scan_due?(scan_type) }
define_method(:"time_ago_for_#{scan_type}_scan") do
scanned_ago_in_words(scan_type)
@@ -143,14 +129,15 @@ class Domain::Fa::User < ReduxApplicationRecord
def scanned_ago_in_words(scan_type)
if (timestamp = get_scanned_at_value(scan_type))
DATE_HELPER.time_ago_in_words(timestamp) + ' ago'
DATE_HELPER.time_ago_in_words(timestamp) + " ago"
else
'never'
"never"
end
end
def scan_due?(scan_type)
duration = SCAN_TYPES[scan_type] || raise("invalid scan type '#{scan_type}'")
duration =
SCAN_TYPES[scan_type] || raise("invalid scan type '#{scan_type}'")
timestamp = get_scanned_at_value(scan_type)
timestamp.nil? || timestamp <= duration.ago
end
@@ -165,13 +152,13 @@ class Domain::Fa::User < ReduxApplicationRecord
end
def avatar_or_create
self.class.transaction do
avatar || create_avatar!
end
self.class.transaction { avatar || create_avatar! }
end
def self.find_or_build_from_submission_parser(submission_parser)
unless submission_parser.is_a?(Domain::Fa::Parser::ListedSubmissionParserHelper) ||
unless submission_parser.is_a?(
Domain::Fa::Parser::ListedSubmissionParserHelper
) ||
submission_parser.is_a?(Domain::Fa::Parser::SubmissionParserHelper)
raise ArgumentError
end
@@ -181,16 +168,11 @@ class Domain::Fa::User < ReduxApplicationRecord
end
end
URL_NAME_EXCEPTIONS = {
'Kammiu' => 'rammiu'
}
URL_NAME_EXCEPTIONS = { "Kammiu" => "rammiu" }
def self.name_to_url_name(name)
name = name.strip
URL_NAME_EXCEPTIONS[name] || name
.delete('_')
.gsub(/\s/, '')
.downcase
URL_NAME_EXCEPTIONS[name] || name.delete("_").gsub(/\s/, "").downcase
end
# TODO: - write method for getting suggested users to follow
@@ -215,20 +197,22 @@ class Domain::Fa::User < ReduxApplicationRecord
end
def guess_user_page_log_entry
for_path = proc { |uri_path|
HttpLogEntry.where(
uri_scheme: 'https',
uri_host: 'www.furaffinity.net',
for_path =
proc do |uri_path|
HttpLogEntry
.where(
uri_scheme: "https",
uri_host: "www.furaffinity.net",
uri_path: uri_path
).order(created_at: :desc).first
}
)
.order(created_at: :desc)
.first
end
for_hle_id = proc { |hle_id|
hle_id && HttpLogEntry.find_by(id: hle_id)
}
for_hle_id = proc { |hle_id| hle_id && HttpLogEntry.find_by(id: hle_id) }
# older versions don't end in a trailing slash
hle_id = self.log_entry_detail && self.log_entry_detail['last_user_page_id']
hle_id = self.log_entry_detail && self.log_entry_detail["last_user_page_id"]
# first try the last scanned user page (present on most fa user models)
for_hle_id.call(hle_id) ||
@@ -247,18 +231,19 @@ class Domain::Fa::User < ReduxApplicationRecord
private
def similar_users_by(factor_col, exclude_followed_by)
query =
disco
.nearest_neighbors(factor_col, distance: 'euclidean')
query = disco.nearest_neighbors(factor_col, distance: "euclidean")
query = query.where.not(user_id: exclude_followed_by.follows.select(:followed_id)) if exclude_followed_by
query =
query.where.not(
user_id: exclude_followed_by.follows.select(:followed_id)
) if exclude_followed_by
users_from_disco_query(query)
end
def users_from_disco_query(disco_query)
Domain::Fa::User
.select('domain_fa_users.*', disco_query.select_values.last)
.select("domain_fa_users.*", disco_query.select_values.last)
.joins(:disco)
.merge(disco_query.reselect(:user_id))
end
@@ -270,7 +255,8 @@ class Domain::Fa::User < ReduxApplicationRecord
when :state_detail
str = state_detail["scanned_#{scan_type}_at"]
Time.parse(str) if str
else raise("invalid scan type '#{scan_type}'")
else
raise("invalid scan type '#{scan_type}'")
end
end
@@ -280,7 +266,8 @@ class Domain::Fa::User < ReduxApplicationRecord
send(:"scanned_#{scan_type}_at=", value)
when :state_detail
state_detail["scanned_#{scan_type}_at"] = value.iso8601
else raise("invalid scan type '#{scan_type}'")
else
raise("invalid scan type '#{scan_type}'")
end
end
end

View File

@@ -25,12 +25,5 @@ module ReduxScraper
config.time_zone = "Pacific Time (US & Canada)"
# config.eager_load_paths << Rails.root.join("extras")
config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa")
config.x.proxies = ReduxScraper::Application.config_for("proxies")
config.x.influxdb = ReduxScraper::Application.config_for("influxdb")
proxy_name = (ENV["proxy"] || "direct").to_sym
config.x.proxy_name = proxy_name
config.x.proxy = Rails.application.config.x.proxies[proxy_name] || raise("no proxy config for '#{proxy_name}'")
config.x.proxy[:name] = proxy_name
end
end

View File

@@ -28,9 +28,12 @@ redux_dev: &redux_dev
adapter: postgresql
host: localhost
port: 5432
database: redux_development
username: scraper_redux
password: pdkFLqRmQwPUPaDDC4pX
# database: redux_development
# username: scraper_redux
# password: pdkFLqRmQwPUPaDDC4pX
database: postgres
username: postgres
password: postgres
migrations_paths: db/redux_migrate
pool: 4
@@ -53,21 +56,25 @@ legacy_staging: &legacy_staging
local_redux_test: &local_redux_test
adapter: postgresql
host: localhost
host: db
port: 5432
database: redux_test
username: scraper_redux
password: pdkFLqRmQwPUPaDDC4pX
# username: scraper_redux
# password: pdkFLqRmQwPUPaDDC4pX
username: postgres
password: postgres
migrations_paths: db/redux_migrate
pool: 4
local_legacy_test: &local_legacy_test
adapter: postgresql
host: localhost
host: db
port: 5432
database: legacy_test
username: scraper_redux
password: pdkFLqRmQwPUPaDDC4pX
# username: scraper_redux
# password: pdkFLqRmQwPUPaDDC4pX
username: postgres
password: postgres
migrations_paths: db/legacy_migrate
pool: 4

View File

@@ -12,11 +12,13 @@ Rails.application.configure do
config.good_job.cleanup_interval_jobs = 100_000
config.good_job.cleanup_interval_seconds = 4.hours
config.good_job.logger = Logger.new(STDOUT)
config.good_job.logger.level = :warn
config.good_job.logger.level = :info
if Rails.env.worker?
config.good_job.execution_mode = :async
config.good_job.on_thread_error = ->(exception) { Rails.logger.error("GoodJob exception: #{exception}") }
config.good_job.on_thread_error = ->(exception) do
Rails.logger.error("GoodJob exception: #{exception}")
end
else
config.good_job.execution_mode = :external
end

View File

@@ -1,41 +0,0 @@
client = InfluxDB2::Client.new(
Rails.application.config.x.influxdb.host || raise("no host"),
"",
bucket: Rails.application.config.x.influxdb.bucket || raise("no bucket"),
org: "",
precision: InfluxDB2::WritePrecision::NANOSECOND,
use_ssl: false,
)
write_api = client.create_write_api
ActiveSupport::Notifications.subscribe "process_action.action_controller" do |name, started, finished, unique_id, data|
request = data[:request]
api_token = request.params[:api_token]
api_user = api_token ? ApplicationController::API_TOKENS[api_token] : nil
ip = request.headers["HTTP_CF_CONNECTING_IP"] || request.remote_ip
hash = {
name: "process_action.action_controller",
tags: {
method: "#{data[:controller]}##{data[:action]}",
format: data[:format],
http_method: data[:method],
status: data[:status],
exception: data[:exception]&.first,
},
fields: {
ip: ip,
api_token: api_token,
api_user: api_user,
time_in_controller: (finished - started) * 1000,
time_in_view: (data[:view_runtime] || 0).ceil,
time_in_db: (data[:db_runtime] || 0).ceil,
request_id: unique_id,
},
time: started,
}
write_api.write(data: hash)
rescue => e
Rails.logger.error("error writing influxdb metric: #{e}")
end

View File

@@ -1,29 +0,0 @@
default: &default
direct:
http: http://127.0.0.1:9292
gallery_dl: http://127.0.0.1:5001
proxy-1:
http: http://127.0.0.1:9292
gallery_dl: http://127.0.0.1:5001
production:
<<: *default
staging:
<<: *default
development:
direct: {}
proxy-1: {}
dedipath-1: {}
serverhost-1: {}
worker:
<<: *default
test:
direct: {}
proxy-1: {}
dedipath-1: {}
serverhost-1: {}

View File

@@ -1,9 +1,9 @@
namespace :metrics do
desc "run reporters periodically"
task :report_all => [:environment, :set_logger_stdout] do
task report_all: %i[environment set_logger_stdout] do
schedule = {
Rake::Task["metrics:jobs"] => 60.seconds,
Rake::Task["metrics:estimate_db_rows"] => 60.seconds,
Rake::Task["metrics:estimate_db_rows"] => 60.seconds
}
last_ran = {}
@@ -24,14 +24,4 @@ namespace :metrics do
sleep sleep_amt
end
end
desc "Report job queue metrics"
task :jobs => :environment do
Metrics::GoodJobReporter.new.report
end
desc "Report estimated db row metrics"
task :estimate_db_rows => :environment do
Metrics::EstimateDbRowsReporter.new.report
end
end