remove proxy code

This commit is contained in:
Dylan Knutson
2024-12-17 17:57:17 +00:00
parent aad2f29e96
commit 9de3c1b5e8
36 changed files with 565 additions and 771 deletions

View File

@@ -1,9 +0,0 @@
PGADMIN_DEFAULT_EMAIL=admin@example.com
PGADMIN_DEFAULT_PASSWORD=password
PGADMIN_LISTEN_PORT=8080
# no password required
PGADMIN_CONFIG_SERVER_MODE=False
PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED=False
RAILS_ENV=development
NODE_ENV=development

View File

@@ -19,7 +19,7 @@ RUN bundle install
RUN rake compile RUN rake compile
# Primary image # Primary image
FROM mcr.microsoft.com/devcontainers/base:bookworm FROM mcr.microsoft.com/devcontainers/ruby:1-3.2-bookworm
# apt caching & install packages # apt caching & install packages
RUN rm -f /etc/apt/apt.conf.d/docker-clean; \ RUN rm -f /etc/apt/apt.conf.d/docker-clean; \
@@ -48,21 +48,34 @@ RUN \
libvips42 \ libvips42 \
libyaml-dev \ libyaml-dev \
patch \ patch \
rbenv \ # rbenv \
iputils-ping \
rustc \ rustc \
uuid-dev \ uuid-dev \
zlib1g-dev zlib1g-dev
USER vscode # USER vscode
ENV PATH=/home/vscode/.rbenv/shims:$PATH # ENV PATH=/home/vscode/.rbenv/shims:$PATH
ENV CONFIGURE_OPTS="--disable-install-rdoc" # ENV CONFIGURE_OPTS="--disable-install-rdoc"
RUN git clone https://github.com/rbenv/ruby-build.git "$(rbenv root)"/plugins/ruby-build # RUN git clone https://github.com/rbenv/ruby-build.git "$(rbenv root)"/plugins/ruby-build
RUN rbenv install 3.2.0 # RUN rbenv install 3.2.0
RUN rbenv global 3.2.0 # RUN rbenv global 3.2.0
COPY --from=native-gems /usr/src/app/gems/xdiff-rb /gems/xdiff-rb COPY --from=native-gems /usr/src/app/gems/xdiff-rb /gems/xdiff-rb
COPY --from=native-gems /usr/src/app/gems/rb-bsdiff /gems/rb-bsdiff COPY --from=native-gems /usr/src/app/gems/rb-bsdiff /gems/rb-bsdiff
ENV RAILS_ENV development ENV RAILS_ENV development
ENV RUBOCOP_OPTS --server
# [Optional] Uncomment this line to install additional gems.
RUN su vscode -c "gem install bundler -v '2.4.1'" && \
su vscode -c "gem install rake -v '13.0.6'" && \
su vscode -c "gem install ruby-lsp -v '0.22.1'" && \
su vscode -c "gem install rubocop -v '1.69.2'"
# install exo
RUN su vscode -c "curl -sL https://exo.deref.io/install | bash"
ENV PATH "/home/vscode/.exo/bin:$PATH"
RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && nvm install 18 && nvm use 18 && npm install -g yarn" 2>&1
ENV PATH /usr/local/share/nvm/current/bin:$PATH

View File

@@ -0,0 +1,7 @@
FROM postgres:17
RUN apt-get update && apt-get install -y \
postgresql-17-pgvector \
&& rm -rf /var/lib/apt/lists/*
RUN echo "CREATE EXTENSION pgvector;" >> /docker-entrypoint-initdb.d/01-pgvector.sql

View File

@@ -0,0 +1,5 @@
CREATE USER vscode CREATEDB;
CREATE DATABASE vscode WITH OWNER vscode;
CREATE DATABASE redux_test WITH OWNER vscode;
CREATE DATABASE legacy_test WITH OWNER vscode;

View File

@@ -1,42 +1,25 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the // For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile // README at: https://github.com/devcontainers/templates/tree/main/src/ruby-rails-postgres
{ {
"name": "Redux Scraper", "name": "Ruby on Rails & Postgres",
// "build": {
// // Sets the run context to one level up instead of the .devcontainer folder.
// "context": "..",
// // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
// "dockerfile": "Dockerfile"
// },
"dockerComposeFile": "docker-compose.yml", "dockerComposeFile": "docker-compose.yml",
"service": "app", "service": "app",
"workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}", "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",
// "mounts": [
// {
// "type": "volume",
// "source": "${devcontainerId}-gems",
// "target": "/home/vscode/.rbenv/versions/3.2.0/lib/ruby/gems"
// }
// ],
"features": { "features": {
"ghcr.io/meaningful-ooo/devcontainer-features/fish:1": {}, "ghcr.io/meaningful-ooo/devcontainer-features/fish:1": {},
"ghcr.io/nikobockerman/devcontainer-features/fish-persistent-data:2": {}, "ghcr.io/nikobockerman/devcontainer-features/fish-persistent-data:2": {}
"ghcr.io/devcontainers/features/node:1": {
"installYarnUsingApt": true,
"version": "18",
"pnpmVersion": "none",
"nvmVersion": "latest"
}
}, },
// "runArgs": [
// "--network=host"
// ],
// Use 'forwardPorts' to make a list of ports inside the container available locally. // Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [], // This can be used to network with other containers or the host.
// Uncomment the next line to run commands after the container is created. // "forwardPorts": [3000, 5432],
"postCreateCommand": "${containerWorkspaceFolder}/.devcontainer/post-create.sh" // Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "bundle install && rake db:setup",
"postCreateCommand": ".devcontainer/post-create.sh",
"forwardPorts": [
8080, // pgadmin
3000 // rails
]
// Configure tool-specific properties. // Configure tool-specific properties.
// "customizations": {}, // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root"
// "remoteUser": "devcontainer"
} }

View File

@@ -1,33 +1,45 @@
version: '3.8' version: '3'
volumes:
devcontainer-redux-postgres-data:
devcontainer-redux-gem-cache:
services: services:
app: app:
build: build:
context: .. context: ..
dockerfile: Dockerfile.devcontainer dockerfile: .devcontainer/Dockerfile.devcontainer
volumes: volumes:
# - ../..:/workspaces:cached - ../..:/workspaces:cached
- devcontainer-redux-gem-cache:/home/vscode/.rbenv/versions/3.2.0/lib/ruby/gems - devcontainer-redux-gem-cache:/usr/local/rvm/gems
# Overrides default command so things don't shut down after the process ends.
command: sleep infinity command: sleep infinity
# Runs app on the same network as the database container, allows "forwardPorts" in devcontainer.json function.
network_mode: service:db network_mode: service:db
env_file:
- .env
db: db:
image: pgvector/pgvector:pg17 build:
context: .
dockerfile: Dockerfile.postgres
restart: unless-stopped restart: unless-stopped
volumes: volumes:
- devcontainer-redux-postgres-data:/var/lib/postgresql/data - postgres-data:/var/lib/postgresql/data
env_file: - ./create-db-user.sql:/docker-entrypoint-initdb.d/create-db-user.sql
- .env environment:
POSTGRES_USER: postgres
POSTGRES_DB: postgres
POSTGRES_PASSWORD: postgres
pgadmin: pgadmin:
image: dpage/pgadmin4:8.13.0 image: dpage/pgadmin4:8.13.0
restart: unless-stopped restart: unless-stopped
network_mode: service:db network_mode: service:db
env_file: environment:
- .env PGADMIN_DEFAULT_EMAIL: admin@example.com
PGADMIN_DEFAULT_PASSWORD: password
PGADMIN_LISTEN_PORT: 8080
PGADMIN_CONFIG_SERVER_MODE: False
PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED: False
volumes:
postgres-data:
devcontainer-redux-gem-cache:

View File

@@ -0,0 +1,11 @@
#!/bin/bash -ex
# EDITOR_BIN="$(ls /vscode/cursor-server/bin/*/*/bin/remote-cli/cursor | head -n 1)"
EDITOR_BIN=cursor
function install_extension() {
$EDITOR_BIN --install-extension "$1"
}
install_extension Shopify.ruby-extensions-pack
install_extension dbaeumer.vscode-eslint
install_extension aliariff.vscode-erb-beautify

View File

@@ -6,14 +6,9 @@ echo "Gem: $(which gem)"
echo "Bundler: $(which bundler)" echo "Bundler: $(which bundler)"
echo "Rake: $(which rake)" echo "Rake: $(which rake)"
gem install bundler -v '2.4.1'
gem install rake -v '13.0.6'
gem install ruby-lsp -v '0.22.1'
gem install rubocop -v '1.69.2'
bundle install --jobs $(getconf _NPROCESSORS_ONLN) bundle install --jobs $(getconf _NPROCESSORS_ONLN)
rbenv rehash rbenv rehash
rubocop --start-server
npm install yarn npm install yarn
bin/rails yarn:install bin/rails yarn:install
yarn yarn

View File

@@ -1 +1 @@
3.2.0 system

View File

@@ -1,8 +0,0 @@
{
"recommendations": [
"Shopify.ruby-extensions-pack",
"LoranKloeze.ruby-rubocop-revived",
"CraigMaslowski.erb",
"dbaeumer.vscode-eslint"
]
}

View File

@@ -4,9 +4,11 @@
"window.title": "${activeEditorMedium}${separator}${rootName}${separator}${profileName}", "window.title": "${activeEditorMedium}${separator}${rootName}${separator}${profileName}",
"workbench.preferredDarkColorTheme": "Spinel", "workbench.preferredDarkColorTheme": "Spinel",
"workbench.preferredLightColorTheme": "Spinel Light", "workbench.preferredLightColorTheme": "Spinel Light",
"rubyLsp.formatter": "auto", "rubyLsp.formatter": "syntax_tree",
"[ruby]": { "[ruby]": {
"editor.formatOnSave": true, "editor.defaultFormatter": "Shopify.ruby-lsp"
"editor.defaultFormatter": "LoranKloeze.ruby-rubocop-revived" },
"[erb]": {
"editor.defaultFormatter": "aliariff.vscode-erb-beautify"
} }
} }

View File

@@ -1,7 +1,7 @@
source 'https://rubygems.org' source 'https://rubygems.org'
git_source(:github) { |repo| "https://github.com/#{repo}.git" } git_source(:github) { |repo| "https://github.com/#{repo}.git" }
ruby '3.2.0' ruby '3.2.6'
# ruby "3.0.3" # ruby "3.0.3"
# Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main" # Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main"
@@ -98,7 +98,6 @@ gem 'daemons'
gem 'discard' gem 'discard'
gem 'good_job', '3.17.2' gem 'good_job', '3.17.2'
gem 'http-cookie' gem 'http-cookie'
gem 'influxdb-client'
gem 'kaminari' gem 'kaminari'
gem 'nokogiri' gem 'nokogiri'
gem 'pluck_each' gem 'pluck_each'
@@ -131,3 +130,5 @@ gem 'rack-cors'
gem 'react_on_rails' gem 'react_on_rails'
gem 'sanitize' gem 'sanitize'
gem 'shakapacker' gem 'shakapacker'
gem "syntax_tree", "~> 6.2"

View File

@@ -154,7 +154,6 @@ GEM
domain_name (~> 0.5) domain_name (~> 0.5)
i18n (1.12.0) i18n (1.12.0)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
influxdb-client (2.9.0)
io-console (0.6.0) io-console (0.6.0)
irb (1.6.2) irb (1.6.2)
reline (>= 0.3.0) reline (>= 0.3.0)
@@ -216,6 +215,7 @@ GEM
pluck_each (0.2.0) pluck_each (0.2.0)
activerecord (> 3.2.0) activerecord (> 3.2.0)
activesupport (> 3.0.0) activesupport (> 3.0.0)
prettier_print (1.2.1)
progressbar (1.13.0) progressbar (1.13.0)
pry (0.14.2) pry (0.14.2)
coderay (~> 1.1) coderay (~> 1.1)
@@ -341,6 +341,8 @@ GEM
stackprof (0.2.24) stackprof (0.2.24)
stimulus-rails (1.2.1) stimulus-rails (1.2.1)
railties (>= 6.0.0) railties (>= 6.0.0)
syntax_tree (6.2.0)
prettier_print (>= 1.2.0)
table_print (1.5.7) table_print (1.5.7)
tailwindcss-rails (2.0.26) tailwindcss-rails (2.0.26)
railties (>= 6.0.0) railties (>= 6.0.0)
@@ -394,7 +396,6 @@ DEPENDENCIES
good_job (= 3.17.2) good_job (= 3.17.2)
htmlbeautifier htmlbeautifier
http-cookie http-cookie
influxdb-client
jbuilder jbuilder
kaminari kaminari
memory_profiler memory_profiler
@@ -429,6 +430,7 @@ DEPENDENCIES
sqlite3 (~> 1.4) sqlite3 (~> 1.4)
stackprof stackprof
stimulus-rails stimulus-rails
syntax_tree (~> 6.2)
table_print table_print
tailwindcss-rails (~> 2.0) tailwindcss-rails (~> 2.0)
turbo-rails turbo-rails
@@ -439,7 +441,7 @@ DEPENDENCIES
zstd-ruby zstd-ruby
RUBY VERSION RUBY VERSION
ruby 3.2.0p0 ruby 3.2.6p234
BUNDLED WITH BUNDLED WITH
2.4.6 2.4.6

View File

@@ -1,6 +1,6 @@
# Procfile for development using HMR # Procfile for development using HMR
# You can run these commands in separate shells # You can run these commands in separate shells
rails: bundle exec rails s -p 3000 rails: RAILS_ENV=development bundle exec rails s -p 3000
tailwind: bundle exec rake tailwindcss:watch tailwind: RAILS_ENV=development bundle exec rake tailwindcss:watch
wp-client: HMR=true bin/webpacker-dev-server wp-client: HMR=true RAILS_ENV=development bin/webpacker-dev-server
wp-server: HMR=true SERVER_BUNDLE_ONLY=yes bin/webpacker --watch wp-server: HMR=true RAILS_ENV=development SERVER_BUNDLE_ONLY=yes bin/webpacker --watch

View File

@@ -1,4 +1,4 @@
rails: RAILS_ENV=staging ./bin/rails s -p 3000 rails: RAILS_ENV=staging ./bin/rails s -p 3000
tailwind: ./bin/rake rake tailwindcss:watch tailwind: RAILS_ENV=staging ./bin/rake rake tailwindcss:watch
wp-client: RAILS_ENV=staging HMR=true ./bin/webpacker-dev-server wp-client: RAILS_ENV=staging HMR=true ./bin/webpacker-dev-server
wp-server: RAILS_ENV=staging HMR=true SERVER_BUNDLE_ONLY=yes ./bin/webpacker --watch wp-server: RAILS_ENV=staging HMR=true SERVER_BUNDLE_ONLY=yes ./bin/webpacker --watch

View File

@@ -14,21 +14,20 @@ Rake.application.rake_require "e621"
Rake.application.rake_require "twitter" Rake.application.rake_require "twitter"
Rake.application.rake_require "ib" Rake.application.rake_require "ib"
task :set_ar_stdout => :environment do task set_ar_stdout: :environment do
ActiveRecord::Base.logger = Logger.new($stdout) ActiveRecord::Base.logger = Logger.new($stdout)
end end
task :set_logger_stdout => :environment do task set_logger_stdout: :environment do
Rails.logger = Logger.new($stdout) Rails.logger = Logger.new($stdout)
Rails.logger.formatter = proc do |severity, datetime, progname, msg| Rails.logger.formatter =
"#{severity}: #{msg}\n" proc { |severity, datetime, progname, msg| "#{severity}: #{msg}\n" }
end
ActiveRecord::Base.logger = nil ActiveRecord::Base.logger = nil
ActiveJob::Base.logger = nil ActiveJob::Base.logger = nil
GoodJob.logger = Rails.logger GoodJob.logger = Rails.logger
end end
task :periodic_tasks => [:environment, :set_logger_stdout] do task periodic_tasks: %i[environment set_logger_stdout] do
Thread.new do Thread.new do
loop do loop do
Rake::Task["pghero:capture_space_stats"].execute Rake::Task["pghero:capture_space_stats"].execute
@@ -55,22 +54,21 @@ task :periodic_tasks => [:environment, :set_logger_stdout] do
end end
end end
loop do loop { sleep 10 }
sleep 10
end
end end
namespace :blob_entries do namespace :blob_entries do
task :export_samples => :environment do task export_samples: :environment do
limit = ENV["limit"]&.to_i || raise("need 'limit' (num)") limit = ENV["limit"]&.to_i || raise("need 'limit' (num)")
outfile = ENV["outfile"] || raise("need 'outfile' (file path, .json encoded)") outfile =
ENV["outfile"] || raise("need 'outfile' (file path, .json encoded)")
BlobEntrySampleExporter.new.export_samples(limit, outfile) BlobEntrySampleExporter.new.export_samples(limit, outfile)
end end
task :import_samples => :environment do task import_samples: :environment do
infile = ENV["infile"] || raise("need 'infile' (file path, .json encoded)") infile = ENV["infile"] || raise("need 'infile' (file path, .json encoded)")
BlobEntrySampleExporter.new.import_samples(infile) BlobEntrySampleExporter.new.import_samples(infile)
end end
task :migrate_entries => :environment do task migrate_entries: :environment do
start_at = ENV["start_at"] start_at = ENV["start_at"]
batch_size = ENV["batch_size"]&.to_i || 64 batch_size = ENV["batch_size"]&.to_i || 64
BlobEntrySampleExporter.new.migrate_blob_entries(start_at, batch_size) BlobEntrySampleExporter.new.migrate_blob_entries(start_at, batch_size)
@@ -78,7 +76,7 @@ namespace :blob_entries do
end end
namespace :db_sampler do namespace :db_sampler do
task :export => :environment do task export: :environment do
url_names = ENV["url_names"] || raise("need 'url_names' (comma-separated)") url_names = ENV["url_names"] || raise("need 'url_names' (comma-separated)")
# outfile_path = ENV["outfile"] || raise("need 'outfile' (file path)") # outfile_path = ENV["outfile"] || raise("need 'outfile' (file path)")
# outfile = File.open(outfile_path, "wb") # outfile = File.open(outfile_path, "wb")
@@ -88,7 +86,7 @@ namespace :db_sampler do
outfile.close if outfile outfile.close if outfile
end end
task :import => [:environment] do task import: [:environment] do
# infile_path = ENV["infile"] || raise("need 'infile' (file path)") # infile_path = ENV["infile"] || raise("need 'infile' (file path)")
# infile = File.open(infile_path, "rb") # infile = File.open(infile_path, "rb")
infile = $stdin infile = $stdin
@@ -98,23 +96,16 @@ namespace :db_sampler do
end end
end end
task :good_job do task good_job: %i[environment set_ar_stdout set_logger_stdout] do
proxies = ["direct", "proxy-1", "dedipath-1", "serverhost-1"]
proxy = ENV["proxy"]
raise("'proxy' must be set") unless proxy
raise("'proxy' must be one of #{proxies}") unless proxies.include?(proxy)
env_hash = { env_hash = {
"RAILS_ENV" => "worker", "RAILS_ENV" => "worker",
"GOOD_JOB_POLL_INTERVAL" => "5", "GOOD_JOB_POLL_INTERVAL" => "5",
"GOOD_JOB_MAX_CACHE" => "10000", "GOOD_JOB_MAX_CACHE" => "10000",
"GOOD_JOB_QUEUE_SELECT_LIMIT" => "4096", "GOOD_JOB_QUEUE_SELECT_LIMIT" => "4096",
"GOOD_JOB_MAX_THREADS" => "4", "GOOD_JOB_MAX_THREADS" => "4",
"GOOD_JOB_QUEUES" => [ "GOOD_JOB_QUEUES" =>
"manual:4", ENV["GOOD_JOB_QUEUES"] ||
"fa_post,e621:2", %w[manual:4 fa_post,e621:2 *:6].reject(&:nil?).join(";")
"*:6",
].reject(&:nil?).join(";"),
} }
env_hash.each do |key, value| env_hash.each do |key, value|
@@ -127,7 +118,7 @@ task :good_job do
exec(cmd) exec(cmd)
end end
task :recompute_job_signatures => :environment do task recompute_job_signatures: :environment do
ActiveRecord::Base.logger = Logger.new(STDOUT) ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Base.logger.level = :error ActiveRecord::Base.logger.level = :error
@@ -148,21 +139,22 @@ task :recompute_job_signatures => :environment do
end end
end end
task :workoff_failed_jobs => [:environment, :set_ar_stdout, :set_logger_stdout] do task workoff_failed_jobs: %i[environment set_ar_stdout set_logger_stdout] do
worker = Delayed::Worker.new worker = Delayed::Worker.new
Delayed::Job.where( Delayed::Job
"last_error is not null and attempts <= 2" .where("last_error is not null and attempts <= 2")
).find_each(batch_size: 1) do |job| .find_each(batch_size: 1) { |job| worker.run(job) }
worker.run(job)
end
end end
task :reverse_csv do task :reverse_csv do
file = ENV["file"] || raise("need 'file' (file path)") file = ENV["file"] || raise("need 'file' (file path)")
in_csv = CSV.parse(File.open(file, "r+"), headers: true) in_csv = CSV.parse(File.open(file, "r+"), headers: true)
out_csv = CSV.new(File.open("rev_" + file, "w"), write_headers: true, headers: in_csv.headers) out_csv =
in_csv.reverse_each do |row| CSV.new(
out_csv << row.map(&:second) File.open("rev_" + file, "w"),
end write_headers: true,
headers: in_csv.headers
)
in_csv.reverse_each { |row| out_csv << row.map(&:second) }
out_csv.close out_csv.close
end end

View File

@@ -3,7 +3,8 @@ module Domain::E621::Job
queue_as :e621 queue_as :e621
ignore_signature_args :caused_by_entry ignore_signature_args :caused_by_entry
def perform(post:, caused_by_entry:) def perform(**args)
logger.info("ScanPostJob args: #{args.inspect}")
end end
end end
end end

View File

@@ -2,7 +2,7 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
queue_as :fa_browse_page queue_as :fa_browse_page
ignore_signature_args :caused_by_entry ignore_signature_args :caused_by_entry
def perform(args) def perform(**args)
@caused_by_entry = args[:caused_by_entry] @caused_by_entry = args[:caused_by_entry]
@first_browse_page_entry = nil @first_browse_page_entry = nil
@@ -16,7 +16,9 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
@page_number += 1 @page_number += 1
end end
logger.info("finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts across #{@page_number.to_s.bold} pages") logger.info(
"finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts across #{@page_number.to_s.bold} pages"
)
end end
private private
@@ -28,21 +30,30 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
url = "https://www.furaffinity.net/browse/#{@page_number}/" url = "https://www.furaffinity.net/browse/#{@page_number}/"
end end
response = http_client.get(url, caused_by_entry: @first_browse_page_entry || @caused_by_entry) response =
http_client.get(
url,
caused_by_entry: @first_browse_page_entry || @caused_by_entry
)
log_entry = response.log_entry log_entry = response.log_entry
@first_browse_page_entry ||= log_entry @first_browse_page_entry ||= log_entry
if response.status_code != 200 if response.status_code != 200
fatal_error("non 200 response for /browse: #{response.status_code.to_s.underline}") fatal_error(
"non 200 response for /browse: #{response.status_code.to_s.underline}"
)
end end
page = Domain::Fa::Parser::Page.new(response.body) page = Domain::Fa::Parser::Page.new(response.body)
listing_page_stats = update_and_enqueue_posts_from_listings_page( listing_page_stats =
:browse_page, page, log_entry, update_and_enqueue_posts_from_listings_page(
enqueue_posts_pri: :high, :browse_page,
page_desc: "Browse@#{@page_number}", page,
fill_id_gaps: true, log_entry,
) enqueue_posts_pri: :high,
page_desc: "Browse@#{@page_number}",
fill_id_gaps: true
)
@total_num_new_posts_seen += listing_page_stats.new_seen @total_num_new_posts_seen += listing_page_stats.new_seen
@total_num_posts_seen += listing_page_stats.total_seen @total_num_posts_seen += listing_page_stats.total_seen

View File

@@ -1,5 +1,6 @@
class Scraper::JobBase < ApplicationJob class Scraper::JobBase < ApplicationJob
class JobError < RuntimeError; end class JobError < RuntimeError
end
def initialize(...) def initialize(...)
@deferred_jobs = [] @deferred_jobs = []
@@ -20,26 +21,28 @@ class Scraper::JobBase < ApplicationJob
good_job_control_concurrency_with( good_job_control_concurrency_with(
total_limit: 1, total_limit: 1,
key: proc do key:
raise("too many arguments") if arguments.size > 1 proc do
first_argument = arguments || [] raise("too many arguments") if arguments.size > 1
first_argument = first_argument[0] || {} first_argument = arguments || []
first_argument = first_argument[0] || {}
ignore_signature_args = self.class.ignore_signature_args ignore_signature_args = self.class.ignore_signature_args
ignore_signature_args << :_aj_symbol_keys ignore_signature_args << :_aj_symbol_keys
ignore_signature_args << :_aj_ruby2_keywords ignore_signature_args << :_aj_ruby2_keywords
sig_arguments = first_argument.reject do |key, value| sig_arguments =
ignore_signature_args.include?(key.to_sym) first_argument
end.to_h .reject { |key, value| ignore_signature_args.include?(key.to_sym) }
.to_h
sig = [] sig = []
sig << self.class.name || raise sig << self.class.name || raise
sig << self.queue_name || "*" sig << self.queue_name || "*"
sig << self.priority || "*" sig << self.priority || "*"
sig << Digest::SHA256.hexdigest(sig_arguments.inspect)[0...16] sig << Digest::SHA256.hexdigest(sig_arguments.inspect)[0...16]
sig = sig.join("|") sig = sig.join("|")
sig sig
end, end
) )
# make the concurrency config threadlocal so it can be modified # make the concurrency config threadlocal so it can be modified
@@ -50,16 +53,15 @@ class Scraper::JobBase < ApplicationJob
@@gjcc_tl.value @@gjcc_tl.value
end end
PERMITTED_CONTENT_TYPES = [ PERMITTED_CONTENT_TYPES = [%r{text/html}, %r{application/json}]
/text\/html/,
/application\/json/,
]
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: []) def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
return unless ENV["enqueue_jobs_from_found_links"] == "1" return unless ENV["enqueue_jobs_from_found_links"] == "1"
start_time = Time.now start_time = Time.now
unless PERMITTED_CONTENT_TYPES.any? { |ct| ct.match(log_entry.content_type) } unless PERMITTED_CONTENT_TYPES.any? { |ct|
ct.match(log_entry.content_type)
}
raise("unsupported content type: #{log_entry.content_type}") raise("unsupported content type: #{log_entry.content_type}")
end end
document = log_entry.response.contents document = log_entry.response.contents
@@ -68,30 +70,39 @@ class Scraper::JobBase < ApplicationJob
links = link_finder.find_links links = link_finder.find_links
job_defs = [] job_defs = []
filter_mapper = proc { |list, type, key| filter_mapper =
list.filter { |item| item[:type] == type }.map { |item| item[key] } proc do |list, type, key|
} list.filter { |item| item[:type] == type }.map { |item| item[key] }
end
attr_to_mapper = proc { |model_class, attr_sym, in_values| attr_to_mapper =
model_class.where({ attr_sym => in_values }).map do |model| proc do |model_class, attr_sym, in_values|
[model.send(attr_sym), model] model_class
end.to_h .where({ attr_sym => in_values })
} .map { |model| [model.send(attr_sym), model] }
.to_h
end
url_name_to_fa_user = attr_to_mapper.call( url_name_to_fa_user =
Domain::Fa::User, :url_name, attr_to_mapper.call(
filter_mapper.call(links, :fa_user, :url_name) Domain::Fa::User,
) :url_name,
filter_mapper.call(links, :fa_user, :url_name)
)
fa_id_to_fa_post = attr_to_mapper.call( fa_id_to_fa_post =
Domain::Fa::Post, :fa_id, attr_to_mapper.call(
filter_mapper.call(links, :fa_post, :fa_id) Domain::Fa::Post,
) :fa_id,
filter_mapper.call(links, :fa_post, :fa_id)
)
name_to_twitter_user = attr_to_mapper.call( name_to_twitter_user =
Domain::Twitter::User, :name, attr_to_mapper.call(
filter_mapper.call(links, :twitter_user, :name) Domain::Twitter::User,
) :name,
filter_mapper.call(links, :twitter_user, :name)
)
links.each do |link| links.each do |link|
link_type = link[:type] link_type = link[:type]
@@ -99,57 +110,52 @@ class Scraper::JobBase < ApplicationJob
when :fa_user when :fa_user
url_name = link[:url_name] url_name = link[:url_name]
user = url_name_to_fa_user[url_name] user = url_name_to_fa_user[url_name]
params = if user params = (user ? { user: user } : { url_name: url_name })
{ user: user }
else
{ url_name: url_name }
end
job_def = ({ params: params, desc: "fa user " + (user&.name || url_name).bold }) job_def =
({ params: params, desc: "fa user " + (user&.name || url_name).bold })
if !user || user.due_for_page_scan? if !user || user.due_for_page_scan?
job_defs << job_def.merge( job_defs << job_def.merge(job: Domain::Fa::Job::UserPageJob)
job: Domain::Fa::Job::UserPageJob,
)
elsif !user || user.due_for_gallery_scan? elsif !user || user.due_for_gallery_scan?
job_defs << job_def.merge( job_defs << job_def.merge(job: Domain::Fa::Job::UserGalleryJob)
job: Domain::Fa::Job::UserGalleryJob,
)
end end
when :fa_post when :fa_post
fa_id = link[:fa_id] fa_id = link[:fa_id]
post = fa_id_to_fa_post[fa_id] post = fa_id_to_fa_post[fa_id]
params = if post params = (post ? { post: post } : { fa_id: fa_id })
{ post: post }
else
{ fa_id: fa_id }
end
job_def = ({ params: params, desc: "fa post #{fa_id.to_s.bold}" }) job_def = ({ params: params, desc: "fa post #{fa_id.to_s.bold}" })
job_defs << job_def.merge( if !post || (post.state == "ok" && !post.scanned?)
job: Domain::Fa::Job::ScanPostJob, job_defs << job_def.merge(job: Domain::Fa::Job::ScanPostJob)
) if !post || (post.state == "ok" && !post.scanned?) end
job_defs << job_def.merge( if post && post.state == "ok" && post.scanned? && !post.have_file?
job: Domain::Fa::Job::ScanFileJob, job_defs << job_def.merge(job: Domain::Fa::Job::ScanFileJob)
) if post && post.state == "ok" && post.scanned? && !post.have_file? end
when :twitter_user when :twitter_user
name = link[:name] name = link[:name]
user = name_to_twitter_user[name] user = name_to_twitter_user[name]
params = if user params = (user ? { user: user } : { name: name })
{ user: user }
else
{ name: name }
end
job_def = ({ params: params, desc: "twitter user " + (user&.name || name).bold }) job_def =
(
{
params: params,
desc: "twitter user " + (user&.name || name).bold
}
)
job_defs << job_def.merge( if !user || user.due_for_timeline_tweets_scan?
job: Domain::Twitter::Job::UserTimelineTweetsJob, job_defs << job_def.merge(
) if !user || user.due_for_timeline_tweets_scan? job: Domain::Twitter::Job::UserTimelineTweetsJob
)
end
else else
logger.warn("unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}") logger.warn(
"unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}"
)
end end
end end
@@ -163,19 +169,20 @@ class Scraper::JobBase < ApplicationJob
job_class = job_def[:job] job_class = job_def[:job]
params = job_def[:params] params = job_def[:params]
desc = job_def[:desc] desc = job_def[:desc]
logger.debug([ logger.debug(
"link finder -", [
job_class.name.split("::").last.to_s.ljust(22).bold.light_black, "link finder -",
desc, job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
].join(" ")) desc
].join(" ")
defer_job(
job_class,
params.merge({ caused_by_entry: log_entry }),
) )
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
end end
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
logger.info("link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)") logger.info(
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)"
)
end end
around_perform do |job, block| around_perform do |job, block|
@@ -194,27 +201,9 @@ class Scraper::JobBase < ApplicationJob
rescue => e rescue => e
error = e error = e
raise e raise e
ensure
Metrics::Client.singleton.write_point(
self,
"job_performed",
tags: {
job_queue: job.queue_name,
job_class: job.class.name,
success: error.nil?,
error_class: error&.class&.name,
},
fields: {
duration_ms: duration_ms,
},
)
end end
after_perform do |job| after_perform { |job| ColorLogger.quiet { job.enqueue_deferred_jobs! } }
ColorLogger.quiet do
job.enqueue_deferred_jobs!
end
end
def defer_job(job_class, params, set_args = {}) def defer_job(job_class, params, set_args = {})
@deferred_jobs << [job_class, params, set_args] @deferred_jobs << [job_class, params, set_args]

View File

@@ -24,40 +24,42 @@ class ColorLogger
klass_name = "(Anonymous)" klass_name = "(Anonymous)"
end end
logger = Logger.new(sink).tap do |logger| logger =
prefix = "" Logger
logger.define_singleton_method :prefix= do |p| .new(sink)
prefix = p .tap do |logger|
end prefix = ""
logger.define_singleton_method :sink do logger.define_singleton_method :prefix= do |p|
sink prefix = p
end end
logger.define_singleton_method :sink do
logger.formatter = proc do |severity, datetime, progname, msg| sink
color = case severity
when "ERROR" then :red
when "WARN" then :yellow
else :light_blue
end end
klass_name_str = "[#{klass_name.send(color)}]".ljust(32) logger.formatter =
if prefix.is_a?(Proc) proc do |severity, datetime, progname, msg|
prefix = prefix.call color =
case severity
when "ERROR"
:red
when "WARN"
:yellow
else
:light_blue
end
klass_name_str = "[#{klass_name.send(color)}]".ljust(32)
prefix = prefix.call if prefix.is_a?(Proc)
if @quiet.value > 0
""
else
[klass_name_str, prefix, msg].reject(&:blank?).join(" ") + "\n"
end
end
end end
if @quiet.value > 0 ActiveSupport::TaggedLogging.new(logger)
""
else
[klass_name_str, prefix, msg].reject(&:blank?).join(" ") + "\n"
end
end
end
if Rails.env.production?
ActiveSupport::TaggedLogging.new(logger)
else
logger
end
end end
def self.klass_name_from_instance(instance) def self.klass_name_from_instance(instance)
@@ -66,40 +68,7 @@ class ColorLogger
else else
klass_name = instance.class.name.dup klass_name = instance.class.name.dup
end end
klass_name.delete_prefix!("Domain::") klass_name.delete_prefix!("Domain::")
prefixes = [
{
if_prefix: "Fa::Scraper::",
replace_with: "Fa::",
},
{
if_prefix: "Fa::Job::",
replace_with: "Fa::",
delete_suffix: "Job",
},
{
if_prefix: "Twitter::Job::",
replace_with: "Twitter::",
delete_suffix: "Job",
},
{
if_prefix: "E621::Job::",
replace_with: "E621::",
delete_suffix: "Job",
},
].each do |conf|
if klass_name.start_with?(conf[:if_prefix])
klass_name.delete_prefix!(conf[:if_prefix])
if (suffix = conf[:delete_suffix])
klass_name.delete_suffix!(suffix)
end
klass_name = conf[:replace_with] + klass_name
break
end
end
klass_name klass_name
end end
end end

View File

@@ -1,54 +0,0 @@
class Metrics::Client
include HasColorLogger
REPORT = !Rails.env.test?
def self.singleton
@singleton ||= Metrics::Client.new
end
def self.singleton=(instance)
@singleton = instance
end
private
def initialize(default_tags: {})
unless REPORT
logger.warn "not reporting metrics for '#{Rails.env.to_s.bold}' environment"
return
end
host = Rails.application.config.x.influxdb.host || raise("no host")
bucket = Rails.application.config.x.influxdb.bucket || raise("no bucket")
@client = InfluxDB2::Client.new(
host, "",
org: "",
bucket: bucket,
precision: InfluxDB2::WritePrecision::MILLISECOND,
use_ssl: false,
)
write_options = InfluxDB2::WriteOptions.new(
write_type: InfluxDB2::WriteType::BATCHING,
batch_size: 100, flush_interval: 5_000,
max_retries: 3, max_retry_delay: 15_000,
exponential_base: 2,
)
point_settings = InfluxDB2::PointSettings.new(default_tags: default_tags)
@writer = @client.create_write_api(
write_options: write_options,
point_settings: point_settings,
)
end
public
def write_point(caller, name, tags: {}, fields: {})
return unless REPORT
@writer.write(data: { name: name, tags: tags, fields: fields })
rescue => e
logger.error("error writing influxdb metric: #{e}")
end
end

View File

@@ -1,30 +0,0 @@
class Metrics::EstimateDbRowsReporter < Metrics::Reporter
def initialize
log_writes!
super
end
def report
extra_tables = [
"http_log_entries",
"http_log_entry_headers",
"versions",
"delayed_jobs",
"blob_entries_p",
]
extra_tables_sql = extra_tables.map { |t| "'#{t}'" }.join(",")
row_estimates = ReduxApplicationRecord.connection.exec_query(
[
"SELECT relname, n_live_tup",
"FROM pg_stat_all_tables",
"WHERE relname IN (#{extra_tables_sql})",
"OR relname like 'domain_%'",
"OR relname like 'blob_entries_p_%'",
].join(" ")
).rows.to_h
write_point(
"estimate_db_rows", fields: row_estimates,
)
end
end

View File

@@ -1,30 +0,0 @@
class Metrics::GoodJobReporter < Metrics::Reporter
def initialize
super
end
def report
total = GoodJob::Job.count
by_queue = GoodJob::Job.group("queue_name").count
by_state = GoodJob::JobsFilter.new({}).states
logger.info "job queue metrics: total=#{total}, by_queue=#{by_queue}, by_state=#{by_state}"
write_point(
"job_queues",
tags: { aggregation: "total" },
fields: { "total" => total },
)
write_point(
"job_queues",
tags: { aggregation: "queue" },
fields: by_queue,
)
write_point(
"job_queues",
tags: { aggregation: "state" },
fields: by_state,
)
end
end

View File

@@ -1,20 +0,0 @@
class Metrics::Reporter
include HasColorLogger
def initialize
@client = Metrics::Client.singleton
end
def log_writes!
@log_writes = true
end
def write_point(name, tags: {}, fields: {})
@client.write_point(self, name, tags: tags, fields: fields)
logger.info("reporter wrote point '#{name.bold}'") if @log_writes
end
def report
raise NotImplementedError, "implement in subclass"
end
end

View File

@@ -58,21 +58,29 @@ class Scraper::ClientFactory
end end
def self._gallery_dl_client_impl def self._gallery_dl_client_impl
@gallery_dl_clients.value ||= begin @gallery_dl_clients.value ||=
proxy_config = Rails.application.config.x.proxy || raise("no proxy config") begin
proxy_config =
Rails.application.config.x.proxy || raise("no proxy config")
if proxy_config[:gallery_dl].blank? if proxy_config[:gallery_dl].blank?
raise("no gallery_dl host defined for proxy config #{proxy_config[:name]}") raise(
"no gallery_dl host defined for proxy config #{proxy_config[:name]}"
)
end end
Scraper::GalleryDlClient.new(proxy_config[:name], proxy_config[:gallery_dl]) Scraper::GalleryDlClient.new(
proxy_config[:name],
proxy_config[:gallery_dl]
)
end end
end end
def self._http_client_impl(key, config_klass) def self._http_client_impl(key, config_klass)
@http_clients.value[key] ||= begin @http_clients.value[key] ||= begin
proxy_config = Rails.application.config.x.proxy || raise("no proxy config") # proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http]) # performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http])
Scraper::HttpClient.new(config_klass.new, performer) performer = Scraper::CurlHttpPerformer.new
end Scraper::HttpClient.new(config_klass.new, performer)
end
end end
def self._get_ib_client_sid(client) def self._get_ib_client_sid(client)

View File

@@ -0,0 +1,106 @@
class Scraper::CurlHttpPerformer
Response =
Struct.new(:response_code, :response_headers, :response_time_ms, :body)
def get(url, request_headers)
do_request(:http_get, url, request_headers)
end
def post(url, request_headers)
do_request(:http_post, url, request_headers)
end
def name
"direct"
end
METHOD_MAP = { http_get: "GET", http_post: "POST" }
def do_request(method, url, request_headers)
t, curl = get_curl
start_at = Time.now
curl.url = Addressable::URI.encode url
curl.follow_location = true
request_headers.each { |key, value| curl.headers[key.to_s] = value }
curl.headers["User-Agent"] = "FurryArchiver/1.0 / dhelta"
curl.send(method)
response_time_ms = ((Time.now - start_at) * 1000).to_i
response_code = curl.response_code
body_str = curl.body_str
response_headers = self.class.parse_header_str(curl.header_str).to_h
rc_color =
case response_code / 100
when 2
[:green]
when 3
[:cyan]
when 4
[:red]
when 5
%i[red bold]
else
[:white]
end
rc_string = response_code.to_s
rc_color.each { |m| rc_string = rc_string.send(m) }
cf_cache_status =
if response_headers["cf-cache-status"]
"(#{response_headers["cf-cache-status"].light_blue})"
else
nil
end
puts [
"[#{Process.pid.to_s.black.bold} / #{t.name.to_s.black.bold}]",
"[#{METHOD_MAP[method]} #{rc_string}",
"#{self.class.humansize(body_str.size).to_s.bold}",
(response_time_ms.to_s + "ms").light_blue.bold + "]",
cf_cache_status,
"#{url.bold}"
].reject(&:nil?).join(" ")
Response.new(response_code, response_headers, response_time_ms, body_str)
end
private
def get_curl
t = Thread.current
unless t.thread_variable?(:curl)
t.thread_variable_set(:curl, Curl::Easy.new)
end
curl = t.thread_variable_get(:curl)
curl.headers = {}
[t, curl]
end
def self.humansize(size)
units = %w[B KiB MiB GiB TiB Pib EiB ZiB]
return "0.0 B" if size == 0
exp = (Math.log(size) / Math.log(1024)).to_i
exp += 1 if (size.to_f / 1024**exp >= 1024 - 0.05)
exp = units.size - 1 if exp > units.size - 1
unit = units[exp]
if unit == "B"
"%d B" % [size]
else
"%.1f %s" % [size.to_f / 1024**exp, unit]
end
end
def self.parse_header_str(header_str)
header_str
.split("\r\n")
.each_with_index
.map do |str, idx|
next nil if idx == 0
idx = str.index(": ")
next nil unless idx
[str[0...idx], str[idx + 2..-1]]
end
.reject(&:nil?)
end
end

View File

@@ -1,23 +1,13 @@
class Scraper::FaHttpClientConfig < Scraper::HttpClientConfig class Scraper::FaHttpClientConfig < Scraper::HttpClientConfig
DEFAULT_ALLOWED_DOMAINS = [ DEFAULT_ALLOWED_DOMAINS = %w[*.furaffinity.net *.facdn.net ipinfo.io]
"*.furaffinity.net",
"*.facdn.net",
"ipinfo.io",
]
def cookies def cookies
proxy_name = Rails.application.config.x.proxy_name Rails.application.config.x.cookies.fa["direct"] ||
Rails.application.config.x.cookies.fa[ raise("no fa cookies defined for direct")
proxy_name.to_s
] || raise("no fa cookies defined for #{proxy_name}")
end end
def ratelimit def ratelimit
[ [["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 0.25]]
["d.furaffinity.net", :none],
["*.facdn.net", :none],
["*", 0.25],
]
end end
def allowed_domains def allowed_domains

View File

@@ -2,13 +2,10 @@ class Scraper::HttpClient
include HasColorLogger include HasColorLogger
attr_reader :config attr_reader :config
Response = Struct.new( Response = Struct.new(:status_code, :body, :log_entry)
:status_code,
:body,
:log_entry,
)
class InvalidURLError < ArgumentError; end class InvalidURLError < ArgumentError
end
def initialize(config, http_performer) def initialize(config, http_performer)
@config = config @config = config
@@ -17,9 +14,10 @@ class Scraper::HttpClient
init_cookie_jar init_cookie_jar
# TODO - populate AdaptiveCache with domains this client cares about # TODO - populate AdaptiveCache with domains this client cares about
@max_cache_size = 8 @max_cache_size = 8
@blob_entry_cache = Hash.new do |hash, key| @blob_entry_cache =
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1) Hash.new do |hash, key|
end hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
end
logger.level = :info logger.level = :info
@config.do_login(http_performer) @config.do_login(http_performer)
end end
@@ -46,12 +44,14 @@ class Scraper::HttpClient
url = uri.to_s url = uri.to_s
unless allowed_domain?(uri.host) unless allowed_domain?(uri.host)
raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}") raise InvalidURLError.new(
"domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}"
)
end end
ratelimit_conf = config.ratelimit.find do |conf| ratelimit_conf =
File.fnmatch? conf[0], uri.host config.ratelimit.find { |conf| File.fnmatch? conf[0], uri.host } ||
end || ["*", :none] ["*", :none]
ratelimit_delay = ratelimit_conf[1] ratelimit_delay = ratelimit_conf[1]
if ratelimit_delay == :none if ratelimit_delay == :none
logger.debug("no rate limit for #{uri.host}") logger.debug("no rate limit for #{uri.host}")
@@ -68,7 +68,10 @@ class Scraper::HttpClient
end end
request_headers = { request_headers = {
"cookie" => HTTP::Cookie.cookie_value(@cookie_jar.cookies(Addressable::URI.encode url)), "cookie" =>
HTTP::Cookie.cookie_value(
@cookie_jar.cookies(Addressable::URI.encode url)
)
} }
requested_at = Time.now requested_at = Time.now
response = @http_performer.do_request(method, url, request_headers) response = @http_performer.do_request(method, url, request_headers)
@@ -77,47 +80,57 @@ class Scraper::HttpClient
response_headers = response.response_headers response_headers = response.response_headers
response_time_ms = response.response_time_ms response_time_ms = response.response_time_ms
response_body = response.body response_body = response.body
@domain_last_requested_at[uri.host] = requested_at + (response_time_ms.to_f / 1000) @domain_last_requested_at[uri.host] = requested_at +
(response_time_ms.to_f / 1000)
content_type = response_headers["Content-Type"] || content_type =
response_headers["content-type"] || response_headers["Content-Type"] || response_headers["content-type"] ||
"none/none" "none/none"
cache_key = "#{uri.host}|#{content_type}" cache_key = "#{uri.host}|#{content_type}"
blob_entry_cache = @blob_entry_cache[cache_key] blob_entry_cache = @blob_entry_cache[cache_key]
candidates = if blob_entry_cache.at_capacity? && rand(0..100) >= 5 candidates =
if blob_entry_cache.at_capacity? && rand(0..100) >= 5
blob_entry_cache.candidates blob_entry_cache.candidates
else else
[] []
end end
candidates << caused_by_entry.response if caused_by_entry&.response candidates << caused_by_entry.response if caused_by_entry&.response
candidates << caused_by_entry.response.base if caused_by_entry&.response&.base if caused_by_entry&.response&.base
candidates << caused_by_entry.response.base
end
retries = 0 retries = 0
begin begin
response_blob_entry = BlobEntryP.find_or_build( response_blob_entry =
content_type: content_type, BlobEntryP.find_or_build(
contents: response_body, content_type: content_type,
candidates: candidates, contents: response_body,
) candidates: candidates
)
scrubbed_uri = @config.scrub_stored_uri(uri) scrubbed_uri = @config.scrub_stored_uri(uri)
log_entry = HttpLogEntry.new({ log_entry =
uri: scrubbed_uri.to_s, HttpLogEntry.new(
verb: method, {
content_type: content_type, uri: scrubbed_uri.to_s,
status_code: response_code, verb: method,
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers), content_type: content_type,
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers), status_code: response_code,
response: response_blob_entry, request_headers:
response_time_ms: response_time_ms, HttpLogEntryHeader.find_or_build(headers: request_headers),
requested_at: requested_at, response_headers:
caused_by_entry: caused_by_entry, HttpLogEntryHeader.find_or_build(headers: response_headers),
performed_by: @http_performer.name, response: response_blob_entry,
}) response_time_ms: response_time_ms,
requested_at: requested_at,
caused_by_entry: caused_by_entry,
performed_by: @http_performer.name
}
)
log_entry.save! log_entry.save!
rescue rescue StandardError
retries += 1 retries += 1
retry if retries < 2 retry if retries < 2
raise raise
@@ -129,11 +142,14 @@ class Scraper::HttpClient
) )
else else
blob_entry_cache.insert( blob_entry_cache.insert(
HexUtil.bin2hex(response_blob_entry.sha256)[0..8], response_blob_entry, scrubbed_uri HexUtil.bin2hex(response_blob_entry.sha256)[0..8],
response_blob_entry,
scrubbed_uri
) )
end end
response_code_colorized = if response_code == 200 response_code_colorized =
if response_code == 200
response_code.to_s.light_green response_code.to_s.light_green
elsif (response_code / 100) == 4 || (response_code / 100) == 5 elsif (response_code / 100) == 4 || (response_code / 100) == 5
response_code.to_s.red.bold response_code.to_s.red.bold
@@ -142,51 +158,30 @@ class Scraper::HttpClient
end end
total_time_ms = ((Time.now - requested_at) * 1000).round(0) total_time_ms = ((Time.now - requested_at) * 1000).round(0)
logger.info([ logger.info(
"[entry #{log_entry.id.to_s.bold} /", [
"GET #{response_code_colorized} /", "[entry #{log_entry.id.to_s.bold} /",
"#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]", "GET #{response_code_colorized} /",
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]", "#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]",
scrubbed_uri.to_s.black, "[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
].reject(&:nil?).join(" ")) scrubbed_uri.to_s.black
].reject(&:nil?).join(" ")
)
if response_code == 524 || response_code == 502 || response_code == 503 || response_code == 403 if response_code == 524 || response_code == 502 || response_code == 503 ||
response_code == 403
logger.error("HTTP #{response_code} - sleeping to not throttle server") logger.error("HTTP #{response_code} - sleeping to not throttle server")
sleep 15 sleep 15
end end
if content_type =~ /text\/plain/ && response_body =~ /Your IP address has been temporarily banned/ if content_type =~ %r{text/plain} &&
response_body =~ /Your IP address has been temporarily banned/
logger.error("IP temp banned, sleeping & raising...") logger.error("IP temp banned, sleeping & raising...")
sleep 60 * 5 sleep 60 * 5
raise raise
end end
Metrics::Client.singleton.write_point( Response.new(response_code, response_body, log_entry)
self,
"http_client_response",
tags: {
method: "GET",
host: uri.host,
status_code: response_code,
content_type: clean_content_type(content_type),
performed_by: @http_performer.name,
},
fields: {
response_time_ms: response_time_ms,
total_time_ms: total_time_ms,
content_size: response_blob_entry.size,
content_stored: response_blob_entry.bytes_stored,
uri: scrubbed_uri.to_s,
status_code: response_code,
content_type: clean_content_type(content_type),
},
)
Response.new(
response_code,
response_body,
log_entry
)
end end
def allowed_domain?(domain) def allowed_domain?(domain)
@@ -206,7 +201,7 @@ class Scraper::HttpClient
name: cookie_key_conf[:name], name: cookie_key_conf[:name],
value: cookie_key_conf[:value], value: cookie_key_conf[:value],
path: cookie_key_conf[:path] || "", path: cookie_key_conf[:path] || "",
for_domain: true, for_domain: true
} }
@cookie_jar.add(HTTP::Cookie.new(conf)) @cookie_jar.add(HTTP::Cookie.new(conf))
end end

View File

@@ -1,56 +0,0 @@
require "base64"
class Scraper::HttpPerformer
include HasColorLogger
Response = Struct.new(
:response_code,
:response_headers,
:response_time_ms,
:body
)
def initialize(name, host)
name || raise("must provide name")
host || raise("must provide host")
logger.info("#{Thread.name.to_s.light_black} - build performer #{name.to_s.green} - #{host.to_s.green.bold}")
@name = name
@client = Ripcord::Client.new(host)
end
def name
@name
end
def get(uri, headers)
do_request("get", uri, headers)
end
def post(uri, headers)
do_request("post", uri, headers)
end
METHODS = %i[get post].freeze
def do_request(method, uri, headers)
raise "invalid method #{method}" unless METHODS.include?(method)
send_token = SecureRandom.uuid
rpc_result = @client.call(method.to_s, [uri, headers, send_token])
raise "rpc error: #{rpc_result.error.inspect}" unless rpc_result.successful?
recv_token, code, headers, time_ms, body = Marshal.load(Base64.decode64(rpc_result.result))
if send_token != recv_token
raise "token mismatch: #{send_token} != #{recv_token}"
end
Scraper::HttpPerformer::Response.new(code, headers, time_ms, body)
end
def close!
begin
@client.instance_variable_get("@http_client")&.finish
rescue IOError => e
Rails.logger.error("Failed to close http client: #{e.inspect}")
end
end
private
end

View File

@@ -1,59 +1,54 @@
class Domain::Fa::User < ReduxApplicationRecord class Domain::Fa::User < ReduxApplicationRecord
self.table_name = 'domain_fa_users' self.table_name = "domain_fa_users"
has_lite_trail(schema_version: 1) has_lite_trail(schema_version: 1)
has_many :posts, has_many :posts,
class_name: '::Domain::Fa::Post', class_name: "::Domain::Fa::Post",
inverse_of: :creator, inverse_of: :creator,
foreign_key: :creator_id foreign_key: :creator_id
has_one :disco, has_one :disco,
class_name: '::Domain::Fa::UserFactor', class_name: "::Domain::Fa::UserFactor",
inverse_of: :user, inverse_of: :user,
foreign_key: :user_id, foreign_key: :user_id,
dependent: :destroy dependent: :destroy
has_one :avatar, has_one :avatar,
class_name: '::Domain::Fa::UserAvatar', class_name: "::Domain::Fa::UserAvatar",
inverse_of: :user, inverse_of: :user,
dependent: :destroy dependent: :destroy
enum :state, [ enum :state,
:ok, # so far so good, user may not yet be scanned [
:scan_error # user has been removed or otherwise, see state_detail :ok, # so far so good, user may not yet be scanned
] :scan_error # user has been removed or otherwise, see state_detail
]
# Who this user follows (join table) # Who this user follows (join table)
has_many :follower_joins, has_many :follower_joins,
class_name: '::Domain::Fa::Follow', class_name: "::Domain::Fa::Follow",
foreign_key: :follower_id, foreign_key: :follower_id,
inverse_of: :follower, inverse_of: :follower,
dependent: :destroy dependent: :destroy
# Who this user follows (User model) # Who this user follows (User model)
has_many :follows, has_many :follows, through: :follower_joins, source: :followed
through: :follower_joins,
source: :followed
# Who follows this user (join table) # Who follows this user (join table)
has_many :followed_joins, has_many :followed_joins,
class_name: '::Domain::Fa::Follow', class_name: "::Domain::Fa::Follow",
foreign_key: :followed_id, foreign_key: :followed_id,
inverse_of: :followed, inverse_of: :followed,
dependent: :destroy dependent: :destroy
# Who follows this user (User model) # Who follows this user (User model)
has_many :followed_by, has_many :followed_by, through: :followed_joins, source: :follower
through: :followed_joins,
source: :follower
has_many :fav_post_joins, has_many :fav_post_joins, class_name: "::Domain::Fa::Fav", inverse_of: :user
class_name: '::Domain::Fa::Fav',
inverse_of: :user
has_many :fav_posts, has_many :fav_posts,
class_name: '::Domain::Fa::Post', class_name: "::Domain::Fa::Post",
through: :fav_post_joins, through: :fav_post_joins,
source: :post source: :post
@@ -64,11 +59,12 @@ class Domain::Fa::User < ReduxApplicationRecord
if name && url_name if name && url_name
expected = self.class.name_to_url_name(name) expected = self.class.name_to_url_name(name)
matches = if name.length >= 30 matches =
url_name.starts_with?(expected) if name.length >= 30
else url_name.starts_with?(expected)
url_name == expected else
end url_name == expected
end
unless matches unless matches
errors.add( errors.add(
@@ -79,17 +75,11 @@ class Domain::Fa::User < ReduxApplicationRecord
end end
if url_name && url_name =~ /[A-Z]/ if url_name && url_name =~ /[A-Z]/
errors.add( errors.add(:url_name, "url_name '#{url_name}' contains uppercase")
:url_name,
"url_name '#{url_name}' contains uppercase"
)
end end
if url_name && url_name =~ /\s/ if url_name && url_name =~ /\s/
errors.add( errors.add(:url_name, "url_name '#{url_name}' contains whitespace")
:url_name,
"url_name '#{url_name}' contains whitespace"
)
end end
end end
@@ -99,9 +89,7 @@ class Domain::Fa::User < ReduxApplicationRecord
self.log_entry_detail ||= {} self.log_entry_detail ||= {}
end end
before_destroy do before_destroy { throw :abort if posts.any? }
throw :abort if posts.any?
end
SCAN_TYPES = { SCAN_TYPES = {
page: 1.month, page: 1.month,
@@ -120,9 +108,7 @@ class Domain::Fa::User < ReduxApplicationRecord
} }
SCAN_TYPES.keys.each do |scan_type| SCAN_TYPES.keys.each do |scan_type|
define_method(:"due_for_#{scan_type}_scan?") do define_method(:"due_for_#{scan_type}_scan?") { scan_due?(scan_type) }
scan_due?(scan_type)
end
define_method(:"time_ago_for_#{scan_type}_scan") do define_method(:"time_ago_for_#{scan_type}_scan") do
scanned_ago_in_words(scan_type) scanned_ago_in_words(scan_type)
@@ -143,14 +129,15 @@ class Domain::Fa::User < ReduxApplicationRecord
def scanned_ago_in_words(scan_type) def scanned_ago_in_words(scan_type)
if (timestamp = get_scanned_at_value(scan_type)) if (timestamp = get_scanned_at_value(scan_type))
DATE_HELPER.time_ago_in_words(timestamp) + ' ago' DATE_HELPER.time_ago_in_words(timestamp) + " ago"
else else
'never' "never"
end end
end end
def scan_due?(scan_type) def scan_due?(scan_type)
duration = SCAN_TYPES[scan_type] || raise("invalid scan type '#{scan_type}'") duration =
SCAN_TYPES[scan_type] || raise("invalid scan type '#{scan_type}'")
timestamp = get_scanned_at_value(scan_type) timestamp = get_scanned_at_value(scan_type)
timestamp.nil? || timestamp <= duration.ago timestamp.nil? || timestamp <= duration.ago
end end
@@ -165,14 +152,14 @@ class Domain::Fa::User < ReduxApplicationRecord
end end
def avatar_or_create def avatar_or_create
self.class.transaction do self.class.transaction { avatar || create_avatar! }
avatar || create_avatar!
end
end end
def self.find_or_build_from_submission_parser(submission_parser) def self.find_or_build_from_submission_parser(submission_parser)
unless submission_parser.is_a?(Domain::Fa::Parser::ListedSubmissionParserHelper) || unless submission_parser.is_a?(
submission_parser.is_a?(Domain::Fa::Parser::SubmissionParserHelper) Domain::Fa::Parser::ListedSubmissionParserHelper
) ||
submission_parser.is_a?(Domain::Fa::Parser::SubmissionParserHelper)
raise ArgumentError raise ArgumentError
end end
@@ -181,16 +168,11 @@ class Domain::Fa::User < ReduxApplicationRecord
end end
end end
URL_NAME_EXCEPTIONS = { URL_NAME_EXCEPTIONS = { "Kammiu" => "rammiu" }
'Kammiu' => 'rammiu'
}
def self.name_to_url_name(name) def self.name_to_url_name(name)
name = name.strip name = name.strip
URL_NAME_EXCEPTIONS[name] || name URL_NAME_EXCEPTIONS[name] || name.delete("_").gsub(/\s/, "").downcase
.delete('_')
.gsub(/\s/, '')
.downcase
end end
# TODO: - write method for getting suggested users to follow # TODO: - write method for getting suggested users to follow
@@ -215,20 +197,22 @@ class Domain::Fa::User < ReduxApplicationRecord
end end
def guess_user_page_log_entry def guess_user_page_log_entry
for_path = proc { |uri_path| for_path =
HttpLogEntry.where( proc do |uri_path|
uri_scheme: 'https', HttpLogEntry
uri_host: 'www.furaffinity.net', .where(
uri_path: uri_path uri_scheme: "https",
).order(created_at: :desc).first uri_host: "www.furaffinity.net",
} uri_path: uri_path
)
.order(created_at: :desc)
.first
end
for_hle_id = proc { |hle_id| for_hle_id = proc { |hle_id| hle_id && HttpLogEntry.find_by(id: hle_id) }
hle_id && HttpLogEntry.find_by(id: hle_id)
}
# older versions don't end in a trailing slash # older versions don't end in a trailing slash
hle_id = self.log_entry_detail && self.log_entry_detail['last_user_page_id'] hle_id = self.log_entry_detail && self.log_entry_detail["last_user_page_id"]
# first try the last scanned user page (present on most fa user models) # first try the last scanned user page (present on most fa user models)
for_hle_id.call(hle_id) || for_hle_id.call(hle_id) ||
@@ -247,18 +231,19 @@ class Domain::Fa::User < ReduxApplicationRecord
private private
def similar_users_by(factor_col, exclude_followed_by) def similar_users_by(factor_col, exclude_followed_by)
query = query = disco.nearest_neighbors(factor_col, distance: "euclidean")
disco
.nearest_neighbors(factor_col, distance: 'euclidean')
query = query.where.not(user_id: exclude_followed_by.follows.select(:followed_id)) if exclude_followed_by query =
query.where.not(
user_id: exclude_followed_by.follows.select(:followed_id)
) if exclude_followed_by
users_from_disco_query(query) users_from_disco_query(query)
end end
def users_from_disco_query(disco_query) def users_from_disco_query(disco_query)
Domain::Fa::User Domain::Fa::User
.select('domain_fa_users.*', disco_query.select_values.last) .select("domain_fa_users.*", disco_query.select_values.last)
.joins(:disco) .joins(:disco)
.merge(disco_query.reselect(:user_id)) .merge(disco_query.reselect(:user_id))
end end
@@ -270,7 +255,8 @@ class Domain::Fa::User < ReduxApplicationRecord
when :state_detail when :state_detail
str = state_detail["scanned_#{scan_type}_at"] str = state_detail["scanned_#{scan_type}_at"]
Time.parse(str) if str Time.parse(str) if str
else raise("invalid scan type '#{scan_type}'") else
raise("invalid scan type '#{scan_type}'")
end end
end end
@@ -280,7 +266,8 @@ class Domain::Fa::User < ReduxApplicationRecord
send(:"scanned_#{scan_type}_at=", value) send(:"scanned_#{scan_type}_at=", value)
when :state_detail when :state_detail
state_detail["scanned_#{scan_type}_at"] = value.iso8601 state_detail["scanned_#{scan_type}_at"] = value.iso8601
else raise("invalid scan type '#{scan_type}'") else
raise("invalid scan type '#{scan_type}'")
end end
end end
end end

View File

@@ -25,12 +25,5 @@ module ReduxScraper
config.time_zone = "Pacific Time (US & Canada)" config.time_zone = "Pacific Time (US & Canada)"
# config.eager_load_paths << Rails.root.join("extras") # config.eager_load_paths << Rails.root.join("extras")
config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa") config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa")
config.x.proxies = ReduxScraper::Application.config_for("proxies")
config.x.influxdb = ReduxScraper::Application.config_for("influxdb")
proxy_name = (ENV["proxy"] || "direct").to_sym
config.x.proxy_name = proxy_name
config.x.proxy = Rails.application.config.x.proxies[proxy_name] || raise("no proxy config for '#{proxy_name}'")
config.x.proxy[:name] = proxy_name
end end
end end

View File

@@ -28,9 +28,12 @@ redux_dev: &redux_dev
adapter: postgresql adapter: postgresql
host: localhost host: localhost
port: 5432 port: 5432
database: redux_development # database: redux_development
username: scraper_redux # username: scraper_redux
password: pdkFLqRmQwPUPaDDC4pX # password: pdkFLqRmQwPUPaDDC4pX
database: postgres
username: postgres
password: postgres
migrations_paths: db/redux_migrate migrations_paths: db/redux_migrate
pool: 4 pool: 4
@@ -53,21 +56,25 @@ legacy_staging: &legacy_staging
local_redux_test: &local_redux_test local_redux_test: &local_redux_test
adapter: postgresql adapter: postgresql
host: localhost host: db
port: 5432 port: 5432
database: redux_test database: redux_test
username: scraper_redux # username: scraper_redux
password: pdkFLqRmQwPUPaDDC4pX # password: pdkFLqRmQwPUPaDDC4pX
username: postgres
password: postgres
migrations_paths: db/redux_migrate migrations_paths: db/redux_migrate
pool: 4 pool: 4
local_legacy_test: &local_legacy_test local_legacy_test: &local_legacy_test
adapter: postgresql adapter: postgresql
host: localhost host: db
port: 5432 port: 5432
database: legacy_test database: legacy_test
username: scraper_redux # username: scraper_redux
password: pdkFLqRmQwPUPaDDC4pX # password: pdkFLqRmQwPUPaDDC4pX
username: postgres
password: postgres
migrations_paths: db/legacy_migrate migrations_paths: db/legacy_migrate
pool: 4 pool: 4

View File

@@ -12,11 +12,13 @@ Rails.application.configure do
config.good_job.cleanup_interval_jobs = 100_000 config.good_job.cleanup_interval_jobs = 100_000
config.good_job.cleanup_interval_seconds = 4.hours config.good_job.cleanup_interval_seconds = 4.hours
config.good_job.logger = Logger.new(STDOUT) config.good_job.logger = Logger.new(STDOUT)
config.good_job.logger.level = :warn config.good_job.logger.level = :info
if Rails.env.worker? if Rails.env.worker?
config.good_job.execution_mode = :async config.good_job.execution_mode = :async
config.good_job.on_thread_error = ->(exception) { Rails.logger.error("GoodJob exception: #{exception}") } config.good_job.on_thread_error = ->(exception) do
Rails.logger.error("GoodJob exception: #{exception}")
end
else else
config.good_job.execution_mode = :external config.good_job.execution_mode = :external
end end

View File

@@ -1,41 +0,0 @@
client = InfluxDB2::Client.new(
Rails.application.config.x.influxdb.host || raise("no host"),
"",
bucket: Rails.application.config.x.influxdb.bucket || raise("no bucket"),
org: "",
precision: InfluxDB2::WritePrecision::NANOSECOND,
use_ssl: false,
)
write_api = client.create_write_api
ActiveSupport::Notifications.subscribe "process_action.action_controller" do |name, started, finished, unique_id, data|
request = data[:request]
api_token = request.params[:api_token]
api_user = api_token ? ApplicationController::API_TOKENS[api_token] : nil
ip = request.headers["HTTP_CF_CONNECTING_IP"] || request.remote_ip
hash = {
name: "process_action.action_controller",
tags: {
method: "#{data[:controller]}##{data[:action]}",
format: data[:format],
http_method: data[:method],
status: data[:status],
exception: data[:exception]&.first,
},
fields: {
ip: ip,
api_token: api_token,
api_user: api_user,
time_in_controller: (finished - started) * 1000,
time_in_view: (data[:view_runtime] || 0).ceil,
time_in_db: (data[:db_runtime] || 0).ceil,
request_id: unique_id,
},
time: started,
}
write_api.write(data: hash)
rescue => e
Rails.logger.error("error writing influxdb metric: #{e}")
end

View File

@@ -1,29 +0,0 @@
default: &default
direct:
http: http://127.0.0.1:9292
gallery_dl: http://127.0.0.1:5001
proxy-1:
http: http://127.0.0.1:9292
gallery_dl: http://127.0.0.1:5001
production:
<<: *default
staging:
<<: *default
development:
direct: {}
proxy-1: {}
dedipath-1: {}
serverhost-1: {}
worker:
<<: *default
test:
direct: {}
proxy-1: {}
dedipath-1: {}
serverhost-1: {}

View File

@@ -1,9 +1,9 @@
namespace :metrics do namespace :metrics do
desc "run reporters periodically" desc "run reporters periodically"
task :report_all => [:environment, :set_logger_stdout] do task report_all: %i[environment set_logger_stdout] do
schedule = { schedule = {
Rake::Task["metrics:jobs"] => 60.seconds, Rake::Task["metrics:jobs"] => 60.seconds,
Rake::Task["metrics:estimate_db_rows"] => 60.seconds, Rake::Task["metrics:estimate_db_rows"] => 60.seconds
} }
last_ran = {} last_ran = {}
@@ -24,14 +24,4 @@ namespace :metrics do
sleep sleep_amt sleep sleep_amt
end end
end end
desc "Report job queue metrics"
task :jobs => :environment do
Metrics::GoodJobReporter.new.report
end
desc "Report estimated db row metrics"
task :estimate_db_rows => :environment do
Metrics::EstimateDbRowsReporter.new.report
end
end end