From 9de3c1b5e821d6465e24b1a01aff32f3550e641a Mon Sep 17 00:00:00 2001 From: Dylan Knutson Date: Tue, 17 Dec 2024 17:57:17 +0000 Subject: [PATCH] remove proxy code --- .devcontainer/.env | 9 - .devcontainer/Dockerfile.devcontainer | 31 ++- .devcontainer/Dockerfile.postgres | 7 + .devcontainer/create-db-user.sql | 5 + .devcontainer/devcontainer.json | 45 ++--- .devcontainer/docker-compose.yml | 44 ++-- .devcontainer/install-extensions.sh | 11 + .devcontainer/post-create.sh | 5 - .ruby-version | 2 +- .vscode/extensions.json | 8 - .vscode/settings.json | 8 +- Gemfile | 5 +- Gemfile.lock | 8 +- Procfile.dev | 8 +- Procfile.staging | 4 +- Rakefile | 66 +++--- app/jobs/domain/e621/job/scan_post_job.rb | 3 +- app/jobs/domain/fa/job/browse_page_job.rb | 31 ++- app/jobs/scraper/job_base.rb | 201 +++++++++---------- app/lib/color_logger.rb | 95 +++------ app/lib/metrics/client.rb | 54 ----- app/lib/metrics/estimate_db_rows_reporter.rb | 30 --- app/lib/metrics/good_job_reporter.rb | 30 --- app/lib/metrics/reporter.rb | 20 -- app/lib/scraper/client_factory.rb | 24 ++- app/lib/scraper/curl_http_performer.rb | 106 ++++++++++ app/lib/scraper/fa_http_client_config.rb | 18 +- app/lib/scraper/http_client.rb | 149 +++++++------- app/lib/scraper/http_performer.rb | 56 ------ app/models/domain/fa/user.rb | 131 ++++++------ config/application.rb | 7 - config/database.yml | 25 ++- config/initializers/good_job.rb | 6 +- config/initializers/influxdb_rails.rb | 41 ---- config/proxies.yml | 29 --- rake/metrics.rake | 14 +- 36 files changed, 565 insertions(+), 771 deletions(-) delete mode 100644 .devcontainer/.env create mode 100644 .devcontainer/Dockerfile.postgres create mode 100644 .devcontainer/create-db-user.sql create mode 100755 .devcontainer/install-extensions.sh delete mode 100644 .vscode/extensions.json delete mode 100644 app/lib/metrics/client.rb delete mode 100644 app/lib/metrics/estimate_db_rows_reporter.rb delete mode 100644 app/lib/metrics/good_job_reporter.rb delete mode 100644 app/lib/metrics/reporter.rb create mode 100644 app/lib/scraper/curl_http_performer.rb delete mode 100644 app/lib/scraper/http_performer.rb delete mode 100644 config/initializers/influxdb_rails.rb delete mode 100644 config/proxies.yml diff --git a/.devcontainer/.env b/.devcontainer/.env deleted file mode 100644 index bf111b32..00000000 --- a/.devcontainer/.env +++ /dev/null @@ -1,9 +0,0 @@ -PGADMIN_DEFAULT_EMAIL=admin@example.com -PGADMIN_DEFAULT_PASSWORD=password -PGADMIN_LISTEN_PORT=8080 -# no password required -PGADMIN_CONFIG_SERVER_MODE=False -PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED=False - -RAILS_ENV=development -NODE_ENV=development diff --git a/.devcontainer/Dockerfile.devcontainer b/.devcontainer/Dockerfile.devcontainer index 39af6dae..975a4de3 100644 --- a/.devcontainer/Dockerfile.devcontainer +++ b/.devcontainer/Dockerfile.devcontainer @@ -19,7 +19,7 @@ RUN bundle install RUN rake compile # Primary image -FROM mcr.microsoft.com/devcontainers/base:bookworm +FROM mcr.microsoft.com/devcontainers/ruby:1-3.2-bookworm # apt caching & install packages RUN rm -f /etc/apt/apt.conf.d/docker-clean; \ @@ -48,21 +48,34 @@ RUN \ libvips42 \ libyaml-dev \ patch \ - rbenv \ + # rbenv \ + iputils-ping \ rustc \ uuid-dev \ zlib1g-dev -USER vscode -ENV PATH=/home/vscode/.rbenv/shims:$PATH -ENV CONFIGURE_OPTS="--disable-install-rdoc" +# USER vscode +# ENV PATH=/home/vscode/.rbenv/shims:$PATH +# ENV CONFIGURE_OPTS="--disable-install-rdoc" -RUN git clone https://github.com/rbenv/ruby-build.git "$(rbenv root)"/plugins/ruby-build -RUN rbenv install 3.2.0 -RUN rbenv global 3.2.0 +# RUN git clone https://github.com/rbenv/ruby-build.git "$(rbenv root)"/plugins/ruby-build +# RUN rbenv install 3.2.0 +# RUN rbenv global 3.2.0 COPY --from=native-gems /usr/src/app/gems/xdiff-rb /gems/xdiff-rb COPY --from=native-gems /usr/src/app/gems/rb-bsdiff /gems/rb-bsdiff ENV RAILS_ENV development -ENV RUBOCOP_OPTS --server + +# [Optional] Uncomment this line to install additional gems. +RUN su vscode -c "gem install bundler -v '2.4.1'" && \ + su vscode -c "gem install rake -v '13.0.6'" && \ + su vscode -c "gem install ruby-lsp -v '0.22.1'" && \ + su vscode -c "gem install rubocop -v '1.69.2'" + +# install exo +RUN su vscode -c "curl -sL https://exo.deref.io/install | bash" +ENV PATH "/home/vscode/.exo/bin:$PATH" + +RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && nvm install 18 && nvm use 18 && npm install -g yarn" 2>&1 +ENV PATH /usr/local/share/nvm/current/bin:$PATH diff --git a/.devcontainer/Dockerfile.postgres b/.devcontainer/Dockerfile.postgres new file mode 100644 index 00000000..6b3cccc0 --- /dev/null +++ b/.devcontainer/Dockerfile.postgres @@ -0,0 +1,7 @@ +FROM postgres:17 + +RUN apt-get update && apt-get install -y \ + postgresql-17-pgvector \ + && rm -rf /var/lib/apt/lists/* + +RUN echo "CREATE EXTENSION pgvector;" >> /docker-entrypoint-initdb.d/01-pgvector.sql diff --git a/.devcontainer/create-db-user.sql b/.devcontainer/create-db-user.sql new file mode 100644 index 00000000..0be34bb2 --- /dev/null +++ b/.devcontainer/create-db-user.sql @@ -0,0 +1,5 @@ +CREATE USER vscode CREATEDB; +CREATE DATABASE vscode WITH OWNER vscode; + +CREATE DATABASE redux_test WITH OWNER vscode; +CREATE DATABASE legacy_test WITH OWNER vscode; diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f0b98660..b29ae223 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,42 +1,25 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the -// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile +// README at: https://github.com/devcontainers/templates/tree/main/src/ruby-rails-postgres { - "name": "Redux Scraper", - // "build": { - // // Sets the run context to one level up instead of the .devcontainer folder. - // "context": "..", - // // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. - // "dockerfile": "Dockerfile" - // }, + "name": "Ruby on Rails & Postgres", "dockerComposeFile": "docker-compose.yml", "service": "app", "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}", - // "mounts": [ - // { - // "type": "volume", - // "source": "${devcontainerId}-gems", - // "target": "/home/vscode/.rbenv/versions/3.2.0/lib/ruby/gems" - // } - // ], "features": { "ghcr.io/meaningful-ooo/devcontainer-features/fish:1": {}, - "ghcr.io/nikobockerman/devcontainer-features/fish-persistent-data:2": {}, - "ghcr.io/devcontainers/features/node:1": { - "installYarnUsingApt": true, - "version": "18", - "pnpmVersion": "none", - "nvmVersion": "latest" - } + "ghcr.io/nikobockerman/devcontainer-features/fish-persistent-data:2": {} }, - // "runArgs": [ - // "--network=host" - // ], // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], - // Uncomment the next line to run commands after the container is created. - "postCreateCommand": "${containerWorkspaceFolder}/.devcontainer/post-create.sh" + // This can be used to network with other containers or the host. + // "forwardPorts": [3000, 5432], + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "bundle install && rake db:setup", + "postCreateCommand": ".devcontainer/post-create.sh", + "forwardPorts": [ + 8080, // pgadmin + 3000 // rails + ] // Configure tool-specific properties. - // "customizations": {}, - // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. - // "remoteUser": "devcontainer" + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" } \ No newline at end of file diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index f04f9a86..18e691a0 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -1,33 +1,45 @@ -version: '3.8' - -volumes: - devcontainer-redux-postgres-data: - devcontainer-redux-gem-cache: +version: '3' services: app: build: context: .. - dockerfile: Dockerfile.devcontainer + dockerfile: .devcontainer/Dockerfile.devcontainer + volumes: - # - ../..:/workspaces:cached - - devcontainer-redux-gem-cache:/home/vscode/.rbenv/versions/3.2.0/lib/ruby/gems + - ../..:/workspaces:cached + - devcontainer-redux-gem-cache:/usr/local/rvm/gems + + # Overrides default command so things don't shut down after the process ends. command: sleep infinity + + # Runs app on the same network as the database container, allows "forwardPorts" in devcontainer.json function. network_mode: service:db - env_file: - - .env db: - image: pgvector/pgvector:pg17 + build: + context: . + dockerfile: Dockerfile.postgres restart: unless-stopped volumes: - - devcontainer-redux-postgres-data:/var/lib/postgresql/data - env_file: - - .env + - postgres-data:/var/lib/postgresql/data + - ./create-db-user.sql:/docker-entrypoint-initdb.d/create-db-user.sql + environment: + POSTGRES_USER: postgres + POSTGRES_DB: postgres + POSTGRES_PASSWORD: postgres pgadmin: image: dpage/pgadmin4:8.13.0 restart: unless-stopped network_mode: service:db - env_file: - - .env + environment: + PGADMIN_DEFAULT_EMAIL: admin@example.com + PGADMIN_DEFAULT_PASSWORD: password + PGADMIN_LISTEN_PORT: 8080 + PGADMIN_CONFIG_SERVER_MODE: False + PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED: False + +volumes: + postgres-data: + devcontainer-redux-gem-cache: diff --git a/.devcontainer/install-extensions.sh b/.devcontainer/install-extensions.sh new file mode 100755 index 00000000..1476e8a4 --- /dev/null +++ b/.devcontainer/install-extensions.sh @@ -0,0 +1,11 @@ +#!/bin/bash -ex + +# EDITOR_BIN="$(ls /vscode/cursor-server/bin/*/*/bin/remote-cli/cursor | head -n 1)" +EDITOR_BIN=cursor +function install_extension() { + $EDITOR_BIN --install-extension "$1" +} + +install_extension Shopify.ruby-extensions-pack +install_extension dbaeumer.vscode-eslint +install_extension aliariff.vscode-erb-beautify diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh index 6b918b3b..d883f6c0 100755 --- a/.devcontainer/post-create.sh +++ b/.devcontainer/post-create.sh @@ -6,14 +6,9 @@ echo "Gem: $(which gem)" echo "Bundler: $(which bundler)" echo "Rake: $(which rake)" -gem install bundler -v '2.4.1' -gem install rake -v '13.0.6' -gem install ruby-lsp -v '0.22.1' -gem install rubocop -v '1.69.2' bundle install --jobs $(getconf _NPROCESSORS_ONLN) rbenv rehash -rubocop --start-server npm install yarn bin/rails yarn:install yarn diff --git a/.ruby-version b/.ruby-version index 944880fa..5b820117 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -3.2.0 +system \ No newline at end of file diff --git a/.vscode/extensions.json b/.vscode/extensions.json deleted file mode 100644 index 6ebfd8bd..00000000 --- a/.vscode/extensions.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "recommendations": [ - "Shopify.ruby-extensions-pack", - "LoranKloeze.ruby-rubocop-revived", - "CraigMaslowski.erb", - "dbaeumer.vscode-eslint" - ] -} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index eaf65559..c0bb2661 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,9 +4,11 @@ "window.title": "${activeEditorMedium}${separator}${rootName}${separator}${profileName}", "workbench.preferredDarkColorTheme": "Spinel", "workbench.preferredLightColorTheme": "Spinel Light", - "rubyLsp.formatter": "auto", + "rubyLsp.formatter": "syntax_tree", "[ruby]": { - "editor.formatOnSave": true, - "editor.defaultFormatter": "LoranKloeze.ruby-rubocop-revived" + "editor.defaultFormatter": "Shopify.ruby-lsp" + }, + "[erb]": { + "editor.defaultFormatter": "aliariff.vscode-erb-beautify" } } \ No newline at end of file diff --git a/Gemfile b/Gemfile index a458df87..71f4620e 100644 --- a/Gemfile +++ b/Gemfile @@ -1,7 +1,7 @@ source 'https://rubygems.org' git_source(:github) { |repo| "https://github.com/#{repo}.git" } -ruby '3.2.0' +ruby '3.2.6' # ruby "3.0.3" # Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main" @@ -98,7 +98,6 @@ gem 'daemons' gem 'discard' gem 'good_job', '3.17.2' gem 'http-cookie' -gem 'influxdb-client' gem 'kaminari' gem 'nokogiri' gem 'pluck_each' @@ -131,3 +130,5 @@ gem 'rack-cors' gem 'react_on_rails' gem 'sanitize' gem 'shakapacker' + +gem "syntax_tree", "~> 6.2" diff --git a/Gemfile.lock b/Gemfile.lock index bef8dd7e..73b29f7d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -154,7 +154,6 @@ GEM domain_name (~> 0.5) i18n (1.12.0) concurrent-ruby (~> 1.0) - influxdb-client (2.9.0) io-console (0.6.0) irb (1.6.2) reline (>= 0.3.0) @@ -216,6 +215,7 @@ GEM pluck_each (0.2.0) activerecord (> 3.2.0) activesupport (> 3.0.0) + prettier_print (1.2.1) progressbar (1.13.0) pry (0.14.2) coderay (~> 1.1) @@ -341,6 +341,8 @@ GEM stackprof (0.2.24) stimulus-rails (1.2.1) railties (>= 6.0.0) + syntax_tree (6.2.0) + prettier_print (>= 1.2.0) table_print (1.5.7) tailwindcss-rails (2.0.26) railties (>= 6.0.0) @@ -394,7 +396,6 @@ DEPENDENCIES good_job (= 3.17.2) htmlbeautifier http-cookie - influxdb-client jbuilder kaminari memory_profiler @@ -429,6 +430,7 @@ DEPENDENCIES sqlite3 (~> 1.4) stackprof stimulus-rails + syntax_tree (~> 6.2) table_print tailwindcss-rails (~> 2.0) turbo-rails @@ -439,7 +441,7 @@ DEPENDENCIES zstd-ruby RUBY VERSION - ruby 3.2.0p0 + ruby 3.2.6p234 BUNDLED WITH 2.4.6 diff --git a/Procfile.dev b/Procfile.dev index 3cda44fa..3c8250cc 100644 --- a/Procfile.dev +++ b/Procfile.dev @@ -1,6 +1,6 @@ # Procfile for development using HMR # You can run these commands in separate shells -rails: bundle exec rails s -p 3000 -tailwind: bundle exec rake tailwindcss:watch -wp-client: HMR=true bin/webpacker-dev-server -wp-server: HMR=true SERVER_BUNDLE_ONLY=yes bin/webpacker --watch \ No newline at end of file +rails: RAILS_ENV=development bundle exec rails s -p 3000 +tailwind: RAILS_ENV=development bundle exec rake tailwindcss:watch +wp-client: HMR=true RAILS_ENV=development bin/webpacker-dev-server +wp-server: HMR=true RAILS_ENV=development SERVER_BUNDLE_ONLY=yes bin/webpacker --watch diff --git a/Procfile.staging b/Procfile.staging index d1102ffc..cebb823b 100644 --- a/Procfile.staging +++ b/Procfile.staging @@ -1,4 +1,4 @@ rails: RAILS_ENV=staging ./bin/rails s -p 3000 -tailwind: ./bin/rake rake tailwindcss:watch +tailwind: RAILS_ENV=staging ./bin/rake rake tailwindcss:watch wp-client: RAILS_ENV=staging HMR=true ./bin/webpacker-dev-server -wp-server: RAILS_ENV=staging HMR=true SERVER_BUNDLE_ONLY=yes ./bin/webpacker --watch \ No newline at end of file +wp-server: RAILS_ENV=staging HMR=true SERVER_BUNDLE_ONLY=yes ./bin/webpacker --watch diff --git a/Rakefile b/Rakefile index adccc204..a7cbf8dd 100644 --- a/Rakefile +++ b/Rakefile @@ -14,21 +14,20 @@ Rake.application.rake_require "e621" Rake.application.rake_require "twitter" Rake.application.rake_require "ib" -task :set_ar_stdout => :environment do +task set_ar_stdout: :environment do ActiveRecord::Base.logger = Logger.new($stdout) end -task :set_logger_stdout => :environment do +task set_logger_stdout: :environment do Rails.logger = Logger.new($stdout) - Rails.logger.formatter = proc do |severity, datetime, progname, msg| - "#{severity}: #{msg}\n" - end + Rails.logger.formatter = + proc { |severity, datetime, progname, msg| "#{severity}: #{msg}\n" } ActiveRecord::Base.logger = nil ActiveJob::Base.logger = nil GoodJob.logger = Rails.logger end -task :periodic_tasks => [:environment, :set_logger_stdout] do +task periodic_tasks: %i[environment set_logger_stdout] do Thread.new do loop do Rake::Task["pghero:capture_space_stats"].execute @@ -55,22 +54,21 @@ task :periodic_tasks => [:environment, :set_logger_stdout] do end end - loop do - sleep 10 - end + loop { sleep 10 } end namespace :blob_entries do - task :export_samples => :environment do + task export_samples: :environment do limit = ENV["limit"]&.to_i || raise("need 'limit' (num)") - outfile = ENV["outfile"] || raise("need 'outfile' (file path, .json encoded)") + outfile = + ENV["outfile"] || raise("need 'outfile' (file path, .json encoded)") BlobEntrySampleExporter.new.export_samples(limit, outfile) end - task :import_samples => :environment do + task import_samples: :environment do infile = ENV["infile"] || raise("need 'infile' (file path, .json encoded)") BlobEntrySampleExporter.new.import_samples(infile) end - task :migrate_entries => :environment do + task migrate_entries: :environment do start_at = ENV["start_at"] batch_size = ENV["batch_size"]&.to_i || 64 BlobEntrySampleExporter.new.migrate_blob_entries(start_at, batch_size) @@ -78,7 +76,7 @@ namespace :blob_entries do end namespace :db_sampler do - task :export => :environment do + task export: :environment do url_names = ENV["url_names"] || raise("need 'url_names' (comma-separated)") # outfile_path = ENV["outfile"] || raise("need 'outfile' (file path)") # outfile = File.open(outfile_path, "wb") @@ -88,7 +86,7 @@ namespace :db_sampler do outfile.close if outfile end - task :import => [:environment] do + task import: [:environment] do # infile_path = ENV["infile"] || raise("need 'infile' (file path)") # infile = File.open(infile_path, "rb") infile = $stdin @@ -98,23 +96,16 @@ namespace :db_sampler do end end -task :good_job do - proxies = ["direct", "proxy-1", "dedipath-1", "serverhost-1"] - proxy = ENV["proxy"] - raise("'proxy' must be set") unless proxy - raise("'proxy' must be one of #{proxies}") unless proxies.include?(proxy) - +task good_job: %i[environment set_ar_stdout set_logger_stdout] do env_hash = { "RAILS_ENV" => "worker", "GOOD_JOB_POLL_INTERVAL" => "5", "GOOD_JOB_MAX_CACHE" => "10000", "GOOD_JOB_QUEUE_SELECT_LIMIT" => "4096", "GOOD_JOB_MAX_THREADS" => "4", - "GOOD_JOB_QUEUES" => [ - "manual:4", - "fa_post,e621:2", - "*:6", - ].reject(&:nil?).join(";"), + "GOOD_JOB_QUEUES" => + ENV["GOOD_JOB_QUEUES"] || + %w[manual:4 fa_post,e621:2 *:6].reject(&:nil?).join(";") } env_hash.each do |key, value| @@ -127,7 +118,7 @@ task :good_job do exec(cmd) end -task :recompute_job_signatures => :environment do +task recompute_job_signatures: :environment do ActiveRecord::Base.logger = Logger.new(STDOUT) ActiveRecord::Base.logger.level = :error @@ -148,21 +139,22 @@ task :recompute_job_signatures => :environment do end end -task :workoff_failed_jobs => [:environment, :set_ar_stdout, :set_logger_stdout] do +task workoff_failed_jobs: %i[environment set_ar_stdout set_logger_stdout] do worker = Delayed::Worker.new - Delayed::Job.where( - "last_error is not null and attempts <= 2" - ).find_each(batch_size: 1) do |job| - worker.run(job) - end + Delayed::Job + .where("last_error is not null and attempts <= 2") + .find_each(batch_size: 1) { |job| worker.run(job) } end task :reverse_csv do file = ENV["file"] || raise("need 'file' (file path)") in_csv = CSV.parse(File.open(file, "r+"), headers: true) - out_csv = CSV.new(File.open("rev_" + file, "w"), write_headers: true, headers: in_csv.headers) - in_csv.reverse_each do |row| - out_csv << row.map(&:second) - end + out_csv = + CSV.new( + File.open("rev_" + file, "w"), + write_headers: true, + headers: in_csv.headers + ) + in_csv.reverse_each { |row| out_csv << row.map(&:second) } out_csv.close end diff --git a/app/jobs/domain/e621/job/scan_post_job.rb b/app/jobs/domain/e621/job/scan_post_job.rb index b22acd31..fc0a7e90 100644 --- a/app/jobs/domain/e621/job/scan_post_job.rb +++ b/app/jobs/domain/e621/job/scan_post_job.rb @@ -3,7 +3,8 @@ module Domain::E621::Job queue_as :e621 ignore_signature_args :caused_by_entry - def perform(post:, caused_by_entry:) + def perform(**args) + logger.info("ScanPostJob args: #{args.inspect}") end end end diff --git a/app/jobs/domain/fa/job/browse_page_job.rb b/app/jobs/domain/fa/job/browse_page_job.rb index e873898c..049d835a 100644 --- a/app/jobs/domain/fa/job/browse_page_job.rb +++ b/app/jobs/domain/fa/job/browse_page_job.rb @@ -2,7 +2,7 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base queue_as :fa_browse_page ignore_signature_args :caused_by_entry - def perform(args) + def perform(**args) @caused_by_entry = args[:caused_by_entry] @first_browse_page_entry = nil @@ -16,7 +16,9 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base @page_number += 1 end - logger.info("finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts across #{@page_number.to_s.bold} pages") + logger.info( + "finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts across #{@page_number.to_s.bold} pages" + ) end private @@ -28,21 +30,30 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base url = "https://www.furaffinity.net/browse/#{@page_number}/" end - response = http_client.get(url, caused_by_entry: @first_browse_page_entry || @caused_by_entry) + response = + http_client.get( + url, + caused_by_entry: @first_browse_page_entry || @caused_by_entry + ) log_entry = response.log_entry @first_browse_page_entry ||= log_entry if response.status_code != 200 - fatal_error("non 200 response for /browse: #{response.status_code.to_s.underline}") + fatal_error( + "non 200 response for /browse: #{response.status_code.to_s.underline}" + ) end page = Domain::Fa::Parser::Page.new(response.body) - listing_page_stats = update_and_enqueue_posts_from_listings_page( - :browse_page, page, log_entry, - enqueue_posts_pri: :high, - page_desc: "Browse@#{@page_number}", - fill_id_gaps: true, - ) + listing_page_stats = + update_and_enqueue_posts_from_listings_page( + :browse_page, + page, + log_entry, + enqueue_posts_pri: :high, + page_desc: "Browse@#{@page_number}", + fill_id_gaps: true + ) @total_num_new_posts_seen += listing_page_stats.new_seen @total_num_posts_seen += listing_page_stats.total_seen diff --git a/app/jobs/scraper/job_base.rb b/app/jobs/scraper/job_base.rb index 8a670685..fe95c0c3 100644 --- a/app/jobs/scraper/job_base.rb +++ b/app/jobs/scraper/job_base.rb @@ -1,5 +1,6 @@ class Scraper::JobBase < ApplicationJob - class JobError < RuntimeError; end + class JobError < RuntimeError + end def initialize(...) @deferred_jobs = [] @@ -20,26 +21,28 @@ class Scraper::JobBase < ApplicationJob good_job_control_concurrency_with( total_limit: 1, - key: proc do - raise("too many arguments") if arguments.size > 1 - first_argument = arguments || [] - first_argument = first_argument[0] || {} + key: + proc do + raise("too many arguments") if arguments.size > 1 + first_argument = arguments || [] + first_argument = first_argument[0] || {} - ignore_signature_args = self.class.ignore_signature_args - ignore_signature_args << :_aj_symbol_keys - ignore_signature_args << :_aj_ruby2_keywords - sig_arguments = first_argument.reject do |key, value| - ignore_signature_args.include?(key.to_sym) - end.to_h + ignore_signature_args = self.class.ignore_signature_args + ignore_signature_args << :_aj_symbol_keys + ignore_signature_args << :_aj_ruby2_keywords + sig_arguments = + first_argument + .reject { |key, value| ignore_signature_args.include?(key.to_sym) } + .to_h - sig = [] - sig << self.class.name || raise - sig << self.queue_name || "*" - sig << self.priority || "*" - sig << Digest::SHA256.hexdigest(sig_arguments.inspect)[0...16] - sig = sig.join("|") - sig - end, + sig = [] + sig << self.class.name || raise + sig << self.queue_name || "*" + sig << self.priority || "*" + sig << Digest::SHA256.hexdigest(sig_arguments.inspect)[0...16] + sig = sig.join("|") + sig + end ) # make the concurrency config threadlocal so it can be modified @@ -50,16 +53,15 @@ class Scraper::JobBase < ApplicationJob @@gjcc_tl.value end - PERMITTED_CONTENT_TYPES = [ - /text\/html/, - /application\/json/, - ] + PERMITTED_CONTENT_TYPES = [%r{text/html}, %r{application/json}] def enqueue_jobs_from_found_links(log_entry, suppress_jobs: []) return unless ENV["enqueue_jobs_from_found_links"] == "1" start_time = Time.now - unless PERMITTED_CONTENT_TYPES.any? { |ct| ct.match(log_entry.content_type) } + unless PERMITTED_CONTENT_TYPES.any? { |ct| + ct.match(log_entry.content_type) + } raise("unsupported content type: #{log_entry.content_type}") end document = log_entry.response.contents @@ -68,30 +70,39 @@ class Scraper::JobBase < ApplicationJob links = link_finder.find_links job_defs = [] - filter_mapper = proc { |list, type, key| - list.filter { |item| item[:type] == type }.map { |item| item[key] } - } + filter_mapper = + proc do |list, type, key| + list.filter { |item| item[:type] == type }.map { |item| item[key] } + end - attr_to_mapper = proc { |model_class, attr_sym, in_values| - model_class.where({ attr_sym => in_values }).map do |model| - [model.send(attr_sym), model] - end.to_h - } + attr_to_mapper = + proc do |model_class, attr_sym, in_values| + model_class + .where({ attr_sym => in_values }) + .map { |model| [model.send(attr_sym), model] } + .to_h + end - url_name_to_fa_user = attr_to_mapper.call( - Domain::Fa::User, :url_name, - filter_mapper.call(links, :fa_user, :url_name) - ) + url_name_to_fa_user = + attr_to_mapper.call( + Domain::Fa::User, + :url_name, + filter_mapper.call(links, :fa_user, :url_name) + ) - fa_id_to_fa_post = attr_to_mapper.call( - Domain::Fa::Post, :fa_id, - filter_mapper.call(links, :fa_post, :fa_id) - ) + fa_id_to_fa_post = + attr_to_mapper.call( + Domain::Fa::Post, + :fa_id, + filter_mapper.call(links, :fa_post, :fa_id) + ) - name_to_twitter_user = attr_to_mapper.call( - Domain::Twitter::User, :name, - filter_mapper.call(links, :twitter_user, :name) - ) + name_to_twitter_user = + attr_to_mapper.call( + Domain::Twitter::User, + :name, + filter_mapper.call(links, :twitter_user, :name) + ) links.each do |link| link_type = link[:type] @@ -99,57 +110,52 @@ class Scraper::JobBase < ApplicationJob when :fa_user url_name = link[:url_name] user = url_name_to_fa_user[url_name] - params = if user - { user: user } - else - { url_name: url_name } - end + params = (user ? { user: user } : { url_name: url_name }) - job_def = ({ params: params, desc: "fa user " + (user&.name || url_name).bold }) + job_def = + ({ params: params, desc: "fa user " + (user&.name || url_name).bold }) if !user || user.due_for_page_scan? - job_defs << job_def.merge( - job: Domain::Fa::Job::UserPageJob, - ) + job_defs << job_def.merge(job: Domain::Fa::Job::UserPageJob) elsif !user || user.due_for_gallery_scan? - job_defs << job_def.merge( - job: Domain::Fa::Job::UserGalleryJob, - ) + job_defs << job_def.merge(job: Domain::Fa::Job::UserGalleryJob) end when :fa_post fa_id = link[:fa_id] post = fa_id_to_fa_post[fa_id] - params = if post - { post: post } - else - { fa_id: fa_id } - end + params = (post ? { post: post } : { fa_id: fa_id }) job_def = ({ params: params, desc: "fa post #{fa_id.to_s.bold}" }) - job_defs << job_def.merge( - job: Domain::Fa::Job::ScanPostJob, - ) if !post || (post.state == "ok" && !post.scanned?) + if !post || (post.state == "ok" && !post.scanned?) + job_defs << job_def.merge(job: Domain::Fa::Job::ScanPostJob) + end - job_defs << job_def.merge( - job: Domain::Fa::Job::ScanFileJob, - ) if post && post.state == "ok" && post.scanned? && !post.have_file? + if post && post.state == "ok" && post.scanned? && !post.have_file? + job_defs << job_def.merge(job: Domain::Fa::Job::ScanFileJob) + end when :twitter_user name = link[:name] user = name_to_twitter_user[name] - params = if user - { user: user } - else - { name: name } - end + params = (user ? { user: user } : { name: name }) - job_def = ({ params: params, desc: "twitter user " + (user&.name || name).bold }) + job_def = + ( + { + params: params, + desc: "twitter user " + (user&.name || name).bold + } + ) - job_defs << job_def.merge( - job: Domain::Twitter::Job::UserTimelineTweetsJob, - ) if !user || user.due_for_timeline_tweets_scan? + if !user || user.due_for_timeline_tweets_scan? + job_defs << job_def.merge( + job: Domain::Twitter::Job::UserTimelineTweetsJob + ) + end else - logger.warn("unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}") + logger.warn( + "unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}" + ) end end @@ -163,19 +169,20 @@ class Scraper::JobBase < ApplicationJob job_class = job_def[:job] params = job_def[:params] desc = job_def[:desc] - logger.debug([ - "link finder -", - job_class.name.split("::").last.to_s.ljust(22).bold.light_black, - desc, - ].join(" ")) - - defer_job( - job_class, - params.merge({ caused_by_entry: log_entry }), + logger.debug( + [ + "link finder -", + job_class.name.split("::").last.to_s.ljust(22).bold.light_black, + desc + ].join(" ") ) + + defer_job(job_class, params.merge({ caused_by_entry: log_entry })) end duration_ms = (1000 * (Time.now - start_time)).to_i.to_s - logger.info("link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)") + logger.info( + "link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)" + ) end around_perform do |job, block| @@ -194,27 +201,9 @@ class Scraper::JobBase < ApplicationJob rescue => e error = e raise e - ensure - Metrics::Client.singleton.write_point( - self, - "job_performed", - tags: { - job_queue: job.queue_name, - job_class: job.class.name, - success: error.nil?, - error_class: error&.class&.name, - }, - fields: { - duration_ms: duration_ms, - }, - ) end - after_perform do |job| - ColorLogger.quiet do - job.enqueue_deferred_jobs! - end - end + after_perform { |job| ColorLogger.quiet { job.enqueue_deferred_jobs! } } def defer_job(job_class, params, set_args = {}) @deferred_jobs << [job_class, params, set_args] diff --git a/app/lib/color_logger.rb b/app/lib/color_logger.rb index b17828e7..958aea48 100644 --- a/app/lib/color_logger.rb +++ b/app/lib/color_logger.rb @@ -24,40 +24,42 @@ class ColorLogger klass_name = "(Anonymous)" end - logger = Logger.new(sink).tap do |logger| - prefix = "" - logger.define_singleton_method :prefix= do |p| - prefix = p - end - logger.define_singleton_method :sink do - sink - end - - logger.formatter = proc do |severity, datetime, progname, msg| - color = case severity - when "ERROR" then :red - when "WARN" then :yellow - else :light_blue + logger = + Logger + .new(sink) + .tap do |logger| + prefix = "" + logger.define_singleton_method :prefix= do |p| + prefix = p + end + logger.define_singleton_method :sink do + sink end - klass_name_str = "[#{klass_name.send(color)}]".ljust(32) - if prefix.is_a?(Proc) - prefix = prefix.call + logger.formatter = + proc do |severity, datetime, progname, msg| + color = + case severity + when "ERROR" + :red + when "WARN" + :yellow + else + :light_blue + end + + klass_name_str = "[#{klass_name.send(color)}]".ljust(32) + prefix = prefix.call if prefix.is_a?(Proc) + + if @quiet.value > 0 + "" + else + [klass_name_str, prefix, msg].reject(&:blank?).join(" ") + "\n" + end + end end - if @quiet.value > 0 - "" - else - [klass_name_str, prefix, msg].reject(&:blank?).join(" ") + "\n" - end - end - end - - if Rails.env.production? - ActiveSupport::TaggedLogging.new(logger) - else - logger - end + ActiveSupport::TaggedLogging.new(logger) end def self.klass_name_from_instance(instance) @@ -66,40 +68,7 @@ class ColorLogger else klass_name = instance.class.name.dup end - klass_name.delete_prefix!("Domain::") - - prefixes = [ - { - if_prefix: "Fa::Scraper::", - replace_with: "Fa::", - }, - { - if_prefix: "Fa::Job::", - replace_with: "Fa::", - delete_suffix: "Job", - }, - { - if_prefix: "Twitter::Job::", - replace_with: "Twitter::", - delete_suffix: "Job", - }, - { - if_prefix: "E621::Job::", - replace_with: "E621::", - delete_suffix: "Job", - }, - ].each do |conf| - if klass_name.start_with?(conf[:if_prefix]) - klass_name.delete_prefix!(conf[:if_prefix]) - if (suffix = conf[:delete_suffix]) - klass_name.delete_suffix!(suffix) - end - klass_name = conf[:replace_with] + klass_name - break - end - end - klass_name end end diff --git a/app/lib/metrics/client.rb b/app/lib/metrics/client.rb deleted file mode 100644 index 9fd64511..00000000 --- a/app/lib/metrics/client.rb +++ /dev/null @@ -1,54 +0,0 @@ -class Metrics::Client - include HasColorLogger - - REPORT = !Rails.env.test? - - def self.singleton - @singleton ||= Metrics::Client.new - end - def self.singleton=(instance) - @singleton = instance - end - - private - - def initialize(default_tags: {}) - unless REPORT - logger.warn "not reporting metrics for '#{Rails.env.to_s.bold}' environment" - return - end - - host = Rails.application.config.x.influxdb.host || raise("no host") - bucket = Rails.application.config.x.influxdb.bucket || raise("no bucket") - - @client = InfluxDB2::Client.new( - host, "", - org: "", - bucket: bucket, - precision: InfluxDB2::WritePrecision::MILLISECOND, - use_ssl: false, - ) - - write_options = InfluxDB2::WriteOptions.new( - write_type: InfluxDB2::WriteType::BATCHING, - batch_size: 100, flush_interval: 5_000, - max_retries: 3, max_retry_delay: 15_000, - exponential_base: 2, - ) - - point_settings = InfluxDB2::PointSettings.new(default_tags: default_tags) - @writer = @client.create_write_api( - write_options: write_options, - point_settings: point_settings, - ) - end - - public - - def write_point(caller, name, tags: {}, fields: {}) - return unless REPORT - @writer.write(data: { name: name, tags: tags, fields: fields }) - rescue => e - logger.error("error writing influxdb metric: #{e}") - end -end diff --git a/app/lib/metrics/estimate_db_rows_reporter.rb b/app/lib/metrics/estimate_db_rows_reporter.rb deleted file mode 100644 index 2fab1448..00000000 --- a/app/lib/metrics/estimate_db_rows_reporter.rb +++ /dev/null @@ -1,30 +0,0 @@ -class Metrics::EstimateDbRowsReporter < Metrics::Reporter - def initialize - log_writes! - super - end - - def report - extra_tables = [ - "http_log_entries", - "http_log_entry_headers", - "versions", - "delayed_jobs", - "blob_entries_p", - ] - extra_tables_sql = extra_tables.map { |t| "'#{t}'" }.join(",") - row_estimates = ReduxApplicationRecord.connection.exec_query( - [ - "SELECT relname, n_live_tup", - "FROM pg_stat_all_tables", - "WHERE relname IN (#{extra_tables_sql})", - "OR relname like 'domain_%'", - "OR relname like 'blob_entries_p_%'", - ].join(" ") - ).rows.to_h - - write_point( - "estimate_db_rows", fields: row_estimates, - ) - end -end diff --git a/app/lib/metrics/good_job_reporter.rb b/app/lib/metrics/good_job_reporter.rb deleted file mode 100644 index e77ba36b..00000000 --- a/app/lib/metrics/good_job_reporter.rb +++ /dev/null @@ -1,30 +0,0 @@ -class Metrics::GoodJobReporter < Metrics::Reporter - def initialize - super - end - - def report - total = GoodJob::Job.count - by_queue = GoodJob::Job.group("queue_name").count - by_state = GoodJob::JobsFilter.new({}).states - logger.info "job queue metrics: total=#{total}, by_queue=#{by_queue}, by_state=#{by_state}" - - write_point( - "job_queues", - tags: { aggregation: "total" }, - fields: { "total" => total }, - ) - - write_point( - "job_queues", - tags: { aggregation: "queue" }, - fields: by_queue, - ) - - write_point( - "job_queues", - tags: { aggregation: "state" }, - fields: by_state, - ) - end -end diff --git a/app/lib/metrics/reporter.rb b/app/lib/metrics/reporter.rb deleted file mode 100644 index 2fba44cf..00000000 --- a/app/lib/metrics/reporter.rb +++ /dev/null @@ -1,20 +0,0 @@ -class Metrics::Reporter - include HasColorLogger - - def initialize - @client = Metrics::Client.singleton - end - - def log_writes! - @log_writes = true - end - - def write_point(name, tags: {}, fields: {}) - @client.write_point(self, name, tags: tags, fields: fields) - logger.info("reporter wrote point '#{name.bold}'") if @log_writes - end - - def report - raise NotImplementedError, "implement in subclass" - end -end diff --git a/app/lib/scraper/client_factory.rb b/app/lib/scraper/client_factory.rb index e4399650..19e75259 100644 --- a/app/lib/scraper/client_factory.rb +++ b/app/lib/scraper/client_factory.rb @@ -58,21 +58,29 @@ class Scraper::ClientFactory end def self._gallery_dl_client_impl - @gallery_dl_clients.value ||= begin - proxy_config = Rails.application.config.x.proxy || raise("no proxy config") + @gallery_dl_clients.value ||= + begin + proxy_config = + Rails.application.config.x.proxy || raise("no proxy config") if proxy_config[:gallery_dl].blank? - raise("no gallery_dl host defined for proxy config #{proxy_config[:name]}") + raise( + "no gallery_dl host defined for proxy config #{proxy_config[:name]}" + ) end - Scraper::GalleryDlClient.new(proxy_config[:name], proxy_config[:gallery_dl]) + Scraper::GalleryDlClient.new( + proxy_config[:name], + proxy_config[:gallery_dl] + ) end end def self._http_client_impl(key, config_klass) @http_clients.value[key] ||= begin - proxy_config = Rails.application.config.x.proxy || raise("no proxy config") - performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http]) - Scraper::HttpClient.new(config_klass.new, performer) - end + # proxy_config = Rails.application.config.x.proxy || raise("no proxy config") + # performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http]) + performer = Scraper::CurlHttpPerformer.new + Scraper::HttpClient.new(config_klass.new, performer) + end end def self._get_ib_client_sid(client) diff --git a/app/lib/scraper/curl_http_performer.rb b/app/lib/scraper/curl_http_performer.rb new file mode 100644 index 00000000..26947abe --- /dev/null +++ b/app/lib/scraper/curl_http_performer.rb @@ -0,0 +1,106 @@ +class Scraper::CurlHttpPerformer + Response = + Struct.new(:response_code, :response_headers, :response_time_ms, :body) + + def get(url, request_headers) + do_request(:http_get, url, request_headers) + end + + def post(url, request_headers) + do_request(:http_post, url, request_headers) + end + + def name + "direct" + end + + METHOD_MAP = { http_get: "GET", http_post: "POST" } + + def do_request(method, url, request_headers) + t, curl = get_curl + start_at = Time.now + curl.url = Addressable::URI.encode url + curl.follow_location = true + request_headers.each { |key, value| curl.headers[key.to_s] = value } + curl.headers["User-Agent"] = "FurryArchiver/1.0 / dhelta" + curl.send(method) + response_time_ms = ((Time.now - start_at) * 1000).to_i + response_code = curl.response_code + body_str = curl.body_str + response_headers = self.class.parse_header_str(curl.header_str).to_h + + rc_color = + case response_code / 100 + when 2 + [:green] + when 3 + [:cyan] + when 4 + [:red] + when 5 + %i[red bold] + else + [:white] + end + + rc_string = response_code.to_s + rc_color.each { |m| rc_string = rc_string.send(m) } + + cf_cache_status = + if response_headers["cf-cache-status"] + "(#{response_headers["cf-cache-status"].light_blue})" + else + nil + end + + puts [ + "[#{Process.pid.to_s.black.bold} / #{t.name.to_s.black.bold}]", + "[#{METHOD_MAP[method]} #{rc_string}", + "#{self.class.humansize(body_str.size).to_s.bold}", + (response_time_ms.to_s + "ms").light_blue.bold + "]", + cf_cache_status, + "#{url.bold}" + ].reject(&:nil?).join(" ") + + Response.new(response_code, response_headers, response_time_ms, body_str) + end + + private + + def get_curl + t = Thread.current + unless t.thread_variable?(:curl) + t.thread_variable_set(:curl, Curl::Easy.new) + end + curl = t.thread_variable_get(:curl) + curl.headers = {} + [t, curl] + end + + def self.humansize(size) + units = %w[B KiB MiB GiB TiB Pib EiB ZiB] + return "0.0 B" if size == 0 + exp = (Math.log(size) / Math.log(1024)).to_i + exp += 1 if (size.to_f / 1024**exp >= 1024 - 0.05) + exp = units.size - 1 if exp > units.size - 1 + unit = units[exp] + if unit == "B" + "%d B" % [size] + else + "%.1f %s" % [size.to_f / 1024**exp, unit] + end + end + + def self.parse_header_str(header_str) + header_str + .split("\r\n") + .each_with_index + .map do |str, idx| + next nil if idx == 0 + idx = str.index(": ") + next nil unless idx + [str[0...idx], str[idx + 2..-1]] + end + .reject(&:nil?) + end +end diff --git a/app/lib/scraper/fa_http_client_config.rb b/app/lib/scraper/fa_http_client_config.rb index 5a703f1f..2bb4ca3d 100644 --- a/app/lib/scraper/fa_http_client_config.rb +++ b/app/lib/scraper/fa_http_client_config.rb @@ -1,23 +1,13 @@ class Scraper::FaHttpClientConfig < Scraper::HttpClientConfig - DEFAULT_ALLOWED_DOMAINS = [ - "*.furaffinity.net", - "*.facdn.net", - "ipinfo.io", - ] + DEFAULT_ALLOWED_DOMAINS = %w[*.furaffinity.net *.facdn.net ipinfo.io] def cookies - proxy_name = Rails.application.config.x.proxy_name - Rails.application.config.x.cookies.fa[ - proxy_name.to_s - ] || raise("no fa cookies defined for #{proxy_name}") + Rails.application.config.x.cookies.fa["direct"] || + raise("no fa cookies defined for direct") end def ratelimit - [ - ["d.furaffinity.net", :none], - ["*.facdn.net", :none], - ["*", 0.25], - ] + [["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 0.25]] end def allowed_domains diff --git a/app/lib/scraper/http_client.rb b/app/lib/scraper/http_client.rb index b38b65ba..b3d7fafb 100644 --- a/app/lib/scraper/http_client.rb +++ b/app/lib/scraper/http_client.rb @@ -2,13 +2,10 @@ class Scraper::HttpClient include HasColorLogger attr_reader :config - Response = Struct.new( - :status_code, - :body, - :log_entry, - ) + Response = Struct.new(:status_code, :body, :log_entry) - class InvalidURLError < ArgumentError; end + class InvalidURLError < ArgumentError + end def initialize(config, http_performer) @config = config @@ -17,9 +14,10 @@ class Scraper::HttpClient init_cookie_jar # TODO - populate AdaptiveCache with domains this client cares about @max_cache_size = 8 - @blob_entry_cache = Hash.new do |hash, key| - hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1) - end + @blob_entry_cache = + Hash.new do |hash, key| + hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1) + end logger.level = :info @config.do_login(http_performer) end @@ -46,12 +44,14 @@ class Scraper::HttpClient url = uri.to_s unless allowed_domain?(uri.host) - raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}") + raise InvalidURLError.new( + "domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}" + ) end - ratelimit_conf = config.ratelimit.find do |conf| - File.fnmatch? conf[0], uri.host - end || ["*", :none] + ratelimit_conf = + config.ratelimit.find { |conf| File.fnmatch? conf[0], uri.host } || + ["*", :none] ratelimit_delay = ratelimit_conf[1] if ratelimit_delay == :none logger.debug("no rate limit for #{uri.host}") @@ -68,7 +68,10 @@ class Scraper::HttpClient end request_headers = { - "cookie" => HTTP::Cookie.cookie_value(@cookie_jar.cookies(Addressable::URI.encode url)), + "cookie" => + HTTP::Cookie.cookie_value( + @cookie_jar.cookies(Addressable::URI.encode url) + ) } requested_at = Time.now response = @http_performer.do_request(method, url, request_headers) @@ -77,47 +80,57 @@ class Scraper::HttpClient response_headers = response.response_headers response_time_ms = response.response_time_ms response_body = response.body - @domain_last_requested_at[uri.host] = requested_at + (response_time_ms.to_f / 1000) + @domain_last_requested_at[uri.host] = requested_at + + (response_time_ms.to_f / 1000) - content_type = response_headers["Content-Type"] || - response_headers["content-type"] || - "none/none" + content_type = + response_headers["Content-Type"] || response_headers["content-type"] || + "none/none" cache_key = "#{uri.host}|#{content_type}" blob_entry_cache = @blob_entry_cache[cache_key] - candidates = if blob_entry_cache.at_capacity? && rand(0..100) >= 5 + candidates = + if blob_entry_cache.at_capacity? && rand(0..100) >= 5 blob_entry_cache.candidates else [] end candidates << caused_by_entry.response if caused_by_entry&.response - candidates << caused_by_entry.response.base if caused_by_entry&.response&.base + if caused_by_entry&.response&.base + candidates << caused_by_entry.response.base + end retries = 0 begin - response_blob_entry = BlobEntryP.find_or_build( - content_type: content_type, - contents: response_body, - candidates: candidates, - ) + response_blob_entry = + BlobEntryP.find_or_build( + content_type: content_type, + contents: response_body, + candidates: candidates + ) scrubbed_uri = @config.scrub_stored_uri(uri) - log_entry = HttpLogEntry.new({ - uri: scrubbed_uri.to_s, - verb: method, - content_type: content_type, - status_code: response_code, - request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers), - response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers), - response: response_blob_entry, - response_time_ms: response_time_ms, - requested_at: requested_at, - caused_by_entry: caused_by_entry, - performed_by: @http_performer.name, - }) + log_entry = + HttpLogEntry.new( + { + uri: scrubbed_uri.to_s, + verb: method, + content_type: content_type, + status_code: response_code, + request_headers: + HttpLogEntryHeader.find_or_build(headers: request_headers), + response_headers: + HttpLogEntryHeader.find_or_build(headers: response_headers), + response: response_blob_entry, + response_time_ms: response_time_ms, + requested_at: requested_at, + caused_by_entry: caused_by_entry, + performed_by: @http_performer.name + } + ) log_entry.save! - rescue + rescue StandardError retries += 1 retry if retries < 2 raise @@ -129,11 +142,14 @@ class Scraper::HttpClient ) else blob_entry_cache.insert( - HexUtil.bin2hex(response_blob_entry.sha256)[0..8], response_blob_entry, scrubbed_uri + HexUtil.bin2hex(response_blob_entry.sha256)[0..8], + response_blob_entry, + scrubbed_uri ) end - response_code_colorized = if response_code == 200 + response_code_colorized = + if response_code == 200 response_code.to_s.light_green elsif (response_code / 100) == 4 || (response_code / 100) == 5 response_code.to_s.red.bold @@ -142,51 +158,30 @@ class Scraper::HttpClient end total_time_ms = ((Time.now - requested_at) * 1000).round(0) - logger.info([ - "[entry #{log_entry.id.to_s.bold} /", - "GET #{response_code_colorized} /", - "#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]", - "[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]", - scrubbed_uri.to_s.black, - ].reject(&:nil?).join(" ")) + logger.info( + [ + "[entry #{log_entry.id.to_s.bold} /", + "GET #{response_code_colorized} /", + "#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]", + "[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]", + scrubbed_uri.to_s.black + ].reject(&:nil?).join(" ") + ) - if response_code == 524 || response_code == 502 || response_code == 503 || response_code == 403 + if response_code == 524 || response_code == 502 || response_code == 503 || + response_code == 403 logger.error("HTTP #{response_code} - sleeping to not throttle server") sleep 15 end - if content_type =~ /text\/plain/ && response_body =~ /Your IP address has been temporarily banned/ + if content_type =~ %r{text/plain} && + response_body =~ /Your IP address has been temporarily banned/ logger.error("IP temp banned, sleeping & raising...") sleep 60 * 5 raise end - Metrics::Client.singleton.write_point( - self, - "http_client_response", - tags: { - method: "GET", - host: uri.host, - status_code: response_code, - content_type: clean_content_type(content_type), - performed_by: @http_performer.name, - }, - fields: { - response_time_ms: response_time_ms, - total_time_ms: total_time_ms, - content_size: response_blob_entry.size, - content_stored: response_blob_entry.bytes_stored, - uri: scrubbed_uri.to_s, - status_code: response_code, - content_type: clean_content_type(content_type), - }, - ) - - Response.new( - response_code, - response_body, - log_entry - ) + Response.new(response_code, response_body, log_entry) end def allowed_domain?(domain) @@ -206,7 +201,7 @@ class Scraper::HttpClient name: cookie_key_conf[:name], value: cookie_key_conf[:value], path: cookie_key_conf[:path] || "", - for_domain: true, + for_domain: true } @cookie_jar.add(HTTP::Cookie.new(conf)) end diff --git a/app/lib/scraper/http_performer.rb b/app/lib/scraper/http_performer.rb deleted file mode 100644 index aac60d27..00000000 --- a/app/lib/scraper/http_performer.rb +++ /dev/null @@ -1,56 +0,0 @@ -require "base64" - -class Scraper::HttpPerformer - include HasColorLogger - - Response = Struct.new( - :response_code, - :response_headers, - :response_time_ms, - :body - ) - - def initialize(name, host) - name || raise("must provide name") - host || raise("must provide host") - logger.info("#{Thread.name.to_s.light_black} - build performer #{name.to_s.green} - #{host.to_s.green.bold}") - @name = name - @client = Ripcord::Client.new(host) - end - - def name - @name - end - - def get(uri, headers) - do_request("get", uri, headers) - end - - def post(uri, headers) - do_request("post", uri, headers) - end - - METHODS = %i[get post].freeze - - def do_request(method, uri, headers) - raise "invalid method #{method}" unless METHODS.include?(method) - send_token = SecureRandom.uuid - rpc_result = @client.call(method.to_s, [uri, headers, send_token]) - raise "rpc error: #{rpc_result.error.inspect}" unless rpc_result.successful? - recv_token, code, headers, time_ms, body = Marshal.load(Base64.decode64(rpc_result.result)) - if send_token != recv_token - raise "token mismatch: #{send_token} != #{recv_token}" - end - Scraper::HttpPerformer::Response.new(code, headers, time_ms, body) - end - - def close! - begin - @client.instance_variable_get("@http_client")&.finish - rescue IOError => e - Rails.logger.error("Failed to close http client: #{e.inspect}") - end - end - - private -end diff --git a/app/models/domain/fa/user.rb b/app/models/domain/fa/user.rb index 397be162..a95829e1 100644 --- a/app/models/domain/fa/user.rb +++ b/app/models/domain/fa/user.rb @@ -1,59 +1,54 @@ class Domain::Fa::User < ReduxApplicationRecord - self.table_name = 'domain_fa_users' + self.table_name = "domain_fa_users" has_lite_trail(schema_version: 1) has_many :posts, - class_name: '::Domain::Fa::Post', + class_name: "::Domain::Fa::Post", inverse_of: :creator, foreign_key: :creator_id has_one :disco, - class_name: '::Domain::Fa::UserFactor', + class_name: "::Domain::Fa::UserFactor", inverse_of: :user, foreign_key: :user_id, dependent: :destroy has_one :avatar, - class_name: '::Domain::Fa::UserAvatar', + class_name: "::Domain::Fa::UserAvatar", inverse_of: :user, dependent: :destroy - enum :state, [ - :ok, # so far so good, user may not yet be scanned - :scan_error # user has been removed or otherwise, see state_detail - ] + enum :state, + [ + :ok, # so far so good, user may not yet be scanned + :scan_error # user has been removed or otherwise, see state_detail + ] # Who this user follows (join table) has_many :follower_joins, - class_name: '::Domain::Fa::Follow', + class_name: "::Domain::Fa::Follow", foreign_key: :follower_id, inverse_of: :follower, dependent: :destroy # Who this user follows (User model) - has_many :follows, - through: :follower_joins, - source: :followed + has_many :follows, through: :follower_joins, source: :followed # Who follows this user (join table) has_many :followed_joins, - class_name: '::Domain::Fa::Follow', + class_name: "::Domain::Fa::Follow", foreign_key: :followed_id, inverse_of: :followed, dependent: :destroy # Who follows this user (User model) - has_many :followed_by, - through: :followed_joins, - source: :follower + has_many :followed_by, through: :followed_joins, source: :follower - has_many :fav_post_joins, - class_name: '::Domain::Fa::Fav', - inverse_of: :user + has_many :fav_post_joins, class_name: "::Domain::Fa::Fav", inverse_of: :user has_many :fav_posts, - class_name: '::Domain::Fa::Post', + class_name: "::Domain::Fa::Post", through: :fav_post_joins, source: :post @@ -64,11 +59,12 @@ class Domain::Fa::User < ReduxApplicationRecord if name && url_name expected = self.class.name_to_url_name(name) - matches = if name.length >= 30 - url_name.starts_with?(expected) - else - url_name == expected - end + matches = + if name.length >= 30 + url_name.starts_with?(expected) + else + url_name == expected + end unless matches errors.add( @@ -79,17 +75,11 @@ class Domain::Fa::User < ReduxApplicationRecord end if url_name && url_name =~ /[A-Z]/ - errors.add( - :url_name, - "url_name '#{url_name}' contains uppercase" - ) + errors.add(:url_name, "url_name '#{url_name}' contains uppercase") end if url_name && url_name =~ /\s/ - errors.add( - :url_name, - "url_name '#{url_name}' contains whitespace" - ) + errors.add(:url_name, "url_name '#{url_name}' contains whitespace") end end @@ -99,9 +89,7 @@ class Domain::Fa::User < ReduxApplicationRecord self.log_entry_detail ||= {} end - before_destroy do - throw :abort if posts.any? - end + before_destroy { throw :abort if posts.any? } SCAN_TYPES = { page: 1.month, @@ -120,9 +108,7 @@ class Domain::Fa::User < ReduxApplicationRecord } SCAN_TYPES.keys.each do |scan_type| - define_method(:"due_for_#{scan_type}_scan?") do - scan_due?(scan_type) - end + define_method(:"due_for_#{scan_type}_scan?") { scan_due?(scan_type) } define_method(:"time_ago_for_#{scan_type}_scan") do scanned_ago_in_words(scan_type) @@ -143,14 +129,15 @@ class Domain::Fa::User < ReduxApplicationRecord def scanned_ago_in_words(scan_type) if (timestamp = get_scanned_at_value(scan_type)) - DATE_HELPER.time_ago_in_words(timestamp) + ' ago' + DATE_HELPER.time_ago_in_words(timestamp) + " ago" else - 'never' + "never" end end def scan_due?(scan_type) - duration = SCAN_TYPES[scan_type] || raise("invalid scan type '#{scan_type}'") + duration = + SCAN_TYPES[scan_type] || raise("invalid scan type '#{scan_type}'") timestamp = get_scanned_at_value(scan_type) timestamp.nil? || timestamp <= duration.ago end @@ -165,14 +152,14 @@ class Domain::Fa::User < ReduxApplicationRecord end def avatar_or_create - self.class.transaction do - avatar || create_avatar! - end + self.class.transaction { avatar || create_avatar! } end def self.find_or_build_from_submission_parser(submission_parser) - unless submission_parser.is_a?(Domain::Fa::Parser::ListedSubmissionParserHelper) || - submission_parser.is_a?(Domain::Fa::Parser::SubmissionParserHelper) + unless submission_parser.is_a?( + Domain::Fa::Parser::ListedSubmissionParserHelper + ) || + submission_parser.is_a?(Domain::Fa::Parser::SubmissionParserHelper) raise ArgumentError end @@ -181,16 +168,11 @@ class Domain::Fa::User < ReduxApplicationRecord end end - URL_NAME_EXCEPTIONS = { - 'Kammiu' => 'rammiu' - } + URL_NAME_EXCEPTIONS = { "Kammiu" => "rammiu" } def self.name_to_url_name(name) name = name.strip - URL_NAME_EXCEPTIONS[name] || name - .delete('_') - .gsub(/\s/, '') - .downcase + URL_NAME_EXCEPTIONS[name] || name.delete("_").gsub(/\s/, "").downcase end # TODO: - write method for getting suggested users to follow @@ -215,20 +197,22 @@ class Domain::Fa::User < ReduxApplicationRecord end def guess_user_page_log_entry - for_path = proc { |uri_path| - HttpLogEntry.where( - uri_scheme: 'https', - uri_host: 'www.furaffinity.net', - uri_path: uri_path - ).order(created_at: :desc).first - } + for_path = + proc do |uri_path| + HttpLogEntry + .where( + uri_scheme: "https", + uri_host: "www.furaffinity.net", + uri_path: uri_path + ) + .order(created_at: :desc) + .first + end - for_hle_id = proc { |hle_id| - hle_id && HttpLogEntry.find_by(id: hle_id) - } + for_hle_id = proc { |hle_id| hle_id && HttpLogEntry.find_by(id: hle_id) } # older versions don't end in a trailing slash - hle_id = self.log_entry_detail && self.log_entry_detail['last_user_page_id'] + hle_id = self.log_entry_detail && self.log_entry_detail["last_user_page_id"] # first try the last scanned user page (present on most fa user models) for_hle_id.call(hle_id) || @@ -247,18 +231,19 @@ class Domain::Fa::User < ReduxApplicationRecord private def similar_users_by(factor_col, exclude_followed_by) - query = - disco - .nearest_neighbors(factor_col, distance: 'euclidean') + query = disco.nearest_neighbors(factor_col, distance: "euclidean") - query = query.where.not(user_id: exclude_followed_by.follows.select(:followed_id)) if exclude_followed_by + query = + query.where.not( + user_id: exclude_followed_by.follows.select(:followed_id) + ) if exclude_followed_by users_from_disco_query(query) end def users_from_disco_query(disco_query) Domain::Fa::User - .select('domain_fa_users.*', disco_query.select_values.last) + .select("domain_fa_users.*", disco_query.select_values.last) .joins(:disco) .merge(disco_query.reselect(:user_id)) end @@ -270,7 +255,8 @@ class Domain::Fa::User < ReduxApplicationRecord when :state_detail str = state_detail["scanned_#{scan_type}_at"] Time.parse(str) if str - else raise("invalid scan type '#{scan_type}'") + else + raise("invalid scan type '#{scan_type}'") end end @@ -280,7 +266,8 @@ class Domain::Fa::User < ReduxApplicationRecord send(:"scanned_#{scan_type}_at=", value) when :state_detail state_detail["scanned_#{scan_type}_at"] = value.iso8601 - else raise("invalid scan type '#{scan_type}'") + else + raise("invalid scan type '#{scan_type}'") end end end diff --git a/config/application.rb b/config/application.rb index 1e74c4a0..3ab2554e 100644 --- a/config/application.rb +++ b/config/application.rb @@ -25,12 +25,5 @@ module ReduxScraper config.time_zone = "Pacific Time (US & Canada)" # config.eager_load_paths << Rails.root.join("extras") config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa") - config.x.proxies = ReduxScraper::Application.config_for("proxies") - config.x.influxdb = ReduxScraper::Application.config_for("influxdb") - - proxy_name = (ENV["proxy"] || "direct").to_sym - config.x.proxy_name = proxy_name - config.x.proxy = Rails.application.config.x.proxies[proxy_name] || raise("no proxy config for '#{proxy_name}'") - config.x.proxy[:name] = proxy_name end end diff --git a/config/database.yml b/config/database.yml index dbcacb32..6dff8fa9 100644 --- a/config/database.yml +++ b/config/database.yml @@ -28,9 +28,12 @@ redux_dev: &redux_dev adapter: postgresql host: localhost port: 5432 - database: redux_development - username: scraper_redux - password: pdkFLqRmQwPUPaDDC4pX + # database: redux_development + # username: scraper_redux + # password: pdkFLqRmQwPUPaDDC4pX + database: postgres + username: postgres + password: postgres migrations_paths: db/redux_migrate pool: 4 @@ -53,21 +56,25 @@ legacy_staging: &legacy_staging local_redux_test: &local_redux_test adapter: postgresql - host: localhost + host: db port: 5432 database: redux_test - username: scraper_redux - password: pdkFLqRmQwPUPaDDC4pX + # username: scraper_redux + # password: pdkFLqRmQwPUPaDDC4pX + username: postgres + password: postgres migrations_paths: db/redux_migrate pool: 4 local_legacy_test: &local_legacy_test adapter: postgresql - host: localhost + host: db port: 5432 database: legacy_test - username: scraper_redux - password: pdkFLqRmQwPUPaDDC4pX + # username: scraper_redux + # password: pdkFLqRmQwPUPaDDC4pX + username: postgres + password: postgres migrations_paths: db/legacy_migrate pool: 4 diff --git a/config/initializers/good_job.rb b/config/initializers/good_job.rb index 7b729230..babf92f1 100644 --- a/config/initializers/good_job.rb +++ b/config/initializers/good_job.rb @@ -12,11 +12,13 @@ Rails.application.configure do config.good_job.cleanup_interval_jobs = 100_000 config.good_job.cleanup_interval_seconds = 4.hours config.good_job.logger = Logger.new(STDOUT) - config.good_job.logger.level = :warn + config.good_job.logger.level = :info if Rails.env.worker? config.good_job.execution_mode = :async - config.good_job.on_thread_error = ->(exception) { Rails.logger.error("GoodJob exception: #{exception}") } + config.good_job.on_thread_error = ->(exception) do + Rails.logger.error("GoodJob exception: #{exception}") + end else config.good_job.execution_mode = :external end diff --git a/config/initializers/influxdb_rails.rb b/config/initializers/influxdb_rails.rb deleted file mode 100644 index d524b21d..00000000 --- a/config/initializers/influxdb_rails.rb +++ /dev/null @@ -1,41 +0,0 @@ -client = InfluxDB2::Client.new( - Rails.application.config.x.influxdb.host || raise("no host"), - "", - bucket: Rails.application.config.x.influxdb.bucket || raise("no bucket"), - org: "", - precision: InfluxDB2::WritePrecision::NANOSECOND, - use_ssl: false, -) -write_api = client.create_write_api - -ActiveSupport::Notifications.subscribe "process_action.action_controller" do |name, started, finished, unique_id, data| - request = data[:request] - api_token = request.params[:api_token] - api_user = api_token ? ApplicationController::API_TOKENS[api_token] : nil - ip = request.headers["HTTP_CF_CONNECTING_IP"] || request.remote_ip - - hash = { - name: "process_action.action_controller", - tags: { - method: "#{data[:controller]}##{data[:action]}", - format: data[:format], - http_method: data[:method], - status: data[:status], - exception: data[:exception]&.first, - }, - fields: { - ip: ip, - api_token: api_token, - api_user: api_user, - time_in_controller: (finished - started) * 1000, - time_in_view: (data[:view_runtime] || 0).ceil, - time_in_db: (data[:db_runtime] || 0).ceil, - request_id: unique_id, - }, - time: started, - } - - write_api.write(data: hash) -rescue => e - Rails.logger.error("error writing influxdb metric: #{e}") -end diff --git a/config/proxies.yml b/config/proxies.yml deleted file mode 100644 index a610721e..00000000 --- a/config/proxies.yml +++ /dev/null @@ -1,29 +0,0 @@ -default: &default - direct: - http: http://127.0.0.1:9292 - gallery_dl: http://127.0.0.1:5001 - - proxy-1: - http: http://127.0.0.1:9292 - gallery_dl: http://127.0.0.1:5001 - -production: - <<: *default - -staging: - <<: *default - -development: - direct: {} - proxy-1: {} - dedipath-1: {} - serverhost-1: {} - -worker: - <<: *default - -test: - direct: {} - proxy-1: {} - dedipath-1: {} - serverhost-1: {} diff --git a/rake/metrics.rake b/rake/metrics.rake index 1a0fb7cd..d6e86b44 100644 --- a/rake/metrics.rake +++ b/rake/metrics.rake @@ -1,9 +1,9 @@ namespace :metrics do desc "run reporters periodically" - task :report_all => [:environment, :set_logger_stdout] do + task report_all: %i[environment set_logger_stdout] do schedule = { Rake::Task["metrics:jobs"] => 60.seconds, - Rake::Task["metrics:estimate_db_rows"] => 60.seconds, + Rake::Task["metrics:estimate_db_rows"] => 60.seconds } last_ran = {} @@ -24,14 +24,4 @@ namespace :metrics do sleep sleep_amt end end - - desc "Report job queue metrics" - task :jobs => :environment do - Metrics::GoodJobReporter.new.report - end - - desc "Report estimated db row metrics" - task :estimate_db_rows => :environment do - Metrics::EstimateDbRowsReporter.new.report - end end