remove proxy code
This commit is contained in:
@@ -1,9 +0,0 @@
|
||||
PGADMIN_DEFAULT_EMAIL=admin@example.com
|
||||
PGADMIN_DEFAULT_PASSWORD=password
|
||||
PGADMIN_LISTEN_PORT=8080
|
||||
# no password required
|
||||
PGADMIN_CONFIG_SERVER_MODE=False
|
||||
PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED=False
|
||||
|
||||
RAILS_ENV=development
|
||||
NODE_ENV=development
|
||||
@@ -19,7 +19,7 @@ RUN bundle install
|
||||
RUN rake compile
|
||||
|
||||
# Primary image
|
||||
FROM mcr.microsoft.com/devcontainers/base:bookworm
|
||||
FROM mcr.microsoft.com/devcontainers/ruby:1-3.2-bookworm
|
||||
|
||||
# apt caching & install packages
|
||||
RUN rm -f /etc/apt/apt.conf.d/docker-clean; \
|
||||
@@ -48,21 +48,34 @@ RUN \
|
||||
libvips42 \
|
||||
libyaml-dev \
|
||||
patch \
|
||||
rbenv \
|
||||
# rbenv \
|
||||
iputils-ping \
|
||||
rustc \
|
||||
uuid-dev \
|
||||
zlib1g-dev
|
||||
|
||||
USER vscode
|
||||
ENV PATH=/home/vscode/.rbenv/shims:$PATH
|
||||
ENV CONFIGURE_OPTS="--disable-install-rdoc"
|
||||
# USER vscode
|
||||
# ENV PATH=/home/vscode/.rbenv/shims:$PATH
|
||||
# ENV CONFIGURE_OPTS="--disable-install-rdoc"
|
||||
|
||||
RUN git clone https://github.com/rbenv/ruby-build.git "$(rbenv root)"/plugins/ruby-build
|
||||
RUN rbenv install 3.2.0
|
||||
RUN rbenv global 3.2.0
|
||||
# RUN git clone https://github.com/rbenv/ruby-build.git "$(rbenv root)"/plugins/ruby-build
|
||||
# RUN rbenv install 3.2.0
|
||||
# RUN rbenv global 3.2.0
|
||||
|
||||
COPY --from=native-gems /usr/src/app/gems/xdiff-rb /gems/xdiff-rb
|
||||
COPY --from=native-gems /usr/src/app/gems/rb-bsdiff /gems/rb-bsdiff
|
||||
|
||||
ENV RAILS_ENV development
|
||||
ENV RUBOCOP_OPTS --server
|
||||
|
||||
# [Optional] Uncomment this line to install additional gems.
|
||||
RUN su vscode -c "gem install bundler -v '2.4.1'" && \
|
||||
su vscode -c "gem install rake -v '13.0.6'" && \
|
||||
su vscode -c "gem install ruby-lsp -v '0.22.1'" && \
|
||||
su vscode -c "gem install rubocop -v '1.69.2'"
|
||||
|
||||
# install exo
|
||||
RUN su vscode -c "curl -sL https://exo.deref.io/install | bash"
|
||||
ENV PATH "/home/vscode/.exo/bin:$PATH"
|
||||
|
||||
RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && nvm install 18 && nvm use 18 && npm install -g yarn" 2>&1
|
||||
ENV PATH /usr/local/share/nvm/current/bin:$PATH
|
||||
|
||||
7
.devcontainer/Dockerfile.postgres
Normal file
7
.devcontainer/Dockerfile.postgres
Normal file
@@ -0,0 +1,7 @@
|
||||
FROM postgres:17
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
postgresql-17-pgvector \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN echo "CREATE EXTENSION pgvector;" >> /docker-entrypoint-initdb.d/01-pgvector.sql
|
||||
5
.devcontainer/create-db-user.sql
Normal file
5
.devcontainer/create-db-user.sql
Normal file
@@ -0,0 +1,5 @@
|
||||
CREATE USER vscode CREATEDB;
|
||||
CREATE DATABASE vscode WITH OWNER vscode;
|
||||
|
||||
CREATE DATABASE redux_test WITH OWNER vscode;
|
||||
CREATE DATABASE legacy_test WITH OWNER vscode;
|
||||
@@ -1,42 +1,25 @@
|
||||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
||||
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
|
||||
// README at: https://github.com/devcontainers/templates/tree/main/src/ruby-rails-postgres
|
||||
{
|
||||
"name": "Redux Scraper",
|
||||
// "build": {
|
||||
// // Sets the run context to one level up instead of the .devcontainer folder.
|
||||
// "context": "..",
|
||||
// // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
|
||||
// "dockerfile": "Dockerfile"
|
||||
// },
|
||||
"name": "Ruby on Rails & Postgres",
|
||||
"dockerComposeFile": "docker-compose.yml",
|
||||
"service": "app",
|
||||
"workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",
|
||||
// "mounts": [
|
||||
// {
|
||||
// "type": "volume",
|
||||
// "source": "${devcontainerId}-gems",
|
||||
// "target": "/home/vscode/.rbenv/versions/3.2.0/lib/ruby/gems"
|
||||
// }
|
||||
// ],
|
||||
"features": {
|
||||
"ghcr.io/meaningful-ooo/devcontainer-features/fish:1": {},
|
||||
"ghcr.io/nikobockerman/devcontainer-features/fish-persistent-data:2": {},
|
||||
"ghcr.io/devcontainers/features/node:1": {
|
||||
"installYarnUsingApt": true,
|
||||
"version": "18",
|
||||
"pnpmVersion": "none",
|
||||
"nvmVersion": "latest"
|
||||
}
|
||||
"ghcr.io/nikobockerman/devcontainer-features/fish-persistent-data:2": {}
|
||||
},
|
||||
// "runArgs": [
|
||||
// "--network=host"
|
||||
// ],
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
// "forwardPorts": [],
|
||||
// Uncomment the next line to run commands after the container is created.
|
||||
"postCreateCommand": "${containerWorkspaceFolder}/.devcontainer/post-create.sh"
|
||||
// This can be used to network with other containers or the host.
|
||||
// "forwardPorts": [3000, 5432],
|
||||
// Use 'postCreateCommand' to run commands after the container is created.
|
||||
// "postCreateCommand": "bundle install && rake db:setup",
|
||||
"postCreateCommand": ".devcontainer/post-create.sh",
|
||||
"forwardPorts": [
|
||||
8080, // pgadmin
|
||||
3000 // rails
|
||||
]
|
||||
// Configure tool-specific properties.
|
||||
// "customizations": {},
|
||||
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
|
||||
// "remoteUser": "devcontainer"
|
||||
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
|
||||
// "remoteUser": "root"
|
||||
}
|
||||
@@ -1,33 +1,45 @@
|
||||
version: '3.8'
|
||||
|
||||
volumes:
|
||||
devcontainer-redux-postgres-data:
|
||||
devcontainer-redux-gem-cache:
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
app:
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: Dockerfile.devcontainer
|
||||
dockerfile: .devcontainer/Dockerfile.devcontainer
|
||||
|
||||
volumes:
|
||||
# - ../..:/workspaces:cached
|
||||
- devcontainer-redux-gem-cache:/home/vscode/.rbenv/versions/3.2.0/lib/ruby/gems
|
||||
- ../..:/workspaces:cached
|
||||
- devcontainer-redux-gem-cache:/usr/local/rvm/gems
|
||||
|
||||
# Overrides default command so things don't shut down after the process ends.
|
||||
command: sleep infinity
|
||||
|
||||
# Runs app on the same network as the database container, allows "forwardPorts" in devcontainer.json function.
|
||||
network_mode: service:db
|
||||
env_file:
|
||||
- .env
|
||||
|
||||
db:
|
||||
image: pgvector/pgvector:pg17
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.postgres
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- devcontainer-redux-postgres-data:/var/lib/postgresql/data
|
||||
env_file:
|
||||
- .env
|
||||
- postgres-data:/var/lib/postgresql/data
|
||||
- ./create-db-user.sql:/docker-entrypoint-initdb.d/create-db-user.sql
|
||||
environment:
|
||||
POSTGRES_USER: postgres
|
||||
POSTGRES_DB: postgres
|
||||
POSTGRES_PASSWORD: postgres
|
||||
|
||||
pgadmin:
|
||||
image: dpage/pgadmin4:8.13.0
|
||||
restart: unless-stopped
|
||||
network_mode: service:db
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
PGADMIN_DEFAULT_EMAIL: admin@example.com
|
||||
PGADMIN_DEFAULT_PASSWORD: password
|
||||
PGADMIN_LISTEN_PORT: 8080
|
||||
PGADMIN_CONFIG_SERVER_MODE: False
|
||||
PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED: False
|
||||
|
||||
volumes:
|
||||
postgres-data:
|
||||
devcontainer-redux-gem-cache:
|
||||
|
||||
11
.devcontainer/install-extensions.sh
Executable file
11
.devcontainer/install-extensions.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash -ex
|
||||
|
||||
# EDITOR_BIN="$(ls /vscode/cursor-server/bin/*/*/bin/remote-cli/cursor | head -n 1)"
|
||||
EDITOR_BIN=cursor
|
||||
function install_extension() {
|
||||
$EDITOR_BIN --install-extension "$1"
|
||||
}
|
||||
|
||||
install_extension Shopify.ruby-extensions-pack
|
||||
install_extension dbaeumer.vscode-eslint
|
||||
install_extension aliariff.vscode-erb-beautify
|
||||
@@ -6,14 +6,9 @@ echo "Gem: $(which gem)"
|
||||
echo "Bundler: $(which bundler)"
|
||||
echo "Rake: $(which rake)"
|
||||
|
||||
gem install bundler -v '2.4.1'
|
||||
gem install rake -v '13.0.6'
|
||||
gem install ruby-lsp -v '0.22.1'
|
||||
gem install rubocop -v '1.69.2'
|
||||
bundle install --jobs $(getconf _NPROCESSORS_ONLN)
|
||||
rbenv rehash
|
||||
|
||||
rubocop --start-server
|
||||
npm install yarn
|
||||
bin/rails yarn:install
|
||||
yarn
|
||||
|
||||
@@ -1 +1 @@
|
||||
3.2.0
|
||||
system
|
||||
8
.vscode/extensions.json
vendored
8
.vscode/extensions.json
vendored
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"recommendations": [
|
||||
"Shopify.ruby-extensions-pack",
|
||||
"LoranKloeze.ruby-rubocop-revived",
|
||||
"CraigMaslowski.erb",
|
||||
"dbaeumer.vscode-eslint"
|
||||
]
|
||||
}
|
||||
8
.vscode/settings.json
vendored
8
.vscode/settings.json
vendored
@@ -4,9 +4,11 @@
|
||||
"window.title": "${activeEditorMedium}${separator}${rootName}${separator}${profileName}",
|
||||
"workbench.preferredDarkColorTheme": "Spinel",
|
||||
"workbench.preferredLightColorTheme": "Spinel Light",
|
||||
"rubyLsp.formatter": "auto",
|
||||
"rubyLsp.formatter": "syntax_tree",
|
||||
"[ruby]": {
|
||||
"editor.formatOnSave": true,
|
||||
"editor.defaultFormatter": "LoranKloeze.ruby-rubocop-revived"
|
||||
"editor.defaultFormatter": "Shopify.ruby-lsp"
|
||||
},
|
||||
"[erb]": {
|
||||
"editor.defaultFormatter": "aliariff.vscode-erb-beautify"
|
||||
}
|
||||
}
|
||||
5
Gemfile
5
Gemfile
@@ -1,7 +1,7 @@
|
||||
source 'https://rubygems.org'
|
||||
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
|
||||
|
||||
ruby '3.2.0'
|
||||
ruby '3.2.6'
|
||||
# ruby "3.0.3"
|
||||
|
||||
# Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main"
|
||||
@@ -98,7 +98,6 @@ gem 'daemons'
|
||||
gem 'discard'
|
||||
gem 'good_job', '3.17.2'
|
||||
gem 'http-cookie'
|
||||
gem 'influxdb-client'
|
||||
gem 'kaminari'
|
||||
gem 'nokogiri'
|
||||
gem 'pluck_each'
|
||||
@@ -131,3 +130,5 @@ gem 'rack-cors'
|
||||
gem 'react_on_rails'
|
||||
gem 'sanitize'
|
||||
gem 'shakapacker'
|
||||
|
||||
gem "syntax_tree", "~> 6.2"
|
||||
|
||||
@@ -154,7 +154,6 @@ GEM
|
||||
domain_name (~> 0.5)
|
||||
i18n (1.12.0)
|
||||
concurrent-ruby (~> 1.0)
|
||||
influxdb-client (2.9.0)
|
||||
io-console (0.6.0)
|
||||
irb (1.6.2)
|
||||
reline (>= 0.3.0)
|
||||
@@ -216,6 +215,7 @@ GEM
|
||||
pluck_each (0.2.0)
|
||||
activerecord (> 3.2.0)
|
||||
activesupport (> 3.0.0)
|
||||
prettier_print (1.2.1)
|
||||
progressbar (1.13.0)
|
||||
pry (0.14.2)
|
||||
coderay (~> 1.1)
|
||||
@@ -341,6 +341,8 @@ GEM
|
||||
stackprof (0.2.24)
|
||||
stimulus-rails (1.2.1)
|
||||
railties (>= 6.0.0)
|
||||
syntax_tree (6.2.0)
|
||||
prettier_print (>= 1.2.0)
|
||||
table_print (1.5.7)
|
||||
tailwindcss-rails (2.0.26)
|
||||
railties (>= 6.0.0)
|
||||
@@ -394,7 +396,6 @@ DEPENDENCIES
|
||||
good_job (= 3.17.2)
|
||||
htmlbeautifier
|
||||
http-cookie
|
||||
influxdb-client
|
||||
jbuilder
|
||||
kaminari
|
||||
memory_profiler
|
||||
@@ -429,6 +430,7 @@ DEPENDENCIES
|
||||
sqlite3 (~> 1.4)
|
||||
stackprof
|
||||
stimulus-rails
|
||||
syntax_tree (~> 6.2)
|
||||
table_print
|
||||
tailwindcss-rails (~> 2.0)
|
||||
turbo-rails
|
||||
@@ -439,7 +441,7 @@ DEPENDENCIES
|
||||
zstd-ruby
|
||||
|
||||
RUBY VERSION
|
||||
ruby 3.2.0p0
|
||||
ruby 3.2.6p234
|
||||
|
||||
BUNDLED WITH
|
||||
2.4.6
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Procfile for development using HMR
|
||||
# You can run these commands in separate shells
|
||||
rails: bundle exec rails s -p 3000
|
||||
tailwind: bundle exec rake tailwindcss:watch
|
||||
wp-client: HMR=true bin/webpacker-dev-server
|
||||
wp-server: HMR=true SERVER_BUNDLE_ONLY=yes bin/webpacker --watch
|
||||
rails: RAILS_ENV=development bundle exec rails s -p 3000
|
||||
tailwind: RAILS_ENV=development bundle exec rake tailwindcss:watch
|
||||
wp-client: HMR=true RAILS_ENV=development bin/webpacker-dev-server
|
||||
wp-server: HMR=true RAILS_ENV=development SERVER_BUNDLE_ONLY=yes bin/webpacker --watch
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
rails: RAILS_ENV=staging ./bin/rails s -p 3000
|
||||
tailwind: ./bin/rake rake tailwindcss:watch
|
||||
tailwind: RAILS_ENV=staging ./bin/rake rake tailwindcss:watch
|
||||
wp-client: RAILS_ENV=staging HMR=true ./bin/webpacker-dev-server
|
||||
wp-server: RAILS_ENV=staging HMR=true SERVER_BUNDLE_ONLY=yes ./bin/webpacker --watch
|
||||
wp-server: RAILS_ENV=staging HMR=true SERVER_BUNDLE_ONLY=yes ./bin/webpacker --watch
|
||||
|
||||
66
Rakefile
66
Rakefile
@@ -14,21 +14,20 @@ Rake.application.rake_require "e621"
|
||||
Rake.application.rake_require "twitter"
|
||||
Rake.application.rake_require "ib"
|
||||
|
||||
task :set_ar_stdout => :environment do
|
||||
task set_ar_stdout: :environment do
|
||||
ActiveRecord::Base.logger = Logger.new($stdout)
|
||||
end
|
||||
|
||||
task :set_logger_stdout => :environment do
|
||||
task set_logger_stdout: :environment do
|
||||
Rails.logger = Logger.new($stdout)
|
||||
Rails.logger.formatter = proc do |severity, datetime, progname, msg|
|
||||
"#{severity}: #{msg}\n"
|
||||
end
|
||||
Rails.logger.formatter =
|
||||
proc { |severity, datetime, progname, msg| "#{severity}: #{msg}\n" }
|
||||
ActiveRecord::Base.logger = nil
|
||||
ActiveJob::Base.logger = nil
|
||||
GoodJob.logger = Rails.logger
|
||||
end
|
||||
|
||||
task :periodic_tasks => [:environment, :set_logger_stdout] do
|
||||
task periodic_tasks: %i[environment set_logger_stdout] do
|
||||
Thread.new do
|
||||
loop do
|
||||
Rake::Task["pghero:capture_space_stats"].execute
|
||||
@@ -55,22 +54,21 @@ task :periodic_tasks => [:environment, :set_logger_stdout] do
|
||||
end
|
||||
end
|
||||
|
||||
loop do
|
||||
sleep 10
|
||||
end
|
||||
loop { sleep 10 }
|
||||
end
|
||||
|
||||
namespace :blob_entries do
|
||||
task :export_samples => :environment do
|
||||
task export_samples: :environment do
|
||||
limit = ENV["limit"]&.to_i || raise("need 'limit' (num)")
|
||||
outfile = ENV["outfile"] || raise("need 'outfile' (file path, .json encoded)")
|
||||
outfile =
|
||||
ENV["outfile"] || raise("need 'outfile' (file path, .json encoded)")
|
||||
BlobEntrySampleExporter.new.export_samples(limit, outfile)
|
||||
end
|
||||
task :import_samples => :environment do
|
||||
task import_samples: :environment do
|
||||
infile = ENV["infile"] || raise("need 'infile' (file path, .json encoded)")
|
||||
BlobEntrySampleExporter.new.import_samples(infile)
|
||||
end
|
||||
task :migrate_entries => :environment do
|
||||
task migrate_entries: :environment do
|
||||
start_at = ENV["start_at"]
|
||||
batch_size = ENV["batch_size"]&.to_i || 64
|
||||
BlobEntrySampleExporter.new.migrate_blob_entries(start_at, batch_size)
|
||||
@@ -78,7 +76,7 @@ namespace :blob_entries do
|
||||
end
|
||||
|
||||
namespace :db_sampler do
|
||||
task :export => :environment do
|
||||
task export: :environment do
|
||||
url_names = ENV["url_names"] || raise("need 'url_names' (comma-separated)")
|
||||
# outfile_path = ENV["outfile"] || raise("need 'outfile' (file path)")
|
||||
# outfile = File.open(outfile_path, "wb")
|
||||
@@ -88,7 +86,7 @@ namespace :db_sampler do
|
||||
outfile.close if outfile
|
||||
end
|
||||
|
||||
task :import => [:environment] do
|
||||
task import: [:environment] do
|
||||
# infile_path = ENV["infile"] || raise("need 'infile' (file path)")
|
||||
# infile = File.open(infile_path, "rb")
|
||||
infile = $stdin
|
||||
@@ -98,23 +96,16 @@ namespace :db_sampler do
|
||||
end
|
||||
end
|
||||
|
||||
task :good_job do
|
||||
proxies = ["direct", "proxy-1", "dedipath-1", "serverhost-1"]
|
||||
proxy = ENV["proxy"]
|
||||
raise("'proxy' must be set") unless proxy
|
||||
raise("'proxy' must be one of #{proxies}") unless proxies.include?(proxy)
|
||||
|
||||
task good_job: %i[environment set_ar_stdout set_logger_stdout] do
|
||||
env_hash = {
|
||||
"RAILS_ENV" => "worker",
|
||||
"GOOD_JOB_POLL_INTERVAL" => "5",
|
||||
"GOOD_JOB_MAX_CACHE" => "10000",
|
||||
"GOOD_JOB_QUEUE_SELECT_LIMIT" => "4096",
|
||||
"GOOD_JOB_MAX_THREADS" => "4",
|
||||
"GOOD_JOB_QUEUES" => [
|
||||
"manual:4",
|
||||
"fa_post,e621:2",
|
||||
"*:6",
|
||||
].reject(&:nil?).join(";"),
|
||||
"GOOD_JOB_QUEUES" =>
|
||||
ENV["GOOD_JOB_QUEUES"] ||
|
||||
%w[manual:4 fa_post,e621:2 *:6].reject(&:nil?).join(";")
|
||||
}
|
||||
|
||||
env_hash.each do |key, value|
|
||||
@@ -127,7 +118,7 @@ task :good_job do
|
||||
exec(cmd)
|
||||
end
|
||||
|
||||
task :recompute_job_signatures => :environment do
|
||||
task recompute_job_signatures: :environment do
|
||||
ActiveRecord::Base.logger = Logger.new(STDOUT)
|
||||
ActiveRecord::Base.logger.level = :error
|
||||
|
||||
@@ -148,21 +139,22 @@ task :recompute_job_signatures => :environment do
|
||||
end
|
||||
end
|
||||
|
||||
task :workoff_failed_jobs => [:environment, :set_ar_stdout, :set_logger_stdout] do
|
||||
task workoff_failed_jobs: %i[environment set_ar_stdout set_logger_stdout] do
|
||||
worker = Delayed::Worker.new
|
||||
Delayed::Job.where(
|
||||
"last_error is not null and attempts <= 2"
|
||||
).find_each(batch_size: 1) do |job|
|
||||
worker.run(job)
|
||||
end
|
||||
Delayed::Job
|
||||
.where("last_error is not null and attempts <= 2")
|
||||
.find_each(batch_size: 1) { |job| worker.run(job) }
|
||||
end
|
||||
|
||||
task :reverse_csv do
|
||||
file = ENV["file"] || raise("need 'file' (file path)")
|
||||
in_csv = CSV.parse(File.open(file, "r+"), headers: true)
|
||||
out_csv = CSV.new(File.open("rev_" + file, "w"), write_headers: true, headers: in_csv.headers)
|
||||
in_csv.reverse_each do |row|
|
||||
out_csv << row.map(&:second)
|
||||
end
|
||||
out_csv =
|
||||
CSV.new(
|
||||
File.open("rev_" + file, "w"),
|
||||
write_headers: true,
|
||||
headers: in_csv.headers
|
||||
)
|
||||
in_csv.reverse_each { |row| out_csv << row.map(&:second) }
|
||||
out_csv.close
|
||||
end
|
||||
|
||||
@@ -3,7 +3,8 @@ module Domain::E621::Job
|
||||
queue_as :e621
|
||||
ignore_signature_args :caused_by_entry
|
||||
|
||||
def perform(post:, caused_by_entry:)
|
||||
def perform(**args)
|
||||
logger.info("ScanPostJob args: #{args.inspect}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -2,7 +2,7 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
|
||||
queue_as :fa_browse_page
|
||||
ignore_signature_args :caused_by_entry
|
||||
|
||||
def perform(args)
|
||||
def perform(**args)
|
||||
@caused_by_entry = args[:caused_by_entry]
|
||||
@first_browse_page_entry = nil
|
||||
|
||||
@@ -16,7 +16,9 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
|
||||
@page_number += 1
|
||||
end
|
||||
|
||||
logger.info("finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts across #{@page_number.to_s.bold} pages")
|
||||
logger.info(
|
||||
"finished, #{@total_num_new_posts_seen.to_s.bold} new, #{@total_num_posts_seen.to_s.bold} total posts across #{@page_number.to_s.bold} pages"
|
||||
)
|
||||
end
|
||||
|
||||
private
|
||||
@@ -28,21 +30,30 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
|
||||
url = "https://www.furaffinity.net/browse/#{@page_number}/"
|
||||
end
|
||||
|
||||
response = http_client.get(url, caused_by_entry: @first_browse_page_entry || @caused_by_entry)
|
||||
response =
|
||||
http_client.get(
|
||||
url,
|
||||
caused_by_entry: @first_browse_page_entry || @caused_by_entry
|
||||
)
|
||||
log_entry = response.log_entry
|
||||
@first_browse_page_entry ||= log_entry
|
||||
|
||||
if response.status_code != 200
|
||||
fatal_error("non 200 response for /browse: #{response.status_code.to_s.underline}")
|
||||
fatal_error(
|
||||
"non 200 response for /browse: #{response.status_code.to_s.underline}"
|
||||
)
|
||||
end
|
||||
|
||||
page = Domain::Fa::Parser::Page.new(response.body)
|
||||
listing_page_stats = update_and_enqueue_posts_from_listings_page(
|
||||
:browse_page, page, log_entry,
|
||||
enqueue_posts_pri: :high,
|
||||
page_desc: "Browse@#{@page_number}",
|
||||
fill_id_gaps: true,
|
||||
)
|
||||
listing_page_stats =
|
||||
update_and_enqueue_posts_from_listings_page(
|
||||
:browse_page,
|
||||
page,
|
||||
log_entry,
|
||||
enqueue_posts_pri: :high,
|
||||
page_desc: "Browse@#{@page_number}",
|
||||
fill_id_gaps: true
|
||||
)
|
||||
|
||||
@total_num_new_posts_seen += listing_page_stats.new_seen
|
||||
@total_num_posts_seen += listing_page_stats.total_seen
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
class Scraper::JobBase < ApplicationJob
|
||||
class JobError < RuntimeError; end
|
||||
class JobError < RuntimeError
|
||||
end
|
||||
|
||||
def initialize(...)
|
||||
@deferred_jobs = []
|
||||
@@ -20,26 +21,28 @@ class Scraper::JobBase < ApplicationJob
|
||||
|
||||
good_job_control_concurrency_with(
|
||||
total_limit: 1,
|
||||
key: proc do
|
||||
raise("too many arguments") if arguments.size > 1
|
||||
first_argument = arguments || []
|
||||
first_argument = first_argument[0] || {}
|
||||
key:
|
||||
proc do
|
||||
raise("too many arguments") if arguments.size > 1
|
||||
first_argument = arguments || []
|
||||
first_argument = first_argument[0] || {}
|
||||
|
||||
ignore_signature_args = self.class.ignore_signature_args
|
||||
ignore_signature_args << :_aj_symbol_keys
|
||||
ignore_signature_args << :_aj_ruby2_keywords
|
||||
sig_arguments = first_argument.reject do |key, value|
|
||||
ignore_signature_args.include?(key.to_sym)
|
||||
end.to_h
|
||||
ignore_signature_args = self.class.ignore_signature_args
|
||||
ignore_signature_args << :_aj_symbol_keys
|
||||
ignore_signature_args << :_aj_ruby2_keywords
|
||||
sig_arguments =
|
||||
first_argument
|
||||
.reject { |key, value| ignore_signature_args.include?(key.to_sym) }
|
||||
.to_h
|
||||
|
||||
sig = []
|
||||
sig << self.class.name || raise
|
||||
sig << self.queue_name || "*"
|
||||
sig << self.priority || "*"
|
||||
sig << Digest::SHA256.hexdigest(sig_arguments.inspect)[0...16]
|
||||
sig = sig.join("|")
|
||||
sig
|
||||
end,
|
||||
sig = []
|
||||
sig << self.class.name || raise
|
||||
sig << self.queue_name || "*"
|
||||
sig << self.priority || "*"
|
||||
sig << Digest::SHA256.hexdigest(sig_arguments.inspect)[0...16]
|
||||
sig = sig.join("|")
|
||||
sig
|
||||
end
|
||||
)
|
||||
|
||||
# make the concurrency config threadlocal so it can be modified
|
||||
@@ -50,16 +53,15 @@ class Scraper::JobBase < ApplicationJob
|
||||
@@gjcc_tl.value
|
||||
end
|
||||
|
||||
PERMITTED_CONTENT_TYPES = [
|
||||
/text\/html/,
|
||||
/application\/json/,
|
||||
]
|
||||
PERMITTED_CONTENT_TYPES = [%r{text/html}, %r{application/json}]
|
||||
|
||||
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
|
||||
return unless ENV["enqueue_jobs_from_found_links"] == "1"
|
||||
|
||||
start_time = Time.now
|
||||
unless PERMITTED_CONTENT_TYPES.any? { |ct| ct.match(log_entry.content_type) }
|
||||
unless PERMITTED_CONTENT_TYPES.any? { |ct|
|
||||
ct.match(log_entry.content_type)
|
||||
}
|
||||
raise("unsupported content type: #{log_entry.content_type}")
|
||||
end
|
||||
document = log_entry.response.contents
|
||||
@@ -68,30 +70,39 @@ class Scraper::JobBase < ApplicationJob
|
||||
links = link_finder.find_links
|
||||
job_defs = []
|
||||
|
||||
filter_mapper = proc { |list, type, key|
|
||||
list.filter { |item| item[:type] == type }.map { |item| item[key] }
|
||||
}
|
||||
filter_mapper =
|
||||
proc do |list, type, key|
|
||||
list.filter { |item| item[:type] == type }.map { |item| item[key] }
|
||||
end
|
||||
|
||||
attr_to_mapper = proc { |model_class, attr_sym, in_values|
|
||||
model_class.where({ attr_sym => in_values }).map do |model|
|
||||
[model.send(attr_sym), model]
|
||||
end.to_h
|
||||
}
|
||||
attr_to_mapper =
|
||||
proc do |model_class, attr_sym, in_values|
|
||||
model_class
|
||||
.where({ attr_sym => in_values })
|
||||
.map { |model| [model.send(attr_sym), model] }
|
||||
.to_h
|
||||
end
|
||||
|
||||
url_name_to_fa_user = attr_to_mapper.call(
|
||||
Domain::Fa::User, :url_name,
|
||||
filter_mapper.call(links, :fa_user, :url_name)
|
||||
)
|
||||
url_name_to_fa_user =
|
||||
attr_to_mapper.call(
|
||||
Domain::Fa::User,
|
||||
:url_name,
|
||||
filter_mapper.call(links, :fa_user, :url_name)
|
||||
)
|
||||
|
||||
fa_id_to_fa_post = attr_to_mapper.call(
|
||||
Domain::Fa::Post, :fa_id,
|
||||
filter_mapper.call(links, :fa_post, :fa_id)
|
||||
)
|
||||
fa_id_to_fa_post =
|
||||
attr_to_mapper.call(
|
||||
Domain::Fa::Post,
|
||||
:fa_id,
|
||||
filter_mapper.call(links, :fa_post, :fa_id)
|
||||
)
|
||||
|
||||
name_to_twitter_user = attr_to_mapper.call(
|
||||
Domain::Twitter::User, :name,
|
||||
filter_mapper.call(links, :twitter_user, :name)
|
||||
)
|
||||
name_to_twitter_user =
|
||||
attr_to_mapper.call(
|
||||
Domain::Twitter::User,
|
||||
:name,
|
||||
filter_mapper.call(links, :twitter_user, :name)
|
||||
)
|
||||
|
||||
links.each do |link|
|
||||
link_type = link[:type]
|
||||
@@ -99,57 +110,52 @@ class Scraper::JobBase < ApplicationJob
|
||||
when :fa_user
|
||||
url_name = link[:url_name]
|
||||
user = url_name_to_fa_user[url_name]
|
||||
params = if user
|
||||
{ user: user }
|
||||
else
|
||||
{ url_name: url_name }
|
||||
end
|
||||
params = (user ? { user: user } : { url_name: url_name })
|
||||
|
||||
job_def = ({ params: params, desc: "fa user " + (user&.name || url_name).bold })
|
||||
job_def =
|
||||
({ params: params, desc: "fa user " + (user&.name || url_name).bold })
|
||||
|
||||
if !user || user.due_for_page_scan?
|
||||
job_defs << job_def.merge(
|
||||
job: Domain::Fa::Job::UserPageJob,
|
||||
)
|
||||
job_defs << job_def.merge(job: Domain::Fa::Job::UserPageJob)
|
||||
elsif !user || user.due_for_gallery_scan?
|
||||
job_defs << job_def.merge(
|
||||
job: Domain::Fa::Job::UserGalleryJob,
|
||||
)
|
||||
job_defs << job_def.merge(job: Domain::Fa::Job::UserGalleryJob)
|
||||
end
|
||||
when :fa_post
|
||||
fa_id = link[:fa_id]
|
||||
post = fa_id_to_fa_post[fa_id]
|
||||
params = if post
|
||||
{ post: post }
|
||||
else
|
||||
{ fa_id: fa_id }
|
||||
end
|
||||
params = (post ? { post: post } : { fa_id: fa_id })
|
||||
|
||||
job_def = ({ params: params, desc: "fa post #{fa_id.to_s.bold}" })
|
||||
|
||||
job_defs << job_def.merge(
|
||||
job: Domain::Fa::Job::ScanPostJob,
|
||||
) if !post || (post.state == "ok" && !post.scanned?)
|
||||
if !post || (post.state == "ok" && !post.scanned?)
|
||||
job_defs << job_def.merge(job: Domain::Fa::Job::ScanPostJob)
|
||||
end
|
||||
|
||||
job_defs << job_def.merge(
|
||||
job: Domain::Fa::Job::ScanFileJob,
|
||||
) if post && post.state == "ok" && post.scanned? && !post.have_file?
|
||||
if post && post.state == "ok" && post.scanned? && !post.have_file?
|
||||
job_defs << job_def.merge(job: Domain::Fa::Job::ScanFileJob)
|
||||
end
|
||||
when :twitter_user
|
||||
name = link[:name]
|
||||
user = name_to_twitter_user[name]
|
||||
params = if user
|
||||
{ user: user }
|
||||
else
|
||||
{ name: name }
|
||||
end
|
||||
params = (user ? { user: user } : { name: name })
|
||||
|
||||
job_def = ({ params: params, desc: "twitter user " + (user&.name || name).bold })
|
||||
job_def =
|
||||
(
|
||||
{
|
||||
params: params,
|
||||
desc: "twitter user " + (user&.name || name).bold
|
||||
}
|
||||
)
|
||||
|
||||
job_defs << job_def.merge(
|
||||
job: Domain::Twitter::Job::UserTimelineTweetsJob,
|
||||
) if !user || user.due_for_timeline_tweets_scan?
|
||||
if !user || user.due_for_timeline_tweets_scan?
|
||||
job_defs << job_def.merge(
|
||||
job: Domain::Twitter::Job::UserTimelineTweetsJob
|
||||
)
|
||||
end
|
||||
else
|
||||
logger.warn("unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}")
|
||||
logger.warn(
|
||||
"unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}"
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -163,19 +169,20 @@ class Scraper::JobBase < ApplicationJob
|
||||
job_class = job_def[:job]
|
||||
params = job_def[:params]
|
||||
desc = job_def[:desc]
|
||||
logger.debug([
|
||||
"link finder -",
|
||||
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
|
||||
desc,
|
||||
].join(" "))
|
||||
|
||||
defer_job(
|
||||
job_class,
|
||||
params.merge({ caused_by_entry: log_entry }),
|
||||
logger.debug(
|
||||
[
|
||||
"link finder -",
|
||||
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
|
||||
desc
|
||||
].join(" ")
|
||||
)
|
||||
|
||||
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
|
||||
end
|
||||
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
|
||||
logger.info("link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)")
|
||||
logger.info(
|
||||
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)"
|
||||
)
|
||||
end
|
||||
|
||||
around_perform do |job, block|
|
||||
@@ -194,27 +201,9 @@ class Scraper::JobBase < ApplicationJob
|
||||
rescue => e
|
||||
error = e
|
||||
raise e
|
||||
ensure
|
||||
Metrics::Client.singleton.write_point(
|
||||
self,
|
||||
"job_performed",
|
||||
tags: {
|
||||
job_queue: job.queue_name,
|
||||
job_class: job.class.name,
|
||||
success: error.nil?,
|
||||
error_class: error&.class&.name,
|
||||
},
|
||||
fields: {
|
||||
duration_ms: duration_ms,
|
||||
},
|
||||
)
|
||||
end
|
||||
|
||||
after_perform do |job|
|
||||
ColorLogger.quiet do
|
||||
job.enqueue_deferred_jobs!
|
||||
end
|
||||
end
|
||||
after_perform { |job| ColorLogger.quiet { job.enqueue_deferred_jobs! } }
|
||||
|
||||
def defer_job(job_class, params, set_args = {})
|
||||
@deferred_jobs << [job_class, params, set_args]
|
||||
|
||||
@@ -24,40 +24,42 @@ class ColorLogger
|
||||
klass_name = "(Anonymous)"
|
||||
end
|
||||
|
||||
logger = Logger.new(sink).tap do |logger|
|
||||
prefix = ""
|
||||
logger.define_singleton_method :prefix= do |p|
|
||||
prefix = p
|
||||
end
|
||||
logger.define_singleton_method :sink do
|
||||
sink
|
||||
end
|
||||
|
||||
logger.formatter = proc do |severity, datetime, progname, msg|
|
||||
color = case severity
|
||||
when "ERROR" then :red
|
||||
when "WARN" then :yellow
|
||||
else :light_blue
|
||||
logger =
|
||||
Logger
|
||||
.new(sink)
|
||||
.tap do |logger|
|
||||
prefix = ""
|
||||
logger.define_singleton_method :prefix= do |p|
|
||||
prefix = p
|
||||
end
|
||||
logger.define_singleton_method :sink do
|
||||
sink
|
||||
end
|
||||
|
||||
klass_name_str = "[#{klass_name.send(color)}]".ljust(32)
|
||||
if prefix.is_a?(Proc)
|
||||
prefix = prefix.call
|
||||
logger.formatter =
|
||||
proc do |severity, datetime, progname, msg|
|
||||
color =
|
||||
case severity
|
||||
when "ERROR"
|
||||
:red
|
||||
when "WARN"
|
||||
:yellow
|
||||
else
|
||||
:light_blue
|
||||
end
|
||||
|
||||
klass_name_str = "[#{klass_name.send(color)}]".ljust(32)
|
||||
prefix = prefix.call if prefix.is_a?(Proc)
|
||||
|
||||
if @quiet.value > 0
|
||||
""
|
||||
else
|
||||
[klass_name_str, prefix, msg].reject(&:blank?).join(" ") + "\n"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if @quiet.value > 0
|
||||
""
|
||||
else
|
||||
[klass_name_str, prefix, msg].reject(&:blank?).join(" ") + "\n"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if Rails.env.production?
|
||||
ActiveSupport::TaggedLogging.new(logger)
|
||||
else
|
||||
logger
|
||||
end
|
||||
ActiveSupport::TaggedLogging.new(logger)
|
||||
end
|
||||
|
||||
def self.klass_name_from_instance(instance)
|
||||
@@ -66,40 +68,7 @@ class ColorLogger
|
||||
else
|
||||
klass_name = instance.class.name.dup
|
||||
end
|
||||
|
||||
klass_name.delete_prefix!("Domain::")
|
||||
|
||||
prefixes = [
|
||||
{
|
||||
if_prefix: "Fa::Scraper::",
|
||||
replace_with: "Fa::",
|
||||
},
|
||||
{
|
||||
if_prefix: "Fa::Job::",
|
||||
replace_with: "Fa::",
|
||||
delete_suffix: "Job",
|
||||
},
|
||||
{
|
||||
if_prefix: "Twitter::Job::",
|
||||
replace_with: "Twitter::",
|
||||
delete_suffix: "Job",
|
||||
},
|
||||
{
|
||||
if_prefix: "E621::Job::",
|
||||
replace_with: "E621::",
|
||||
delete_suffix: "Job",
|
||||
},
|
||||
].each do |conf|
|
||||
if klass_name.start_with?(conf[:if_prefix])
|
||||
klass_name.delete_prefix!(conf[:if_prefix])
|
||||
if (suffix = conf[:delete_suffix])
|
||||
klass_name.delete_suffix!(suffix)
|
||||
end
|
||||
klass_name = conf[:replace_with] + klass_name
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
klass_name
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
class Metrics::Client
|
||||
include HasColorLogger
|
||||
|
||||
REPORT = !Rails.env.test?
|
||||
|
||||
def self.singleton
|
||||
@singleton ||= Metrics::Client.new
|
||||
end
|
||||
def self.singleton=(instance)
|
||||
@singleton = instance
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def initialize(default_tags: {})
|
||||
unless REPORT
|
||||
logger.warn "not reporting metrics for '#{Rails.env.to_s.bold}' environment"
|
||||
return
|
||||
end
|
||||
|
||||
host = Rails.application.config.x.influxdb.host || raise("no host")
|
||||
bucket = Rails.application.config.x.influxdb.bucket || raise("no bucket")
|
||||
|
||||
@client = InfluxDB2::Client.new(
|
||||
host, "",
|
||||
org: "",
|
||||
bucket: bucket,
|
||||
precision: InfluxDB2::WritePrecision::MILLISECOND,
|
||||
use_ssl: false,
|
||||
)
|
||||
|
||||
write_options = InfluxDB2::WriteOptions.new(
|
||||
write_type: InfluxDB2::WriteType::BATCHING,
|
||||
batch_size: 100, flush_interval: 5_000,
|
||||
max_retries: 3, max_retry_delay: 15_000,
|
||||
exponential_base: 2,
|
||||
)
|
||||
|
||||
point_settings = InfluxDB2::PointSettings.new(default_tags: default_tags)
|
||||
@writer = @client.create_write_api(
|
||||
write_options: write_options,
|
||||
point_settings: point_settings,
|
||||
)
|
||||
end
|
||||
|
||||
public
|
||||
|
||||
def write_point(caller, name, tags: {}, fields: {})
|
||||
return unless REPORT
|
||||
@writer.write(data: { name: name, tags: tags, fields: fields })
|
||||
rescue => e
|
||||
logger.error("error writing influxdb metric: #{e}")
|
||||
end
|
||||
end
|
||||
@@ -1,30 +0,0 @@
|
||||
class Metrics::EstimateDbRowsReporter < Metrics::Reporter
|
||||
def initialize
|
||||
log_writes!
|
||||
super
|
||||
end
|
||||
|
||||
def report
|
||||
extra_tables = [
|
||||
"http_log_entries",
|
||||
"http_log_entry_headers",
|
||||
"versions",
|
||||
"delayed_jobs",
|
||||
"blob_entries_p",
|
||||
]
|
||||
extra_tables_sql = extra_tables.map { |t| "'#{t}'" }.join(",")
|
||||
row_estimates = ReduxApplicationRecord.connection.exec_query(
|
||||
[
|
||||
"SELECT relname, n_live_tup",
|
||||
"FROM pg_stat_all_tables",
|
||||
"WHERE relname IN (#{extra_tables_sql})",
|
||||
"OR relname like 'domain_%'",
|
||||
"OR relname like 'blob_entries_p_%'",
|
||||
].join(" ")
|
||||
).rows.to_h
|
||||
|
||||
write_point(
|
||||
"estimate_db_rows", fields: row_estimates,
|
||||
)
|
||||
end
|
||||
end
|
||||
@@ -1,30 +0,0 @@
|
||||
class Metrics::GoodJobReporter < Metrics::Reporter
|
||||
def initialize
|
||||
super
|
||||
end
|
||||
|
||||
def report
|
||||
total = GoodJob::Job.count
|
||||
by_queue = GoodJob::Job.group("queue_name").count
|
||||
by_state = GoodJob::JobsFilter.new({}).states
|
||||
logger.info "job queue metrics: total=#{total}, by_queue=#{by_queue}, by_state=#{by_state}"
|
||||
|
||||
write_point(
|
||||
"job_queues",
|
||||
tags: { aggregation: "total" },
|
||||
fields: { "total" => total },
|
||||
)
|
||||
|
||||
write_point(
|
||||
"job_queues",
|
||||
tags: { aggregation: "queue" },
|
||||
fields: by_queue,
|
||||
)
|
||||
|
||||
write_point(
|
||||
"job_queues",
|
||||
tags: { aggregation: "state" },
|
||||
fields: by_state,
|
||||
)
|
||||
end
|
||||
end
|
||||
@@ -1,20 +0,0 @@
|
||||
class Metrics::Reporter
|
||||
include HasColorLogger
|
||||
|
||||
def initialize
|
||||
@client = Metrics::Client.singleton
|
||||
end
|
||||
|
||||
def log_writes!
|
||||
@log_writes = true
|
||||
end
|
||||
|
||||
def write_point(name, tags: {}, fields: {})
|
||||
@client.write_point(self, name, tags: tags, fields: fields)
|
||||
logger.info("reporter wrote point '#{name.bold}'") if @log_writes
|
||||
end
|
||||
|
||||
def report
|
||||
raise NotImplementedError, "implement in subclass"
|
||||
end
|
||||
end
|
||||
@@ -58,21 +58,29 @@ class Scraper::ClientFactory
|
||||
end
|
||||
|
||||
def self._gallery_dl_client_impl
|
||||
@gallery_dl_clients.value ||= begin
|
||||
proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
|
||||
@gallery_dl_clients.value ||=
|
||||
begin
|
||||
proxy_config =
|
||||
Rails.application.config.x.proxy || raise("no proxy config")
|
||||
if proxy_config[:gallery_dl].blank?
|
||||
raise("no gallery_dl host defined for proxy config #{proxy_config[:name]}")
|
||||
raise(
|
||||
"no gallery_dl host defined for proxy config #{proxy_config[:name]}"
|
||||
)
|
||||
end
|
||||
Scraper::GalleryDlClient.new(proxy_config[:name], proxy_config[:gallery_dl])
|
||||
Scraper::GalleryDlClient.new(
|
||||
proxy_config[:name],
|
||||
proxy_config[:gallery_dl]
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def self._http_client_impl(key, config_klass)
|
||||
@http_clients.value[key] ||= begin
|
||||
proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
|
||||
performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http])
|
||||
Scraper::HttpClient.new(config_klass.new, performer)
|
||||
end
|
||||
# proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
|
||||
# performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http])
|
||||
performer = Scraper::CurlHttpPerformer.new
|
||||
Scraper::HttpClient.new(config_klass.new, performer)
|
||||
end
|
||||
end
|
||||
|
||||
def self._get_ib_client_sid(client)
|
||||
|
||||
106
app/lib/scraper/curl_http_performer.rb
Normal file
106
app/lib/scraper/curl_http_performer.rb
Normal file
@@ -0,0 +1,106 @@
|
||||
class Scraper::CurlHttpPerformer
|
||||
Response =
|
||||
Struct.new(:response_code, :response_headers, :response_time_ms, :body)
|
||||
|
||||
def get(url, request_headers)
|
||||
do_request(:http_get, url, request_headers)
|
||||
end
|
||||
|
||||
def post(url, request_headers)
|
||||
do_request(:http_post, url, request_headers)
|
||||
end
|
||||
|
||||
def name
|
||||
"direct"
|
||||
end
|
||||
|
||||
METHOD_MAP = { http_get: "GET", http_post: "POST" }
|
||||
|
||||
def do_request(method, url, request_headers)
|
||||
t, curl = get_curl
|
||||
start_at = Time.now
|
||||
curl.url = Addressable::URI.encode url
|
||||
curl.follow_location = true
|
||||
request_headers.each { |key, value| curl.headers[key.to_s] = value }
|
||||
curl.headers["User-Agent"] = "FurryArchiver/1.0 / dhelta"
|
||||
curl.send(method)
|
||||
response_time_ms = ((Time.now - start_at) * 1000).to_i
|
||||
response_code = curl.response_code
|
||||
body_str = curl.body_str
|
||||
response_headers = self.class.parse_header_str(curl.header_str).to_h
|
||||
|
||||
rc_color =
|
||||
case response_code / 100
|
||||
when 2
|
||||
[:green]
|
||||
when 3
|
||||
[:cyan]
|
||||
when 4
|
||||
[:red]
|
||||
when 5
|
||||
%i[red bold]
|
||||
else
|
||||
[:white]
|
||||
end
|
||||
|
||||
rc_string = response_code.to_s
|
||||
rc_color.each { |m| rc_string = rc_string.send(m) }
|
||||
|
||||
cf_cache_status =
|
||||
if response_headers["cf-cache-status"]
|
||||
"(#{response_headers["cf-cache-status"].light_blue})"
|
||||
else
|
||||
nil
|
||||
end
|
||||
|
||||
puts [
|
||||
"[#{Process.pid.to_s.black.bold} / #{t.name.to_s.black.bold}]",
|
||||
"[#{METHOD_MAP[method]} #{rc_string}",
|
||||
"#{self.class.humansize(body_str.size).to_s.bold}",
|
||||
(response_time_ms.to_s + "ms").light_blue.bold + "]",
|
||||
cf_cache_status,
|
||||
"#{url.bold}"
|
||||
].reject(&:nil?).join(" ")
|
||||
|
||||
Response.new(response_code, response_headers, response_time_ms, body_str)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def get_curl
|
||||
t = Thread.current
|
||||
unless t.thread_variable?(:curl)
|
||||
t.thread_variable_set(:curl, Curl::Easy.new)
|
||||
end
|
||||
curl = t.thread_variable_get(:curl)
|
||||
curl.headers = {}
|
||||
[t, curl]
|
||||
end
|
||||
|
||||
def self.humansize(size)
|
||||
units = %w[B KiB MiB GiB TiB Pib EiB ZiB]
|
||||
return "0.0 B" if size == 0
|
||||
exp = (Math.log(size) / Math.log(1024)).to_i
|
||||
exp += 1 if (size.to_f / 1024**exp >= 1024 - 0.05)
|
||||
exp = units.size - 1 if exp > units.size - 1
|
||||
unit = units[exp]
|
||||
if unit == "B"
|
||||
"%d B" % [size]
|
||||
else
|
||||
"%.1f %s" % [size.to_f / 1024**exp, unit]
|
||||
end
|
||||
end
|
||||
|
||||
def self.parse_header_str(header_str)
|
||||
header_str
|
||||
.split("\r\n")
|
||||
.each_with_index
|
||||
.map do |str, idx|
|
||||
next nil if idx == 0
|
||||
idx = str.index(": ")
|
||||
next nil unless idx
|
||||
[str[0...idx], str[idx + 2..-1]]
|
||||
end
|
||||
.reject(&:nil?)
|
||||
end
|
||||
end
|
||||
@@ -1,23 +1,13 @@
|
||||
class Scraper::FaHttpClientConfig < Scraper::HttpClientConfig
|
||||
DEFAULT_ALLOWED_DOMAINS = [
|
||||
"*.furaffinity.net",
|
||||
"*.facdn.net",
|
||||
"ipinfo.io",
|
||||
]
|
||||
DEFAULT_ALLOWED_DOMAINS = %w[*.furaffinity.net *.facdn.net ipinfo.io]
|
||||
|
||||
def cookies
|
||||
proxy_name = Rails.application.config.x.proxy_name
|
||||
Rails.application.config.x.cookies.fa[
|
||||
proxy_name.to_s
|
||||
] || raise("no fa cookies defined for #{proxy_name}")
|
||||
Rails.application.config.x.cookies.fa["direct"] ||
|
||||
raise("no fa cookies defined for direct")
|
||||
end
|
||||
|
||||
def ratelimit
|
||||
[
|
||||
["d.furaffinity.net", :none],
|
||||
["*.facdn.net", :none],
|
||||
["*", 0.25],
|
||||
]
|
||||
[["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 0.25]]
|
||||
end
|
||||
|
||||
def allowed_domains
|
||||
|
||||
@@ -2,13 +2,10 @@ class Scraper::HttpClient
|
||||
include HasColorLogger
|
||||
attr_reader :config
|
||||
|
||||
Response = Struct.new(
|
||||
:status_code,
|
||||
:body,
|
||||
:log_entry,
|
||||
)
|
||||
Response = Struct.new(:status_code, :body, :log_entry)
|
||||
|
||||
class InvalidURLError < ArgumentError; end
|
||||
class InvalidURLError < ArgumentError
|
||||
end
|
||||
|
||||
def initialize(config, http_performer)
|
||||
@config = config
|
||||
@@ -17,9 +14,10 @@ class Scraper::HttpClient
|
||||
init_cookie_jar
|
||||
# TODO - populate AdaptiveCache with domains this client cares about
|
||||
@max_cache_size = 8
|
||||
@blob_entry_cache = Hash.new do |hash, key|
|
||||
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
|
||||
end
|
||||
@blob_entry_cache =
|
||||
Hash.new do |hash, key|
|
||||
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
|
||||
end
|
||||
logger.level = :info
|
||||
@config.do_login(http_performer)
|
||||
end
|
||||
@@ -46,12 +44,14 @@ class Scraper::HttpClient
|
||||
url = uri.to_s
|
||||
|
||||
unless allowed_domain?(uri.host)
|
||||
raise InvalidURLError.new("domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}")
|
||||
raise InvalidURLError.new(
|
||||
"domain not permitted: #{uri.host} - allowed: #{config.allowed_domains}"
|
||||
)
|
||||
end
|
||||
|
||||
ratelimit_conf = config.ratelimit.find do |conf|
|
||||
File.fnmatch? conf[0], uri.host
|
||||
end || ["*", :none]
|
||||
ratelimit_conf =
|
||||
config.ratelimit.find { |conf| File.fnmatch? conf[0], uri.host } ||
|
||||
["*", :none]
|
||||
ratelimit_delay = ratelimit_conf[1]
|
||||
if ratelimit_delay == :none
|
||||
logger.debug("no rate limit for #{uri.host}")
|
||||
@@ -68,7 +68,10 @@ class Scraper::HttpClient
|
||||
end
|
||||
|
||||
request_headers = {
|
||||
"cookie" => HTTP::Cookie.cookie_value(@cookie_jar.cookies(Addressable::URI.encode url)),
|
||||
"cookie" =>
|
||||
HTTP::Cookie.cookie_value(
|
||||
@cookie_jar.cookies(Addressable::URI.encode url)
|
||||
)
|
||||
}
|
||||
requested_at = Time.now
|
||||
response = @http_performer.do_request(method, url, request_headers)
|
||||
@@ -77,47 +80,57 @@ class Scraper::HttpClient
|
||||
response_headers = response.response_headers
|
||||
response_time_ms = response.response_time_ms
|
||||
response_body = response.body
|
||||
@domain_last_requested_at[uri.host] = requested_at + (response_time_ms.to_f / 1000)
|
||||
@domain_last_requested_at[uri.host] = requested_at +
|
||||
(response_time_ms.to_f / 1000)
|
||||
|
||||
content_type = response_headers["Content-Type"] ||
|
||||
response_headers["content-type"] ||
|
||||
"none/none"
|
||||
content_type =
|
||||
response_headers["Content-Type"] || response_headers["content-type"] ||
|
||||
"none/none"
|
||||
|
||||
cache_key = "#{uri.host}|#{content_type}"
|
||||
blob_entry_cache = @blob_entry_cache[cache_key]
|
||||
candidates = if blob_entry_cache.at_capacity? && rand(0..100) >= 5
|
||||
candidates =
|
||||
if blob_entry_cache.at_capacity? && rand(0..100) >= 5
|
||||
blob_entry_cache.candidates
|
||||
else
|
||||
[]
|
||||
end
|
||||
candidates << caused_by_entry.response if caused_by_entry&.response
|
||||
candidates << caused_by_entry.response.base if caused_by_entry&.response&.base
|
||||
if caused_by_entry&.response&.base
|
||||
candidates << caused_by_entry.response.base
|
||||
end
|
||||
|
||||
retries = 0
|
||||
begin
|
||||
response_blob_entry = BlobEntryP.find_or_build(
|
||||
content_type: content_type,
|
||||
contents: response_body,
|
||||
candidates: candidates,
|
||||
)
|
||||
response_blob_entry =
|
||||
BlobEntryP.find_or_build(
|
||||
content_type: content_type,
|
||||
contents: response_body,
|
||||
candidates: candidates
|
||||
)
|
||||
|
||||
scrubbed_uri = @config.scrub_stored_uri(uri)
|
||||
log_entry = HttpLogEntry.new({
|
||||
uri: scrubbed_uri.to_s,
|
||||
verb: method,
|
||||
content_type: content_type,
|
||||
status_code: response_code,
|
||||
request_headers: HttpLogEntryHeader.find_or_build(headers: request_headers),
|
||||
response_headers: HttpLogEntryHeader.find_or_build(headers: response_headers),
|
||||
response: response_blob_entry,
|
||||
response_time_ms: response_time_ms,
|
||||
requested_at: requested_at,
|
||||
caused_by_entry: caused_by_entry,
|
||||
performed_by: @http_performer.name,
|
||||
})
|
||||
log_entry =
|
||||
HttpLogEntry.new(
|
||||
{
|
||||
uri: scrubbed_uri.to_s,
|
||||
verb: method,
|
||||
content_type: content_type,
|
||||
status_code: response_code,
|
||||
request_headers:
|
||||
HttpLogEntryHeader.find_or_build(headers: request_headers),
|
||||
response_headers:
|
||||
HttpLogEntryHeader.find_or_build(headers: response_headers),
|
||||
response: response_blob_entry,
|
||||
response_time_ms: response_time_ms,
|
||||
requested_at: requested_at,
|
||||
caused_by_entry: caused_by_entry,
|
||||
performed_by: @http_performer.name
|
||||
}
|
||||
)
|
||||
|
||||
log_entry.save!
|
||||
rescue
|
||||
rescue StandardError
|
||||
retries += 1
|
||||
retry if retries < 2
|
||||
raise
|
||||
@@ -129,11 +142,14 @@ class Scraper::HttpClient
|
||||
)
|
||||
else
|
||||
blob_entry_cache.insert(
|
||||
HexUtil.bin2hex(response_blob_entry.sha256)[0..8], response_blob_entry, scrubbed_uri
|
||||
HexUtil.bin2hex(response_blob_entry.sha256)[0..8],
|
||||
response_blob_entry,
|
||||
scrubbed_uri
|
||||
)
|
||||
end
|
||||
|
||||
response_code_colorized = if response_code == 200
|
||||
response_code_colorized =
|
||||
if response_code == 200
|
||||
response_code.to_s.light_green
|
||||
elsif (response_code / 100) == 4 || (response_code / 100) == 5
|
||||
response_code.to_s.red.bold
|
||||
@@ -142,51 +158,30 @@ class Scraper::HttpClient
|
||||
end
|
||||
|
||||
total_time_ms = ((Time.now - requested_at) * 1000).round(0)
|
||||
logger.info([
|
||||
"[entry #{log_entry.id.to_s.bold} /",
|
||||
"GET #{response_code_colorized} /",
|
||||
"#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]",
|
||||
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
|
||||
scrubbed_uri.to_s.black,
|
||||
].reject(&:nil?).join(" "))
|
||||
logger.info(
|
||||
[
|
||||
"[entry #{log_entry.id.to_s.bold} /",
|
||||
"GET #{response_code_colorized} /",
|
||||
"#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]",
|
||||
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
|
||||
scrubbed_uri.to_s.black
|
||||
].reject(&:nil?).join(" ")
|
||||
)
|
||||
|
||||
if response_code == 524 || response_code == 502 || response_code == 503 || response_code == 403
|
||||
if response_code == 524 || response_code == 502 || response_code == 503 ||
|
||||
response_code == 403
|
||||
logger.error("HTTP #{response_code} - sleeping to not throttle server")
|
||||
sleep 15
|
||||
end
|
||||
|
||||
if content_type =~ /text\/plain/ && response_body =~ /Your IP address has been temporarily banned/
|
||||
if content_type =~ %r{text/plain} &&
|
||||
response_body =~ /Your IP address has been temporarily banned/
|
||||
logger.error("IP temp banned, sleeping & raising...")
|
||||
sleep 60 * 5
|
||||
raise
|
||||
end
|
||||
|
||||
Metrics::Client.singleton.write_point(
|
||||
self,
|
||||
"http_client_response",
|
||||
tags: {
|
||||
method: "GET",
|
||||
host: uri.host,
|
||||
status_code: response_code,
|
||||
content_type: clean_content_type(content_type),
|
||||
performed_by: @http_performer.name,
|
||||
},
|
||||
fields: {
|
||||
response_time_ms: response_time_ms,
|
||||
total_time_ms: total_time_ms,
|
||||
content_size: response_blob_entry.size,
|
||||
content_stored: response_blob_entry.bytes_stored,
|
||||
uri: scrubbed_uri.to_s,
|
||||
status_code: response_code,
|
||||
content_type: clean_content_type(content_type),
|
||||
},
|
||||
)
|
||||
|
||||
Response.new(
|
||||
response_code,
|
||||
response_body,
|
||||
log_entry
|
||||
)
|
||||
Response.new(response_code, response_body, log_entry)
|
||||
end
|
||||
|
||||
def allowed_domain?(domain)
|
||||
@@ -206,7 +201,7 @@ class Scraper::HttpClient
|
||||
name: cookie_key_conf[:name],
|
||||
value: cookie_key_conf[:value],
|
||||
path: cookie_key_conf[:path] || "",
|
||||
for_domain: true,
|
||||
for_domain: true
|
||||
}
|
||||
@cookie_jar.add(HTTP::Cookie.new(conf))
|
||||
end
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
require "base64"
|
||||
|
||||
class Scraper::HttpPerformer
|
||||
include HasColorLogger
|
||||
|
||||
Response = Struct.new(
|
||||
:response_code,
|
||||
:response_headers,
|
||||
:response_time_ms,
|
||||
:body
|
||||
)
|
||||
|
||||
def initialize(name, host)
|
||||
name || raise("must provide name")
|
||||
host || raise("must provide host")
|
||||
logger.info("#{Thread.name.to_s.light_black} - build performer #{name.to_s.green} - #{host.to_s.green.bold}")
|
||||
@name = name
|
||||
@client = Ripcord::Client.new(host)
|
||||
end
|
||||
|
||||
def name
|
||||
@name
|
||||
end
|
||||
|
||||
def get(uri, headers)
|
||||
do_request("get", uri, headers)
|
||||
end
|
||||
|
||||
def post(uri, headers)
|
||||
do_request("post", uri, headers)
|
||||
end
|
||||
|
||||
METHODS = %i[get post].freeze
|
||||
|
||||
def do_request(method, uri, headers)
|
||||
raise "invalid method #{method}" unless METHODS.include?(method)
|
||||
send_token = SecureRandom.uuid
|
||||
rpc_result = @client.call(method.to_s, [uri, headers, send_token])
|
||||
raise "rpc error: #{rpc_result.error.inspect}" unless rpc_result.successful?
|
||||
recv_token, code, headers, time_ms, body = Marshal.load(Base64.decode64(rpc_result.result))
|
||||
if send_token != recv_token
|
||||
raise "token mismatch: #{send_token} != #{recv_token}"
|
||||
end
|
||||
Scraper::HttpPerformer::Response.new(code, headers, time_ms, body)
|
||||
end
|
||||
|
||||
def close!
|
||||
begin
|
||||
@client.instance_variable_get("@http_client")&.finish
|
||||
rescue IOError => e
|
||||
Rails.logger.error("Failed to close http client: #{e.inspect}")
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
end
|
||||
@@ -1,59 +1,54 @@
|
||||
class Domain::Fa::User < ReduxApplicationRecord
|
||||
self.table_name = 'domain_fa_users'
|
||||
self.table_name = "domain_fa_users"
|
||||
|
||||
has_lite_trail(schema_version: 1)
|
||||
|
||||
has_many :posts,
|
||||
class_name: '::Domain::Fa::Post',
|
||||
class_name: "::Domain::Fa::Post",
|
||||
inverse_of: :creator,
|
||||
foreign_key: :creator_id
|
||||
|
||||
has_one :disco,
|
||||
class_name: '::Domain::Fa::UserFactor',
|
||||
class_name: "::Domain::Fa::UserFactor",
|
||||
inverse_of: :user,
|
||||
foreign_key: :user_id,
|
||||
dependent: :destroy
|
||||
|
||||
has_one :avatar,
|
||||
class_name: '::Domain::Fa::UserAvatar',
|
||||
class_name: "::Domain::Fa::UserAvatar",
|
||||
inverse_of: :user,
|
||||
dependent: :destroy
|
||||
|
||||
enum :state, [
|
||||
:ok, # so far so good, user may not yet be scanned
|
||||
:scan_error # user has been removed or otherwise, see state_detail
|
||||
]
|
||||
enum :state,
|
||||
[
|
||||
:ok, # so far so good, user may not yet be scanned
|
||||
:scan_error # user has been removed or otherwise, see state_detail
|
||||
]
|
||||
|
||||
# Who this user follows (join table)
|
||||
has_many :follower_joins,
|
||||
class_name: '::Domain::Fa::Follow',
|
||||
class_name: "::Domain::Fa::Follow",
|
||||
foreign_key: :follower_id,
|
||||
inverse_of: :follower,
|
||||
dependent: :destroy
|
||||
|
||||
# Who this user follows (User model)
|
||||
has_many :follows,
|
||||
through: :follower_joins,
|
||||
source: :followed
|
||||
has_many :follows, through: :follower_joins, source: :followed
|
||||
|
||||
# Who follows this user (join table)
|
||||
has_many :followed_joins,
|
||||
class_name: '::Domain::Fa::Follow',
|
||||
class_name: "::Domain::Fa::Follow",
|
||||
foreign_key: :followed_id,
|
||||
inverse_of: :followed,
|
||||
dependent: :destroy
|
||||
|
||||
# Who follows this user (User model)
|
||||
has_many :followed_by,
|
||||
through: :followed_joins,
|
||||
source: :follower
|
||||
has_many :followed_by, through: :followed_joins, source: :follower
|
||||
|
||||
has_many :fav_post_joins,
|
||||
class_name: '::Domain::Fa::Fav',
|
||||
inverse_of: :user
|
||||
has_many :fav_post_joins, class_name: "::Domain::Fa::Fav", inverse_of: :user
|
||||
|
||||
has_many :fav_posts,
|
||||
class_name: '::Domain::Fa::Post',
|
||||
class_name: "::Domain::Fa::Post",
|
||||
through: :fav_post_joins,
|
||||
source: :post
|
||||
|
||||
@@ -64,11 +59,12 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
if name && url_name
|
||||
expected = self.class.name_to_url_name(name)
|
||||
|
||||
matches = if name.length >= 30
|
||||
url_name.starts_with?(expected)
|
||||
else
|
||||
url_name == expected
|
||||
end
|
||||
matches =
|
||||
if name.length >= 30
|
||||
url_name.starts_with?(expected)
|
||||
else
|
||||
url_name == expected
|
||||
end
|
||||
|
||||
unless matches
|
||||
errors.add(
|
||||
@@ -79,17 +75,11 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
end
|
||||
|
||||
if url_name && url_name =~ /[A-Z]/
|
||||
errors.add(
|
||||
:url_name,
|
||||
"url_name '#{url_name}' contains uppercase"
|
||||
)
|
||||
errors.add(:url_name, "url_name '#{url_name}' contains uppercase")
|
||||
end
|
||||
|
||||
if url_name && url_name =~ /\s/
|
||||
errors.add(
|
||||
:url_name,
|
||||
"url_name '#{url_name}' contains whitespace"
|
||||
)
|
||||
errors.add(:url_name, "url_name '#{url_name}' contains whitespace")
|
||||
end
|
||||
end
|
||||
|
||||
@@ -99,9 +89,7 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
self.log_entry_detail ||= {}
|
||||
end
|
||||
|
||||
before_destroy do
|
||||
throw :abort if posts.any?
|
||||
end
|
||||
before_destroy { throw :abort if posts.any? }
|
||||
|
||||
SCAN_TYPES = {
|
||||
page: 1.month,
|
||||
@@ -120,9 +108,7 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
}
|
||||
|
||||
SCAN_TYPES.keys.each do |scan_type|
|
||||
define_method(:"due_for_#{scan_type}_scan?") do
|
||||
scan_due?(scan_type)
|
||||
end
|
||||
define_method(:"due_for_#{scan_type}_scan?") { scan_due?(scan_type) }
|
||||
|
||||
define_method(:"time_ago_for_#{scan_type}_scan") do
|
||||
scanned_ago_in_words(scan_type)
|
||||
@@ -143,14 +129,15 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
|
||||
def scanned_ago_in_words(scan_type)
|
||||
if (timestamp = get_scanned_at_value(scan_type))
|
||||
DATE_HELPER.time_ago_in_words(timestamp) + ' ago'
|
||||
DATE_HELPER.time_ago_in_words(timestamp) + " ago"
|
||||
else
|
||||
'never'
|
||||
"never"
|
||||
end
|
||||
end
|
||||
|
||||
def scan_due?(scan_type)
|
||||
duration = SCAN_TYPES[scan_type] || raise("invalid scan type '#{scan_type}'")
|
||||
duration =
|
||||
SCAN_TYPES[scan_type] || raise("invalid scan type '#{scan_type}'")
|
||||
timestamp = get_scanned_at_value(scan_type)
|
||||
timestamp.nil? || timestamp <= duration.ago
|
||||
end
|
||||
@@ -165,14 +152,14 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
end
|
||||
|
||||
def avatar_or_create
|
||||
self.class.transaction do
|
||||
avatar || create_avatar!
|
||||
end
|
||||
self.class.transaction { avatar || create_avatar! }
|
||||
end
|
||||
|
||||
def self.find_or_build_from_submission_parser(submission_parser)
|
||||
unless submission_parser.is_a?(Domain::Fa::Parser::ListedSubmissionParserHelper) ||
|
||||
submission_parser.is_a?(Domain::Fa::Parser::SubmissionParserHelper)
|
||||
unless submission_parser.is_a?(
|
||||
Domain::Fa::Parser::ListedSubmissionParserHelper
|
||||
) ||
|
||||
submission_parser.is_a?(Domain::Fa::Parser::SubmissionParserHelper)
|
||||
raise ArgumentError
|
||||
end
|
||||
|
||||
@@ -181,16 +168,11 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
end
|
||||
end
|
||||
|
||||
URL_NAME_EXCEPTIONS = {
|
||||
'Kammiu' => 'rammiu'
|
||||
}
|
||||
URL_NAME_EXCEPTIONS = { "Kammiu" => "rammiu" }
|
||||
|
||||
def self.name_to_url_name(name)
|
||||
name = name.strip
|
||||
URL_NAME_EXCEPTIONS[name] || name
|
||||
.delete('_')
|
||||
.gsub(/\s/, '')
|
||||
.downcase
|
||||
URL_NAME_EXCEPTIONS[name] || name.delete("_").gsub(/\s/, "").downcase
|
||||
end
|
||||
|
||||
# TODO: - write method for getting suggested users to follow
|
||||
@@ -215,20 +197,22 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
end
|
||||
|
||||
def guess_user_page_log_entry
|
||||
for_path = proc { |uri_path|
|
||||
HttpLogEntry.where(
|
||||
uri_scheme: 'https',
|
||||
uri_host: 'www.furaffinity.net',
|
||||
uri_path: uri_path
|
||||
).order(created_at: :desc).first
|
||||
}
|
||||
for_path =
|
||||
proc do |uri_path|
|
||||
HttpLogEntry
|
||||
.where(
|
||||
uri_scheme: "https",
|
||||
uri_host: "www.furaffinity.net",
|
||||
uri_path: uri_path
|
||||
)
|
||||
.order(created_at: :desc)
|
||||
.first
|
||||
end
|
||||
|
||||
for_hle_id = proc { |hle_id|
|
||||
hle_id && HttpLogEntry.find_by(id: hle_id)
|
||||
}
|
||||
for_hle_id = proc { |hle_id| hle_id && HttpLogEntry.find_by(id: hle_id) }
|
||||
|
||||
# older versions don't end in a trailing slash
|
||||
hle_id = self.log_entry_detail && self.log_entry_detail['last_user_page_id']
|
||||
hle_id = self.log_entry_detail && self.log_entry_detail["last_user_page_id"]
|
||||
|
||||
# first try the last scanned user page (present on most fa user models)
|
||||
for_hle_id.call(hle_id) ||
|
||||
@@ -247,18 +231,19 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
private
|
||||
|
||||
def similar_users_by(factor_col, exclude_followed_by)
|
||||
query =
|
||||
disco
|
||||
.nearest_neighbors(factor_col, distance: 'euclidean')
|
||||
query = disco.nearest_neighbors(factor_col, distance: "euclidean")
|
||||
|
||||
query = query.where.not(user_id: exclude_followed_by.follows.select(:followed_id)) if exclude_followed_by
|
||||
query =
|
||||
query.where.not(
|
||||
user_id: exclude_followed_by.follows.select(:followed_id)
|
||||
) if exclude_followed_by
|
||||
|
||||
users_from_disco_query(query)
|
||||
end
|
||||
|
||||
def users_from_disco_query(disco_query)
|
||||
Domain::Fa::User
|
||||
.select('domain_fa_users.*', disco_query.select_values.last)
|
||||
.select("domain_fa_users.*", disco_query.select_values.last)
|
||||
.joins(:disco)
|
||||
.merge(disco_query.reselect(:user_id))
|
||||
end
|
||||
@@ -270,7 +255,8 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
when :state_detail
|
||||
str = state_detail["scanned_#{scan_type}_at"]
|
||||
Time.parse(str) if str
|
||||
else raise("invalid scan type '#{scan_type}'")
|
||||
else
|
||||
raise("invalid scan type '#{scan_type}'")
|
||||
end
|
||||
end
|
||||
|
||||
@@ -280,7 +266,8 @@ class Domain::Fa::User < ReduxApplicationRecord
|
||||
send(:"scanned_#{scan_type}_at=", value)
|
||||
when :state_detail
|
||||
state_detail["scanned_#{scan_type}_at"] = value.iso8601
|
||||
else raise("invalid scan type '#{scan_type}'")
|
||||
else
|
||||
raise("invalid scan type '#{scan_type}'")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -25,12 +25,5 @@ module ReduxScraper
|
||||
config.time_zone = "Pacific Time (US & Canada)"
|
||||
# config.eager_load_paths << Rails.root.join("extras")
|
||||
config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa")
|
||||
config.x.proxies = ReduxScraper::Application.config_for("proxies")
|
||||
config.x.influxdb = ReduxScraper::Application.config_for("influxdb")
|
||||
|
||||
proxy_name = (ENV["proxy"] || "direct").to_sym
|
||||
config.x.proxy_name = proxy_name
|
||||
config.x.proxy = Rails.application.config.x.proxies[proxy_name] || raise("no proxy config for '#{proxy_name}'")
|
||||
config.x.proxy[:name] = proxy_name
|
||||
end
|
||||
end
|
||||
|
||||
@@ -28,9 +28,12 @@ redux_dev: &redux_dev
|
||||
adapter: postgresql
|
||||
host: localhost
|
||||
port: 5432
|
||||
database: redux_development
|
||||
username: scraper_redux
|
||||
password: pdkFLqRmQwPUPaDDC4pX
|
||||
# database: redux_development
|
||||
# username: scraper_redux
|
||||
# password: pdkFLqRmQwPUPaDDC4pX
|
||||
database: postgres
|
||||
username: postgres
|
||||
password: postgres
|
||||
migrations_paths: db/redux_migrate
|
||||
pool: 4
|
||||
|
||||
@@ -53,21 +56,25 @@ legacy_staging: &legacy_staging
|
||||
|
||||
local_redux_test: &local_redux_test
|
||||
adapter: postgresql
|
||||
host: localhost
|
||||
host: db
|
||||
port: 5432
|
||||
database: redux_test
|
||||
username: scraper_redux
|
||||
password: pdkFLqRmQwPUPaDDC4pX
|
||||
# username: scraper_redux
|
||||
# password: pdkFLqRmQwPUPaDDC4pX
|
||||
username: postgres
|
||||
password: postgres
|
||||
migrations_paths: db/redux_migrate
|
||||
pool: 4
|
||||
|
||||
local_legacy_test: &local_legacy_test
|
||||
adapter: postgresql
|
||||
host: localhost
|
||||
host: db
|
||||
port: 5432
|
||||
database: legacy_test
|
||||
username: scraper_redux
|
||||
password: pdkFLqRmQwPUPaDDC4pX
|
||||
# username: scraper_redux
|
||||
# password: pdkFLqRmQwPUPaDDC4pX
|
||||
username: postgres
|
||||
password: postgres
|
||||
migrations_paths: db/legacy_migrate
|
||||
pool: 4
|
||||
|
||||
|
||||
@@ -12,11 +12,13 @@ Rails.application.configure do
|
||||
config.good_job.cleanup_interval_jobs = 100_000
|
||||
config.good_job.cleanup_interval_seconds = 4.hours
|
||||
config.good_job.logger = Logger.new(STDOUT)
|
||||
config.good_job.logger.level = :warn
|
||||
config.good_job.logger.level = :info
|
||||
|
||||
if Rails.env.worker?
|
||||
config.good_job.execution_mode = :async
|
||||
config.good_job.on_thread_error = ->(exception) { Rails.logger.error("GoodJob exception: #{exception}") }
|
||||
config.good_job.on_thread_error = ->(exception) do
|
||||
Rails.logger.error("GoodJob exception: #{exception}")
|
||||
end
|
||||
else
|
||||
config.good_job.execution_mode = :external
|
||||
end
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
client = InfluxDB2::Client.new(
|
||||
Rails.application.config.x.influxdb.host || raise("no host"),
|
||||
"",
|
||||
bucket: Rails.application.config.x.influxdb.bucket || raise("no bucket"),
|
||||
org: "",
|
||||
precision: InfluxDB2::WritePrecision::NANOSECOND,
|
||||
use_ssl: false,
|
||||
)
|
||||
write_api = client.create_write_api
|
||||
|
||||
ActiveSupport::Notifications.subscribe "process_action.action_controller" do |name, started, finished, unique_id, data|
|
||||
request = data[:request]
|
||||
api_token = request.params[:api_token]
|
||||
api_user = api_token ? ApplicationController::API_TOKENS[api_token] : nil
|
||||
ip = request.headers["HTTP_CF_CONNECTING_IP"] || request.remote_ip
|
||||
|
||||
hash = {
|
||||
name: "process_action.action_controller",
|
||||
tags: {
|
||||
method: "#{data[:controller]}##{data[:action]}",
|
||||
format: data[:format],
|
||||
http_method: data[:method],
|
||||
status: data[:status],
|
||||
exception: data[:exception]&.first,
|
||||
},
|
||||
fields: {
|
||||
ip: ip,
|
||||
api_token: api_token,
|
||||
api_user: api_user,
|
||||
time_in_controller: (finished - started) * 1000,
|
||||
time_in_view: (data[:view_runtime] || 0).ceil,
|
||||
time_in_db: (data[:db_runtime] || 0).ceil,
|
||||
request_id: unique_id,
|
||||
},
|
||||
time: started,
|
||||
}
|
||||
|
||||
write_api.write(data: hash)
|
||||
rescue => e
|
||||
Rails.logger.error("error writing influxdb metric: #{e}")
|
||||
end
|
||||
@@ -1,29 +0,0 @@
|
||||
default: &default
|
||||
direct:
|
||||
http: http://127.0.0.1:9292
|
||||
gallery_dl: http://127.0.0.1:5001
|
||||
|
||||
proxy-1:
|
||||
http: http://127.0.0.1:9292
|
||||
gallery_dl: http://127.0.0.1:5001
|
||||
|
||||
production:
|
||||
<<: *default
|
||||
|
||||
staging:
|
||||
<<: *default
|
||||
|
||||
development:
|
||||
direct: {}
|
||||
proxy-1: {}
|
||||
dedipath-1: {}
|
||||
serverhost-1: {}
|
||||
|
||||
worker:
|
||||
<<: *default
|
||||
|
||||
test:
|
||||
direct: {}
|
||||
proxy-1: {}
|
||||
dedipath-1: {}
|
||||
serverhost-1: {}
|
||||
@@ -1,9 +1,9 @@
|
||||
namespace :metrics do
|
||||
desc "run reporters periodically"
|
||||
task :report_all => [:environment, :set_logger_stdout] do
|
||||
task report_all: %i[environment set_logger_stdout] do
|
||||
schedule = {
|
||||
Rake::Task["metrics:jobs"] => 60.seconds,
|
||||
Rake::Task["metrics:estimate_db_rows"] => 60.seconds,
|
||||
Rake::Task["metrics:estimate_db_rows"] => 60.seconds
|
||||
}
|
||||
|
||||
last_ran = {}
|
||||
@@ -24,14 +24,4 @@ namespace :metrics do
|
||||
sleep sleep_amt
|
||||
end
|
||||
end
|
||||
|
||||
desc "Report job queue metrics"
|
||||
task :jobs => :environment do
|
||||
Metrics::GoodJobReporter.new.report
|
||||
end
|
||||
|
||||
desc "Report estimated db row metrics"
|
||||
task :estimate_db_rows => :environment do
|
||||
Metrics::EstimateDbRowsReporter.new.report
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user