basic fa browse job scaffolding
This commit is contained in:
6
Gemfile
6
Gemfile
@@ -63,7 +63,7 @@ group :development do
|
||||
gem "web-console"
|
||||
|
||||
# Add speed badges [https://github.com/MiniProfiler/rack-mini-profiler]
|
||||
# gem "rack-mini-profiler"
|
||||
gem "rack-mini-profiler", require: ["enable_rails_patches", "rack-mini-profiler"]
|
||||
|
||||
# Speed up commands on slow machines / big apps [https://github.com/rails/spring]
|
||||
# gem "spring"
|
||||
@@ -90,6 +90,10 @@ gem "addressable"
|
||||
gem "nokogiri"
|
||||
gem "http-cookie"
|
||||
gem "curb"
|
||||
gem "kaminari"
|
||||
gem "delayed_job_active_record"
|
||||
# gem "delayed-web"
|
||||
gem "delayed_job_web"
|
||||
# gem "concurrent-ruby-ext", require: "concurrent"
|
||||
# gem 'cli-ui'
|
||||
# gem "paper_trail"
|
||||
|
||||
39
Gemfile.lock
39
Gemfile.lock
@@ -102,6 +102,16 @@ GEM
|
||||
irb (>= 1.5.0)
|
||||
reline (>= 0.3.1)
|
||||
debug_inspector (1.1.0)
|
||||
delayed_job (4.1.11)
|
||||
activesupport (>= 3.0, < 8.0)
|
||||
delayed_job_active_record (4.1.7)
|
||||
activerecord (>= 3.0, < 8.0)
|
||||
delayed_job (>= 3.0, < 5)
|
||||
delayed_job_web (1.4.4)
|
||||
activerecord (> 3.0.0)
|
||||
delayed_job (> 2.0.3)
|
||||
rack-protection (>= 1.5.5)
|
||||
sinatra (>= 1.4.4)
|
||||
diffy (3.4.2)
|
||||
domain_name (0.5.20190701)
|
||||
unf (>= 0.0.5, < 1.0.0)
|
||||
@@ -121,6 +131,18 @@ GEM
|
||||
jbuilder (2.11.5)
|
||||
actionview (>= 5.0.0)
|
||||
activesupport (>= 5.0.0)
|
||||
kaminari (1.2.2)
|
||||
activesupport (>= 4.1.0)
|
||||
kaminari-actionview (= 1.2.2)
|
||||
kaminari-activerecord (= 1.2.2)
|
||||
kaminari-core (= 1.2.2)
|
||||
kaminari-actionview (1.2.2)
|
||||
actionview
|
||||
kaminari-core (= 1.2.2)
|
||||
kaminari-activerecord (1.2.2)
|
||||
activerecord
|
||||
kaminari-core (= 1.2.2)
|
||||
kaminari-core (1.2.2)
|
||||
loofah (2.19.1)
|
||||
crass (~> 1.0.2)
|
||||
nokogiri (>= 1.5.9)
|
||||
@@ -136,6 +158,8 @@ GEM
|
||||
mini_portile2 (2.8.1)
|
||||
minitest (5.17.0)
|
||||
msgpack (1.6.0)
|
||||
mustermann (3.0.0)
|
||||
ruby2_keywords (~> 0.0.1)
|
||||
net-imap (0.3.4)
|
||||
date
|
||||
net-protocol
|
||||
@@ -161,6 +185,10 @@ GEM
|
||||
nio4r (~> 2.0)
|
||||
racc (1.6.2)
|
||||
rack (2.2.6.2)
|
||||
rack-mini-profiler (3.0.0)
|
||||
rack (>= 1.2.0)
|
||||
rack-protection (3.0.5)
|
||||
rack
|
||||
rack-test (2.0.2)
|
||||
rack (>= 1.3)
|
||||
rails (7.0.4.2)
|
||||
@@ -197,11 +225,17 @@ GEM
|
||||
ruby-prof (1.4.5)
|
||||
ruby-prof-speedscope (0.3.0)
|
||||
ruby-prof (~> 1.0)
|
||||
ruby2_keywords (0.0.5)
|
||||
rubyzip (2.3.2)
|
||||
selenium-webdriver (4.8.0)
|
||||
rexml (~> 3.2, >= 3.2.5)
|
||||
rubyzip (>= 1.2.2, < 3.0)
|
||||
websocket (~> 1.0)
|
||||
sinatra (3.0.5)
|
||||
mustermann (~> 3.0)
|
||||
rack (~> 2.2, >= 2.2.4)
|
||||
rack-protection (= 3.0.5)
|
||||
tilt (~> 2.0)
|
||||
sprockets (4.2.0)
|
||||
concurrent-ruby (~> 1.0)
|
||||
rack (>= 2.2.4, < 4)
|
||||
@@ -215,6 +249,7 @@ GEM
|
||||
railties (>= 6.0.0)
|
||||
table_print (1.5.7)
|
||||
thor (1.2.1)
|
||||
tilt (2.1.0)
|
||||
timeout (0.3.1)
|
||||
turbo-rails (1.3.3)
|
||||
actionpack (>= 6.0.0)
|
||||
@@ -251,15 +286,19 @@ DEPENDENCIES
|
||||
capybara
|
||||
curb
|
||||
debug
|
||||
delayed_job_active_record
|
||||
delayed_job_web
|
||||
diffy
|
||||
http-cookie
|
||||
importmap-rails
|
||||
jbuilder
|
||||
kaminari
|
||||
nokogiri
|
||||
pg
|
||||
pry
|
||||
pry-stack_explorer
|
||||
puma (~> 5.0)
|
||||
rack-mini-profiler
|
||||
rails (~> 7.0.4, >= 7.0.4.2)
|
||||
rb-bsdiff!
|
||||
ruby-prof
|
||||
|
||||
17
Rakefile
17
Rakefile
@@ -5,10 +5,19 @@ require "rake/testtask"
|
||||
require_relative "config/application"
|
||||
|
||||
Rails.application.load_tasks
|
||||
require_relative "rake/sst"
|
||||
require_relative "rake/log_entry"
|
||||
require_relative "rake/fa_post"
|
||||
$LOAD_PATH << Rails.root.join("rake")
|
||||
Rake.application.rake_require "sst"
|
||||
Rake.application.rake_require "log_entry"
|
||||
Rake.application.rake_require "fa_post"
|
||||
|
||||
task :log => :environment do
|
||||
task :set_ar_stdout => :environment do
|
||||
ActiveRecord::Base.logger = Logger.new(STDOUT)
|
||||
end
|
||||
|
||||
task :set_logger_stdout => :environment do
|
||||
Rails.logger = Logger.new(STDOUT)
|
||||
Rails.logger.formatter = proc do |severity, datetime, progname, msg|
|
||||
"#{severity}: #{msg}\n"
|
||||
end
|
||||
ActiveRecord::Base.logger = nil
|
||||
end
|
||||
|
||||
29
app/controllers/log_entries_controller.rb
Normal file
29
app/controllers/log_entries_controller.rb
Normal file
@@ -0,0 +1,29 @@
|
||||
class LogEntriesController < ApplicationController
|
||||
def index
|
||||
@log_entries = HttpLogEntry.page(params[:page]).per(50).order(id: :desc)
|
||||
end
|
||||
|
||||
def show
|
||||
@log_entry = HttpLogEntry.find(params[:id])
|
||||
end
|
||||
|
||||
def contents
|
||||
log_entry = HttpLogEntry.find(params[:id])
|
||||
response = log_entry.response
|
||||
|
||||
if helpers.is_renderable_image_type?(response.content_type)
|
||||
send_data(
|
||||
response.contents,
|
||||
type: response.content_type,
|
||||
disposition: "inline",
|
||||
filename: log_entry.uri.path,
|
||||
)
|
||||
elsif response.content_type =~ /text\/plain/
|
||||
render plain: response.contents
|
||||
elsif response.content_type =~ /text\/html/
|
||||
render html: response.contents.html_safe
|
||||
else
|
||||
render plain: "no renderer for #{response.content_type}"
|
||||
end
|
||||
end
|
||||
end
|
||||
5
app/helpers/log_entries_helper.rb
Normal file
5
app/helpers/log_entries_helper.rb
Normal file
@@ -0,0 +1,5 @@
|
||||
module LogEntriesHelper
|
||||
def is_renderable_image_type?(content_type)
|
||||
[/image\/jpeg/, /image\/jpg/, /image\/png/, /image\/gif/].any? { |ct| content_type =~ ct }
|
||||
end
|
||||
end
|
||||
57
app/jobs/domain/fa/job/browse_page_job.rb
Normal file
57
app/jobs/domain/fa/job/browse_page_job.rb
Normal file
@@ -0,0 +1,57 @@
|
||||
class Domain::Fa::Job::BrowsePageJob < ApplicationJob
|
||||
queue_as :default
|
||||
|
||||
def initialize(http_client = nil)
|
||||
@http_client = http_client || Domain::Fa::Scraper::FaHttpClient.new(Curl::Easy.new)
|
||||
super
|
||||
end
|
||||
|
||||
def perform(args)
|
||||
response = @http_client.get("https://www.furaffinity.net/browse/")
|
||||
if response.status_code != 200
|
||||
raise("non 200 response for /browse: #{response.status_code}")
|
||||
end
|
||||
page = Domain::Fa::Parser::Page.new(response.body)
|
||||
raise "not a listings page" unless page.probably_listings_page?
|
||||
submissions = page.submissions_parsed
|
||||
Rails.logger.info "have #{submissions.count} submissions on page"
|
||||
|
||||
submissions.each do |submission|
|
||||
post = Domain::Fa::Post.find_or_initialize_by(fa_id: submission.id)
|
||||
post.state ||= :seen_post
|
||||
|
||||
case post.state.to_sym
|
||||
when :seen_post
|
||||
Rails.logger.info "enqueue post scan for fa_id #{post.fa_id}"
|
||||
saw_post(post, submission, response.log_entry)
|
||||
when :scanned_initial
|
||||
Rails.logger.info "(todo) enqueue post download for fa_id #{post.fa_id}"
|
||||
when :scanned_file
|
||||
Rails.logger.info "(todo) post has been scanned / downloaded for fa_id #{post.fa_id}"
|
||||
when :removed
|
||||
Rails.logger.info "(todo) removed post seen in listing page, enqueue scan for fa_id #{post.fa_id}"
|
||||
when :error
|
||||
Rails.logger.info "(todo) error'd post seen in listing page, enqueue scan for fa_id #{post.fa_id}"
|
||||
else
|
||||
Rails.logger.info "unknown post state `#{post.state}` for fa_id #{post.fa_id}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def saw_post(post, submission, log_entry)
|
||||
if post.file_id
|
||||
Rails.logger.info " -> already have file, marking as :scanned_file"
|
||||
post.state = :scanned_file
|
||||
end
|
||||
|
||||
if post.creator.nil?
|
||||
post.creator = Domain::Fa::User.find_or_initialize_by(url_name: submission.artist_url_name) do |user|
|
||||
user.name = submission.artist
|
||||
end
|
||||
end
|
||||
post.title = submission.title || raise("should have a title")
|
||||
post.save!
|
||||
|
||||
Domain::Fa::Job::ScanPostJob.perform_later({ post: post, caused_by: log_entry })
|
||||
end
|
||||
end
|
||||
15
app/jobs/domain/fa/job/scan_post_job.rb
Normal file
15
app/jobs/domain/fa/job/scan_post_job.rb
Normal file
@@ -0,0 +1,15 @@
|
||||
class Domain::Fa::Job::ScanPostJob < ApplicationJob
|
||||
queue_as :default
|
||||
|
||||
def initialize(http_client = nil)
|
||||
@http_client = http_client || Domain::Fa::Scraper::FaHttpClient.new(Curl::Easy.new)
|
||||
super
|
||||
end
|
||||
|
||||
def perform(args)
|
||||
post = args[:post]
|
||||
caused_by = args[:caused_by]
|
||||
|
||||
Rails.logger.info "[Domain::Fa::Job::ScanPostJob] Performing scan for fa_id #{post.fa_id} (triggerd by #{caused_by&.id})"
|
||||
end
|
||||
end
|
||||
2
app/jobs/scraper/job_base.rb
Normal file
2
app/jobs/scraper/job_base.rb
Normal file
@@ -0,0 +1,2 @@
|
||||
class Scraper::JobBase < ApplicationJob
|
||||
end
|
||||
@@ -27,6 +27,10 @@ class Domain::Fa::Parser::ListedSubmissionParserHelper
|
||||
end
|
||||
end
|
||||
|
||||
def artist_url_name
|
||||
artist_path.split("/").last
|
||||
end
|
||||
|
||||
def title
|
||||
@title ||= if !@new_parse_mode
|
||||
@elem.css("> span").first["title"]
|
||||
|
||||
19
app/lib/domain/fa/scraper/fa_http_client.rb
Normal file
19
app/lib/domain/fa/scraper/fa_http_client.rb
Normal file
@@ -0,0 +1,19 @@
|
||||
class Domain::Fa::Scraper::FaHttpClient < Scraper::BaseHttpClient
|
||||
DEFAULT_ALLOWED_DOMAINS = [
|
||||
"*.furaffinity.net",
|
||||
]
|
||||
|
||||
def initialize(curl_client, cookies: nil, allowed_domains: nil)
|
||||
@cookies = cookies || Rails.application.config.x.cookies.fa
|
||||
@allowed_domains = allowed_domains || DEFAULT_ALLOWED_DOMAINS
|
||||
super(curl_client)
|
||||
end
|
||||
|
||||
def cookies
|
||||
@cookies
|
||||
end
|
||||
|
||||
def allowed_domains
|
||||
@allowed_domains
|
||||
end
|
||||
end
|
||||
@@ -1,13 +0,0 @@
|
||||
class Domain::Fa::Scraper::HttpClient < Scraper::HttpClientBase
|
||||
def cookies
|
||||
Rails.application.config.x.cookies.fa
|
||||
end
|
||||
|
||||
ALLOWED_DOMAINS = [
|
||||
"*.furaffinity.net",
|
||||
]
|
||||
|
||||
def allowed_domains
|
||||
ALLOWED_DOMAINS
|
||||
end
|
||||
end
|
||||
@@ -1,9 +0,0 @@
|
||||
class Domain::Fa::Scraper::Job::BrowsePageJob < Scraper::JobBase
|
||||
def initialize(http_client)
|
||||
@http_client = http_client
|
||||
end
|
||||
|
||||
def perform
|
||||
content = @http_client.get("https://www.furaffinity.net/browse/")
|
||||
end
|
||||
end
|
||||
@@ -1,4 +1,4 @@
|
||||
class Scraper::HttpClientBase
|
||||
class Scraper::BaseHttpClient
|
||||
Response = Struct.new(
|
||||
:status_code,
|
||||
:request_headers,
|
||||
@@ -9,7 +9,10 @@ class Scraper::HttpClientBase
|
||||
|
||||
class InvalidURLError < ArgumentError; end
|
||||
|
||||
def initialize(curl_client)
|
||||
def initialize(curl_client = nil)
|
||||
cookies || raise("Subclass must define non-nil #cookies")
|
||||
allowed_domains || raise("Subclass must define non-nil #allowed_domains")
|
||||
|
||||
# @cookie_jar
|
||||
init_cookie_jar
|
||||
# TODO - populate AdaptiveCache with domains this client cares about
|
||||
@@ -59,13 +62,15 @@ class Scraper::HttpClientBase
|
||||
|
||||
requested_at = Time.now
|
||||
@curl_client.perform
|
||||
response_time_ms = Time.now - requested_at
|
||||
response_time_ms = ((Time.now - requested_at) * 1000).to_i
|
||||
|
||||
response_code = @curl_client.response_code
|
||||
body_str = @curl_client.body_str
|
||||
response_headers = self.class.parse_header_str(@curl_client.header_str).to_h
|
||||
request_headers = @curl_client.headers
|
||||
content_type = response_headers["Content-Type"] || response_headers["content-type"] || raise("no content type provided: #{response_headers}")
|
||||
content_type = response_headers["Content-Type"] ||
|
||||
response_headers["content-type"] ||
|
||||
raise("no content type provided: #{response_headers}")
|
||||
|
||||
log_entry = HttpLogEntry.new({
|
||||
uri: url,
|
||||
@@ -80,6 +85,12 @@ class Scraper::HttpClientBase
|
||||
})
|
||||
log_entry.save!
|
||||
|
||||
Rails.logger.info(
|
||||
"[HttpClient] #{log_entry.id} - GET #{uri.to_s}: HTTP #{response_code} " +
|
||||
"(#{HexUtil.humansize(body_str.length)}) " +
|
||||
"took #{response_time_ms} ms"
|
||||
)
|
||||
|
||||
Response.new(
|
||||
response_code,
|
||||
request_headers,
|
||||
@@ -1,2 +0,0 @@
|
||||
class Scraper::JobBase
|
||||
end
|
||||
@@ -8,7 +8,15 @@ class Domain::Fa::Post < ReduxApplicationRecord
|
||||
},
|
||||
)
|
||||
|
||||
validates_presence_of(:fa_id)
|
||||
enum :state, [
|
||||
:seen_post, # post has been observed e.g. in the listings / browse page, but not yet scanned
|
||||
:scanned_initial, # post page has been scanned, but media not downloaded
|
||||
:scanned_file, # post page has been scanned, and media has been downloaded
|
||||
:removed, # post has been removed
|
||||
:error, # other error scanning the post
|
||||
]
|
||||
validates_inclusion_of(:state, in: self.states.keys)
|
||||
validates_presence_of(:fa_id, :state)
|
||||
|
||||
belongs_to :creator,
|
||||
class_name: "::Domain::Fa::User",
|
||||
@@ -103,6 +111,14 @@ class Domain::Fa::Post < ReduxApplicationRecord
|
||||
end
|
||||
end
|
||||
|
||||
post.state = case legacy_post.state.to_sym
|
||||
when :seen_listing then :seen_post
|
||||
when :scanned_submission then :scanned_initial
|
||||
when :scan_error then :error
|
||||
when :have_static then :scanned_file
|
||||
else raise("unhandled state: #{legacy_post.state}")
|
||||
end
|
||||
|
||||
fields_to_copy = [
|
||||
:fa_id,
|
||||
:title,
|
||||
|
||||
@@ -14,6 +14,14 @@ class HttpLogEntry < ReduxApplicationRecord
|
||||
|
||||
validates :response_sha256, length: { is: 32 }
|
||||
|
||||
belongs_to :caused_by,
|
||||
class_name: "::HttpLogEntry",
|
||||
optional: true
|
||||
|
||||
has_many :triggered,
|
||||
class_name: "::HttpLogEntry",
|
||||
foreign_key: :caused_by_id
|
||||
|
||||
validates_inclusion_of(:verb, in: ::HttpLogEntry.verbs.keys)
|
||||
validates_presence_of(
|
||||
:uri_scheme,
|
||||
|
||||
7
app/views/log_entries/_log_entry_table_row.html.erb
Normal file
7
app/views/log_entries/_log_entry_table_row.html.erb
Normal file
@@ -0,0 +1,7 @@
|
||||
<tr>
|
||||
<td><%= link_to log_entry.id, log_entry_path(log_entry.id) %></td>
|
||||
<td><%= log_entry.uri.to_s %></td>
|
||||
<td><%= log_entry.status_code %></td>
|
||||
<td><%= log_entry.content_type %></td>
|
||||
<td><%= log_entry.response_time_ms %></td>
|
||||
</tr>
|
||||
15
app/views/log_entries/index.html.erb
Normal file
15
app/views/log_entries/index.html.erb
Normal file
@@ -0,0 +1,15 @@
|
||||
<h1>Http Request Log</h1>
|
||||
<table>
|
||||
<tr>
|
||||
<th>id</th>
|
||||
<th>url</th>
|
||||
<th>status code</th>
|
||||
<th>content type</th>
|
||||
<th>response time</th>
|
||||
</tr>
|
||||
<% @log_entries.each do |log_entry| %>
|
||||
<%= render partial: "log_entry_table_row", locals: { log_entry: log_entry } %>
|
||||
<% end %>
|
||||
</table>
|
||||
|
||||
<%= link_to_next_page @log_entries, "Next Page" %>
|
||||
20
app/views/log_entries/show.html.erb
Normal file
20
app/views/log_entries/show.html.erb
Normal file
@@ -0,0 +1,20 @@
|
||||
<%= link_to "← Log Entries".html_safe, log_entries_path %>
|
||||
<h3><%= @log_entry.uri.to_s %> - <%= @log_entry.status_code %></h3>
|
||||
<table>
|
||||
<tr>
|
||||
<% rtms = @log_entry.response_time_ms %>
|
||||
<td>response time</td><td><%= rtms == -1 ? "(not recorded)" : "#{rtms}ms" %></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>content type</td><td><%= @log_entry.content_type %></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>size</td><td><%= HexUtil.humansize(@log_entry.response.size) %></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<% if is_renderable_image_type?(@log_entry.content_type) %>
|
||||
<img alt="image" src="<%= contents_log_entry_path %>"/>
|
||||
<% else %>
|
||||
<iframe sandbox title="log entry contents" width="100%" height="100%" src="<%= contents_log_entry_path %>"></iframe>
|
||||
<% end %>
|
||||
5
bin/delayed_job
Executable file
5
bin/delayed_job
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env ruby
|
||||
|
||||
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'config', 'environment'))
|
||||
require 'delayed/command'
|
||||
Delayed::Command.new(ARGV).daemonize
|
||||
@@ -8,10 +8,12 @@ Bundler.require(*Rails.groups)
|
||||
|
||||
module ReduxScraper
|
||||
class Application < Rails::Application
|
||||
config.assets.precompile << "delayed/web/application.css"
|
||||
# Initialize configuration defaults for originally generated Rails version.
|
||||
config.load_defaults 7.0
|
||||
config.active_record.legacy_connection_handling = false
|
||||
config.autoload_paths << config.root.join("app/lib")
|
||||
config.active_job.queue_adapter = :delayed_job
|
||||
|
||||
# Configuration for the application, engines, and railties goes here.
|
||||
#
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
require "active_support/core_ext/integer/time"
|
||||
|
||||
Rack::MiniProfiler.config.position = "top-right"
|
||||
Rails.application.configure do
|
||||
# Settings specified here will take precedence over those in config/application.rb.
|
||||
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
Rails.application.routes.draw do
|
||||
# Define your application routes per the DSL in https://guides.rubyonrails.org/routing.html
|
||||
# mount Delayed::Web::Engine, at: "/jobs"
|
||||
match "/jobs" => DelayedJobWeb, :anchor => false, :via => [:get, :post]
|
||||
|
||||
# Defines the root path route ("/")
|
||||
# root "articles#index"
|
||||
resources :log_entries, only: [:index, :show] do
|
||||
get :contents, on: :member
|
||||
end
|
||||
end
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
class CreateHttpLogEntryTriggers < ActiveRecord::Migration[7.0]
|
||||
def change
|
||||
change_table :http_log_entries do |t|
|
||||
t.references :caused_by
|
||||
end
|
||||
|
||||
add_foreign_key :http_log_entries, :http_log_entries, column: :caused_by_id, validate: true
|
||||
end
|
||||
end
|
||||
7
db/redux_migrate/20230220055733_add_state_to_fa_posts.rb
Normal file
7
db/redux_migrate/20230220055733_add_state_to_fa_posts.rb
Normal file
@@ -0,0 +1,7 @@
|
||||
class AddStateToFaPosts < ActiveRecord::Migration[7.0]
|
||||
def change
|
||||
change_table :domain_fa_posts do |t|
|
||||
t.integer :state
|
||||
end
|
||||
end
|
||||
end
|
||||
22
db/redux_migrate/20230220062108_create_delayed_jobs.rb
Normal file
22
db/redux_migrate/20230220062108_create_delayed_jobs.rb
Normal file
@@ -0,0 +1,22 @@
|
||||
class CreateDelayedJobs < ActiveRecord::Migration[7.0]
|
||||
def self.up
|
||||
create_table :delayed_jobs do |table|
|
||||
table.integer :priority, default: 0, null: false # Allows some jobs to jump to the front of the queue
|
||||
table.integer :attempts, default: 0, null: false # Provides for retries, but still fail eventually.
|
||||
table.text :handler, null: false # YAML-encoded string of the object that will do work
|
||||
table.text :last_error # reason for last failure (See Note below)
|
||||
table.datetime :run_at # When to run. Could be Time.zone.now for immediately, or sometime in the future.
|
||||
table.datetime :locked_at # Set when a client is working on this object
|
||||
table.datetime :failed_at # Set when all retries have failed (actually, by default, the record is deleted instead)
|
||||
table.string :locked_by # Who is working on this object (if locked)
|
||||
table.string :queue # The name of the queue this job is in
|
||||
table.timestamps null: true
|
||||
end
|
||||
|
||||
add_index :delayed_jobs, [:priority, :run_at], name: "delayed_jobs_priority"
|
||||
end
|
||||
|
||||
def self.down
|
||||
drop_table :delayed_jobs
|
||||
end
|
||||
end
|
||||
21
db/schema.rb
generated
21
db/schema.rb
generated
@@ -10,7 +10,7 @@
|
||||
#
|
||||
# It's strongly recommended that you check this file into your version control system.
|
||||
|
||||
ActiveRecord::Schema[7.0].define(version: 2023_02_13_013141) do
|
||||
ActiveRecord::Schema[7.0].define(version: 2023_02_20_062108) do
|
||||
# These are extensions that must be enabled in order to support this database
|
||||
enable_extension "pg_stat_statements"
|
||||
enable_extension "plpgsql"
|
||||
@@ -26,6 +26,21 @@ ActiveRecord::Schema[7.0].define(version: 2023_02_13_013141) do
|
||||
t.index ["sha256"], name: "index_blob_entries_on_sha256", unique: true
|
||||
end
|
||||
|
||||
create_table "delayed_jobs", force: :cascade do |t|
|
||||
t.integer "priority", default: 0, null: false
|
||||
t.integer "attempts", default: 0, null: false
|
||||
t.text "handler", null: false
|
||||
t.text "last_error"
|
||||
t.datetime "run_at"
|
||||
t.datetime "locked_at"
|
||||
t.datetime "failed_at"
|
||||
t.string "locked_by"
|
||||
t.string "queue"
|
||||
t.datetime "created_at"
|
||||
t.datetime "updated_at"
|
||||
t.index ["priority", "run_at"], name: "delayed_jobs_priority"
|
||||
end
|
||||
|
||||
create_table "domain_fa_post_http_log_entry_joins", force: :cascade do |t|
|
||||
t.bigint "post_id", null: false
|
||||
t.bigint "entry_id", null: false
|
||||
@@ -52,6 +67,7 @@ ActiveRecord::Schema[7.0].define(version: 2023_02_13_013141) do
|
||||
t.bigint "file_id"
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.integer "state"
|
||||
t.index ["creator_id"], name: "index_domain_fa_posts_on_creator_id"
|
||||
t.index ["fa_id"], name: "index_domain_fa_posts_on_fa_id", unique: true
|
||||
t.index ["file_id"], name: "index_domain_fa_posts_on_file_id"
|
||||
@@ -109,6 +125,8 @@ ActiveRecord::Schema[7.0].define(version: 2023_02_13_013141) do
|
||||
t.datetime "requested_at", null: false
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.bigint "caused_by_id"
|
||||
t.index ["caused_by_id"], name: "index_http_log_entries_on_caused_by_id"
|
||||
t.index ["request_headers_id"], name: "index_http_log_entries_on_request_headers_id"
|
||||
t.index ["response_headers_id"], name: "index_http_log_entries_on_response_headers_id"
|
||||
end
|
||||
@@ -147,6 +165,7 @@ ActiveRecord::Schema[7.0].define(version: 2023_02_13_013141) do
|
||||
add_foreign_key "domain_fa_user_http_log_entry_joins", "domain_fa_users", column: "post_id"
|
||||
add_foreign_key "domain_fa_user_http_log_entry_joins", "http_log_entries", column: "entry_id"
|
||||
add_foreign_key "http_log_entries", "blob_entries", column: "response_sha256", primary_key: "sha256"
|
||||
add_foreign_key "http_log_entries", "http_log_entries", column: "caused_by_id"
|
||||
add_foreign_key "http_log_entries", "http_log_entry_headers", column: "request_headers_id"
|
||||
add_foreign_key "http_log_entries", "http_log_entry_headers", column: "response_headers_id"
|
||||
end
|
||||
|
||||
@@ -9,4 +9,11 @@ namespace :fa_post do
|
||||
new(batch_size: batch_size, forks: forks, start_at: start_at).
|
||||
run
|
||||
end
|
||||
|
||||
desc "run a single browse page job"
|
||||
task :browse_page_job => [:set_logger_stdout, :environment] do
|
||||
Domain::Fa::Job::BrowsePageJob.perform_later({})
|
||||
rescue
|
||||
binding.pry
|
||||
end
|
||||
end
|
||||
7
test/controllers/log_entries_controller_test.rb
Normal file
7
test/controllers/log_entries_controller_test.rb
Normal file
@@ -0,0 +1,7 @@
|
||||
require "test_helper"
|
||||
|
||||
class LogEntriesControllerTest < ActionDispatch::IntegrationTest
|
||||
# test "the truth" do
|
||||
# assert true
|
||||
# end
|
||||
end
|
||||
@@ -1,6 +1,6 @@
|
||||
class Domain::Fa::Scraper::HttpClientTest < ActiveSupport::TestCase
|
||||
class Domain::Fa::Scraper::FaHttpClientTest < ActiveSupport::TestCase
|
||||
test "creates an http log entry" do
|
||||
client = Domain::Fa::Scraper::HttpClient.new(TestUtil.mock_curl_easy(
|
||||
client = Domain::Fa::Scraper::FaHttpClient.new(TestUtil.mock_curl_easy(
|
||||
"https://www.furaffinity.net/",
|
||||
response_code: 200,
|
||||
body_str: "a plain text body",
|
||||
@@ -1,11 +1,5 @@
|
||||
class Scraper::HttpClientBaseTest < ActiveSupport::TestCase
|
||||
test "can parse header string" do
|
||||
headers = Scraper::HttpClientBase.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
|
||||
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
|
||||
assert_equal ["content-encoding", "gzip"], headers[8]
|
||||
end
|
||||
|
||||
class Target < Scraper::HttpClientBase
|
||||
class Scraper::BaseHttpClientTest < ActiveSupport::TestCase
|
||||
class TestHttpClient < Scraper::BaseHttpClient
|
||||
def cookies
|
||||
[]
|
||||
end
|
||||
@@ -15,16 +9,22 @@ class Scraper::HttpClientBaseTest < ActiveSupport::TestCase
|
||||
end
|
||||
end
|
||||
|
||||
test "can parse header string" do
|
||||
headers = Scraper::BaseHttpClient.parse_header_str("HTTP/2 200 \r\ndate: Mon, 20 Feb 2023 00:57:22 GMT\r\ncontent-type: text/html; charset=UTF-8\r\ncache-control: no-cache\r\nexpires: Thu, 01 Jan 1970 00:00:01 GMT\r\nreferrer-policy: strict-origin-when-cross-origin\r\nx-frame-options: SAMEORIGIN\r\ncontent-security-policy: frame-ancestors 'self';\r\nfa-upstream: mainsite-01\r\ncontent-encoding: gzip\r\ncf-cache-status: DYNAMIC\r\nserver: cloudflare\r\ncf-ray: 79c349abd918abd2-CPH\r\n\r\n")
|
||||
assert_equal ["date", "Mon, 20 Feb 2023 00:57:22 GMT"], headers[0]
|
||||
assert_equal ["content-encoding", "gzip"], headers[8]
|
||||
end
|
||||
|
||||
test "throws on unallowed domain" do
|
||||
client = Target.new(TestUtil.mock_curl_easy(""))
|
||||
assert_raises(Scraper::HttpClientBase::InvalidURLError) do
|
||||
client = TestHttpClient.new(TestUtil.mock_curl_easy(""))
|
||||
assert_raises(Scraper::BaseHttpClient::InvalidURLError) do
|
||||
client.get("https://foobar.com")
|
||||
end
|
||||
end
|
||||
|
||||
test "creates an http log entry" do
|
||||
body_str = "the response body " + TestUtil.random_string(16)
|
||||
client = Target.new(TestUtil.mock_curl_easy("https://www.example.com/", body_str: body_str))
|
||||
client = TestHttpClient.new(TestUtil.mock_curl_easy("https://www.example.com/", body_str: body_str))
|
||||
# note the lack of trailing slash - http client should set path to '/'
|
||||
|
||||
response = client.get("https://www.example.com")
|
||||
@@ -112,7 +112,7 @@ class Domain::Fa::PostTest < ActiveSupport::TestCase
|
||||
end
|
||||
|
||||
test "can manipulate associated log entries" do
|
||||
post = ::Domain::Fa::Post.new(fa_id: 12345, creator: TestUtil.build_fa_user)
|
||||
post = ::Domain::Fa::Post.new(fa_id: 12345, creator: TestUtil.build_fa_user, state: :seen_post)
|
||||
assert post.valid?, post.errors.full_messages
|
||||
|
||||
le1 = TestUtil.build_http_log_entry
|
||||
@@ -133,8 +133,9 @@ class Domain::Fa::PostTest < ActiveSupport::TestCase
|
||||
test "will update the creator if it didn't already have one" do
|
||||
legacy_post = ::Legacy::Fa::Post.find(4936259)
|
||||
# and that's all we create
|
||||
post = ::Domain::Fa::Post.create({
|
||||
post = ::Domain::Fa::Post.create!({
|
||||
fa_id: legacy_post.fa_id,
|
||||
state: :seen_post,
|
||||
})
|
||||
assert_nil post.creator
|
||||
assert_nil ::Domain::Fa::User.find_by(name: "Rodrick-Dragon")
|
||||
|
||||
Reference in New Issue
Block a user