enqueue from links

This commit is contained in:
Dylan Knutson
2025-02-25 05:47:44 +00:00
parent d4ffac3496
commit 3490e28cb4
16 changed files with 1504 additions and 172 deletions

View File

@@ -10,6 +10,8 @@ class Domain::PostsController < DomainController
before_action :set_post_group!, only: %i[posts_in_group] before_action :set_post_group!, only: %i[posts_in_group]
class PostsIndexViewConfig < T::ImmutableStruct class PostsIndexViewConfig < T::ImmutableStruct
include T::Struct::ActsAsComparable
const :show_domain_filters, T::Boolean const :show_domain_filters, T::Boolean
const :show_creator_links, T::Boolean const :show_creator_links, T::Boolean
const :index_type_header, String const :index_type_header, String

View File

@@ -5,6 +5,8 @@ class DomainController < ApplicationController
abstract! abstract!
class DomainParamConfig < T::ImmutableStruct class DomainParamConfig < T::ImmutableStruct
include T::Struct::ActsAsComparable
const :post_id_param, Symbol const :post_id_param, Symbol
const :user_id_param, Symbol const :user_id_param, Symbol
const :post_group_id_param, Symbol const :post_group_id_param, Symbol

View File

@@ -91,6 +91,8 @@ module Domain::UsersHelper
end end
class StatRow < T::ImmutableStruct class StatRow < T::ImmutableStruct
include T::Struct::ActsAsComparable
const :name, String const :name, String
const :value, const :value,
T.nilable( T.nilable(

View File

@@ -12,6 +12,11 @@ class Domain::Fa::Job::Base < Scraper::JobBase
protected protected
sig { returns(T::Boolean) }
def skip_enqueue_found_links?
!!arguments[0][:skip_enqueue_found_links]
end
sig { params(build_post: T::Boolean).returns(Domain::Post::FaPost) } sig { params(build_post: T::Boolean).returns(Domain::Post::FaPost) }
def post_from_args!(build_post: false) def post_from_args!(build_post: false)
args = arguments[0] args = arguments[0]
@@ -394,4 +399,106 @@ class Domain::Fa::Job::Base < Scraper::JobBase
user.association(:avatar).reload user.association(:avatar).reload
end end
end end
FoundLink = Scraper::LinkFinder::FoundLink
sig do
params(log_entry: HttpLogEntry, suppress_jobs: T::Array[T.untyped]).void
end
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
return if skip_enqueue_found_links?
start_time = Time.now
unless PERMITTED_CONTENT_TYPES.any? { |ct|
ct.match(log_entry.content_type)
}
raise("unsupported content type: #{log_entry.content_type}")
end
document = log_entry.response&.contents || return
link_finder = Scraper::LinkFinder.new(T.must(log_entry.uri_host), document)
link_finder.logger.level = :error
links = link_finder.find_links
job_defs = []
url_names =
links.filter_map do |link|
link.is_a?(FoundLink::FaUser) ? link.url_name : nil
end
url_name_to_fa_user =
T.let(
Domain::User::FaUser.where(url_name: url_names).index_by(&:url_name),
T::Hash[String, Domain::User::FaUser],
)
fa_ids =
links.filter_map do |link|
link.is_a?(FoundLink::FaPost) ? link.fa_id : nil
end
fa_id_to_fa_post =
T.cast(
Domain::Post::FaPost.where(fa_id: fa_ids).index_by(&:fa_id),
T::Hash[Integer, Domain::Post::FaPost],
)
links
.filter_map do |link|
if link.is_a?(FoundLink::FaUser) || link.is_a?(FoundLink::FaPost)
link
else
nil
end
end
.each do |link|
case link
when FoundLink::FaUser
url_name = link.url_name
user =
url_name_to_fa_user[url_name] ||
Domain::User::FaUser.create!(url_name:) do |user|
user.name ||= url_name
end
enqueue_user_scan(user)
when FoundLink::FaPost
fa_id = link.fa_id
post =
fa_id_to_fa_post[fa_id] ||
Domain::Post::FaPost.build(fa_id:) do |post|
post.first_seen_entry_id = log_entry.id
end
if post.new_record?
post.save!
defer_job(Domain::Fa::Job::ScanPostJob, { post: })
end
end
end
job_defs.uniq!
job_defs.reject! do |job_def|
suppress_jobs.any? do |suppress|
suppress == job_def.slice(*suppress.keys)
end
end
job_defs.each do |job_def|
job_class = job_def[:job]
params = job_def[:params]
desc = job_def[:desc]
logger.debug(
[
"link finder -",
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
desc,
].join(" "),
)
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
end
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
logger.info(
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)",
)
rescue StandardError => e
logger.error(
"link finder - error enqueuing jobs: #{e.class.name} - #{e.message}",
)
end
end end

View File

@@ -57,6 +57,8 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
) )
end end
enqueue_jobs_from_found_links(response.log_entry)
page = Domain::Fa::Parser::Page.new(response.body) page = Domain::Fa::Parser::Page.new(response.body)
listing_page_stats = listing_page_stats =
update_and_enqueue_posts_from_listings_page( update_and_enqueue_posts_from_listings_page(

View File

@@ -93,15 +93,8 @@ class Domain::Fa::Job::UserGalleryJob < Domain::Fa::Job::Base
page_url = page_url =
"https://www.furaffinity.net#{folder_href}#{page_number}?perpage=72" "https://www.furaffinity.net#{folder_href}#{page_number}?perpage=72"
response = http_client.get(page_url) response = http_client.get(page_url)
log_entry = response.log_entry
fatal_error("failed to scan folder page") if response.status_code != 200 fatal_error("failed to scan folder page") if response.status_code != 200
enqueue_jobs_from_found_links(
log_entry,
suppress_jobs: [{ job: self.class, url_name: user.url_name }],
)
if Domain::Fa::Job::ScanUserUtils.user_disabled_or_not_found?( if Domain::Fa::Job::ScanUserUtils.user_disabled_or_not_found?(
user, user,
response, response,

View File

@@ -2,13 +2,13 @@
class Domain::Inkbunny::Job::UpdatePostsJob < Domain::Inkbunny::Job::Base class Domain::Inkbunny::Job::UpdatePostsJob < Domain::Inkbunny::Job::Base
sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) } sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
def perform(args) def perform(args)
ib_post_ids = args[:ib_post_ids] ib_post_ids = T.cast(args[:ib_post_ids], T.nilable(T::Array[Integer]))
missing_pool_post_ib_ids = T::Set[Integer].new missing_pool_post_ib_ids = T::Set[Integer].new
pools_to_update = T::Set[Domain::PostGroup::InkbunnyPool].new pools_to_update = T::Set[Domain::PostGroup::InkbunnyPool].new
if ib_post_ids.empty? if ib_post_ids.blank? || ib_post_ids.empty?
logger.warn "empty ib_post_ids" logger.error("empty ib_post_ids")
return return
end end

View File

@@ -35,7 +35,8 @@ module Domain::StaticFileJobHelper
begin begin
response = http_client.get(file_url_str) response = http_client.get(file_url_str)
rescue Scraper::HttpClient::InvalidURLError => e rescue Scraper::HttpClient::InvalidURLError,
Curl::Err::HostResolutionError => e
post_file.state_terminal_error! post_file.state_terminal_error!
post_file.error_message = e.message post_file.error_message = e.message
logger.error( logger.error(

View File

@@ -174,139 +174,6 @@ class Scraper::JobBase < ApplicationJob
PERMITTED_CONTENT_TYPES = PERMITTED_CONTENT_TYPES =
T.let([%r{text/html}, %r{application/json}], T::Array[Regexp]) T.let([%r{text/html}, %r{application/json}], T::Array[Regexp])
sig do
params(log_entry: HttpLogEntry, suppress_jobs: T::Array[T.untyped]).void
end
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
return unless ENV["enqueue_jobs_from_found_links"] == "1"
start_time = Time.now
unless PERMITTED_CONTENT_TYPES.any? { |ct|
ct.match(log_entry.content_type)
}
raise("unsupported content type: #{log_entry.content_type}")
end
document = log_entry.response&.contents || return
link_finder = Scraper::LinkFinder.new(T.must(log_entry.uri_host), document)
link_finder.logger.level = :error
links = link_finder.find_links
job_defs = []
filter_mapper =
proc do |list, type, key|
list.filter { |item| item[:type] == type }.map { |item| item[key] }
end
attr_to_mapper =
proc do |model_class, attr_sym, in_values|
model_class
.where({ attr_sym => in_values })
.map { |model| [model.send(attr_sym), model] }
.to_h
end
url_name_to_fa_user =
attr_to_mapper.call(
Domain::Fa::User,
:url_name,
filter_mapper.call(links, :fa_user, :url_name),
)
fa_id_to_fa_post =
attr_to_mapper.call(
Domain::Fa::Post,
:fa_id,
filter_mapper.call(links, :fa_post, :fa_id),
)
name_to_twitter_user =
attr_to_mapper.call(
Domain::Twitter::User,
:name,
filter_mapper.call(links, :twitter_user, :name),
)
links.each do |link|
link_type = link[:type]
case link_type
when :fa_user
url_name = link[:url_name]
user = url_name_to_fa_user[url_name]
params = (user ? { user: user } : { url_name: url_name })
job_def =
({ params: params, desc: "fa user " + (user&.name || url_name).bold })
if !user || user.due_for_page_scan?
job_defs << job_def.merge(job: Domain::Fa::Job::UserPageJob)
elsif !user || user.due_for_gallery_scan?
job_defs << job_def.merge(job: Domain::Fa::Job::UserGalleryJob)
end
when :fa_post
fa_id = link[:fa_id]
post = fa_id_to_fa_post[fa_id]
params = (post ? { post: post } : { fa_id: fa_id })
job_def = ({ params: params, desc: "fa post #{fa_id.to_s.bold}" })
if !post || (post.state == "ok" && !post.scanned?)
job_defs << job_def.merge(job: Domain::Fa::Job::ScanPostJob)
end
if post && post.state == "ok" && post.scanned? && !post.have_file?
job_defs << job_def.merge(job: Domain::Fa::Job::ScanFileJob)
end
when :twitter_user
name = link[:name]
user = name_to_twitter_user[name]
params = (user ? { user: user } : { name: name })
job_def =
(
{
params: params,
desc: "twitter user " + (user&.name || name).bold,
}
)
if !user || user.due_for_timeline_tweets_scan?
job_defs << job_def.merge(
job: Domain::Twitter::Job::UserTimelineTweetsJob,
)
end
else
logger.warn(
"unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}",
)
end
end
job_defs.uniq!
job_defs.reject! do |job_def|
suppress_jobs.any? do |suppress|
suppress == job_def.slice(*suppress.keys)
end
end
job_defs.each do |job_def|
job_class = job_def[:job]
params = job_def[:params]
desc = job_def[:desc]
logger.debug(
[
"link finder -",
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
desc,
].join(" "),
)
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
end
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
logger.info(
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)",
)
end
# Delay a little bit on Net::ReadTimeout or Errno::ECONNREFUSED # Delay a little bit on Net::ReadTimeout or Errno::ECONNREFUSED
around_perform do |job, block| around_perform do |job, block|
block.call block.call

View File

@@ -31,7 +31,7 @@ class Scraper::FaHttpClientConfig < Scraper::HttpClientConfig
def ratelimit def ratelimit
# number represents minimum delay in seconds between requests to the same domain # number represents minimum delay in seconds between requests to the same domain
[["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 1.5]] [["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 1]]
end end
def allowed_domains def allowed_domains

View File

@@ -22,7 +22,26 @@ class Scraper::LinkFinder
T::Array[Regexp], T::Array[Regexp],
) )
sig { returns(T::Array[T.untyped]) } module FoundLink
class FaUser < T::ImmutableStruct
include T::Struct::ActsAsComparable
const :url_name, String
end
class FaPost < T::ImmutableStruct
include T::Struct::ActsAsComparable
const :fa_id, Integer
end
class TwitterUser < T::ImmutableStruct
include T::Struct::ActsAsComparable
const :name, String
end
Type = T.type_alias { T.any(FaUser, FaPost, TwitterUser) }
end
sig { returns(T::Array[FoundLink::Type]) }
def find_links def find_links
from_text_uris = from_text_uris =
RAW_REGEXES RAW_REGEXES
@@ -64,7 +83,7 @@ class Scraper::LinkFinder
process_twitter(uri) process_twitter(uri)
end end
end end
.reject(&:nil?) .compact
.uniq .uniq
end end
@@ -74,22 +93,18 @@ class Scraper::LinkFinder
FA_POST_PREFIXES = %w[/view/ /full/] FA_POST_PREFIXES = %w[/view/ /full/]
sig do sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
params(uri: Addressable::URI).returns(T.nilable(T::Hash[Symbol, T.untyped]))
end
def process_fa(uri) def process_fa(uri)
if prefixed_with?(FA_USER_PREFIXES, uri.path) if prefixed_with?(FA_USER_PREFIXES, uri.path)
url_name = path_parts(uri.path)[1]&.downcase url_name = path_parts(uri.path)[1]&.downcase
{ type: :fa_user, url_name: url_name } if url_name FoundLink::FaUser.new(url_name: url_name) if url_name
elsif prefixed_with?(FA_POST_PREFIXES, uri.path) elsif prefixed_with?(FA_POST_PREFIXES, uri.path)
fa_id = path_parts(uri.path)[1]&.to_i fa_id = path_parts(uri.path)[1]&.to_i
{ type: :fa_post, fa_id: fa_id } if fa_id FoundLink::FaPost.new(fa_id: fa_id) if fa_id
end end
end end
sig do sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
params(uri: Addressable::URI).returns(T.nilable(T::Hash[Symbol, T.untyped]))
end
def process_e621(uri) def process_e621(uri)
nil nil
end end
@@ -111,15 +126,13 @@ class Scraper::LinkFinder
support.twitter.com support.twitter.com
] ]
sig do sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
params(uri: Addressable::URI).returns(T.nilable(T::Hash[Symbol, T.untyped]))
end
def process_twitter(uri) def process_twitter(uri)
return if TWITTER_IGNORE_HOSTS.include?(uri.host) return if TWITTER_IGNORE_HOSTS.include?(uri.host)
return if TWITTER_IGNORE_FIRST_PATH_PART.include?(path_parts(uri.path)[0]) return if TWITTER_IGNORE_FIRST_PATH_PART.include?(path_parts(uri.path)[0])
name = path_parts(uri.path)[0] name = path_parts(uri.path)[0]
{ type: :twitter_user, name: name } if name FoundLink::TwitterUser.new(name: name) if name
end end
sig { params(prefixes: T::Array[String], path: String).returns(T::Boolean) } sig { params(prefixes: T::Array[String], path: String).returns(T::Boolean) }

View File

@@ -12,7 +12,10 @@ module PerformJobHelpers
).returns(T.untyped) ).returns(T.untyped)
end end
def perform_now(params, should_raise: false) def perform_now(params, should_raise: false)
ret = described_class.perform_now(params) ret =
described_class.perform_now(
{ skip_enqueue_found_links: true }.merge(params),
)
bt_printer = bt_printer =
Kernel.proc do Kernel.proc do

View File

@@ -64,4 +64,31 @@ describe Domain::Fa::Job::UserPageJob do
expect(user.num_favorites).to eq(0) expect(user.num_favorites).to eq(0)
end end
end end
context "user with page that links to unseen users" do
let(:client_mock_config) do
[
{
uri: "https://www.furaffinity.net/user/angelpawqt/",
status_code: 200,
content_type: "text/html",
contents:
SpecUtil.read_fixture_file(
"domain/fa/user_page/user_page_angelpawqt.html",
),
},
]
end
it "enqueues jobs for the unseen users" do
perform_now({ url_name: "angelpawqt", skip_enqueue_found_links: false })
expect(
SpecUtil.enqueued_job_args(Domain::Fa::Job::UserPageJob),
).to include(
hash_including(
user: Domain::User::FaUser.find_by(url_name: "8bitstarshon1"),
),
)
end
end
end end

View File

@@ -109,7 +109,7 @@ RSpec.describe Scraper::FaHttpClientConfig do
it "returns the configured rate limits" do it "returns the configured rate limits" do
config = described_class.new config = described_class.new
expect(config.ratelimit).to eq( expect(config.ratelimit).to eq(
[["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 1.5]], [["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 1]],
) )
end end
end end

View File

@@ -1,11 +1,84 @@
# typed: false # typed: false
require "rails_helper"
describe Scraper::LinkFinder do describe Scraper::LinkFinder do
def find_links(origin_host, document) def find_links(origin_host, document)
sort(Scraper::LinkFinder.new(origin_host, document).find_links) sort(Scraper::LinkFinder.new(origin_host, document).find_links)
end end
def sort(links) def sort(links)
links.sort_by { |l| l.to_s } links.uniq.sort_by { |l| l.to_s }
end
it "finds links on FA user pages" do
document =
File.read(
"test/fixtures/files/domain/fa/user_page/user_page_angelpawqt.html",
)
links = find_links("www.furaffinity.net", document)
expect(links).to include(
Scraper::LinkFinder::FoundLink::FaPost.new(fa_id: 54_212_636),
)
expect(links).to include(
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "8bitstarshon1"),
)
expect(links).to include(
Scraper::LinkFinder::FoundLink::TwitterUser.new(name: "furaffinity"),
)
# { type: :fa_post, fa_id: 44_848_118 },
# { type: :fa_post, fa_id: 50_619_594 },
# { type: :fa_post, fa_id: 51_321_948 },
# { type: :fa_post, fa_id: 51_379_769 },
# { type: :fa_post, fa_id: 52_782_367 },
# { type: :fa_post, fa_id: 54_212_636 },
# { type: :fa_post, fa_id: 54_717_098 },
# { type: :fa_post, fa_id: 57_492_887 },
# { type: :fa_post, fa_id: 57_567_803 },
# { type: :fa_post, fa_id: 57_736_082 },
# { type: :fa_post, fa_id: 57_744_156 },
# { type: :fa_post, fa_id: 57_889_649 },
# { type: :fa_post, fa_id: 57_898_215 },
# { type: :fa_post, fa_id: 58_080_112 },
# { type: :fa_post, fa_id: 58_196_925 },
# { type: :fa_post, fa_id: 58_233_811 },
# { type: :fa_post, fa_id: 59_515_926 },
# { type: :fa_user, url_name: "8bitstarshon1" },
# { type: :fa_user, url_name: "angelpawqt" },
# { type: :fa_user, url_name: "arilace" },
# { type: :fa_user, url_name: "chamomilearts" },
# { type: :fa_user, url_name: "cheshirkas" },
# { type: :fa_user, url_name: "cinnabunnyart" },
# { type: :fa_user, url_name: "cloudsen" },
# { type: :fa_user, url_name: "eyeswings" },
# { type: :fa_user, url_name: "fender" },
# { type: :fa_user, url_name: "feztiz" },
# { type: :fa_user, url_name: "grau" },
# { type: :fa_user, url_name: "huibiyyy" },
# { type: :fa_user, url_name: "hyongatayro" },
# { type: :fa_user, url_name: "i-luv-tentacles" },
# { type: :fa_user, url_name: "innoinatamm" },
# { type: :fa_user, url_name: "likara" },
# { type: :fa_user, url_name: "limoncella" },
# { type: :fa_user, url_name: "nacpanylis" },
# { type: :fa_user, url_name: "oseledetts" },
# { type: :fa_user, url_name: "pinxi" },
# { type: :fa_user, url_name: "plazmafox" },
# { type: :fa_user, url_name: "rosebyki1" },
# { type: :fa_user, url_name: "rosebyki2" },
# { type: :fa_user, url_name: "samarthanie" },
# { type: :fa_user, url_name: "sekaitoroto" },
# { type: :fa_user, url_name: "snakeki" },
# { type: :fa_user, url_name: "tailung" },
# { type: :fa_user, url_name: "tentastic" },
# { type: :fa_user, url_name: "toks" },
# { type: :fa_user, url_name: "vestel" },
# { type: :fa_user, url_name: "yabewenyx" },
# { type: :fa_user, url_name: "yoyikori" },
# { type: :fa_user, url_name: "zzreg" },
# { type: :fa_user, url_name: "~fikanori" },
# { type: :twitter_user, name: "furaffinity" },
# )
end end
it "finds links on pages from furaffinity" do it "finds links on pages from furaffinity" do
@@ -13,14 +86,16 @@ describe Scraper::LinkFinder do
Hello, world! This is my document. <a href="/user/foobar">Foobar</a> Hello, world! This is my document. <a href="/user/foobar">Foobar</a>
visit me at www.furaffinity.net/user/baz or twitter.com/foobartw visit me at www.furaffinity.net/user/baz or twitter.com/foobartw
HTML HTML
assert_equal sort( links = find_links("www.furaffinity.net", document)
[ expect(links).to include(
{ type: :fa_user, url_name: "foobar" }, Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "foobar"),
{ type: :fa_user, url_name: "baz" }, )
{ type: :twitter_user, name: "foobartw" } expect(links).to include(
] Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "baz"),
), )
find_links("www.furaffinity.net", document) expect(links).to include(
Scraper::LinkFinder::FoundLink::TwitterUser.new(name: "foobartw"),
)
end end
it "ignores raw links ending in an ellipsis" do it "ignores raw links ending in an ellipsis" do
@@ -38,7 +113,9 @@ describe Scraper::LinkFinder do
document = <<-HTML document = <<-HTML
www.furaffinity.net/user/Some-User www.furaffinity.net/user/Some-User
HTML HTML
assert_equal sort([{ type: :fa_user, url_name: "some-user" }]), links = find_links("www.furaffinity.net", document)
find_links("www.furaffinity.net", document) expect(links).to include(
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "some-user"),
)
end end
end end

File diff suppressed because it is too large Load Diff