enqueue from links
This commit is contained in:
@@ -10,6 +10,8 @@ class Domain::PostsController < DomainController
|
|||||||
before_action :set_post_group!, only: %i[posts_in_group]
|
before_action :set_post_group!, only: %i[posts_in_group]
|
||||||
|
|
||||||
class PostsIndexViewConfig < T::ImmutableStruct
|
class PostsIndexViewConfig < T::ImmutableStruct
|
||||||
|
include T::Struct::ActsAsComparable
|
||||||
|
|
||||||
const :show_domain_filters, T::Boolean
|
const :show_domain_filters, T::Boolean
|
||||||
const :show_creator_links, T::Boolean
|
const :show_creator_links, T::Boolean
|
||||||
const :index_type_header, String
|
const :index_type_header, String
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ class DomainController < ApplicationController
|
|||||||
abstract!
|
abstract!
|
||||||
|
|
||||||
class DomainParamConfig < T::ImmutableStruct
|
class DomainParamConfig < T::ImmutableStruct
|
||||||
|
include T::Struct::ActsAsComparable
|
||||||
|
|
||||||
const :post_id_param, Symbol
|
const :post_id_param, Symbol
|
||||||
const :user_id_param, Symbol
|
const :user_id_param, Symbol
|
||||||
const :post_group_id_param, Symbol
|
const :post_group_id_param, Symbol
|
||||||
|
|||||||
@@ -91,6 +91,8 @@ module Domain::UsersHelper
|
|||||||
end
|
end
|
||||||
|
|
||||||
class StatRow < T::ImmutableStruct
|
class StatRow < T::ImmutableStruct
|
||||||
|
include T::Struct::ActsAsComparable
|
||||||
|
|
||||||
const :name, String
|
const :name, String
|
||||||
const :value,
|
const :value,
|
||||||
T.nilable(
|
T.nilable(
|
||||||
|
|||||||
@@ -12,6 +12,11 @@ class Domain::Fa::Job::Base < Scraper::JobBase
|
|||||||
|
|
||||||
protected
|
protected
|
||||||
|
|
||||||
|
sig { returns(T::Boolean) }
|
||||||
|
def skip_enqueue_found_links?
|
||||||
|
!!arguments[0][:skip_enqueue_found_links]
|
||||||
|
end
|
||||||
|
|
||||||
sig { params(build_post: T::Boolean).returns(Domain::Post::FaPost) }
|
sig { params(build_post: T::Boolean).returns(Domain::Post::FaPost) }
|
||||||
def post_from_args!(build_post: false)
|
def post_from_args!(build_post: false)
|
||||||
args = arguments[0]
|
args = arguments[0]
|
||||||
@@ -394,4 +399,106 @@ class Domain::Fa::Job::Base < Scraper::JobBase
|
|||||||
user.association(:avatar).reload
|
user.association(:avatar).reload
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
FoundLink = Scraper::LinkFinder::FoundLink
|
||||||
|
|
||||||
|
sig do
|
||||||
|
params(log_entry: HttpLogEntry, suppress_jobs: T::Array[T.untyped]).void
|
||||||
|
end
|
||||||
|
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
|
||||||
|
return if skip_enqueue_found_links?
|
||||||
|
|
||||||
|
start_time = Time.now
|
||||||
|
unless PERMITTED_CONTENT_TYPES.any? { |ct|
|
||||||
|
ct.match(log_entry.content_type)
|
||||||
|
}
|
||||||
|
raise("unsupported content type: #{log_entry.content_type}")
|
||||||
|
end
|
||||||
|
document = log_entry.response&.contents || return
|
||||||
|
link_finder = Scraper::LinkFinder.new(T.must(log_entry.uri_host), document)
|
||||||
|
link_finder.logger.level = :error
|
||||||
|
links = link_finder.find_links
|
||||||
|
job_defs = []
|
||||||
|
|
||||||
|
url_names =
|
||||||
|
links.filter_map do |link|
|
||||||
|
link.is_a?(FoundLink::FaUser) ? link.url_name : nil
|
||||||
|
end
|
||||||
|
url_name_to_fa_user =
|
||||||
|
T.let(
|
||||||
|
Domain::User::FaUser.where(url_name: url_names).index_by(&:url_name),
|
||||||
|
T::Hash[String, Domain::User::FaUser],
|
||||||
|
)
|
||||||
|
|
||||||
|
fa_ids =
|
||||||
|
links.filter_map do |link|
|
||||||
|
link.is_a?(FoundLink::FaPost) ? link.fa_id : nil
|
||||||
|
end
|
||||||
|
fa_id_to_fa_post =
|
||||||
|
T.cast(
|
||||||
|
Domain::Post::FaPost.where(fa_id: fa_ids).index_by(&:fa_id),
|
||||||
|
T::Hash[Integer, Domain::Post::FaPost],
|
||||||
|
)
|
||||||
|
|
||||||
|
links
|
||||||
|
.filter_map do |link|
|
||||||
|
if link.is_a?(FoundLink::FaUser) || link.is_a?(FoundLink::FaPost)
|
||||||
|
link
|
||||||
|
else
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
.each do |link|
|
||||||
|
case link
|
||||||
|
when FoundLink::FaUser
|
||||||
|
url_name = link.url_name
|
||||||
|
user =
|
||||||
|
url_name_to_fa_user[url_name] ||
|
||||||
|
Domain::User::FaUser.create!(url_name:) do |user|
|
||||||
|
user.name ||= url_name
|
||||||
|
end
|
||||||
|
enqueue_user_scan(user)
|
||||||
|
when FoundLink::FaPost
|
||||||
|
fa_id = link.fa_id
|
||||||
|
post =
|
||||||
|
fa_id_to_fa_post[fa_id] ||
|
||||||
|
Domain::Post::FaPost.build(fa_id:) do |post|
|
||||||
|
post.first_seen_entry_id = log_entry.id
|
||||||
|
end
|
||||||
|
if post.new_record?
|
||||||
|
post.save!
|
||||||
|
defer_job(Domain::Fa::Job::ScanPostJob, { post: })
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
job_defs.uniq!
|
||||||
|
job_defs.reject! do |job_def|
|
||||||
|
suppress_jobs.any? do |suppress|
|
||||||
|
suppress == job_def.slice(*suppress.keys)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
job_defs.each do |job_def|
|
||||||
|
job_class = job_def[:job]
|
||||||
|
params = job_def[:params]
|
||||||
|
desc = job_def[:desc]
|
||||||
|
logger.debug(
|
||||||
|
[
|
||||||
|
"link finder -",
|
||||||
|
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
|
||||||
|
desc,
|
||||||
|
].join(" "),
|
||||||
|
)
|
||||||
|
|
||||||
|
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
|
||||||
|
end
|
||||||
|
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
|
||||||
|
logger.info(
|
||||||
|
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)",
|
||||||
|
)
|
||||||
|
rescue StandardError => e
|
||||||
|
logger.error(
|
||||||
|
"link finder - error enqueuing jobs: #{e.class.name} - #{e.message}",
|
||||||
|
)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -57,6 +57,8 @@ class Domain::Fa::Job::BrowsePageJob < Domain::Fa::Job::Base
|
|||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
enqueue_jobs_from_found_links(response.log_entry)
|
||||||
|
|
||||||
page = Domain::Fa::Parser::Page.new(response.body)
|
page = Domain::Fa::Parser::Page.new(response.body)
|
||||||
listing_page_stats =
|
listing_page_stats =
|
||||||
update_and_enqueue_posts_from_listings_page(
|
update_and_enqueue_posts_from_listings_page(
|
||||||
|
|||||||
@@ -93,15 +93,8 @@ class Domain::Fa::Job::UserGalleryJob < Domain::Fa::Job::Base
|
|||||||
page_url =
|
page_url =
|
||||||
"https://www.furaffinity.net#{folder_href}#{page_number}?perpage=72"
|
"https://www.furaffinity.net#{folder_href}#{page_number}?perpage=72"
|
||||||
response = http_client.get(page_url)
|
response = http_client.get(page_url)
|
||||||
log_entry = response.log_entry
|
|
||||||
|
|
||||||
fatal_error("failed to scan folder page") if response.status_code != 200
|
fatal_error("failed to scan folder page") if response.status_code != 200
|
||||||
|
|
||||||
enqueue_jobs_from_found_links(
|
|
||||||
log_entry,
|
|
||||||
suppress_jobs: [{ job: self.class, url_name: user.url_name }],
|
|
||||||
)
|
|
||||||
|
|
||||||
if Domain::Fa::Job::ScanUserUtils.user_disabled_or_not_found?(
|
if Domain::Fa::Job::ScanUserUtils.user_disabled_or_not_found?(
|
||||||
user,
|
user,
|
||||||
response,
|
response,
|
||||||
|
|||||||
@@ -2,13 +2,13 @@
|
|||||||
class Domain::Inkbunny::Job::UpdatePostsJob < Domain::Inkbunny::Job::Base
|
class Domain::Inkbunny::Job::UpdatePostsJob < Domain::Inkbunny::Job::Base
|
||||||
sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
|
sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
|
||||||
def perform(args)
|
def perform(args)
|
||||||
ib_post_ids = args[:ib_post_ids]
|
ib_post_ids = T.cast(args[:ib_post_ids], T.nilable(T::Array[Integer]))
|
||||||
|
|
||||||
missing_pool_post_ib_ids = T::Set[Integer].new
|
missing_pool_post_ib_ids = T::Set[Integer].new
|
||||||
pools_to_update = T::Set[Domain::PostGroup::InkbunnyPool].new
|
pools_to_update = T::Set[Domain::PostGroup::InkbunnyPool].new
|
||||||
|
|
||||||
if ib_post_ids.empty?
|
if ib_post_ids.blank? || ib_post_ids.empty?
|
||||||
logger.warn "empty ib_post_ids"
|
logger.error("empty ib_post_ids")
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,8 @@ module Domain::StaticFileJobHelper
|
|||||||
|
|
||||||
begin
|
begin
|
||||||
response = http_client.get(file_url_str)
|
response = http_client.get(file_url_str)
|
||||||
rescue Scraper::HttpClient::InvalidURLError => e
|
rescue Scraper::HttpClient::InvalidURLError,
|
||||||
|
Curl::Err::HostResolutionError => e
|
||||||
post_file.state_terminal_error!
|
post_file.state_terminal_error!
|
||||||
post_file.error_message = e.message
|
post_file.error_message = e.message
|
||||||
logger.error(
|
logger.error(
|
||||||
|
|||||||
@@ -174,139 +174,6 @@ class Scraper::JobBase < ApplicationJob
|
|||||||
PERMITTED_CONTENT_TYPES =
|
PERMITTED_CONTENT_TYPES =
|
||||||
T.let([%r{text/html}, %r{application/json}], T::Array[Regexp])
|
T.let([%r{text/html}, %r{application/json}], T::Array[Regexp])
|
||||||
|
|
||||||
sig do
|
|
||||||
params(log_entry: HttpLogEntry, suppress_jobs: T::Array[T.untyped]).void
|
|
||||||
end
|
|
||||||
def enqueue_jobs_from_found_links(log_entry, suppress_jobs: [])
|
|
||||||
return unless ENV["enqueue_jobs_from_found_links"] == "1"
|
|
||||||
|
|
||||||
start_time = Time.now
|
|
||||||
unless PERMITTED_CONTENT_TYPES.any? { |ct|
|
|
||||||
ct.match(log_entry.content_type)
|
|
||||||
}
|
|
||||||
raise("unsupported content type: #{log_entry.content_type}")
|
|
||||||
end
|
|
||||||
document = log_entry.response&.contents || return
|
|
||||||
link_finder = Scraper::LinkFinder.new(T.must(log_entry.uri_host), document)
|
|
||||||
link_finder.logger.level = :error
|
|
||||||
links = link_finder.find_links
|
|
||||||
job_defs = []
|
|
||||||
|
|
||||||
filter_mapper =
|
|
||||||
proc do |list, type, key|
|
|
||||||
list.filter { |item| item[:type] == type }.map { |item| item[key] }
|
|
||||||
end
|
|
||||||
|
|
||||||
attr_to_mapper =
|
|
||||||
proc do |model_class, attr_sym, in_values|
|
|
||||||
model_class
|
|
||||||
.where({ attr_sym => in_values })
|
|
||||||
.map { |model| [model.send(attr_sym), model] }
|
|
||||||
.to_h
|
|
||||||
end
|
|
||||||
|
|
||||||
url_name_to_fa_user =
|
|
||||||
attr_to_mapper.call(
|
|
||||||
Domain::Fa::User,
|
|
||||||
:url_name,
|
|
||||||
filter_mapper.call(links, :fa_user, :url_name),
|
|
||||||
)
|
|
||||||
|
|
||||||
fa_id_to_fa_post =
|
|
||||||
attr_to_mapper.call(
|
|
||||||
Domain::Fa::Post,
|
|
||||||
:fa_id,
|
|
||||||
filter_mapper.call(links, :fa_post, :fa_id),
|
|
||||||
)
|
|
||||||
|
|
||||||
name_to_twitter_user =
|
|
||||||
attr_to_mapper.call(
|
|
||||||
Domain::Twitter::User,
|
|
||||||
:name,
|
|
||||||
filter_mapper.call(links, :twitter_user, :name),
|
|
||||||
)
|
|
||||||
|
|
||||||
links.each do |link|
|
|
||||||
link_type = link[:type]
|
|
||||||
case link_type
|
|
||||||
when :fa_user
|
|
||||||
url_name = link[:url_name]
|
|
||||||
user = url_name_to_fa_user[url_name]
|
|
||||||
params = (user ? { user: user } : { url_name: url_name })
|
|
||||||
|
|
||||||
job_def =
|
|
||||||
({ params: params, desc: "fa user " + (user&.name || url_name).bold })
|
|
||||||
|
|
||||||
if !user || user.due_for_page_scan?
|
|
||||||
job_defs << job_def.merge(job: Domain::Fa::Job::UserPageJob)
|
|
||||||
elsif !user || user.due_for_gallery_scan?
|
|
||||||
job_defs << job_def.merge(job: Domain::Fa::Job::UserGalleryJob)
|
|
||||||
end
|
|
||||||
when :fa_post
|
|
||||||
fa_id = link[:fa_id]
|
|
||||||
post = fa_id_to_fa_post[fa_id]
|
|
||||||
params = (post ? { post: post } : { fa_id: fa_id })
|
|
||||||
|
|
||||||
job_def = ({ params: params, desc: "fa post #{fa_id.to_s.bold}" })
|
|
||||||
|
|
||||||
if !post || (post.state == "ok" && !post.scanned?)
|
|
||||||
job_defs << job_def.merge(job: Domain::Fa::Job::ScanPostJob)
|
|
||||||
end
|
|
||||||
|
|
||||||
if post && post.state == "ok" && post.scanned? && !post.have_file?
|
|
||||||
job_defs << job_def.merge(job: Domain::Fa::Job::ScanFileJob)
|
|
||||||
end
|
|
||||||
when :twitter_user
|
|
||||||
name = link[:name]
|
|
||||||
user = name_to_twitter_user[name]
|
|
||||||
params = (user ? { user: user } : { name: name })
|
|
||||||
|
|
||||||
job_def =
|
|
||||||
(
|
|
||||||
{
|
|
||||||
params: params,
|
|
||||||
desc: "twitter user " + (user&.name || name).bold,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
if !user || user.due_for_timeline_tweets_scan?
|
|
||||||
job_defs << job_def.merge(
|
|
||||||
job: Domain::Twitter::Job::UserTimelineTweetsJob,
|
|
||||||
)
|
|
||||||
end
|
|
||||||
else
|
|
||||||
logger.warn(
|
|
||||||
"unknown link type #{link_type.to_s.bold}: #{link.inspect.bold}",
|
|
||||||
)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
job_defs.uniq!
|
|
||||||
job_defs.reject! do |job_def|
|
|
||||||
suppress_jobs.any? do |suppress|
|
|
||||||
suppress == job_def.slice(*suppress.keys)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
job_defs.each do |job_def|
|
|
||||||
job_class = job_def[:job]
|
|
||||||
params = job_def[:params]
|
|
||||||
desc = job_def[:desc]
|
|
||||||
logger.debug(
|
|
||||||
[
|
|
||||||
"link finder -",
|
|
||||||
job_class.name.split("::").last.to_s.ljust(22).bold.light_black,
|
|
||||||
desc,
|
|
||||||
].join(" "),
|
|
||||||
)
|
|
||||||
|
|
||||||
defer_job(job_class, params.merge({ caused_by_entry: log_entry }))
|
|
||||||
end
|
|
||||||
duration_ms = (1000 * (Time.now - start_time)).to_i.to_s
|
|
||||||
logger.info(
|
|
||||||
"link finder - enqueue #{job_defs.size.to_s.light_white.bold} jobs (#{duration_ms.bold} ms)",
|
|
||||||
)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Delay a little bit on Net::ReadTimeout or Errno::ECONNREFUSED
|
# Delay a little bit on Net::ReadTimeout or Errno::ECONNREFUSED
|
||||||
around_perform do |job, block|
|
around_perform do |job, block|
|
||||||
block.call
|
block.call
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class Scraper::FaHttpClientConfig < Scraper::HttpClientConfig
|
|||||||
|
|
||||||
def ratelimit
|
def ratelimit
|
||||||
# number represents minimum delay in seconds between requests to the same domain
|
# number represents minimum delay in seconds between requests to the same domain
|
||||||
[["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 1.5]]
|
[["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 1]]
|
||||||
end
|
end
|
||||||
|
|
||||||
def allowed_domains
|
def allowed_domains
|
||||||
|
|||||||
@@ -22,7 +22,26 @@ class Scraper::LinkFinder
|
|||||||
T::Array[Regexp],
|
T::Array[Regexp],
|
||||||
)
|
)
|
||||||
|
|
||||||
sig { returns(T::Array[T.untyped]) }
|
module FoundLink
|
||||||
|
class FaUser < T::ImmutableStruct
|
||||||
|
include T::Struct::ActsAsComparable
|
||||||
|
const :url_name, String
|
||||||
|
end
|
||||||
|
|
||||||
|
class FaPost < T::ImmutableStruct
|
||||||
|
include T::Struct::ActsAsComparable
|
||||||
|
const :fa_id, Integer
|
||||||
|
end
|
||||||
|
|
||||||
|
class TwitterUser < T::ImmutableStruct
|
||||||
|
include T::Struct::ActsAsComparable
|
||||||
|
const :name, String
|
||||||
|
end
|
||||||
|
|
||||||
|
Type = T.type_alias { T.any(FaUser, FaPost, TwitterUser) }
|
||||||
|
end
|
||||||
|
|
||||||
|
sig { returns(T::Array[FoundLink::Type]) }
|
||||||
def find_links
|
def find_links
|
||||||
from_text_uris =
|
from_text_uris =
|
||||||
RAW_REGEXES
|
RAW_REGEXES
|
||||||
@@ -64,7 +83,7 @@ class Scraper::LinkFinder
|
|||||||
process_twitter(uri)
|
process_twitter(uri)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
.reject(&:nil?)
|
.compact
|
||||||
.uniq
|
.uniq
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -74,22 +93,18 @@ class Scraper::LinkFinder
|
|||||||
|
|
||||||
FA_POST_PREFIXES = %w[/view/ /full/]
|
FA_POST_PREFIXES = %w[/view/ /full/]
|
||||||
|
|
||||||
sig do
|
sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
|
||||||
params(uri: Addressable::URI).returns(T.nilable(T::Hash[Symbol, T.untyped]))
|
|
||||||
end
|
|
||||||
def process_fa(uri)
|
def process_fa(uri)
|
||||||
if prefixed_with?(FA_USER_PREFIXES, uri.path)
|
if prefixed_with?(FA_USER_PREFIXES, uri.path)
|
||||||
url_name = path_parts(uri.path)[1]&.downcase
|
url_name = path_parts(uri.path)[1]&.downcase
|
||||||
{ type: :fa_user, url_name: url_name } if url_name
|
FoundLink::FaUser.new(url_name: url_name) if url_name
|
||||||
elsif prefixed_with?(FA_POST_PREFIXES, uri.path)
|
elsif prefixed_with?(FA_POST_PREFIXES, uri.path)
|
||||||
fa_id = path_parts(uri.path)[1]&.to_i
|
fa_id = path_parts(uri.path)[1]&.to_i
|
||||||
{ type: :fa_post, fa_id: fa_id } if fa_id
|
FoundLink::FaPost.new(fa_id: fa_id) if fa_id
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
sig do
|
sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
|
||||||
params(uri: Addressable::URI).returns(T.nilable(T::Hash[Symbol, T.untyped]))
|
|
||||||
end
|
|
||||||
def process_e621(uri)
|
def process_e621(uri)
|
||||||
nil
|
nil
|
||||||
end
|
end
|
||||||
@@ -111,15 +126,13 @@ class Scraper::LinkFinder
|
|||||||
support.twitter.com
|
support.twitter.com
|
||||||
]
|
]
|
||||||
|
|
||||||
sig do
|
sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
|
||||||
params(uri: Addressable::URI).returns(T.nilable(T::Hash[Symbol, T.untyped]))
|
|
||||||
end
|
|
||||||
def process_twitter(uri)
|
def process_twitter(uri)
|
||||||
return if TWITTER_IGNORE_HOSTS.include?(uri.host)
|
return if TWITTER_IGNORE_HOSTS.include?(uri.host)
|
||||||
return if TWITTER_IGNORE_FIRST_PATH_PART.include?(path_parts(uri.path)[0])
|
return if TWITTER_IGNORE_FIRST_PATH_PART.include?(path_parts(uri.path)[0])
|
||||||
|
|
||||||
name = path_parts(uri.path)[0]
|
name = path_parts(uri.path)[0]
|
||||||
{ type: :twitter_user, name: name } if name
|
FoundLink::TwitterUser.new(name: name) if name
|
||||||
end
|
end
|
||||||
|
|
||||||
sig { params(prefixes: T::Array[String], path: String).returns(T::Boolean) }
|
sig { params(prefixes: T::Array[String], path: String).returns(T::Boolean) }
|
||||||
|
|||||||
@@ -12,7 +12,10 @@ module PerformJobHelpers
|
|||||||
).returns(T.untyped)
|
).returns(T.untyped)
|
||||||
end
|
end
|
||||||
def perform_now(params, should_raise: false)
|
def perform_now(params, should_raise: false)
|
||||||
ret = described_class.perform_now(params)
|
ret =
|
||||||
|
described_class.perform_now(
|
||||||
|
{ skip_enqueue_found_links: true }.merge(params),
|
||||||
|
)
|
||||||
|
|
||||||
bt_printer =
|
bt_printer =
|
||||||
Kernel.proc do
|
Kernel.proc do
|
||||||
|
|||||||
@@ -64,4 +64,31 @@ describe Domain::Fa::Job::UserPageJob do
|
|||||||
expect(user.num_favorites).to eq(0)
|
expect(user.num_favorites).to eq(0)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "user with page that links to unseen users" do
|
||||||
|
let(:client_mock_config) do
|
||||||
|
[
|
||||||
|
{
|
||||||
|
uri: "https://www.furaffinity.net/user/angelpawqt/",
|
||||||
|
status_code: 200,
|
||||||
|
content_type: "text/html",
|
||||||
|
contents:
|
||||||
|
SpecUtil.read_fixture_file(
|
||||||
|
"domain/fa/user_page/user_page_angelpawqt.html",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it "enqueues jobs for the unseen users" do
|
||||||
|
perform_now({ url_name: "angelpawqt", skip_enqueue_found_links: false })
|
||||||
|
expect(
|
||||||
|
SpecUtil.enqueued_job_args(Domain::Fa::Job::UserPageJob),
|
||||||
|
).to include(
|
||||||
|
hash_including(
|
||||||
|
user: Domain::User::FaUser.find_by(url_name: "8bitstarshon1"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ RSpec.describe Scraper::FaHttpClientConfig do
|
|||||||
it "returns the configured rate limits" do
|
it "returns the configured rate limits" do
|
||||||
config = described_class.new
|
config = described_class.new
|
||||||
expect(config.ratelimit).to eq(
|
expect(config.ratelimit).to eq(
|
||||||
[["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 1.5]],
|
[["d.furaffinity.net", :none], ["*.facdn.net", :none], ["*", 1]],
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -1,11 +1,84 @@
|
|||||||
# typed: false
|
# typed: false
|
||||||
|
require "rails_helper"
|
||||||
|
|
||||||
describe Scraper::LinkFinder do
|
describe Scraper::LinkFinder do
|
||||||
def find_links(origin_host, document)
|
def find_links(origin_host, document)
|
||||||
sort(Scraper::LinkFinder.new(origin_host, document).find_links)
|
sort(Scraper::LinkFinder.new(origin_host, document).find_links)
|
||||||
end
|
end
|
||||||
|
|
||||||
def sort(links)
|
def sort(links)
|
||||||
links.sort_by { |l| l.to_s }
|
links.uniq.sort_by { |l| l.to_s }
|
||||||
|
end
|
||||||
|
|
||||||
|
it "finds links on FA user pages" do
|
||||||
|
document =
|
||||||
|
File.read(
|
||||||
|
"test/fixtures/files/domain/fa/user_page/user_page_angelpawqt.html",
|
||||||
|
)
|
||||||
|
links = find_links("www.furaffinity.net", document)
|
||||||
|
expect(links).to include(
|
||||||
|
Scraper::LinkFinder::FoundLink::FaPost.new(fa_id: 54_212_636),
|
||||||
|
)
|
||||||
|
expect(links).to include(
|
||||||
|
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "8bitstarshon1"),
|
||||||
|
)
|
||||||
|
expect(links).to include(
|
||||||
|
Scraper::LinkFinder::FoundLink::TwitterUser.new(name: "furaffinity"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# { type: :fa_post, fa_id: 44_848_118 },
|
||||||
|
# { type: :fa_post, fa_id: 50_619_594 },
|
||||||
|
# { type: :fa_post, fa_id: 51_321_948 },
|
||||||
|
# { type: :fa_post, fa_id: 51_379_769 },
|
||||||
|
# { type: :fa_post, fa_id: 52_782_367 },
|
||||||
|
# { type: :fa_post, fa_id: 54_212_636 },
|
||||||
|
# { type: :fa_post, fa_id: 54_717_098 },
|
||||||
|
# { type: :fa_post, fa_id: 57_492_887 },
|
||||||
|
# { type: :fa_post, fa_id: 57_567_803 },
|
||||||
|
# { type: :fa_post, fa_id: 57_736_082 },
|
||||||
|
# { type: :fa_post, fa_id: 57_744_156 },
|
||||||
|
# { type: :fa_post, fa_id: 57_889_649 },
|
||||||
|
# { type: :fa_post, fa_id: 57_898_215 },
|
||||||
|
# { type: :fa_post, fa_id: 58_080_112 },
|
||||||
|
# { type: :fa_post, fa_id: 58_196_925 },
|
||||||
|
# { type: :fa_post, fa_id: 58_233_811 },
|
||||||
|
# { type: :fa_post, fa_id: 59_515_926 },
|
||||||
|
# { type: :fa_user, url_name: "8bitstarshon1" },
|
||||||
|
# { type: :fa_user, url_name: "angelpawqt" },
|
||||||
|
# { type: :fa_user, url_name: "arilace" },
|
||||||
|
# { type: :fa_user, url_name: "chamomilearts" },
|
||||||
|
# { type: :fa_user, url_name: "cheshirkas" },
|
||||||
|
# { type: :fa_user, url_name: "cinnabunnyart" },
|
||||||
|
# { type: :fa_user, url_name: "cloudsen" },
|
||||||
|
# { type: :fa_user, url_name: "eyeswings" },
|
||||||
|
# { type: :fa_user, url_name: "fender" },
|
||||||
|
# { type: :fa_user, url_name: "feztiz" },
|
||||||
|
# { type: :fa_user, url_name: "grau" },
|
||||||
|
# { type: :fa_user, url_name: "huibiyyy" },
|
||||||
|
# { type: :fa_user, url_name: "hyongatayro" },
|
||||||
|
# { type: :fa_user, url_name: "i-luv-tentacles" },
|
||||||
|
# { type: :fa_user, url_name: "innoinatamm" },
|
||||||
|
# { type: :fa_user, url_name: "likara" },
|
||||||
|
# { type: :fa_user, url_name: "limoncella" },
|
||||||
|
# { type: :fa_user, url_name: "nacpanylis" },
|
||||||
|
# { type: :fa_user, url_name: "oseledetts" },
|
||||||
|
# { type: :fa_user, url_name: "pinxi" },
|
||||||
|
# { type: :fa_user, url_name: "plazmafox" },
|
||||||
|
# { type: :fa_user, url_name: "rosebyki1" },
|
||||||
|
# { type: :fa_user, url_name: "rosebyki2" },
|
||||||
|
# { type: :fa_user, url_name: "samarthanie" },
|
||||||
|
# { type: :fa_user, url_name: "sekaitoroto" },
|
||||||
|
# { type: :fa_user, url_name: "snakeki" },
|
||||||
|
# { type: :fa_user, url_name: "tailung" },
|
||||||
|
# { type: :fa_user, url_name: "tentastic" },
|
||||||
|
# { type: :fa_user, url_name: "toks" },
|
||||||
|
# { type: :fa_user, url_name: "vestel" },
|
||||||
|
# { type: :fa_user, url_name: "yabewenyx" },
|
||||||
|
# { type: :fa_user, url_name: "yoyikori" },
|
||||||
|
# { type: :fa_user, url_name: "zzreg" },
|
||||||
|
# { type: :fa_user, url_name: "~fikanori" },
|
||||||
|
# { type: :twitter_user, name: "furaffinity" },
|
||||||
|
# )
|
||||||
end
|
end
|
||||||
|
|
||||||
it "finds links on pages from furaffinity" do
|
it "finds links on pages from furaffinity" do
|
||||||
@@ -13,14 +86,16 @@ describe Scraper::LinkFinder do
|
|||||||
Hello, world! This is my document. <a href="/user/foobar">Foobar</a>
|
Hello, world! This is my document. <a href="/user/foobar">Foobar</a>
|
||||||
visit me at www.furaffinity.net/user/baz or twitter.com/foobartw
|
visit me at www.furaffinity.net/user/baz or twitter.com/foobartw
|
||||||
HTML
|
HTML
|
||||||
assert_equal sort(
|
links = find_links("www.furaffinity.net", document)
|
||||||
[
|
expect(links).to include(
|
||||||
{ type: :fa_user, url_name: "foobar" },
|
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "foobar"),
|
||||||
{ type: :fa_user, url_name: "baz" },
|
)
|
||||||
{ type: :twitter_user, name: "foobartw" }
|
expect(links).to include(
|
||||||
]
|
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "baz"),
|
||||||
),
|
)
|
||||||
find_links("www.furaffinity.net", document)
|
expect(links).to include(
|
||||||
|
Scraper::LinkFinder::FoundLink::TwitterUser.new(name: "foobartw"),
|
||||||
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "ignores raw links ending in an ellipsis" do
|
it "ignores raw links ending in an ellipsis" do
|
||||||
@@ -38,7 +113,9 @@ describe Scraper::LinkFinder do
|
|||||||
document = <<-HTML
|
document = <<-HTML
|
||||||
www.furaffinity.net/user/Some-User
|
www.furaffinity.net/user/Some-User
|
||||||
HTML
|
HTML
|
||||||
assert_equal sort([{ type: :fa_user, url_name: "some-user" }]),
|
links = find_links("www.furaffinity.net", document)
|
||||||
find_links("www.furaffinity.net", document)
|
expect(links).to include(
|
||||||
|
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "some-user"),
|
||||||
|
)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
1236
test/fixtures/files/domain/fa/user_page/user_page_angelpawqt.html
vendored
Normal file
1236
test/fixtures/files/domain/fa/user_page/user_page_angelpawqt.html
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user