Files
redux-scraper/app/lib/domain/fa/parser/page.rb
2025-08-13 08:20:32 +00:00

349 lines
8.8 KiB
Ruby

# typed: true
# frozen_string_literal: true
require "nokogiri"
class Domain::Fa::Parser::Page < Domain::Fa::Parser::Base
extend T::Sig
# old, old before legacy
VERSION_0 = :old_old
# legacy version
VERSION_1 = :legacy
# redux version
VERSION_2 = :redux
sig do
params(log_entry: HttpLogEntry, require_logged_in: T::Boolean).returns(
Domain::Fa::Parser::Page,
)
end
def self.from_log_entry(log_entry, require_logged_in: true)
new(T.must(log_entry.response_bytes), require_logged_in:)
end
sig do
params(
page_html: T.any(String, Nokogiri::HTML4::Document),
require_logged_in: T::Boolean,
).void
end
def initialize(page_html, require_logged_in: true)
@page =
if page_html.is_a? Nokogiri::HTML::Document
page_html
else
phtml = page_html.delete("\u0000")
@phtml = phtml
T.cast(Nokogiri.HTML(phtml), Nokogiri::HTML4::Document)
end
page_version =
if @page.css("link[href='/themes/beta/img/favicon.ico']").first
VERSION_2
elsif @page.css(".submission-list section").first ||
@page.css("meta[property='twitter:site']").first
VERSION_1
else
VERSION_0
end
super(page_version:)
if require_logged_in && !submission_not_found?
raise Domain::Fa::Parser::NotLoggedInError unless logged_in?
end
end
sig { returns(T::Boolean) }
def account_disabled?
@page.inner_html.include?(
"has voluntarily disabled access to their account and all of its contents",
)
end
sig { void }
def require_logged_in!
if !@page.css("img.loggedin_user_avatar")&.first.nil?
raise Domain::Fa::Parser::NotLoggedInError
end
end
sig { returns(T::Boolean) }
def submission_not_found?
# the username elem is never shown on a "not found" page
return false if logged_in_user_elem
not_found_text =
"The submission you are trying to find is not in our database"
!!case @page_version
when VERSION_2
@page.css("body .section-body")&.first&.text&.include?(not_found_text)
else
@page
.css("table.maintable td.alt1 font[size=1]")
&.first
&.text
&.include?(not_found_text)
end
end
sig { returns(T::Boolean) }
def logged_in?
logged_in_user_elem ? true : false
end
sig { returns(String) }
def server_local_time_string
case @page_version
when VERSION_0, VERSION_1
match =
@page.inner_html.match(/Server Local Time:\s+(.+)\s+</) ||
raise("unable to find server local time marker")
match[1]
when VERSION_2
@page
.css("div.footnote")
.first
&.text
&.match(/Server Time: (.+)/)
&.[](1) || raise("unable to find server local time marker")
else
unimplemented_version!
end&.strip
end
sig { returns(Time) }
def server_local_time
ActiveSupport::TimeZone.new("America/Los_Angeles").parse(
server_local_time_string,
)
end
sig { returns(ActiveSupport::TimeZone) }
def logged_in_user_tz
# server default for unauthenticated requests
return ActiveSupport::TimeZone.new("America/New_York") unless logged_in?
case logged_in_user
when "zzreg", "cottoniq"
ActiveSupport::TimeZone.new("America/Los_Angeles")
when "ddwhatnow", "vipvillageworker"
ActiveSupport::TimeZone.new("America/New_York")
when "blazeandwish"
ActiveSupport::TimeZone.new("America/Chicago")
else
# server default?
raise("unknown logged in user #{logged_in_user}")
end
end
sig { returns(T::Array[Domain::Fa::Parser::ListedSubmissionParserHelper]) }
def submissions_parsed
@submissions_hashes ||=
submission_elems.map do |elem|
Domain::Fa::Parser::ListedSubmissionParserHelper.new(
elem,
@page_version,
)
end
end
sig { returns(T.nilable(String)) }
def favorites_next_button_id
next_regex = %r{/favorites/.+/(\d+)/next/?}
button = @page.css(".gallery-section .pagination a.button.right").first
if button
href = button["href"]
match = next_regex.match(href) || raise("invalid favs button uri #{href}")
return match[1]
end
form =
@page
.css(".gallery-navigation form")
.to_a
.filter { |f| next_regex.match(f["action"]) }
.first
if form
action = form["action"]
match =
next_regex.match(action) || raise("invalid favs next action #{action}")
match[1]
end
end
sig { returns(T::Array[T::Hash[Symbol, String]]) }
def submission_folders
@submission_folders ||=
@page
.css(".folder-list a.dotted")
.map do |folder_link|
{ href: folder_link["href"], title: folder_link.text }
end
end
sig { returns(T.nilable(Time)) }
def most_recent_faved_at_time
case @page_version
when VERSION_2
# in the format `May 11, 2023 10:54 AM`
date_string = @page.css("span.popup_date").first&.attr("title")
date_string ? logged_in_user_tz.parse(date_string) : nil
else
unimplemented_version!
end
end
sig { returns(T::Array[Nokogiri::XML::Node]) }
def submission_elems
@submission_elems ||=
case @page_version
when VERSION_0
@page.css(".t-image")
when VERSION_1
@page.css(".submission-list > .gallery > figure")
when VERSION_2
[
# user gallery pages
".submission-list > .gallery > figure",
# browse list
"#gallery-browse > figure",
# favorites list
"#gallery-favorites > figure",
# home page
"#gallery-frontpage-submissions > figure",
"#gallery-frontpage-writing > figure",
"#gallery-frontpage-music > figure",
"#gallery-frontpage-crafts > figure",
].lazy.map { |css| @page.css(css) }.reject(&:empty?).to_a.flatten
else
unimplemented_version!
end.to_a
end
def logged_in_user
case @page_version
when VERSION_2
logged_in_user_elem && logged_in_user_elem["alt"]
when VERSION_0, VERSION_1
logged_in_user_elem&.text&.delete("~")
else
unimplemented_version!
end
end
def logged_in_user_elem
@logged_in_user_elem ||=
case @page_version
when VERSION_0, VERSION_1
@page.css("li a#my-username").first
when VERSION_2
@page.css("img.loggedin_user_avatar").first
else
unimplemented_version!
end
end
def probably_listings_page?
# center.flow seems to wrap all lists of submissions
case @page_version
when VERSION_0
if first_matching_css(@page, [".submission-list", "td > section.gallery"])
true
else
false
end
when VERSION_1
(@page.css(".submission-list").first) ? true : false
when VERSION_2
if first_matching_css(
@page,
%w[
.submission-list
#gallery-browse
#gallery-favorites
#gallery-frontpage-submissions
],
)
true
else
false
end
else
unimplemented_version!
end
end
sig { returns(Domain::Fa::Parser::UserPageHelper) }
def user_page
@user_page ||=
Domain::Fa::Parser::UserPageHelper.new(@page, @page_version, self)
end
def probably_user_page?
case @page_version
when VERSION_2
@page.css("userpage-nav-links a.current").first&.text&.strip == "Home"
else
%w[@ ~ ! ∞ -].include?(
@page.css("td.addpad.lead b").first&.text.try(:[], 0),
)
end
end
def probably_submission?
# @page.css(".alt1.actions").first ? true : false
case @page_version
when VERSION_0, VERSION_1
@page.css("#submissionImg").first ? true : false
when VERSION_2
@page.css(".submission-content").first ? true : false
else
unimplemented_version!
end
end
sig { returns(Domain::Fa::Parser::SubmissionParserHelper) }
def submission
@submission ||=
Domain::Fa::Parser::SubmissionParserHelper.new(
@page,
@phtml,
@page_version,
self,
)
end
sig { returns(T::Array[Domain::Fa::Parser::UserListParserHelper::User]) }
def user_list
@user_list ||= Domain::Fa::Parser::UserListParserHelper.user_list(@page)
end
private
sig do
params(
children: T.any(Nokogiri::XML::Node, Nokogiri::XML::NodeSet),
regex: Regexp,
).returns(T.nilable(Nokogiri::XML::Node))
end
def self.elem_after_text_match(children, regex)
idx = elem_idx_after_text_match(children, regex)
return nil unless idx
children[idx + 1]
end
sig do
params(
children: T.any(Nokogiri::XML::Node, Nokogiri::XML::NodeSet),
regex: Regexp,
).returns(T.nilable(Integer))
end
def self.elem_idx_after_text_match(children, regex)
children.find_index { |child| child.text.match(regex) }
end
end