Files
redux-scraper/app/lib/scraper/link_finder.rb
2025-02-25 05:47:44 +00:00

148 lines
3.6 KiB
Ruby

# typed: strict
class Scraper::LinkFinder
extend T::Sig
include HasColorLogger
sig { void.params(origin_host: String, document: String) }
def initialize(origin_host, document)
@origin_host = T.let(origin_host, String)
@document =
T.let(
document.encode("UTF-8", invalid: :replace, undef: :replace),
String,
)
end
RAW_REGEXES =
T.let(
[
URI.regexp,
%r{(?:(?:https?://)?(?:[-\w\.]+)\.\w+(?:/(?:[\w\-/_\.…]*(?:\?\S+)?)?))},
],
T::Array[Regexp],
)
module FoundLink
class FaUser < T::ImmutableStruct
include T::Struct::ActsAsComparable
const :url_name, String
end
class FaPost < T::ImmutableStruct
include T::Struct::ActsAsComparable
const :fa_id, Integer
end
class TwitterUser < T::ImmutableStruct
include T::Struct::ActsAsComparable
const :name, String
end
Type = T.type_alias { T.any(FaUser, FaPost, TwitterUser) }
end
sig { returns(T::Array[FoundLink::Type]) }
def find_links
from_text_uris =
RAW_REGEXES
.map { |regex| @document.scan(regex) }
.flatten
.reject(&:blank?)
.reject { |str| str.include?("") }
.map do |str|
uri = Addressable::URI.heuristic_parse(str)
uri.host ? uri : nil
rescue StandardError
logger.warn("invalid parsed uri detected - #{str}")
nil
end
.reject(&:blank?)
parsed_uris =
Nokogiri
.HTML(@document)
.css("a")
.map do |link|
uri = Addressable::URI.parse(link[:href])
uri.host ||= @origin_host
uri
rescue StandardError
logger.warn("invalid parsed uri detected - #{link[:href]}")
nil
end
.reject(&:blank?)
all_uris = from_text_uris + parsed_uris
all_uris
.map do |uri|
if uri.host.end_with?("furaffinity.net")
process_fa(uri)
elsif uri.host.end_with?("e621.net")
process_e621(uri)
elsif uri.host.end_with?("twitter.com")
process_twitter(uri)
end
end
.compact
.uniq
end
private
FA_USER_PREFIXES = %w[/user/ /gallery/ /scraps/ /journals/ /favorites/]
FA_POST_PREFIXES = %w[/view/ /full/]
sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
def process_fa(uri)
if prefixed_with?(FA_USER_PREFIXES, uri.path)
url_name = path_parts(uri.path)[1]&.downcase
FoundLink::FaUser.new(url_name: url_name) if url_name
elsif prefixed_with?(FA_POST_PREFIXES, uri.path)
fa_id = path_parts(uri.path)[1]&.to_i
FoundLink::FaPost.new(fa_id: fa_id) if fa_id
end
end
sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
def process_e621(uri)
nil
end
TWITTER_IGNORE_FIRST_PATH_PART = %w[
messages
explore
home
i
notifications
privacy
search
tos
]
TWITTER_IGNORE_HOSTS = %w[
business.twitter.com
help.twitter.com
support.twitter.com
]
sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
def process_twitter(uri)
return if TWITTER_IGNORE_HOSTS.include?(uri.host)
return if TWITTER_IGNORE_FIRST_PATH_PART.include?(path_parts(uri.path)[0])
name = path_parts(uri.path)[0]
FoundLink::TwitterUser.new(name: name) if name
end
sig { params(prefixes: T::Array[String], path: String).returns(T::Boolean) }
def prefixed_with?(prefixes, path)
prefixes.any? { |prefix| path.start_with?(prefix) }
end
sig { params(path: String).returns(T::Array[String]) }
def path_parts(path)
path.split("/").reject(&:blank?)
end
end