148 lines
3.6 KiB
Ruby
148 lines
3.6 KiB
Ruby
# typed: strict
|
|
class Scraper::LinkFinder
|
|
extend T::Sig
|
|
include HasColorLogger
|
|
|
|
sig { void.params(origin_host: String, document: String) }
|
|
def initialize(origin_host, document)
|
|
@origin_host = T.let(origin_host, String)
|
|
@document =
|
|
T.let(
|
|
document.encode("UTF-8", invalid: :replace, undef: :replace),
|
|
String,
|
|
)
|
|
end
|
|
|
|
RAW_REGEXES =
|
|
T.let(
|
|
[
|
|
URI.regexp,
|
|
%r{(?:(?:https?://)?(?:[-\w\.]+)\.\w+(?:/(?:[\w\-/_\.…]*(?:\?\S+)?)?))},
|
|
],
|
|
T::Array[Regexp],
|
|
)
|
|
|
|
module FoundLink
|
|
class FaUser < T::ImmutableStruct
|
|
include T::Struct::ActsAsComparable
|
|
const :url_name, String
|
|
end
|
|
|
|
class FaPost < T::ImmutableStruct
|
|
include T::Struct::ActsAsComparable
|
|
const :fa_id, Integer
|
|
end
|
|
|
|
class TwitterUser < T::ImmutableStruct
|
|
include T::Struct::ActsAsComparable
|
|
const :name, String
|
|
end
|
|
|
|
Type = T.type_alias { T.any(FaUser, FaPost, TwitterUser) }
|
|
end
|
|
|
|
sig { returns(T::Array[FoundLink::Type]) }
|
|
def find_links
|
|
from_text_uris =
|
|
RAW_REGEXES
|
|
.map { |regex| @document.scan(regex) }
|
|
.flatten
|
|
.reject(&:blank?)
|
|
.reject { |str| str.include?("…") }
|
|
.map do |str|
|
|
uri = Addressable::URI.heuristic_parse(str)
|
|
uri.host ? uri : nil
|
|
rescue StandardError
|
|
logger.warn("invalid parsed uri detected - #{str}")
|
|
nil
|
|
end
|
|
.reject(&:blank?)
|
|
|
|
parsed_uris =
|
|
Nokogiri
|
|
.HTML(@document)
|
|
.css("a")
|
|
.map do |link|
|
|
uri = Addressable::URI.parse(link[:href])
|
|
uri.host ||= @origin_host
|
|
uri
|
|
rescue StandardError
|
|
logger.warn("invalid parsed uri detected - #{link[:href]}")
|
|
nil
|
|
end
|
|
.reject(&:blank?)
|
|
|
|
all_uris = from_text_uris + parsed_uris
|
|
all_uris
|
|
.map do |uri|
|
|
if uri.host.end_with?("furaffinity.net")
|
|
process_fa(uri)
|
|
elsif uri.host.end_with?("e621.net")
|
|
process_e621(uri)
|
|
elsif uri.host.end_with?("twitter.com")
|
|
process_twitter(uri)
|
|
end
|
|
end
|
|
.compact
|
|
.uniq
|
|
end
|
|
|
|
private
|
|
|
|
FA_USER_PREFIXES = %w[/user/ /gallery/ /scraps/ /journals/ /favorites/]
|
|
|
|
FA_POST_PREFIXES = %w[/view/ /full/]
|
|
|
|
sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
|
|
def process_fa(uri)
|
|
if prefixed_with?(FA_USER_PREFIXES, uri.path)
|
|
url_name = path_parts(uri.path)[1]&.downcase
|
|
FoundLink::FaUser.new(url_name: url_name) if url_name
|
|
elsif prefixed_with?(FA_POST_PREFIXES, uri.path)
|
|
fa_id = path_parts(uri.path)[1]&.to_i
|
|
FoundLink::FaPost.new(fa_id: fa_id) if fa_id
|
|
end
|
|
end
|
|
|
|
sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
|
|
def process_e621(uri)
|
|
nil
|
|
end
|
|
|
|
TWITTER_IGNORE_FIRST_PATH_PART = %w[
|
|
messages
|
|
explore
|
|
home
|
|
i
|
|
notifications
|
|
privacy
|
|
search
|
|
tos
|
|
]
|
|
|
|
TWITTER_IGNORE_HOSTS = %w[
|
|
business.twitter.com
|
|
help.twitter.com
|
|
support.twitter.com
|
|
]
|
|
|
|
sig { params(uri: Addressable::URI).returns(T.nilable(FoundLink::Type)) }
|
|
def process_twitter(uri)
|
|
return if TWITTER_IGNORE_HOSTS.include?(uri.host)
|
|
return if TWITTER_IGNORE_FIRST_PATH_PART.include?(path_parts(uri.path)[0])
|
|
|
|
name = path_parts(uri.path)[0]
|
|
FoundLink::TwitterUser.new(name: name) if name
|
|
end
|
|
|
|
sig { params(prefixes: T::Array[String], path: String).returns(T::Boolean) }
|
|
def prefixed_with?(prefixes, path)
|
|
prefixes.any? { |prefix| path.start_with?(prefix) }
|
|
end
|
|
|
|
sig { params(path: String).returns(T::Array[String]) }
|
|
def path_parts(path)
|
|
path.split("/").reject(&:blank?)
|
|
end
|
|
end
|