113 lines
2.6 KiB
Ruby
113 lines
2.6 KiB
Ruby
# typed: false
|
|
class Scraper::LinkFinder
|
|
include HasColorLogger
|
|
|
|
def initialize(origin_host, document)
|
|
@origin_host = origin_host
|
|
@document = document.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
end
|
|
|
|
RAW_REGEXES = [
|
|
URI.regexp,
|
|
%r{(?:(?:https?://)?(?:[-\w\.]+)\.\w+(?:/(?:[\w\-/_\.…]*(?:\?\S+)?)?))},
|
|
]
|
|
|
|
def find_links
|
|
from_text_uris =
|
|
RAW_REGEXES
|
|
.map { |regex| @document.scan(regex) }
|
|
.flatten
|
|
.reject(&:blank?)
|
|
.reject { |str| str.include?("…") }
|
|
.map do |str|
|
|
uri = Addressable::URI.heuristic_parse(str)
|
|
uri.host ? uri : nil
|
|
rescue StandardError
|
|
logger.warn("invalid parsed uri detected - #{str}")
|
|
nil
|
|
end
|
|
.reject(&:blank?)
|
|
|
|
parsed_uris =
|
|
Nokogiri
|
|
.HTML(@document)
|
|
.css("a")
|
|
.map do |link|
|
|
uri = Addressable::URI.parse(link[:href])
|
|
uri.host ||= @origin_host
|
|
uri
|
|
rescue StandardError
|
|
logger.warn("invalid parsed uri detected - #{link[:href]}")
|
|
nil
|
|
end
|
|
.reject(&:blank?)
|
|
|
|
all_uris = from_text_uris + parsed_uris
|
|
all_uris
|
|
.map do |uri|
|
|
if uri.host.end_with?("furaffinity.net")
|
|
process_fa(uri)
|
|
elsif uri.host.end_with?("e621.net")
|
|
process_e621(uri)
|
|
elsif uri.host.end_with?("twitter.com")
|
|
process_twitter(uri)
|
|
end
|
|
end
|
|
.reject(&:nil?)
|
|
.uniq
|
|
end
|
|
|
|
private
|
|
|
|
FA_USER_PREFIXES = %w[/user/ /gallery/ /scraps/ /journals/ /favorites/]
|
|
|
|
FA_POST_PREFIXES = %w[/view/ /full/]
|
|
|
|
def process_fa(uri)
|
|
if prefixed_with?(FA_USER_PREFIXES, uri.path)
|
|
url_name = path_parts(uri.path)[1]&.downcase
|
|
{ type: :fa_user, url_name: url_name } if url_name
|
|
elsif prefixed_with?(FA_POST_PREFIXES, uri.path)
|
|
fa_id = path_parts(uri.path)[1]&.to_i
|
|
{ type: :fa_post, fa_id: fa_id } if fa_id
|
|
end
|
|
end
|
|
|
|
def process_e621(uri)
|
|
nil
|
|
end
|
|
|
|
TWITTER_IGNORE_FIRST_PATH_PART = %w[
|
|
messages
|
|
explore
|
|
home
|
|
i
|
|
notifications
|
|
privacy
|
|
search
|
|
tos
|
|
]
|
|
|
|
TWITTER_IGNORE_HOSTS = %w[
|
|
business.twitter.com
|
|
help.twitter.com
|
|
support.twitter.com
|
|
]
|
|
|
|
def process_twitter(uri)
|
|
return if TWITTER_IGNORE_HOSTS.include?(uri.host)
|
|
return if TWITTER_IGNORE_FIRST_PATH_PART.include?(path_parts(uri.path)[0])
|
|
|
|
name = path_parts(uri.path)[0]
|
|
{ type: :twitter_user, name: name } if name
|
|
end
|
|
|
|
def prefixed_with?(prefixes, path)
|
|
prefixes.any? { |prefix| path.start_with?(prefix) }
|
|
end
|
|
|
|
def path_parts(path)
|
|
path.split("/").reject(&:blank?)
|
|
end
|
|
end
|