Files
redux-scraper/app/lib/scraper/link_finder.rb
2025-01-01 03:29:53 +00:00

113 lines
2.6 KiB
Ruby

# typed: false
class Scraper::LinkFinder
include HasColorLogger
def initialize(origin_host, document)
@origin_host = origin_host
@document = document.encode("UTF-8", invalid: :replace, undef: :replace)
end
RAW_REGEXES = [
URI.regexp,
%r{(?:(?:https?://)?(?:[-\w\.]+)\.\w+(?:/(?:[\w\-/_\.…]*(?:\?\S+)?)?))},
]
def find_links
from_text_uris =
RAW_REGEXES
.map { |regex| @document.scan(regex) }
.flatten
.reject(&:blank?)
.reject { |str| str.include?("") }
.map do |str|
uri = Addressable::URI.heuristic_parse(str)
uri.host ? uri : nil
rescue StandardError
logger.warn("invalid parsed uri detected - #{str}")
nil
end
.reject(&:blank?)
parsed_uris =
Nokogiri
.HTML(@document)
.css("a")
.map do |link|
uri = Addressable::URI.parse(link[:href])
uri.host ||= @origin_host
uri
rescue StandardError
logger.warn("invalid parsed uri detected - #{link[:href]}")
nil
end
.reject(&:blank?)
all_uris = from_text_uris + parsed_uris
all_uris
.map do |uri|
if uri.host.end_with?("furaffinity.net")
process_fa(uri)
elsif uri.host.end_with?("e621.net")
process_e621(uri)
elsif uri.host.end_with?("twitter.com")
process_twitter(uri)
end
end
.reject(&:nil?)
.uniq
end
private
FA_USER_PREFIXES = %w[/user/ /gallery/ /scraps/ /journals/ /favorites/]
FA_POST_PREFIXES = %w[/view/ /full/]
def process_fa(uri)
if prefixed_with?(FA_USER_PREFIXES, uri.path)
url_name = path_parts(uri.path)[1]&.downcase
{ type: :fa_user, url_name: url_name } if url_name
elsif prefixed_with?(FA_POST_PREFIXES, uri.path)
fa_id = path_parts(uri.path)[1]&.to_i
{ type: :fa_post, fa_id: fa_id } if fa_id
end
end
def process_e621(uri)
nil
end
TWITTER_IGNORE_FIRST_PATH_PART = %w[
messages
explore
home
i
notifications
privacy
search
tos
]
TWITTER_IGNORE_HOSTS = %w[
business.twitter.com
help.twitter.com
support.twitter.com
]
def process_twitter(uri)
return if TWITTER_IGNORE_HOSTS.include?(uri.host)
return if TWITTER_IGNORE_FIRST_PATH_PART.include?(path_parts(uri.path)[0])
name = path_parts(uri.path)[0]
{ type: :twitter_user, name: name } if name
end
def prefixed_with?(prefixes, path)
prefixes.any? { |prefix| path.start_with?(prefix) }
end
def path_parts(path)
path.split("/").reject(&:blank?)
end
end