Files
redux-scraper/spec/scraper/link_finder_spec.rb
2025-02-25 05:47:44 +00:00

122 lines
4.8 KiB
Ruby

# typed: false
require "rails_helper"
describe Scraper::LinkFinder do
def find_links(origin_host, document)
sort(Scraper::LinkFinder.new(origin_host, document).find_links)
end
def sort(links)
links.uniq.sort_by { |l| l.to_s }
end
it "finds links on FA user pages" do
document =
File.read(
"test/fixtures/files/domain/fa/user_page/user_page_angelpawqt.html",
)
links = find_links("www.furaffinity.net", document)
expect(links).to include(
Scraper::LinkFinder::FoundLink::FaPost.new(fa_id: 54_212_636),
)
expect(links).to include(
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "8bitstarshon1"),
)
expect(links).to include(
Scraper::LinkFinder::FoundLink::TwitterUser.new(name: "furaffinity"),
)
# { type: :fa_post, fa_id: 44_848_118 },
# { type: :fa_post, fa_id: 50_619_594 },
# { type: :fa_post, fa_id: 51_321_948 },
# { type: :fa_post, fa_id: 51_379_769 },
# { type: :fa_post, fa_id: 52_782_367 },
# { type: :fa_post, fa_id: 54_212_636 },
# { type: :fa_post, fa_id: 54_717_098 },
# { type: :fa_post, fa_id: 57_492_887 },
# { type: :fa_post, fa_id: 57_567_803 },
# { type: :fa_post, fa_id: 57_736_082 },
# { type: :fa_post, fa_id: 57_744_156 },
# { type: :fa_post, fa_id: 57_889_649 },
# { type: :fa_post, fa_id: 57_898_215 },
# { type: :fa_post, fa_id: 58_080_112 },
# { type: :fa_post, fa_id: 58_196_925 },
# { type: :fa_post, fa_id: 58_233_811 },
# { type: :fa_post, fa_id: 59_515_926 },
# { type: :fa_user, url_name: "8bitstarshon1" },
# { type: :fa_user, url_name: "angelpawqt" },
# { type: :fa_user, url_name: "arilace" },
# { type: :fa_user, url_name: "chamomilearts" },
# { type: :fa_user, url_name: "cheshirkas" },
# { type: :fa_user, url_name: "cinnabunnyart" },
# { type: :fa_user, url_name: "cloudsen" },
# { type: :fa_user, url_name: "eyeswings" },
# { type: :fa_user, url_name: "fender" },
# { type: :fa_user, url_name: "feztiz" },
# { type: :fa_user, url_name: "grau" },
# { type: :fa_user, url_name: "huibiyyy" },
# { type: :fa_user, url_name: "hyongatayro" },
# { type: :fa_user, url_name: "i-luv-tentacles" },
# { type: :fa_user, url_name: "innoinatamm" },
# { type: :fa_user, url_name: "likara" },
# { type: :fa_user, url_name: "limoncella" },
# { type: :fa_user, url_name: "nacpanylis" },
# { type: :fa_user, url_name: "oseledetts" },
# { type: :fa_user, url_name: "pinxi" },
# { type: :fa_user, url_name: "plazmafox" },
# { type: :fa_user, url_name: "rosebyki1" },
# { type: :fa_user, url_name: "rosebyki2" },
# { type: :fa_user, url_name: "samarthanie" },
# { type: :fa_user, url_name: "sekaitoroto" },
# { type: :fa_user, url_name: "snakeki" },
# { type: :fa_user, url_name: "tailung" },
# { type: :fa_user, url_name: "tentastic" },
# { type: :fa_user, url_name: "toks" },
# { type: :fa_user, url_name: "vestel" },
# { type: :fa_user, url_name: "yabewenyx" },
# { type: :fa_user, url_name: "yoyikori" },
# { type: :fa_user, url_name: "zzreg" },
# { type: :fa_user, url_name: "~fikanori" },
# { type: :twitter_user, name: "furaffinity" },
# )
end
it "finds links on pages from furaffinity" do
document = <<-HTML
Hello, world! This is my document. <a href="/user/foobar">Foobar</a>
visit me at www.furaffinity.net/user/baz or twitter.com/foobartw
HTML
links = find_links("www.furaffinity.net", document)
expect(links).to include(
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "foobar"),
)
expect(links).to include(
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "baz"),
)
expect(links).to include(
Scraper::LinkFinder::FoundLink::TwitterUser.new(name: "foobartw"),
)
end
it "ignores raw links ending in an ellipsis" do
# links from twitter may be truncated, so we should ignore those
# which end in an ellipsis
document = <<-HTML
<a dir="ltr" href="https://t.co/U66eFH987e" rel="noopener noreferrer nofollow" target="_blank" role="none" class="css-4rbku5 css-18t94o4 css-901oao css-16my406 r-1loqt21 r-4qtqp9 r-poiln3 r-1b7u577 r-bcqeeo r-qvutc0" data-testid="UserUrl" style="color: rgb(29, 155, 240);">
<span class="css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0">furaffinity.net/user/draconics</span>
</a>
HTML
assert_equal sort([]), find_links("twitter.co", document)
end
it "downcases fa url names correctly" do
document = <<-HTML
www.furaffinity.net/user/Some-User
HTML
links = find_links("www.furaffinity.net", document)
expect(links).to include(
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "some-user"),
)
end
end