122 lines
4.8 KiB
Ruby
122 lines
4.8 KiB
Ruby
# typed: false
|
|
require "rails_helper"
|
|
|
|
describe Scraper::LinkFinder do
|
|
def find_links(origin_host, document)
|
|
sort(Scraper::LinkFinder.new(origin_host, document).find_links)
|
|
end
|
|
|
|
def sort(links)
|
|
links.uniq.sort_by { |l| l.to_s }
|
|
end
|
|
|
|
it "finds links on FA user pages" do
|
|
document =
|
|
File.read(
|
|
"test/fixtures/files/domain/fa/user_page/user_page_angelpawqt.html",
|
|
)
|
|
links = find_links("www.furaffinity.net", document)
|
|
expect(links).to include(
|
|
Scraper::LinkFinder::FoundLink::FaPost.new(fa_id: 54_212_636),
|
|
)
|
|
expect(links).to include(
|
|
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "8bitstarshon1"),
|
|
)
|
|
expect(links).to include(
|
|
Scraper::LinkFinder::FoundLink::TwitterUser.new(name: "furaffinity"),
|
|
)
|
|
|
|
# { type: :fa_post, fa_id: 44_848_118 },
|
|
# { type: :fa_post, fa_id: 50_619_594 },
|
|
# { type: :fa_post, fa_id: 51_321_948 },
|
|
# { type: :fa_post, fa_id: 51_379_769 },
|
|
# { type: :fa_post, fa_id: 52_782_367 },
|
|
# { type: :fa_post, fa_id: 54_212_636 },
|
|
# { type: :fa_post, fa_id: 54_717_098 },
|
|
# { type: :fa_post, fa_id: 57_492_887 },
|
|
# { type: :fa_post, fa_id: 57_567_803 },
|
|
# { type: :fa_post, fa_id: 57_736_082 },
|
|
# { type: :fa_post, fa_id: 57_744_156 },
|
|
# { type: :fa_post, fa_id: 57_889_649 },
|
|
# { type: :fa_post, fa_id: 57_898_215 },
|
|
# { type: :fa_post, fa_id: 58_080_112 },
|
|
# { type: :fa_post, fa_id: 58_196_925 },
|
|
# { type: :fa_post, fa_id: 58_233_811 },
|
|
# { type: :fa_post, fa_id: 59_515_926 },
|
|
# { type: :fa_user, url_name: "8bitstarshon1" },
|
|
# { type: :fa_user, url_name: "angelpawqt" },
|
|
# { type: :fa_user, url_name: "arilace" },
|
|
# { type: :fa_user, url_name: "chamomilearts" },
|
|
# { type: :fa_user, url_name: "cheshirkas" },
|
|
# { type: :fa_user, url_name: "cinnabunnyart" },
|
|
# { type: :fa_user, url_name: "cloudsen" },
|
|
# { type: :fa_user, url_name: "eyeswings" },
|
|
# { type: :fa_user, url_name: "fender" },
|
|
# { type: :fa_user, url_name: "feztiz" },
|
|
# { type: :fa_user, url_name: "grau" },
|
|
# { type: :fa_user, url_name: "huibiyyy" },
|
|
# { type: :fa_user, url_name: "hyongatayro" },
|
|
# { type: :fa_user, url_name: "i-luv-tentacles" },
|
|
# { type: :fa_user, url_name: "innoinatamm" },
|
|
# { type: :fa_user, url_name: "likara" },
|
|
# { type: :fa_user, url_name: "limoncella" },
|
|
# { type: :fa_user, url_name: "nacpanylis" },
|
|
# { type: :fa_user, url_name: "oseledetts" },
|
|
# { type: :fa_user, url_name: "pinxi" },
|
|
# { type: :fa_user, url_name: "plazmafox" },
|
|
# { type: :fa_user, url_name: "rosebyki1" },
|
|
# { type: :fa_user, url_name: "rosebyki2" },
|
|
# { type: :fa_user, url_name: "samarthanie" },
|
|
# { type: :fa_user, url_name: "sekaitoroto" },
|
|
# { type: :fa_user, url_name: "snakeki" },
|
|
# { type: :fa_user, url_name: "tailung" },
|
|
# { type: :fa_user, url_name: "tentastic" },
|
|
# { type: :fa_user, url_name: "toks" },
|
|
# { type: :fa_user, url_name: "vestel" },
|
|
# { type: :fa_user, url_name: "yabewenyx" },
|
|
# { type: :fa_user, url_name: "yoyikori" },
|
|
# { type: :fa_user, url_name: "zzreg" },
|
|
# { type: :fa_user, url_name: "~fikanori" },
|
|
# { type: :twitter_user, name: "furaffinity" },
|
|
# )
|
|
end
|
|
|
|
it "finds links on pages from furaffinity" do
|
|
document = <<-HTML
|
|
Hello, world! This is my document. <a href="/user/foobar">Foobar</a>
|
|
visit me at www.furaffinity.net/user/baz or twitter.com/foobartw
|
|
HTML
|
|
links = find_links("www.furaffinity.net", document)
|
|
expect(links).to include(
|
|
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "foobar"),
|
|
)
|
|
expect(links).to include(
|
|
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "baz"),
|
|
)
|
|
expect(links).to include(
|
|
Scraper::LinkFinder::FoundLink::TwitterUser.new(name: "foobartw"),
|
|
)
|
|
end
|
|
|
|
it "ignores raw links ending in an ellipsis" do
|
|
# links from twitter may be truncated, so we should ignore those
|
|
# which end in an ellipsis
|
|
document = <<-HTML
|
|
<a dir="ltr" href="https://t.co/U66eFH987e" rel="noopener noreferrer nofollow" target="_blank" role="none" class="css-4rbku5 css-18t94o4 css-901oao css-16my406 r-1loqt21 r-4qtqp9 r-poiln3 r-1b7u577 r-bcqeeo r-qvutc0" data-testid="UserUrl" style="color: rgb(29, 155, 240);">
|
|
<span class="css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0">furaffinity.net/user/draconics…</span>
|
|
</a>
|
|
HTML
|
|
assert_equal sort([]), find_links("twitter.co", document)
|
|
end
|
|
|
|
it "downcases fa url names correctly" do
|
|
document = <<-HTML
|
|
www.furaffinity.net/user/Some-User
|
|
HTML
|
|
links = find_links("www.furaffinity.net", document)
|
|
expect(links).to include(
|
|
Scraper::LinkFinder::FoundLink::FaUser.new(url_name: "some-user"),
|
|
)
|
|
end
|
|
end
|