fix posted date parsing for legacy scanned posts

This commit is contained in:
Dylan Knutson
2024-12-28 17:33:29 +00:00
parent 652463a273
commit b1b8a6c780
3 changed files with 25 additions and 19 deletions

View File

@@ -114,7 +114,13 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
when VERSION_0, VERSION_1
idx = elem_idx_after_text_match(info_children, /Posted/)
child = info_children[idx..idx + 5].find { |ic| ic.name == "span" }
child.try(:[], "title").try(:strip)
date_str = child.try(:[], "title").try(:strip)
if date_str
DateTime.strptime(
date_str.gsub(/(\d+)(st|nd|rd|th)/, '\1'),
"%b %d, %Y %I:%M %p",
)
end
when VERSION_2
date_str = @elem.css(".popup_date").first["title"]
# e.g. 'Feb 18, 2023 06:47 PM'
@@ -246,7 +252,7 @@ class Domain::Fa::Parser::SubmissionParserHelper < Domain::Fa::Parser::Base
case @page_version
when VERSION_0
elem_after_text_match(info_children, /Resolution/).try(:text).try(
:strip
:strip,
)
when VERSION_1
idx = elem_idx_after_text_match(info_children, /Resolution/)

View File

@@ -8,13 +8,13 @@ class Domain::Fa::PostFactorCalculator
end
def fit
limit = 100_000_000
limit = 10_000_000
dataset =
measure(
->(r) do
proc do |r|
r && "loaded #{r.length.to_s.bold} favs" ||
"loading up to #{limit} favs"
end
"loading up to #{limit.to_s.bold} favs"
end,
) { Domain::Fa::Fav.all.limit(limit).pluck(:user_id, :post_id).to_a }
measure("convert to hash") do
@@ -51,13 +51,13 @@ class Domain::Fa::PostFactorCalculator
.each_slice(20_000) do |chunk|
total += chunk.size
measure(
" -> wrote chunk of #{chunk.size.to_s.bold} - (#{total.to_s.bold} total)"
" -> wrote chunk of #{chunk.size.to_s.bold} - (#{total.to_s.bold} total)",
) do
Domain::Fa::PostFactor.upsert_all(
chunk,
unique_by: :post_id,
update_only: factors_col_name,
returning: :id
returning: :id,
)
end
end

View File

@@ -115,20 +115,20 @@ describe Domain::Fa::Parser::Page do
assert_equal(
{
href: "/gallery/feretta/folder/60236/Tale-of-Tails",
title: "Tale of Tails"
title: "Tale of Tails",
},
folders[1]
folders[1],
)
assert_equal(
{
href: "/gallery/feretta/folder/60234/Illustrations",
title: "Illustrations"
title: "Illustrations",
},
folders[2]
folders[2],
)
assert_equal(
{ href: "/gallery/feretta/folder/229520/Marketing", title: "Marketing" },
folders.last
folders.last,
)
end
@@ -177,7 +177,7 @@ describe Domain::Fa::Parser::Page do
assert_equal "Miles-DF", sub.artist
assert_equal "//d.facdn.net/art/miles-df/1455889648/1455889648.miles-df_miles-df_stream.jpg",
sub.full_res_img
assert_equal "Feb 19th, 2016 08:47 AM", sub.posted_date
assert_equal DateTime.parse("Feb 19, 2016 08:47 AM"), sub.posted_date
assert_equal "All", sub.category
assert_equal "All", sub.theme
assert_equal "Unspecified / Any", sub.species
@@ -201,7 +201,7 @@ describe Domain::Fa::Parser::Page do
assert_equal 82, sub.num_views
assert_equal "Male", sub.gender
assert_equal "Dragon (Other)", sub.species
assert_equal "Feb 24th, 2016 12:51 AM", sub.posted_date
assert_equal DateTime.parse("Feb 24, 2016 12:51 AM"), sub.posted_date
assert_equal %w[kobold frog hunting male frog mighty hunter],
sub.keywords_array
end
@@ -219,7 +219,7 @@ describe Domain::Fa::Parser::Page do
assert sub.description_html =~ /Forgot to put this up/
assert_equal "//d.facdn.net/art/feretta/1479650817/1479650802.feretta_161004_space_vixen_hop_desktop.jpg",
sub.full_res_img
assert_equal "Nov 20th, 2016 09:06 AM", sub.posted_date
assert_equal DateTime.parse("Nov 20, 2016 09:06 AM"), sub.posted_date
assert_equal "Artwork (Digital)", sub.category
assert_equal "Miscellaneous", sub.theme
assert_equal "Canid - Vulpine", sub.species
@@ -247,14 +247,14 @@ describe Domain::Fa::Parser::Page do
assert_equal({ href: "/scraps/salkitten/", title: "Scraps" }, folders[0])
assert_equal(
{ href: "/gallery/salkitten/folder/51051/7up-Pup", title: "7up Pup" },
folders[2]
folders[2],
)
assert_equal(
{
href: "/gallery/salkitten/folder/93190/Animations",
title: "Animations"
title: "Animations",
},
folders.last
folders.last,
)
end