sofurry WIP for now, sofurry is down

This commit is contained in:
Dylan Knutson
2025-07-08 00:11:24 +00:00
parent 5a9efa798d
commit 967f78366e
8 changed files with 50 additions and 46 deletions

View File

@@ -1,8 +1,7 @@
source "https://rubygems.org"
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
ruby "3.2.6"
# ruby "3.0.3"
ruby "~> 3.2"
# Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main"
gem "rails", "~> 7.2"

View File

@@ -39,3 +39,9 @@
- [ ] Rich inline links to e621 e.g. https://refurrer.com/posts/fa@60070060
- [ ] Find FaPost that have favs recorded but no scan / file, enqueue scan
- [x] Bunch of posts with empty responses: posts = Domain::Post.joins(files: :log_entry).where(files: { http_log_entries: { response_sha256: BlobFile::EMPTY_FILE_SHA256 }}).limit(10)
- [ ] Create GlobalState entries for last FA id on browse page, periodic scan to scan from the newest FA ID to the stored one
- [ ] GlobalState entries for long running backfill jobs, automatically restart them if they fail
- [ ] Flag to pass to jobs to log HTTP requests / responses to a directory, HTTP mock helper to read from that directory
- [ ] fix IP address incorrect for Cloudflare proxied requests
- [ ] SOCKS5 proxy for additional workers
- [ ] Backup FA scraper using foxbot & g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion

View File

@@ -22,6 +22,12 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
const :gallery_log_entry, T.nilable(HttpLogEntry)
end
sig { override.params(args: T::Hash[Symbol, T.untyped]).void }
def initialize(*args)
super
@visited_urls = T.let(Set.new, T::Set[String])
end
sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
def perform(args)
user = user_from_args!
@@ -45,29 +51,26 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
media_types_with_posts =
T.let(Set.new, T::Set[Domain::Sofurry::GalleryPageParser::MediaType])
existing_folders =
user
.folders
.filter_map { |folder| (sfid = folder.sofurry_id) && [sfid, folder] }
.to_h
media_types.each do |media_type|
gallery_summary = gather_gallery_summary(user, media_type)
folders = gallery_summary.folders
gallery_posts = gallery_summary.posts
existing_folders =
user
.folders
.filter { |folder| folder.media_type == media_type.serialize }
.filter_map { |folder| (sfid = folder.sofurry_id) && [sfid, folder] }
.to_h
folders.each do |folder|
if (existing = existing_folders[folder.id])
if (existing = existing_folders[folder.sofurry_id])
existing.name = folder.name
else
folder_model =
user.folders.build(
{
sofurry_id: folder.id,
name: folder.name,
media_type: media_type.serialize,
},
{ sofurry_id: folder.sofurry_id, name: folder.name },
)
existing_folders[folder.sofurry_id] = folder_model
end
if gallery_posts.any?
@@ -97,7 +100,6 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
folder_posts =
scan_gallery_posts(
user_id,
T.must(folder.media_type_enum),
existing_posts.keys.to_set,
folder_id: folder.sofurry_id,
)
@@ -212,16 +214,21 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
def scan_gallery_posts(user_id, media_type, existing_post_ids, folder_id: nil)
posts = T.let(Set.new, T::Set[GalleryEntry])
page = 1
seen_post_ids = T.let(Set.new, T::Set[Integer])
loop do
url =
if folder_id
"https://api2.sofurry.com/browse/folder/#{media_type.serialize}" +
"?by=#{user_id}&folder=#{folder_id}&format=json"
"?by=#{user_id}&folder=#{folder_id}&#{media_type.serialize}-page=#{page}&format=json"
else
"https://api2.sofurry.com/browse/user/#{media_type.serialize}" +
"?uid=#{user_id}&#{media_type.serialize}-page=#{page}&format=json"
end
break if @visited_urls.include?(url)
@visited_urls << url
response = http_client.get(url)
if response.status_code != 200
@@ -240,6 +247,9 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
.compact
.to_set
break if post_ids.subset?(seen_post_ids)
seen_post_ids.merge(post_ids)
posts.merge(
posts_json.map do |post_json|
GalleryEntry.new(

View File

@@ -5,7 +5,7 @@ class Domain::Sofurry::GalleryPageParser
class Folder < T::Struct
include T::Struct::ActsAsComparable
const :name, String
const :id, Integer
const :sofurry_id, Integer
end
class MediaType < T::Enum
@@ -20,7 +20,7 @@ class Domain::Sofurry::GalleryPageParser
class ShortGalleryEntry < T::Struct
include T::Struct::ActsAsComparable
const :id, Integer
const :sofurry_id, Integer
const :title, String
end
@@ -46,8 +46,8 @@ class Domain::Sofurry::GalleryPageParser
raise "user_id mismatch: #{user_id} != #{@user_id}"
end
name = elem.text
id = href_params["folder"]&.to_i
Folder.new(name:, id:)
sofurry_id = href_params["folder"]&.to_i
Folder.new(name:, sofurry_id:)
end
end
@@ -60,19 +60,19 @@ class Domain::Sofurry::GalleryPageParser
.css(".sf-browse-shortlist,.sf-browse-shortlist-zebra")
.map do |elem|
title_id = elem.css(".sf-browse-shortlist-title a").first || next
id = title_id&.attr("href").split("/")&.last&.to_i || next
sofurry_id = title_id&.attr("href").split("/")&.last&.to_i || next
title = title_id&.text || next
ShortGalleryEntry.new(id:, title:)
ShortGalleryEntry.new(sofurry_id:, title:)
end
.compact,
@doc
.css("a.sfArtworkSmallInner")
.map do |elem|
id = elem.attr("href").split("/")&.last&.to_i || next
sofurry_id = elem.attr("href").split("/")&.last&.to_i || next
img_elem = elem.css("img.sfArtworkItem").first || next
title =
img_elem.attr("alt")&.strip&.rpartition("|by")&.first || next
ShortGalleryEntry.new(id:, title:)
ShortGalleryEntry.new(sofurry_id:, title:)
end
.compact,
].flatten

View File

@@ -9,17 +9,11 @@ class Domain::PostGroup::SofurryFolder < Domain::PostGroup
attr_json :sofurry_id, :integer
attr_json :name, :string
attr_json :owner_id, :integer
attr_json :media_types, :string, array: true
attr_json_due_timestamp :scanned_folder_at, 3.months
belongs_to :owner, class_name: "::Domain::User::SofurryUser", optional: true
attr_json_enum(
:media_type,
:integer,
{ stories: 1, art: 2, music: 3, journals: 4, photos: 5 },
prefix: true,
)
sig { override.returns([String, Symbol]) }
def self.param_prefix_and_attribute
["sf", :sofurry_folder_id]
@@ -32,11 +26,4 @@ class Domain::PostGroup::SofurryFolder < Domain::PostGroup
"https://www.sofurry.com/browse/folder/#{type}?by=#{owner_id}&folder=#{sofurry_id}"
end
end
sig { returns(T.nilable(Domain::Sofurry::GalleryPageParser::MediaType)) }
def media_type_enum
Domain::Sofurry::GalleryPageParser::MediaType.try_deserialize(
self.media_type,
)
end
end

View File

@@ -2,6 +2,8 @@
class Domain::UserJobEvent::AddTrackedObject < Domain::UserJobEvent
self.table_name = "domain_user_job_event_add_tracked_objects"
belongs_to :log_entry, class_name: "HttpLogEntry"
validates :log_entry_id, uniqueness: { scope: :kind }
attr_json :requested_at, ActiveModelUtcTimeValue.new
enum :kind, %i[favs followed_by following gallery]

View File

@@ -52,7 +52,7 @@
"@types/lodash": "^4.14.192",
"@types/react": "^18.0.33",
"env-cmd": "^10.1.0",
"prettier": "^3.5.2",
"prettier": "^3.6.2",
"prettier-plugin-tailwindcss": "^0.6.9",
"react-refresh": "^0.14.0",
"typescript": "^5.0.3",

View File

@@ -4025,10 +4025,10 @@ prettier-plugin-tailwindcss@^0.6.9:
resolved "https://registry.npmjs.org/prettier-plugin-tailwindcss/-/prettier-plugin-tailwindcss-0.6.9.tgz"
integrity sha512-r0i3uhaZAXYP0At5xGfJH876W3HHGHDp+LCRUJrs57PBeQ6mYHMwr25KH8NPX44F2yGTvdnH7OqCshlQx183Eg==
prettier@^3.5.2:
version "3.5.2"
resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.5.2.tgz#d066c6053200da0234bf8fa1ef45168abed8b914"
integrity sha512-lc6npv5PH7hVqozBR7lkBNOGXV9vMwROAPlumdBkX0wTbbzPu/U1hk5yL8p2pt4Xoc+2mkT8t/sow2YrV/M5qg==
prettier@^3.6.2:
version "3.6.2"
resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.6.2.tgz#ccda02a1003ebbb2bfda6f83a074978f608b9393"
integrity sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==
process-nextick-args@~2.0.0:
version "2.0.1"