sofurry WIP for now, sofurry is down

This commit is contained in:
Dylan Knutson
2025-07-08 00:11:24 +00:00
parent 5a9efa798d
commit 967f78366e
8 changed files with 50 additions and 46 deletions

View File

@@ -1,8 +1,7 @@
source "https://rubygems.org" source "https://rubygems.org"
git_source(:github) { |repo| "https://github.com/#{repo}.git" } git_source(:github) { |repo| "https://github.com/#{repo}.git" }
ruby "3.2.6" ruby "~> 3.2"
# ruby "3.0.3"
# Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main" # Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main"
gem "rails", "~> 7.2" gem "rails", "~> 7.2"

View File

@@ -39,3 +39,9 @@
- [ ] Rich inline links to e621 e.g. https://refurrer.com/posts/fa@60070060 - [ ] Rich inline links to e621 e.g. https://refurrer.com/posts/fa@60070060
- [ ] Find FaPost that have favs recorded but no scan / file, enqueue scan - [ ] Find FaPost that have favs recorded but no scan / file, enqueue scan
- [x] Bunch of posts with empty responses: posts = Domain::Post.joins(files: :log_entry).where(files: { http_log_entries: { response_sha256: BlobFile::EMPTY_FILE_SHA256 }}).limit(10) - [x] Bunch of posts with empty responses: posts = Domain::Post.joins(files: :log_entry).where(files: { http_log_entries: { response_sha256: BlobFile::EMPTY_FILE_SHA256 }}).limit(10)
- [ ] Create GlobalState entries for last FA id on browse page, periodic scan to scan from the newest FA ID to the stored one
- [ ] GlobalState entries for long running backfill jobs, automatically restart them if they fail
- [ ] Flag to pass to jobs to log HTTP requests / responses to a directory, HTTP mock helper to read from that directory
- [ ] fix IP address incorrect for Cloudflare proxied requests
- [ ] SOCKS5 proxy for additional workers
- [ ] Backup FA scraper using foxbot & g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion

View File

@@ -22,6 +22,12 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
const :gallery_log_entry, T.nilable(HttpLogEntry) const :gallery_log_entry, T.nilable(HttpLogEntry)
end end
sig { override.params(args: T::Hash[Symbol, T.untyped]).void }
def initialize(*args)
super
@visited_urls = T.let(Set.new, T::Set[String])
end
sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) } sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) }
def perform(args) def perform(args)
user = user_from_args! user = user_from_args!
@@ -45,29 +51,26 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
media_types_with_posts = media_types_with_posts =
T.let(Set.new, T::Set[Domain::Sofurry::GalleryPageParser::MediaType]) T.let(Set.new, T::Set[Domain::Sofurry::GalleryPageParser::MediaType])
existing_folders =
user
.folders
.filter_map { |folder| (sfid = folder.sofurry_id) && [sfid, folder] }
.to_h
media_types.each do |media_type| media_types.each do |media_type|
gallery_summary = gather_gallery_summary(user, media_type) gallery_summary = gather_gallery_summary(user, media_type)
folders = gallery_summary.folders folders = gallery_summary.folders
gallery_posts = gallery_summary.posts gallery_posts = gallery_summary.posts
existing_folders =
user
.folders
.filter { |folder| folder.media_type == media_type.serialize }
.filter_map { |folder| (sfid = folder.sofurry_id) && [sfid, folder] }
.to_h
folders.each do |folder| folders.each do |folder|
if (existing = existing_folders[folder.id]) if (existing = existing_folders[folder.sofurry_id])
existing.name = folder.name existing.name = folder.name
else else
folder_model =
user.folders.build( user.folders.build(
{ { sofurry_id: folder.sofurry_id, name: folder.name },
sofurry_id: folder.id,
name: folder.name,
media_type: media_type.serialize,
},
) )
existing_folders[folder.sofurry_id] = folder_model
end end
if gallery_posts.any? if gallery_posts.any?
@@ -97,7 +100,6 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
folder_posts = folder_posts =
scan_gallery_posts( scan_gallery_posts(
user_id, user_id,
T.must(folder.media_type_enum),
existing_posts.keys.to_set, existing_posts.keys.to_set,
folder_id: folder.sofurry_id, folder_id: folder.sofurry_id,
) )
@@ -212,16 +214,21 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
def scan_gallery_posts(user_id, media_type, existing_post_ids, folder_id: nil) def scan_gallery_posts(user_id, media_type, existing_post_ids, folder_id: nil)
posts = T.let(Set.new, T::Set[GalleryEntry]) posts = T.let(Set.new, T::Set[GalleryEntry])
page = 1 page = 1
seen_post_ids = T.let(Set.new, T::Set[Integer])
loop do loop do
url = url =
if folder_id if folder_id
"https://api2.sofurry.com/browse/folder/#{media_type.serialize}" + "https://api2.sofurry.com/browse/folder/#{media_type.serialize}" +
"?by=#{user_id}&folder=#{folder_id}&format=json" "?by=#{user_id}&folder=#{folder_id}&#{media_type.serialize}-page=#{page}&format=json"
else else
"https://api2.sofurry.com/browse/user/#{media_type.serialize}" + "https://api2.sofurry.com/browse/user/#{media_type.serialize}" +
"?uid=#{user_id}&#{media_type.serialize}-page=#{page}&format=json" "?uid=#{user_id}&#{media_type.serialize}-page=#{page}&format=json"
end end
break if @visited_urls.include?(url)
@visited_urls << url
response = http_client.get(url) response = http_client.get(url)
if response.status_code != 200 if response.status_code != 200
@@ -240,6 +247,9 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base
.compact .compact
.to_set .to_set
break if post_ids.subset?(seen_post_ids)
seen_post_ids.merge(post_ids)
posts.merge( posts.merge(
posts_json.map do |post_json| posts_json.map do |post_json|
GalleryEntry.new( GalleryEntry.new(

View File

@@ -5,7 +5,7 @@ class Domain::Sofurry::GalleryPageParser
class Folder < T::Struct class Folder < T::Struct
include T::Struct::ActsAsComparable include T::Struct::ActsAsComparable
const :name, String const :name, String
const :id, Integer const :sofurry_id, Integer
end end
class MediaType < T::Enum class MediaType < T::Enum
@@ -20,7 +20,7 @@ class Domain::Sofurry::GalleryPageParser
class ShortGalleryEntry < T::Struct class ShortGalleryEntry < T::Struct
include T::Struct::ActsAsComparable include T::Struct::ActsAsComparable
const :id, Integer const :sofurry_id, Integer
const :title, String const :title, String
end end
@@ -46,8 +46,8 @@ class Domain::Sofurry::GalleryPageParser
raise "user_id mismatch: #{user_id} != #{@user_id}" raise "user_id mismatch: #{user_id} != #{@user_id}"
end end
name = elem.text name = elem.text
id = href_params["folder"]&.to_i sofurry_id = href_params["folder"]&.to_i
Folder.new(name:, id:) Folder.new(name:, sofurry_id:)
end end
end end
@@ -60,19 +60,19 @@ class Domain::Sofurry::GalleryPageParser
.css(".sf-browse-shortlist,.sf-browse-shortlist-zebra") .css(".sf-browse-shortlist,.sf-browse-shortlist-zebra")
.map do |elem| .map do |elem|
title_id = elem.css(".sf-browse-shortlist-title a").first || next title_id = elem.css(".sf-browse-shortlist-title a").first || next
id = title_id&.attr("href").split("/")&.last&.to_i || next sofurry_id = title_id&.attr("href").split("/")&.last&.to_i || next
title = title_id&.text || next title = title_id&.text || next
ShortGalleryEntry.new(id:, title:) ShortGalleryEntry.new(sofurry_id:, title:)
end end
.compact, .compact,
@doc @doc
.css("a.sfArtworkSmallInner") .css("a.sfArtworkSmallInner")
.map do |elem| .map do |elem|
id = elem.attr("href").split("/")&.last&.to_i || next sofurry_id = elem.attr("href").split("/")&.last&.to_i || next
img_elem = elem.css("img.sfArtworkItem").first || next img_elem = elem.css("img.sfArtworkItem").first || next
title = title =
img_elem.attr("alt")&.strip&.rpartition("|by")&.first || next img_elem.attr("alt")&.strip&.rpartition("|by")&.first || next
ShortGalleryEntry.new(id:, title:) ShortGalleryEntry.new(sofurry_id:, title:)
end end
.compact, .compact,
].flatten ].flatten

View File

@@ -9,17 +9,11 @@ class Domain::PostGroup::SofurryFolder < Domain::PostGroup
attr_json :sofurry_id, :integer attr_json :sofurry_id, :integer
attr_json :name, :string attr_json :name, :string
attr_json :owner_id, :integer attr_json :owner_id, :integer
attr_json :media_types, :string, array: true
attr_json_due_timestamp :scanned_folder_at, 3.months attr_json_due_timestamp :scanned_folder_at, 3.months
belongs_to :owner, class_name: "::Domain::User::SofurryUser", optional: true belongs_to :owner, class_name: "::Domain::User::SofurryUser", optional: true
attr_json_enum(
:media_type,
:integer,
{ stories: 1, art: 2, music: 3, journals: 4, photos: 5 },
prefix: true,
)
sig { override.returns([String, Symbol]) } sig { override.returns([String, Symbol]) }
def self.param_prefix_and_attribute def self.param_prefix_and_attribute
["sf", :sofurry_folder_id] ["sf", :sofurry_folder_id]
@@ -32,11 +26,4 @@ class Domain::PostGroup::SofurryFolder < Domain::PostGroup
"https://www.sofurry.com/browse/folder/#{type}?by=#{owner_id}&folder=#{sofurry_id}" "https://www.sofurry.com/browse/folder/#{type}?by=#{owner_id}&folder=#{sofurry_id}"
end end
end end
sig { returns(T.nilable(Domain::Sofurry::GalleryPageParser::MediaType)) }
def media_type_enum
Domain::Sofurry::GalleryPageParser::MediaType.try_deserialize(
self.media_type,
)
end
end end

View File

@@ -2,6 +2,8 @@
class Domain::UserJobEvent::AddTrackedObject < Domain::UserJobEvent class Domain::UserJobEvent::AddTrackedObject < Domain::UserJobEvent
self.table_name = "domain_user_job_event_add_tracked_objects" self.table_name = "domain_user_job_event_add_tracked_objects"
belongs_to :log_entry, class_name: "HttpLogEntry" belongs_to :log_entry, class_name: "HttpLogEntry"
validates :log_entry_id, uniqueness: { scope: :kind }
attr_json :requested_at, ActiveModelUtcTimeValue.new attr_json :requested_at, ActiveModelUtcTimeValue.new
enum :kind, %i[favs followed_by following gallery] enum :kind, %i[favs followed_by following gallery]

View File

@@ -52,7 +52,7 @@
"@types/lodash": "^4.14.192", "@types/lodash": "^4.14.192",
"@types/react": "^18.0.33", "@types/react": "^18.0.33",
"env-cmd": "^10.1.0", "env-cmd": "^10.1.0",
"prettier": "^3.5.2", "prettier": "^3.6.2",
"prettier-plugin-tailwindcss": "^0.6.9", "prettier-plugin-tailwindcss": "^0.6.9",
"react-refresh": "^0.14.0", "react-refresh": "^0.14.0",
"typescript": "^5.0.3", "typescript": "^5.0.3",

View File

@@ -4025,10 +4025,10 @@ prettier-plugin-tailwindcss@^0.6.9:
resolved "https://registry.npmjs.org/prettier-plugin-tailwindcss/-/prettier-plugin-tailwindcss-0.6.9.tgz" resolved "https://registry.npmjs.org/prettier-plugin-tailwindcss/-/prettier-plugin-tailwindcss-0.6.9.tgz"
integrity sha512-r0i3uhaZAXYP0At5xGfJH876W3HHGHDp+LCRUJrs57PBeQ6mYHMwr25KH8NPX44F2yGTvdnH7OqCshlQx183Eg== integrity sha512-r0i3uhaZAXYP0At5xGfJH876W3HHGHDp+LCRUJrs57PBeQ6mYHMwr25KH8NPX44F2yGTvdnH7OqCshlQx183Eg==
prettier@^3.5.2: prettier@^3.6.2:
version "3.5.2" version "3.6.2"
resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.5.2.tgz#d066c6053200da0234bf8fa1ef45168abed8b914" resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.6.2.tgz#ccda02a1003ebbb2bfda6f83a074978f608b9393"
integrity sha512-lc6npv5PH7hVqozBR7lkBNOGXV9vMwROAPlumdBkX0wTbbzPu/U1hk5yL8p2pt4Xoc+2mkT8t/sow2YrV/M5qg== integrity sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==
process-nextick-args@~2.0.0: process-nextick-args@~2.0.0:
version "2.0.1" version "2.0.1"