From 967f78366ef730151085dd43df987fcae90be4c0 Mon Sep 17 00:00:00 2001 From: Dylan Knutson Date: Tue, 8 Jul 2025 00:11:24 +0000 Subject: [PATCH] sofurry WIP for now, sofurry is down --- Gemfile | 3 +- TODO.md | 6 +++ .../domain/sofurry/job/scan_gallery_job.rb | 44 ++++++++++++------- app/lib/domain/sofurry/gallery_page_parser.rb | 16 +++---- .../domain/post_group/sofurry_folder.rb | 15 +------ .../user_job_event/add_tracked_object.rb | 2 + package.json | 2 +- yarn.lock | 8 ++-- 8 files changed, 50 insertions(+), 46 deletions(-) diff --git a/Gemfile b/Gemfile index 3fc574a1..c1cf5de1 100644 --- a/Gemfile +++ b/Gemfile @@ -1,8 +1,7 @@ source "https://rubygems.org" git_source(:github) { |repo| "https://github.com/#{repo}.git" } -ruby "3.2.6" -# ruby "3.0.3" +ruby "~> 3.2" # Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main" gem "rails", "~> 7.2" diff --git a/TODO.md b/TODO.md index 39b9c252..228adbcb 100644 --- a/TODO.md +++ b/TODO.md @@ -39,3 +39,9 @@ - [ ] Rich inline links to e621 e.g. https://refurrer.com/posts/fa@60070060 - [ ] Find FaPost that have favs recorded but no scan / file, enqueue scan - [x] Bunch of posts with empty responses: posts = Domain::Post.joins(files: :log_entry).where(files: { http_log_entries: { response_sha256: BlobFile::EMPTY_FILE_SHA256 }}).limit(10) +- [ ] Create GlobalState entries for last FA id on browse page, periodic scan to scan from the newest FA ID to the stored one +- [ ] GlobalState entries for long running backfill jobs, automatically restart them if they fail +- [ ] Flag to pass to jobs to log HTTP requests / responses to a directory, HTTP mock helper to read from that directory +- [ ] fix IP address incorrect for Cloudflare proxied requests +- [ ] SOCKS5 proxy for additional workers +- [ ] Backup FA scraper using foxbot & g6jy5jkx466lrqojcngbnksugrcfxsl562bzuikrka5rv7srgguqbjid.onion diff --git a/app/jobs/domain/sofurry/job/scan_gallery_job.rb b/app/jobs/domain/sofurry/job/scan_gallery_job.rb index 26a76431..3772765b 100644 --- a/app/jobs/domain/sofurry/job/scan_gallery_job.rb +++ b/app/jobs/domain/sofurry/job/scan_gallery_job.rb @@ -22,6 +22,12 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base const :gallery_log_entry, T.nilable(HttpLogEntry) end + sig { override.params(args: T::Hash[Symbol, T.untyped]).void } + def initialize(*args) + super + @visited_urls = T.let(Set.new, T::Set[String]) + end + sig { override.params(args: T::Hash[Symbol, T.untyped]).returns(T.untyped) } def perform(args) user = user_from_args! @@ -45,29 +51,26 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base media_types_with_posts = T.let(Set.new, T::Set[Domain::Sofurry::GalleryPageParser::MediaType]) + existing_folders = + user + .folders + .filter_map { |folder| (sfid = folder.sofurry_id) && [sfid, folder] } + .to_h + media_types.each do |media_type| gallery_summary = gather_gallery_summary(user, media_type) folders = gallery_summary.folders gallery_posts = gallery_summary.posts - existing_folders = - user - .folders - .filter { |folder| folder.media_type == media_type.serialize } - .filter_map { |folder| (sfid = folder.sofurry_id) && [sfid, folder] } - .to_h - folders.each do |folder| - if (existing = existing_folders[folder.id]) + if (existing = existing_folders[folder.sofurry_id]) existing.name = folder.name else - user.folders.build( - { - sofurry_id: folder.id, - name: folder.name, - media_type: media_type.serialize, - }, - ) + folder_model = + user.folders.build( + { sofurry_id: folder.sofurry_id, name: folder.name }, + ) + existing_folders[folder.sofurry_id] = folder_model end if gallery_posts.any? @@ -97,7 +100,6 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base folder_posts = scan_gallery_posts( user_id, - T.must(folder.media_type_enum), existing_posts.keys.to_set, folder_id: folder.sofurry_id, ) @@ -212,16 +214,21 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base def scan_gallery_posts(user_id, media_type, existing_post_ids, folder_id: nil) posts = T.let(Set.new, T::Set[GalleryEntry]) page = 1 + seen_post_ids = T.let(Set.new, T::Set[Integer]) loop do url = if folder_id "https://api2.sofurry.com/browse/folder/#{media_type.serialize}" + - "?by=#{user_id}&folder=#{folder_id}&format=json" + "?by=#{user_id}&folder=#{folder_id}&#{media_type.serialize}-page=#{page}&format=json" else "https://api2.sofurry.com/browse/user/#{media_type.serialize}" + "?uid=#{user_id}&#{media_type.serialize}-page=#{page}&format=json" end + + break if @visited_urls.include?(url) + @visited_urls << url + response = http_client.get(url) if response.status_code != 200 @@ -240,6 +247,9 @@ class Domain::Sofurry::Job::ScanGalleryJob < Domain::Sofurry::Job::Base .compact .to_set + break if post_ids.subset?(seen_post_ids) + seen_post_ids.merge(post_ids) + posts.merge( posts_json.map do |post_json| GalleryEntry.new( diff --git a/app/lib/domain/sofurry/gallery_page_parser.rb b/app/lib/domain/sofurry/gallery_page_parser.rb index 95edaef3..d153c828 100644 --- a/app/lib/domain/sofurry/gallery_page_parser.rb +++ b/app/lib/domain/sofurry/gallery_page_parser.rb @@ -5,7 +5,7 @@ class Domain::Sofurry::GalleryPageParser class Folder < T::Struct include T::Struct::ActsAsComparable const :name, String - const :id, Integer + const :sofurry_id, Integer end class MediaType < T::Enum @@ -20,7 +20,7 @@ class Domain::Sofurry::GalleryPageParser class ShortGalleryEntry < T::Struct include T::Struct::ActsAsComparable - const :id, Integer + const :sofurry_id, Integer const :title, String end @@ -46,8 +46,8 @@ class Domain::Sofurry::GalleryPageParser raise "user_id mismatch: #{user_id} != #{@user_id}" end name = elem.text - id = href_params["folder"]&.to_i - Folder.new(name:, id:) + sofurry_id = href_params["folder"]&.to_i + Folder.new(name:, sofurry_id:) end end @@ -60,19 +60,19 @@ class Domain::Sofurry::GalleryPageParser .css(".sf-browse-shortlist,.sf-browse-shortlist-zebra") .map do |elem| title_id = elem.css(".sf-browse-shortlist-title a").first || next - id = title_id&.attr("href").split("/")&.last&.to_i || next + sofurry_id = title_id&.attr("href").split("/")&.last&.to_i || next title = title_id&.text || next - ShortGalleryEntry.new(id:, title:) + ShortGalleryEntry.new(sofurry_id:, title:) end .compact, @doc .css("a.sfArtworkSmallInner") .map do |elem| - id = elem.attr("href").split("/")&.last&.to_i || next + sofurry_id = elem.attr("href").split("/")&.last&.to_i || next img_elem = elem.css("img.sfArtworkItem").first || next title = img_elem.attr("alt")&.strip&.rpartition("|by")&.first || next - ShortGalleryEntry.new(id:, title:) + ShortGalleryEntry.new(sofurry_id:, title:) end .compact, ].flatten diff --git a/app/models/domain/post_group/sofurry_folder.rb b/app/models/domain/post_group/sofurry_folder.rb index c77f12a7..43b4e9bc 100644 --- a/app/models/domain/post_group/sofurry_folder.rb +++ b/app/models/domain/post_group/sofurry_folder.rb @@ -9,17 +9,11 @@ class Domain::PostGroup::SofurryFolder < Domain::PostGroup attr_json :sofurry_id, :integer attr_json :name, :string attr_json :owner_id, :integer + attr_json :media_types, :string, array: true attr_json_due_timestamp :scanned_folder_at, 3.months belongs_to :owner, class_name: "::Domain::User::SofurryUser", optional: true - attr_json_enum( - :media_type, - :integer, - { stories: 1, art: 2, music: 3, journals: 4, photos: 5 }, - prefix: true, - ) - sig { override.returns([String, Symbol]) } def self.param_prefix_and_attribute ["sf", :sofurry_folder_id] @@ -32,11 +26,4 @@ class Domain::PostGroup::SofurryFolder < Domain::PostGroup "https://www.sofurry.com/browse/folder/#{type}?by=#{owner_id}&folder=#{sofurry_id}" end end - - sig { returns(T.nilable(Domain::Sofurry::GalleryPageParser::MediaType)) } - def media_type_enum - Domain::Sofurry::GalleryPageParser::MediaType.try_deserialize( - self.media_type, - ) - end end diff --git a/app/models/domain/user_job_event/add_tracked_object.rb b/app/models/domain/user_job_event/add_tracked_object.rb index bd841da1..7ec8e4f2 100644 --- a/app/models/domain/user_job_event/add_tracked_object.rb +++ b/app/models/domain/user_job_event/add_tracked_object.rb @@ -2,6 +2,8 @@ class Domain::UserJobEvent::AddTrackedObject < Domain::UserJobEvent self.table_name = "domain_user_job_event_add_tracked_objects" belongs_to :log_entry, class_name: "HttpLogEntry" + validates :log_entry_id, uniqueness: { scope: :kind } + attr_json :requested_at, ActiveModelUtcTimeValue.new enum :kind, %i[favs followed_by following gallery] diff --git a/package.json b/package.json index 2a457235..9465e11d 100644 --- a/package.json +++ b/package.json @@ -52,7 +52,7 @@ "@types/lodash": "^4.14.192", "@types/react": "^18.0.33", "env-cmd": "^10.1.0", - "prettier": "^3.5.2", + "prettier": "^3.6.2", "prettier-plugin-tailwindcss": "^0.6.9", "react-refresh": "^0.14.0", "typescript": "^5.0.3", diff --git a/yarn.lock b/yarn.lock index 2c8738a2..9be4a8a9 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4025,10 +4025,10 @@ prettier-plugin-tailwindcss@^0.6.9: resolved "https://registry.npmjs.org/prettier-plugin-tailwindcss/-/prettier-plugin-tailwindcss-0.6.9.tgz" integrity sha512-r0i3uhaZAXYP0At5xGfJH876W3HHGHDp+LCRUJrs57PBeQ6mYHMwr25KH8NPX44F2yGTvdnH7OqCshlQx183Eg== -prettier@^3.5.2: - version "3.5.2" - resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.5.2.tgz#d066c6053200da0234bf8fa1ef45168abed8b914" - integrity sha512-lc6npv5PH7hVqozBR7lkBNOGXV9vMwROAPlumdBkX0wTbbzPu/U1hk5yL8p2pt4Xoc+2mkT8t/sow2YrV/M5qg== +prettier@^3.6.2: + version "3.6.2" + resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.6.2.tgz#ccda02a1003ebbb2bfda6f83a074978f608b9393" + integrity sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ== process-nextick-args@~2.0.0: version "2.0.1"