improvements for rich text, plain text link embedding

2025-03-03 01:10:54 +00:00
parent 720a2ab1b8
commit c0ddef96f0
8 changed files with 396 additions and 162 deletions
--- a/app/helpers/domain/descriptions_helper.rb
+++ b/app/helpers/domain/descriptions_helper.rb
@@ -56,6 +56,17 @@ module Domain::DescriptionsHelper
    end
  end

+  WEAK_URL_MATCHER_REGEX =
+    %r{(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)}
+
+  ALLOWED_INFERRED_URL_DOMAINS =
+    T.let(
+      %w[furaffinity.net inkbunny.net e621.net]
+        .flat_map { |domain| [domain, "www.#{domain}"] }
+        .freeze,
+      T::Array[String],
+    )
+
  sig { params(model: HasDescriptionHtmlForView).returns(T.nilable(String)) }
  def sanitize_description_html(model)
    html = model.description_html_for_view
@@ -75,107 +86,154 @@ module Domain::DescriptionsHelper

    replacements = {}

+    # Transform bare text that is not contained within an anchor tag into an anchor tag
+    text_link_transformer =
+      lambda do |env|
+        node = T.cast(env[:node], Nokogiri::XML::Node)
+        return if env[:is_allowlisted]
+
+        next unless node.text?
+        next unless node.ancestors("a").empty?
+        next unless (node_text = T.cast(node.text, T.nilable(String)))
+        next unless (match = node_text.match(WEAK_URL_MATCHER_REGEX))
+        next unless (url_text = match[0])
+        unless (
+                 uri =
+                   try_parse_uri(model.description_html_base_domain, url_text)
+               )
+          next
+        end
+        unless ALLOWED_PLAIN_TEXT_URL_DOMAINS.any? { |domain|
+                 url_matches_domain?(domain, uri.host)
+               }
+          next
+        end
+
+        before, after = node.text.split(url_text, 2)
+        new_node = "#{before}<a href=\"#{url_text}\">#{url_text}</a>#{after}"
+        node.replace(new_node)
+      end
+
+    tag_class_and_style_transformer =
+      lambda do |env|
+        node = T.cast(env[:node], Nokogiri::XML::Node)
+        node_name = T.cast(env[:node_name], String)
+        return if env[:is_allowlisted] || !node.element?
+
+        # Convert bbcode_center class to text-align: center style
+        # and remove all other styling
+        add_node_styles = []
+        if node["class"]&.include?("bbcode_center")
+          add_node_styles << "text-align: center"
+        end
+
+        node.name = "div" if node_name == "code"
+        node.remove_attribute("class")
+        # add to original styles
+        node["style"] = (node["style"] || "")
+          .split(";")
+          .map(&:strip)
+          .concat(add_node_styles)
+          .map { |s| s + ";" }
+          .join(" ")
+      end
+
+    link_to_model_link_transformer =
+      lambda do |env|
+        node = T.cast(env[:node], Nokogiri::XML::Node)
+        node_name = T.cast(env[:node_name], String)
+        next if env[:is_allowlisted] || !node.element?
+
+        # Only allow and transform FA links
+        if node_name == "a"
+          href_str = node["href"]&.downcase || ""
+          url = try_parse_uri(model.description_html_base_domain, href_str)
+          next { node_whitelist: [] } if url.nil?
+
+          found_link = link_for_source(url.to_s)
+          if found_link.present? && (found_model = found_link.model)
+            partial, locals =
+              case found_model
+              when Domain::Post
+                [
+                  "domain/has_description_html/inline_link_domain_post",
+                  {
+                    post: found_model,
+                    link_text: node.text,
+                    visual_style: "description-section-link",
+                  },
+                ]
+              when Domain::User
+                [
+                  "domain/has_description_html/inline_link_domain_user",
+                  {
+                    user: found_model,
+                    link_text: node.text,
+                    visual_style: "description-section-link",
+                  },
+                ]
+              else
+                raise "Unknown model type: #{found_link.model.class}"
+              end
+
+            replacements[node] = Nokogiri::HTML5.fragment(
+              render(partial:, locals:),
+            )
+            next { node_whitelist: [node] }
+          else
+            if ALLOWED_EXTERNAL_LINK_DOMAINS.any? { |domain|
+                 url_matches_domain?(domain, url.host)
+               }
+              if node.text.blank? || text_same_as_url?(node.text, url.to_s)
+                title = title_for_url(url.to_s)
+              else
+                title = node.text
+              end
+
+              replacements[node] = Nokogiri::HTML5.fragment(
+                render(
+                  partial: "domain/has_description_html/inline_link_external",
+                  locals: {
+                    url: url.to_s,
+                    title:,
+                    icon_path: icon_path_for_domain(url.host),
+                  },
+                ),
+              )
+              next { node_whitelist: [node] }
+            end
+          end
+        end
+      end
+
+    disallowed_link_transformer =
+      lambda do |env|
+        node = T.cast(env[:node], Nokogiri::XML::Node)
+        node_name = T.cast(env[:node_name], String)
+        return if env[:is_allowlisted] || !node.element?
+
+        if node_name == "a"
+          # by the time we're here, we know this is not a valid link node,
+          # and it should be replaced with its text
+          node.replace(node.inner_html)
+        end
+      end
+
    sanitizer =
      Sanitize.new(
-        elements: %w[code div br img b i span strong hr p],
+        elements: %w[a code div br img b i span strong hr p],
        attributes: {
-          all: %w[class style],
+          "a" => %w[href class],
+          :all => %w[class style],
        },
        css: {
          properties: %w[font-size color text-align class],
        },
        transformers: [
-          lambda do |env|
-            node = env[:node]
-            node_name = env[:node_name]
-            return if env[:is_allowlisted] || !node.element?
-
-            # Convert bbcode_center class to text-align: center style
-            # and remove all other styling
-            add_node_styles = []
-            if node["class"]&.include?("bbcode_center")
-              add_node_styles << "text-align: center"
-            end
-
-            node.name = "div" if node_name == "code"
-            node.remove_attribute("class")
-            # add to original styles
-            node["style"] = (node["style"] || "")
-              .split(";")
-              .map(&:strip)
-              .concat(add_node_styles)
-              .map { |s| s + ";" }
-              .join(" ")
-
-            # Only allow and transform FA links
-            if node_name == "a"
-              href_str = node["href"]&.downcase || ""
-              url = try_parse_uri(model.description_html_base_domain, href_str)
-              next { node_whitelist: [] } if url.nil?
-
-              found_link = link_for_source(url.to_s)
-              if found_link.nil?
-                if ALLOWED_EXTERNAL_LINK_DOMAINS.any? { |domain|
-                     url_matches_domain?(domain, url.host)
-                   }
-                  if node.text.blank? || text_same_as_url?(node.text, url.to_s)
-                    title = title_for_url(url.to_s)
-                  else
-                    title = node.text
-                  end
-
-                  replacements[node] = Nokogiri::HTML5.fragment(
-                    render(
-                      partial:
-                        "domain/has_description_html/inline_link_external",
-                      locals: {
-                        url: url.to_s,
-                        title:,
-                        icon_path: icon_path_for_domain(url.host),
-                      },
-                    ),
-                  )
-                  next { node_whitelist: [node] }
-                else
-                  next { node_whitelist: [] }
-                end
-              end
-
-              found_model = found_link.model
-              partial, locals =
-                case found_model
-                when Domain::Post
-                  [
-                    "domain/has_description_html/inline_link_domain_post",
-                    {
-                      post: found_model,
-                      link_text: node.text,
-                      visual_style: "description-section-link",
-                    },
-                  ]
-                when Domain::User
-                  [
-                    "domain/has_description_html/inline_link_domain_user",
-                    {
-                      user: found_model,
-                      link_text: node.text,
-                      visual_style: "description-section-link",
-                    },
-                  ]
-                else
-                  raise "Unknown model type: #{found_link.model.class}"
-                end
-
-              replacements[node] = Nokogiri::HTML5.fragment(
-                render(partial:, locals:),
-              )
-
-              next { node_whitelist: [node] }
-            end
-
-            # Else, don't allow any other nodes
-            { node_whitelist: [] }
-          end,
+          text_link_transformer,
+          tag_class_and_style_transformer,
+          link_to_model_link_transformer,
+          disallowed_link_transformer,
        ],
      )

--- a/app/helpers/domain/domains_helper.rb
+++ b/app/helpers/domain/domains_helper.rb
@@ -5,63 +5,81 @@ module Domain::DomainsHelper
  include HelpersInterface
  abstract!

-  ALLOWED_EXTERNAL_LINK_DOMAINS = %w[
-    youtube.com
-    x.com
-    weasyl.com
-    vimeo.com
-    twitter.com
-    twitch.tv
-    tumblr.com
-    t.me
-    spreadshirt.de
-    spreadshirt.com
-    redbubble.com
-    pixiv.net
-    pinterest.com
-    patreon.com
-    mstdn.social
-    livejournal.com
-    ko-fi.com
-    instagram.com
-    facebook.com
-    dribbble.com
-    discord.gg
-    deviantart.com
-    bsky.app
-    behance.net
-    gumroad.com
-    bigcartel.com
+  # If a URL is detected in plain text and is one of these domains,
+  # it will be converted to an anchor tag.
+  ALLOWED_PLAIN_TEXT_URL_DOMAINS = %w[
+    e621.net
    furaffinity.net
-    boosty.to
-    hipolink.me
-    archiveofourown.org
+    inkbunny.net
  ].freeze

+  # If a link is detected in an anchor tag and is one of these domains,
+  # it will be converted to a link.
+  ALLOWED_EXTERNAL_LINK_DOMAINS =
+    T.let(
+      (
+        %w[
+          archiveofourown.org
+          behance.net
+          bigcartel.com
+          boosty.to
+          bsky.app
+          deviantart.com
+          discord.gg
+          dribbble.com
+          e621.net
+          facebook.com
+          furaffinity.net
+          gumroad.com
+          hipolink.me
+          inkbunny.net
+          instagram.com
+          ko-fi.com
+          livejournal.com
+          mstdn.social
+          patreon.com
+          pinterest.com
+          pixiv.net
+          redbubble.com
+          spreadshirt.com
+          spreadshirt.de
+          t.me
+          tumblr.com
+          twitch.tv
+          twitter.com
+          vimeo.com
+          weasyl.com
+          x.com
+          youtube.com
+        ] + ALLOWED_PLAIN_TEXT_URL_DOMAINS
+      ).freeze,
+      T::Array[String],
+    )
+
  DOMAIN_TO_ICON_PATH =
    T.let(
      {
-        "x.com" => "x-twitter.png",
-        "wixmp.com" => "deviantart.png",
-        "weasyl.com" => "weasyl.png",
-        "twitter.com" => "x-twitter.png",
-        "t.me" => "telegram.png",
-        "pixiv.net" => "pixiv.png",
-        "patreon.com" => "patreon.png",
-        "newgrounds.com" => "newgrounds.png",
-        "itaku.ee" => "itaku.png",
-        "inkbunny.net" => "inkbunny.png",
-        "ib.metapix.net" => "inkbunny.png",
-        "furaffinity.net" => "fa.png",
-        "e621.net" => "e621.png",
-        "deviantart.com" => "deviantart.png",
-        "bsky.app" => "bsky.png",
-        "redbubble.com" => "redbubble.png",
-        "spreadshirt.de" => "spreadshirt.png",
-        "spreadshirt.com" => "spreadshirt.png",
        "boosty.to" => "boosty.png",
-        "tumblr.com" => "tumblr.png",
+        "bsky.app" => "bsky.png",
+        "deviantart.com" => "deviantart.png",
+        "e621.net" => "e621.png",
+        "furaffinity.net" => "fa.png",
+        "ib.metapix.net" => "inkbunny.png",
+        "inkbunny.net" => "inkbunny.png",
+        "itaku.ee" => "itaku.png",
        "ko-fi.com" => "ko-fi.png",
+        "newgrounds.com" => "newgrounds.png",
+        "patreon.com" => "patreon.png",
+        "pixiv.net" => "pixiv.png",
+        "redbubble.com" => "redbubble.png",
+        "spreadshirt.com" => "spreadshirt.png",
+        "spreadshirt.de" => "spreadshirt.png",
+        "t.me" => "telegram.png",
+        "tumblr.com" => "tumblr.png",
+        "twitter.com" => "x-twitter.png",
+        "weasyl.com" => "weasyl.png",
+        "wixmp.com" => "deviantart.png",
+        "x.com" => "x-twitter.png",
      }.freeze,
      T::Hash[String, String],
    )
--- a/app/helpers/domain/posts_helper.rb
+++ b/app/helpers/domain/posts_helper.rb
@@ -103,6 +103,8 @@ module Domain::PostsHelper
      "Microsoft Word Document"
    when %r{application/vnd\.openxmlformats-officedocument\.wordprocessingml\.document}
      "Microsoft Word Document (OpenXML)"
+    when %r{application/vnd\.oasis\.opendocument\.text}
+      "OpenDocument Text"
    when %r{application/rtf}
      "Rich Text Document"
    when %r{image/jpeg}
@@ -224,6 +226,7 @@ module Domain::PostsHelper
  FA_CDN_HOSTS = %w[d.furaffinity.net *.facdn.net facdn.net]
  IB_HOSTS = %w[*.inkbunny.net inkbunny.net]
  IB_CDN_HOSTS = %w[*.ib.metapix.net ib.metapix.net]
+  E621_HOSTS = %w[www.e621.net e621.net]

  URL_SUFFIX_QUERY = T.let(<<-SQL.strip.chomp.freeze, String)
    lower(json_attributes->>'url_str') = lower(?)
@@ -235,7 +238,10 @@ module Domain::PostsHelper
        # Furaffinity posts
        SourceMatcher.new(
          hosts: FA_HOSTS,
-          patterns: [%r{/view/(\d+)/?}],
+          patterns: [
+            %r{/view/(\d+)/?},
+            %r{/controls/submissions/changeinfo/(\d+)/?},
+          ],
          find_proc: ->(helper, match, _) do
            if post = Domain::Post::FaPost.find_by(fa_id: match[1])
              SourceResult.new(
@@ -248,7 +254,7 @@ module Domain::PostsHelper
        # Furaffinity posts via direct file URL
        SourceMatcher.new(
          hosts: FA_CDN_HOSTS,
-          patterns: [//],
+          patterns: [/.+/],
          find_proc: ->(helper, _, url) do
            url = Addressable::URI.parse(url)

@@ -329,6 +335,32 @@ module Domain::PostsHelper
            end
          end,
        ),
+        # E621 posts
+        SourceMatcher.new(
+          hosts: E621_HOSTS,
+          patterns: [%r{/posts/(\d+)/?}],
+          find_proc: ->(helper, match, _) do
+            if post = Domain::Post::E621Post.find_by(e621_id: match[1])
+              SourceResult.new(
+                model: post,
+                title: helper.title_for_post_model(post),
+              )
+            end
+          end,
+        ),
+        # E621 users
+        SourceMatcher.new(
+          hosts: E621_HOSTS,
+          patterns: [%r{/users/(\d+)/?}],
+          find_proc: ->(helper, match, _) do
+            if user = Domain::User::E621User.find_by(e621_id: match[1])
+              SourceResult.new(
+                model: user,
+                title: user.name_for_view || "unknown",
+              )
+            end
+          end,
+        ),
      ],
      T::Array[SourceMatcher],
    )
--- a/app/helpers/log_entries_helper.rb
+++ b/app/helpers/log_entries_helper.rb
@@ -70,6 +70,8 @@ module LogEntriesHelper
      "doc"
    when %r{application/vnd\.openxmlformats-officedocument\.wordprocessingml\.document}
      "docx"
+    when %r{application/vnd\.oasis\.opendocument\.text}
+      "odt"
    else
      nil
    end
@@ -94,6 +96,7 @@ module LogEntriesHelper
      application/rtf
      application/msword
      text/plain
+      application/vnd.oasis.opendocument.text
      application/vnd.openxmlformats-officedocument.wordprocessingml.document
    ].any? { |ct| content_type.starts_with?(ct) }
  end
@@ -248,7 +251,13 @@ module LogEntriesHelper
    end

    return nil if document_html.blank?
+    sanitize_rich_text_document_html(document_html, is_plain_text)
+  end

+  sig do
+    params(document_html: String, is_plain_text: T::Boolean).returns(String)
+  end
+  def sanitize_rich_text_document_html(document_html, is_plain_text)
    quote_transformer =
      Kernel.lambda do |env|
        node = env[:node]
@@ -265,6 +274,7 @@ module LogEntriesHelper

    clean_plain_text_node =
      Kernel.lambda do |node|
+        node = T.cast(node, Nokogiri::XML::Node)
        if node.text?
          node_text = node.text.strip
          if node_text.empty?
@@ -275,7 +285,7 @@ module LogEntriesHelper
            # and the collected nodes
            current_node = node
            inline_elements = []
-            while (next_sibling = current_node&.next_sibling) &&
+            while (next_sibling = current_node.next_sibling) &&
                    (next_sibling.name != "br") && (next_sibling.name != "p")
              inline_elements << next_sibling
              current_node = next_sibling
@@ -294,29 +304,33 @@ module LogEntriesHelper
    plain_text_transformer =
      Kernel.lambda do |env|
        # within a div, wrap bare text nodes in a <p>
-        node = env[:node]
-        node_name = env[:node_name]
+        node = T.cast(env[:node], Nokogiri::XML::Node)
+        node_name = T.cast(env[:node_name], String)
        if node_name == "div"
-          node.children.each { |child| clean_plain_text_node.call(child) }
+          current_child = T.unsafe(node.children.first)
+          while current_child.present?
+            clean_plain_text_node.call(current_child)
+            current_child = current_child.next_sibling
+          end
        elsif node.text? && node.parent&.name == "#document-fragment"
          clean_plain_text_node.call(node)
        end
        { node_allowlist: [node] }
      end

-    remove_empty_newline_transformer =
-      Kernel.lambda do |env|
-        node = env[:node]
-        node.unlink if node.text? && node.text.strip.chomp.blank?
-      end
+    # remove_empty_newline_transformer =
+    #   Kernel.lambda do |env|
+    #     node = env[:node]
+    #     node.unlink if node.text? && node.text.strip.chomp.blank?
+    #   end

-    remove_multiple_br_transformer =
-      Kernel.lambda do |env|
-        node = env[:node]
-        if node.name == "br"
-          node.unlink if node.previous_sibling&.name == "br"
-        end
-      end
+    # remove_multiple_br_transformer =
+    #   Kernel.lambda do |env|
+    #     node = env[:node]
+    #     if node.name == "br"
+    #       node.unlink if node.previous_sibling&.name == "br"
+    #     end
+    #   end

    sanitizer =
      Sanitize.new(
--- a/app/models/domain/post/fa_post.rb
+++ b/app/models/domain/post/fa_post.rb
@@ -168,7 +168,8 @@ class Domain::Post::FaPost < Domain::Post
    begin
      contents = guess_last_submission_log_entry&.response_bytes
      if contents
-        parser = Domain::Fa::Parser::Page.new(contents)
+        parser =
+          Domain::Fa::Parser::Page.new(contents, require_logged_in: false)
        parser.submission.posted_date if parser.probably_submission?
      end
    end
--- a/sorbet/rbi/shims/nokogiri.rbi
+++ b/sorbet/rbi/shims/nokogiri.rbi
@@ -0,0 +1,17 @@
+# typed: strict
+
+class Nokogiri::XML::Node
+  sig { returns(Nokogiri::XML::NodeSet) }
+  def children
+  end
+
+  sig { returns(T.nilable(Nokogiri::XML::Node)) }
+  def next_sibling
+  end
+end
+
+class Nokogiri::XML::NodeSet
+  sig { params(n: T.nilable(Integer)).returns(T.nilable(Nokogiri::XML::Node)) }
+  def first(n = nil)
+  end
+end
--- a/spec/helpers/domain/descriptions_helper_spec.rb
+++ b/spec/helpers/domain/descriptions_helper_spec.rb
@@ -67,6 +67,82 @@ RSpec.describe Domain::DescriptionsHelper, type: :helper do
      end
    end

+    describe "handling links that are bare text" do
+      it "does not replace links that are already wrapped in an anchor tag" do
+        html =
+          '<a href="https://e621.net/posts/5406941" title="https://e621.net/posts/5406941" class="auto_link">https://e621.net/posts/5406941</a><br />'
+        sanitized = sanitize_description_html(html)
+        # should render the external link partial because the e621 post model is not found
+        expect(sanitized).to include("_blank noopener noreferrer nofollow")
+        expect(sanitized).to include("https://e621.net/posts/5406941")
+        expect(sanitized).to include("<a")
+      end
+
+      it "does not make a link if the domain is not in the allowlist" do
+        html =
+          'before <a href="https://www.example.com/foo/" title="https://www.example.com/foo/" class="auto_link">https://www.example.com/foo/</a> after'
+        sanitized = sanitize_description_html(html)
+        expect(sanitized).to include("before ")
+        expect(sanitized).to include(" after")
+        expect(sanitized).to include("https://www.example.com/foo/")
+        expect(sanitized).not_to include("<a")
+      end
+
+      it "does not make a link if anchor does not have href" do
+        html = "<a>No href</a>"
+        sanitized = sanitize_description_html(html)
+        expect(sanitized).to eq("No href")
+      end
+
+      it "does not allow nested links" do
+        html =
+          '<a href="https://www.example.com/foo">Link <a href="https://www.example.com/bar/">Nested Link</a></a>'
+        sanitized = sanitize_description_html(html)
+        expect(sanitized).to eq("Link Nested Link")
+      end
+
+      it "handles models that are not recorded in the database as external links" do
+        html = "before http://www.furaffinity.net/view/123456/ after"
+        sanitized = sanitize_description_html(html)
+        expect(sanitized).to include("before ")
+        expect(sanitized).to include(" after")
+        expect(sanitized).to include("http://www.furaffinity.net/view/123456/")
+        # renders the inline_link_external partial because the FA post model is not found
+        expect(sanitized).to include("_blank noopener noreferrer nofollow")
+      end
+
+      it "handles models that are recorded in the database as inline hoverable links" do
+        post = create(:domain_post_e621_post, e621_id: 5_406_941)
+        html = "before https://e621.net/posts/5406941 after"
+        sanitized = sanitize_description_html(html)
+        expect(sanitized).to include("before ")
+        expect(sanitized).to include(" after")
+        expect(sanitized).to include(
+          /PostHoverPreviewWrapper.+#{post.to_param}/,
+        )
+      end
+
+      it "handles text links that do not have a scheme" do
+        html = "before e621.net/posts/5406941 after"
+        sanitized = sanitize_description_html(html)
+        expect(sanitized).to include("before ")
+        expect(sanitized).to include(" after")
+        expect(sanitized).to include("e621.net/posts/5406941")
+        expect(sanitized).to include("_blank noopener noreferrer nofollow")
+      end
+
+      it "deep links text links that do not have a scheme" do
+        post = create(:domain_post_e621_post, e621_id: 5_406_941)
+        html = "before e621.net/posts/5406941 after"
+        sanitized = sanitize_description_html(html)
+        expect(sanitized).to include("before ")
+        expect(sanitized).to include(" after")
+        expect(sanitized).to include(
+          /PostHoverPreviewWrapper.+#{post.to_param}/,
+        )
+      end
+    end
+
    describe "FA post link handling" do
      %w[
        https://www.furaffinity.net/view/123456/
@@ -257,7 +333,7 @@ RSpec.describe Domain::DescriptionsHelper, type: :helper do
        expect(sanitized).not_to include("FA User")
        expect(sanitized).to include("Google")
        expect(sanitized).to include("No href")
-        expect(sanitized.scan(/<a/).length).to eq(0)
+        expect(sanitized).not_to include("<a")
      end
    end

--- a/spec/helpers/log_entries_helper_spec.rb
+++ b/spec/helpers/log_entries_helper_spec.rb
@@ -80,4 +80,22 @@ RSpec.describe LogEntriesHelper, type: :helper do
      expect(rendered).not_to be_nil
    end
  end
+
+  describe "#sanitize_rich_text_document_html" do
+    it "puts sequences of inline elements into a <p>" do
+      html =
+        "
+          <div>
+            Does that
+            <em>really</em>
+            make you...
+          </div>
+        "
+
+      rendered = helper.sanitize_rich_text_document_html(html, true)
+      expect(rendered).to eq_html(
+        "<div><p>Does that <em>really</em> make you...</p></div>",
+      )
+    end
+  end
 end