detect encoding for plain text files

This commit is contained in:
Dylan Knutson
2025-06-19 16:07:26 +00:00
parent dc98e30f47
commit 9d2bde629a
7 changed files with 148 additions and 2 deletions

View File

@@ -135,6 +135,7 @@ gem "ruby-bbcode"
gem "dtext_rb",
git: "https://github.com/e621ng/dtext_rb",
ref: "5ef8fd7a5205c832f4c18197911717e7d491494e"
gem "charlock_holmes"
# gem "pghero", git: "https://github.com/dymk/pghero", ref: "e314f99"
gem "pghero", "~> 3.6"

View File

@@ -131,6 +131,7 @@ GEM
rack-test (>= 0.6.3)
regexp_parser (>= 1.5, < 3.0)
xpath (~> 3.2)
charlock_holmes (0.7.9)
cloudflare-rails (6.2.0)
actionpack (>= 7.1.0, < 8.1.0)
activesupport (>= 7.1.0, < 8.1.0)
@@ -573,6 +574,7 @@ DEPENDENCIES
attr_json
bootsnap
capybara
charlock_holmes
cloudflare-rails
colorize
concurrent-ruby-edge

View File

@@ -223,6 +223,13 @@ module LogEntriesHelper
tempfile&.close
end
sig { params(str: String).returns(String) }
def reencode_as_utf8_lossy(str)
str.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
rescue StandardError
str
end
sig { params(rich_text_body: String).returns(String) }
def try_convert_bbcode_to_html(rich_text_body)
rich_text_body.bbcode_to_html(false)
@@ -230,6 +237,12 @@ module LogEntriesHelper
rich_text_body
end
sig { params(rich_text_body: String).returns(String) }
def try_detect_encoding(rich_text_body)
encoding = CharlockHolmes::EncodingDetector.detect(rich_text_body)
encoding ? encoding[:encoding] : "UTF-8"
end
sig { params(log_entry: HttpLogEntry).returns(T.nilable(String)) }
def render_rich_text_content(log_entry)
content_type = log_entry.content_type
@@ -239,8 +252,9 @@ module LogEntriesHelper
is_plain_text = content_type.starts_with?("text/plain")
if is_plain_text
# rich_text_body.gsub!(/(\r\n|\n|\r)+/, "<br />")
rich_text_body = rich_text_body.force_encoding("UTF-8")
encoding_name = try_detect_encoding(rich_text_body)
rich_text_body = rich_text_body.force_encoding(encoding_name)
rich_text_body = reencode_as_utf8_lossy(rich_text_body)
document_html = try_convert_bbcode_to_html(rich_text_body)
elsif content_type.starts_with?("application/pdf")
document_html = convert_with_pdftohtml(rich_text_body)

105
sorbet/rbi/gems/charlock_holmes@0.7.9.rbi generated Normal file
View File

@@ -0,0 +1,105 @@
# typed: true
# DO NOT EDIT MANUALLY
# This is an autogenerated file for types exported from the `charlock_holmes` gem.
# Please instead update this file by running `bin/tapioca gem charlock_holmes`.
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#1
module CharlockHolmes; end
class CharlockHolmes::Converter
class << self
def convert(_arg0, _arg1, _arg2); end
end
end
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#2
class CharlockHolmes::EncodingDetector
# @return [EncodingDetector] a new instance of EncodingDetector
#
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#11
def initialize(scan_len = T.unsafe(nil)); end
# Length for which to scan content for NULL bytes
#
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#7
def binary_scan_length; end
# Length for which to scan content for NULL bytes
#
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#7
def binary_scan_length=(_arg0); end
def detect(*_arg0); end
def detect_all(*_arg0); end
def is_binary?(_arg0); end
def strip_tags; end
def strip_tags=(_arg0); end
def strip_tags?; end
class << self
# Builds the ENCODING_TABLE hash by running through the list of supported encodings
# in the ICU detection API and trying to map them to supported encodings in Ruby.
# This is built dynamically so as to take advantage of ICU upgrades which may have
# support for more encodings in the future.
#
# Returns nothing.
#
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#65
def build_encoding_table; end
# Attempt to detect the encoding of this string
#
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
# as well as use the default binary scan length
#
# str - a String, what you want to detect the encoding of
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
# be used as an additional hint to the charset detector
#
# Returns: a Hash with :encoding, :language, :type and :confidence
#
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#25
def detect(str, hint_enc = T.unsafe(nil)); end
# Attempt to detect the encoding of this string, and return
# a list with all the possible encodings that match it.
#
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
# as well as use the default binary scan length
#
# str - a String, what you want to detect the encoding of
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
# be used as an additional hint to the charset detector
#
# Returns: an Array with zero or more Hashes,
# each one of them with with :encoding, :language, :type and :confidence
#
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#41
def detect_all(str, hint_enc = T.unsafe(nil)); end
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#53
def encoding_table; end
def supported_encodings; end
end
end
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#57
CharlockHolmes::EncodingDetector::BINARY = T.let(T.unsafe(nil), String)
# Default length for which to scan content for NULL bytes
#
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#4
CharlockHolmes::EncodingDetector::DEFAULT_BINARY_SCAN_LEN = T.let(T.unsafe(nil), Integer)
class CharlockHolmes::Transliterator
class << self
def id_list; end
def transliterate(_arg0, _arg1); end
end
end
# source://charlock_holmes//lib/charlock_holmes/version.rb#2
CharlockHolmes::VERSION = T.let(T.unsafe(nil), String)

View File

@@ -0,0 +1,12 @@
# typed: strict
module CharlockHolmes
class EncodingDetector
sig do
params(str: String, hint_enc: T.nilable(String)).returns(
T.nilable(T::Hash[Symbol, T.untyped]),
)
end
def self.detect(str, hint_enc = nil)
end
end
end

View File

@@ -79,6 +79,18 @@ RSpec.describe LogEntriesHelper, type: :helper do
rendered = helper.render_rich_text_content(log_entry)
expect(rendered).not_to be_nil
end
it "works in txt files encoded with utf-16" do
log_entry =
build_log_entry(
"text/plain",
"test/fixtures/files/1321054359.ztf_tolartist.txt",
)
rendered = helper.render_rich_text_content(log_entry)
expect(rendered).not_to be_nil
expect(rendered).to include("at the local cafe")
end
end
describe "#sanitize_rich_text_document_html" do

Binary file not shown.