detect encoding for plain text files
This commit is contained in:
1
Gemfile
1
Gemfile
@@ -135,6 +135,7 @@ gem "ruby-bbcode"
|
||||
gem "dtext_rb",
|
||||
git: "https://github.com/e621ng/dtext_rb",
|
||||
ref: "5ef8fd7a5205c832f4c18197911717e7d491494e"
|
||||
gem "charlock_holmes"
|
||||
|
||||
# gem "pghero", git: "https://github.com/dymk/pghero", ref: "e314f99"
|
||||
gem "pghero", "~> 3.6"
|
||||
|
||||
@@ -131,6 +131,7 @@ GEM
|
||||
rack-test (>= 0.6.3)
|
||||
regexp_parser (>= 1.5, < 3.0)
|
||||
xpath (~> 3.2)
|
||||
charlock_holmes (0.7.9)
|
||||
cloudflare-rails (6.2.0)
|
||||
actionpack (>= 7.1.0, < 8.1.0)
|
||||
activesupport (>= 7.1.0, < 8.1.0)
|
||||
@@ -573,6 +574,7 @@ DEPENDENCIES
|
||||
attr_json
|
||||
bootsnap
|
||||
capybara
|
||||
charlock_holmes
|
||||
cloudflare-rails
|
||||
colorize
|
||||
concurrent-ruby-edge
|
||||
|
||||
@@ -223,6 +223,13 @@ module LogEntriesHelper
|
||||
tempfile&.close
|
||||
end
|
||||
|
||||
sig { params(str: String).returns(String) }
|
||||
def reencode_as_utf8_lossy(str)
|
||||
str.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
||||
rescue StandardError
|
||||
str
|
||||
end
|
||||
|
||||
sig { params(rich_text_body: String).returns(String) }
|
||||
def try_convert_bbcode_to_html(rich_text_body)
|
||||
rich_text_body.bbcode_to_html(false)
|
||||
@@ -230,6 +237,12 @@ module LogEntriesHelper
|
||||
rich_text_body
|
||||
end
|
||||
|
||||
sig { params(rich_text_body: String).returns(String) }
|
||||
def try_detect_encoding(rich_text_body)
|
||||
encoding = CharlockHolmes::EncodingDetector.detect(rich_text_body)
|
||||
encoding ? encoding[:encoding] : "UTF-8"
|
||||
end
|
||||
|
||||
sig { params(log_entry: HttpLogEntry).returns(T.nilable(String)) }
|
||||
def render_rich_text_content(log_entry)
|
||||
content_type = log_entry.content_type
|
||||
@@ -239,8 +252,9 @@ module LogEntriesHelper
|
||||
is_plain_text = content_type.starts_with?("text/plain")
|
||||
|
||||
if is_plain_text
|
||||
# rich_text_body.gsub!(/(\r\n|\n|\r)+/, "<br />")
|
||||
rich_text_body = rich_text_body.force_encoding("UTF-8")
|
||||
encoding_name = try_detect_encoding(rich_text_body)
|
||||
rich_text_body = rich_text_body.force_encoding(encoding_name)
|
||||
rich_text_body = reencode_as_utf8_lossy(rich_text_body)
|
||||
document_html = try_convert_bbcode_to_html(rich_text_body)
|
||||
elsif content_type.starts_with?("application/pdf")
|
||||
document_html = convert_with_pdftohtml(rich_text_body)
|
||||
|
||||
105
sorbet/rbi/gems/charlock_holmes@0.7.9.rbi
generated
Normal file
105
sorbet/rbi/gems/charlock_holmes@0.7.9.rbi
generated
Normal file
@@ -0,0 +1,105 @@
|
||||
# typed: true
|
||||
|
||||
# DO NOT EDIT MANUALLY
|
||||
# This is an autogenerated file for types exported from the `charlock_holmes` gem.
|
||||
# Please instead update this file by running `bin/tapioca gem charlock_holmes`.
|
||||
|
||||
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#1
|
||||
module CharlockHolmes; end
|
||||
|
||||
class CharlockHolmes::Converter
|
||||
class << self
|
||||
def convert(_arg0, _arg1, _arg2); end
|
||||
end
|
||||
end
|
||||
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#2
|
||||
class CharlockHolmes::EncodingDetector
|
||||
# @return [EncodingDetector] a new instance of EncodingDetector
|
||||
#
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#11
|
||||
def initialize(scan_len = T.unsafe(nil)); end
|
||||
|
||||
# Length for which to scan content for NULL bytes
|
||||
#
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#7
|
||||
def binary_scan_length; end
|
||||
|
||||
# Length for which to scan content for NULL bytes
|
||||
#
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#7
|
||||
def binary_scan_length=(_arg0); end
|
||||
|
||||
def detect(*_arg0); end
|
||||
def detect_all(*_arg0); end
|
||||
def is_binary?(_arg0); end
|
||||
def strip_tags; end
|
||||
def strip_tags=(_arg0); end
|
||||
def strip_tags?; end
|
||||
|
||||
class << self
|
||||
# Builds the ENCODING_TABLE hash by running through the list of supported encodings
|
||||
# in the ICU detection API and trying to map them to supported encodings in Ruby.
|
||||
# This is built dynamically so as to take advantage of ICU upgrades which may have
|
||||
# support for more encodings in the future.
|
||||
#
|
||||
# Returns nothing.
|
||||
#
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#65
|
||||
def build_encoding_table; end
|
||||
|
||||
# Attempt to detect the encoding of this string
|
||||
#
|
||||
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
|
||||
# as well as use the default binary scan length
|
||||
#
|
||||
# str - a String, what you want to detect the encoding of
|
||||
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
||||
# be used as an additional hint to the charset detector
|
||||
#
|
||||
# Returns: a Hash with :encoding, :language, :type and :confidence
|
||||
#
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#25
|
||||
def detect(str, hint_enc = T.unsafe(nil)); end
|
||||
|
||||
# Attempt to detect the encoding of this string, and return
|
||||
# a list with all the possible encodings that match it.
|
||||
#
|
||||
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
|
||||
# as well as use the default binary scan length
|
||||
#
|
||||
# str - a String, what you want to detect the encoding of
|
||||
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
||||
# be used as an additional hint to the charset detector
|
||||
#
|
||||
# Returns: an Array with zero or more Hashes,
|
||||
# each one of them with with :encoding, :language, :type and :confidence
|
||||
#
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#41
|
||||
def detect_all(str, hint_enc = T.unsafe(nil)); end
|
||||
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#53
|
||||
def encoding_table; end
|
||||
|
||||
def supported_encodings; end
|
||||
end
|
||||
end
|
||||
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#57
|
||||
CharlockHolmes::EncodingDetector::BINARY = T.let(T.unsafe(nil), String)
|
||||
|
||||
# Default length for which to scan content for NULL bytes
|
||||
#
|
||||
# source://charlock_holmes//lib/charlock_holmes/encoding_detector.rb#4
|
||||
CharlockHolmes::EncodingDetector::DEFAULT_BINARY_SCAN_LEN = T.let(T.unsafe(nil), Integer)
|
||||
|
||||
class CharlockHolmes::Transliterator
|
||||
class << self
|
||||
def id_list; end
|
||||
def transliterate(_arg0, _arg1); end
|
||||
end
|
||||
end
|
||||
|
||||
# source://charlock_holmes//lib/charlock_holmes/version.rb#2
|
||||
CharlockHolmes::VERSION = T.let(T.unsafe(nil), String)
|
||||
12
sorbet/rbi/shims/charlock_holmes.rbi
Normal file
12
sorbet/rbi/shims/charlock_holmes.rbi
Normal file
@@ -0,0 +1,12 @@
|
||||
# typed: strict
|
||||
module CharlockHolmes
|
||||
class EncodingDetector
|
||||
sig do
|
||||
params(str: String, hint_enc: T.nilable(String)).returns(
|
||||
T.nilable(T::Hash[Symbol, T.untyped]),
|
||||
)
|
||||
end
|
||||
def self.detect(str, hint_enc = nil)
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -79,6 +79,18 @@ RSpec.describe LogEntriesHelper, type: :helper do
|
||||
rendered = helper.render_rich_text_content(log_entry)
|
||||
expect(rendered).not_to be_nil
|
||||
end
|
||||
|
||||
it "works in txt files encoded with utf-16" do
|
||||
log_entry =
|
||||
build_log_entry(
|
||||
"text/plain",
|
||||
"test/fixtures/files/1321054359.ztf_tolartist.txt",
|
||||
)
|
||||
|
||||
rendered = helper.render_rich_text_content(log_entry)
|
||||
expect(rendered).not_to be_nil
|
||||
expect(rendered).to include("at the local cafe")
|
||||
end
|
||||
end
|
||||
|
||||
describe "#sanitize_rich_text_document_html" do
|
||||
|
||||
BIN
test/fixtures/files/1321054359.ztf_tolartist.txt
vendored
Normal file
BIN
test/fixtures/files/1321054359.ztf_tolartist.txt
vendored
Normal file
Binary file not shown.
Reference in New Issue
Block a user