more strict typing

This commit is contained in:
Dylan Knutson
2025-01-01 22:30:55 +00:00
parent 3a14c6946f
commit fdd1200396
6 changed files with 153 additions and 58 deletions

View File

@@ -1,11 +1,21 @@
# typed: true
# typed: strict
class HexUtil
extend T::Sig
sig { params(str: String).returns(String) }
def self.hex2bin(str)
[str].pack("H*")
end
sig { params(bin: String).returns(String) }
def self.bin2hex(bin)
bin.unpack("H*").first
unpacked = bin.unpack("H*").first
raise("invalid bin2hex: nil") if unpacked.nil?
raise("invalid bin2hex: #{unpacked.class}") unless unpacked.is_a?(String)
unpacked
end
sig { params(size: Integer).returns(String) }
def self.humansize(size)
units = %w[B KiB MiB GiB TiB Pib EiB ZiB]
return "0.0 B" if size == 0

View File

@@ -1,19 +1,23 @@
# typed: true
# typed: strict
class Scraper::E621HttpClientConfig < Scraper::HttpClientConfig
DEFAULT_ALLOWED_DOMAINS = %w[e621.net *.e621.net]
sig { override.returns(T.nilable(T::Array[T.untyped])) }
def cookies
[]
end
sig { override.returns(T::Array[[String, Numeric]]) }
def ratelimit
[["static1.e621.net", 1], ["facdn.net", 2], ["*", 0.25]]
end
sig { override.returns(T::Array[String]) }
def allowed_domains
DEFAULT_ALLOWED_DOMAINS
end
sig { override.returns(Integer) }
def redirect_limit
2
end

View File

@@ -1,9 +1,10 @@
# typed: true
# typed: strict
class Scraper::HttpClient
extend T::Sig
include HasColorLogger
sig { returns(Scraper::HttpClientConfig) }
attr_reader :config
Response = Struct.new(:status_code, :body, :log_entry)
@@ -11,28 +12,40 @@ class Scraper::HttpClient
class InvalidURLError < ArgumentError
end
sig do
params(config: Scraper::HttpClientConfig, http_performer: T.untyped).void
end
def initialize(config, http_performer)
@config = config
@http_performer = http_performer
@domain_last_requested_at = {}
@config = T.let(config, Scraper::HttpClientConfig)
@http_performer = T.let(http_performer, T.untyped)
@domain_last_requested_at = T.let({}, T::Hash[String, Time])
@cookie_jar = T.let(HTTP::CookieJar.new, HTTP::CookieJar)
init_cookie_jar
@config.do_login(http_performer)
end
sig { void }
def close!
@http_performer.close!
end
sig { params(url: String, caused_by_entry: T.untyped).returns(Response) }
def get(url, caused_by_entry: nil)
do_request(:get, url, caused_by_entry)
end
sig { params(url: String, caused_by_entry: T.untyped).returns(Response) }
def post(url, caused_by_entry: nil)
do_request(:post, url, caused_by_entry)
end
private
sig do
params(method: Symbol, url: String, caused_by_entry: T.untyped).returns(
Response,
)
end
def do_request(method, url, caused_by_entry)
uri = Addressable::URI.parse(url)
uri.scheme = "https" if uri.scheme.blank?
@@ -52,7 +65,7 @@ class Scraper::HttpClient
ratelimit_delay = ratelimit_conf[1]
if ratelimit_delay == :none
logger.debug("no rate limit for #{uri.host}")
else
elsif ratelimit_delay.is_a?(Numeric)
last_requested_at = @domain_last_requested_at[uri.host]
if last_requested_at
@@ -62,6 +75,8 @@ class Scraper::HttpClient
sleep sleep_for
end
end
else
raise "unknown ratelimit delay: #{ratelimit_delay}"
end
request_headers = {
@@ -120,7 +135,7 @@ class Scraper::HttpClient
begin
response_blob_file.save unless response_blob_file.persisted?
rescue => e
puts "error saving blob file #{HexUtil.bin2hex(response_blob_file.sha256)}: #{e}"
puts "error saving blob file #{HexUtil.bin2hex(T.must(response_blob_file.sha256))}: #{e}"
end
rescue StandardError
retries += 1
@@ -142,7 +157,7 @@ class Scraper::HttpClient
[
"[entry #{log_entry.id.to_s.bold} /",
"GET #{response_code_colorized} /",
"#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]",
"#{HexUtil.humansize(T.must(response_blob_entry.bytes_stored)).bold} / #{HexUtil.humansize(T.must(response_blob_entry.size)).bold}]",
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
scrubbed_uri.to_s.black,
].reject(&:nil?).join(" "),
@@ -164,14 +179,15 @@ class Scraper::HttpClient
Response.new(response_code, response_body, log_entry)
end
sig { params(domain: String).returns(T::Boolean) }
def allowed_domain?(domain)
config.allowed_domains.any? do |domain_to_check|
File.fnmatch? domain_to_check, domain
end
end
sig { void }
def init_cookie_jar
@cookie_jar = HTTP::CookieJar.new
cookies = config.cookies || []
cookies.each do |cookie_domain_conf|
domain = cookie_domain_conf[:domain]
@@ -188,7 +204,8 @@ class Scraper::HttpClient
end
end
sig { params(ct: String).returns(String) }
def clean_content_type(ct)
ct.split(";").first.strip
T.must(ct.split(";").first).strip
end
end

View File

@@ -1,40 +1,39 @@
# typed: true
# typed: strict
class Scraper::HttpClientConfig
extend T::Sig
extend T::Helpers
abstract!
include HasColorLogger
sig { abstract.returns(T.nilable(T::Array[T.untyped])) }
def cookies
not_implemented!
end
sig { overridable.params(performer: T.untyped).void }
def do_login(performer)
# nop
end
sig { overridable.params(uri: Addressable::URI).returns(Addressable::URI) }
def map_uri(uri)
# nop
uri
end
sig { overridable.params(uri: Addressable::URI).returns(Addressable::URI) }
def scrub_stored_uri(uri)
# nop
uri
end
sig { abstract.returns(T::Array[[String, Numeric]]) }
def ratelimit
not_implemented!
end
sig { abstract.returns(T::Array[String]) }
def allowed_domains
not_implemented!
end
sig { abstract.returns(Integer) }
def redirect_limit
not_implemented!
end
private
def not_implemented!
raise NotImplementedError.new("implement in #{self.class.name}")
end
end

View File

@@ -1,4 +1,4 @@
# typed: true
# typed: strict
class BlobEntry < ReduxApplicationRecord
self.table_name = "blob_entries_p"
@@ -7,8 +7,11 @@ class BlobEntry < ReduxApplicationRecord
self.primary_key = :sha256
EMPTY_FILE_SHA256 =
HexUtil.hex2bin(
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
T.let(
HexUtil.hex2bin(
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
),
String,
)
belongs_to :base,
@@ -21,8 +24,14 @@ class BlobEntry < ReduxApplicationRecord
raise("digest mismatch for BlobEntry") if sha256 != actual_sha256
end
sig { returns(T.nilable(BlobEntry)) }
def base
@base_model ||= super || self.class.ensure(base_sha256) if base_sha256
@base_model = T.let(@base_model, T.nilable(BlobEntry))
@base_model ||=
super ||
if (sha256 = base_sha256)
self.class.ensure(sha256)
end
end
validates_presence_of(:sha256, :content_type, :size)
@@ -35,27 +44,42 @@ class BlobEntry < ReduxApplicationRecord
validates :sha256, length: { is: 32 }
validates :base_sha256, length: { is: 32 }, if: :base_sha256
sig { params(sha256: String).returns(BlobEntry) }
def self.ensure(sha256)
find_by(sha256: sha256) ||
raise("blob #{HexUtil.bin2hex(sha256)} does not exist")
end
sig { returns(T.nilable(String)) }
def sha256_hex
HexUtil.bin2hex(sha256) if sha256
if (hash = self.sha256)
HexUtil.bin2hex(hash)
end
end
sig { returns(T.nilable(String)) }
def contents
@contents ||=
begin
contents_raw = self.read_attribute(:contents)
self.base ? XDiff.patch(self.base.contents, contents_raw) : contents_raw
end
T.let(
begin
contents_raw = self.read_attribute(:contents)
base_contents = self.base&.contents
if base_contents
XDiff.patch(base_contents, contents_raw)
else
contents_raw
end
end,
T.nilable(String),
)
end
sig { returns(T.nilable(Integer)) }
def bytes_stored
self.read_attribute(:contents).size
self.read_attribute(:contents)&.size
end
sig { params(content_type: String, contents: String).returns(BlobEntry) }
def self.find_or_build(content_type:, contents:)
sha256 = Digest::SHA256.digest(contents)
BlobEntry.find_by(sha256: sha256) ||
@@ -68,9 +92,20 @@ class BlobEntry < ReduxApplicationRecord
end
end
DIFFABLE_CONTENT_TYPES = [%r{text/html}, %r{text/plain}, %r{application/json}]
DIFFABLE_CONTENT_TYPES =
T.let(
[%r{text/html}, %r{text/plain}, %r{application/json}],
T::Array[Regexp],
)
def self.build_record(content_type:, sha256: nil, contents:)
sig do
params(
content_type: String,
contents: String,
sha256: T.nilable(String),
).returns(BlobEntry)
end
def self.build_record(content_type:, contents:, sha256: nil)
sha256 ||= Digest::SHA256.digest(contents)
record =
self.new(

View File

@@ -1,13 +1,17 @@
# typed: false
# typed: strict
class BlobFile < ReduxApplicationRecord
self.table_name = "blob_files"
ROOT_DIR =
Rails.application.config_for("blob_file_location") ||
raise("no blob_file_location config")
TMP_DIR = File.join(ROOT_DIR, "tmp-files")
T.let(
Rails.application.config_for("blob_file_location") ||
raise("no blob_file_location config"),
String,
)
TMP_DIR = T.let(File.join(ROOT_DIR, "tmp-files"), String)
FILE_PATH_PATTERNS = { v1: [2, 2, 1] }
FILE_PATH_PATTERNS =
T.let({ v1: [2, 2, 1] }, T::Hash[Symbol, T::Array[Integer]])
# consider sha256 `e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855`
# breaking it into 3 parts, becomes:
# e3/b0/c/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
@@ -18,8 +22,11 @@ class BlobFile < ReduxApplicationRecord
self.primary_key = :sha256
EMPTY_FILE_SHA256 =
HexUtil.hex2bin(
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
T.let(
HexUtil.hex2bin(
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
),
String,
)
enum :version, { v1: 1 }
@@ -41,12 +48,13 @@ class BlobFile < ReduxApplicationRecord
FileUtils.mkdir_p(TMP_DIR)
tmp_file_path = File.join(TMP_DIR, "blob-file-#{SecureRandom.uuid}")
File.binwrite(tmp_file_path, self.content_bytes)
File.binwrite(tmp_file_path, T.must(self.content_bytes))
FileUtils.mv(tmp_file_path, self.absolute_file_path)
end
end
end
sig { params(blob_entry: BlobEntry).returns(BlobFile) }
def self.find_or_initialize_from_blob_entry(blob_entry)
BlobFile.find_or_initialize_by(sha256: blob_entry.sha256) do |blob_file|
blob_file.content_type = blob_entry.content_type
@@ -55,6 +63,7 @@ class BlobFile < ReduxApplicationRecord
end
end
sig { params(blob_entry: BlobEntry).returns(BlobFile) }
def self.initialize_from_blob_entry(blob_entry)
BlobFile.new(
sha256: blob_entry.sha256,
@@ -64,14 +73,22 @@ class BlobFile < ReduxApplicationRecord
)
end
sig { params(content_bytes: T.nilable(String)).void }
def content_bytes=(content_bytes)
raise("cannot set content_bytes of an existing BlobFile") if self.persisted?
@content_bytes = content_bytes
@content_bytes.force_encoding("ASCII-8BIT")
self.size_bytes = content_bytes.bytesize
self.sha256 = Digest::SHA256.digest(content_bytes)
if content_bytes.nil?
@content_bytes = nil
self.size_bytes = 0
self.sha256 = nil
else
content_bytes.force_encoding("ASCII-8BIT")
@content_bytes = T.let(content_bytes, T.nilable(String))
self.size_bytes = content_bytes.bytesize
self.sha256 = Digest::SHA256.digest(content_bytes)
end
end
sig { returns(T.nilable(String)) }
def content_bytes
if self.persisted?
@content_bytes ||=
@@ -85,26 +102,39 @@ class BlobFile < ReduxApplicationRecord
end
end
sig { returns(String) }
def absolute_file_path
@absolute_file_path ||= File.join(ROOT_DIR, self.relative_file_path)
File.join(ROOT_DIR, self.relative_file_path)
end
sig { returns(T.nilable(String)) }
def relative_file_path
version = self.version
return nil if version.nil?
sha256 = self.sha256
return nil if sha256.nil?
pattern = FILE_PATH_PATTERNS[version.to_sym]
return nil if pattern.nil?
file_path_parts = [
self.version,
*self.class.path_segments(
FILE_PATH_PATTERNS[self.version.to_sym],
HexUtil.bin2hex(self.sha256),
),
version,
*self.class.path_segments(pattern, HexUtil.bin2hex(sha256)),
]
file_path_parts.join("/")
end
sig do
params(pattern: T::Array[Integer], sha256_hex: String).returns(
T::Array[String],
)
end
def self.path_segments(pattern, sha256_hex)
parts = []
parts = T.let([], T::Array[String])
offset = 0
pattern.each do |part_len|
parts << sha256_hex[offset, part_len]
part = sha256_hex[offset, part_len]
raise("invalid sha256_hex") if part.nil?
parts << part
offset += part_len
end
parts << sha256_hex