more strict typing
This commit is contained in:
@@ -1,11 +1,21 @@
|
||||
# typed: true
|
||||
# typed: strict
|
||||
class HexUtil
|
||||
extend T::Sig
|
||||
|
||||
sig { params(str: String).returns(String) }
|
||||
def self.hex2bin(str)
|
||||
[str].pack("H*")
|
||||
end
|
||||
|
||||
sig { params(bin: String).returns(String) }
|
||||
def self.bin2hex(bin)
|
||||
bin.unpack("H*").first
|
||||
unpacked = bin.unpack("H*").first
|
||||
raise("invalid bin2hex: nil") if unpacked.nil?
|
||||
raise("invalid bin2hex: #{unpacked.class}") unless unpacked.is_a?(String)
|
||||
unpacked
|
||||
end
|
||||
|
||||
sig { params(size: Integer).returns(String) }
|
||||
def self.humansize(size)
|
||||
units = %w[B KiB MiB GiB TiB Pib EiB ZiB]
|
||||
return "0.0 B" if size == 0
|
||||
|
||||
@@ -1,19 +1,23 @@
|
||||
# typed: true
|
||||
# typed: strict
|
||||
class Scraper::E621HttpClientConfig < Scraper::HttpClientConfig
|
||||
DEFAULT_ALLOWED_DOMAINS = %w[e621.net *.e621.net]
|
||||
|
||||
sig { override.returns(T.nilable(T::Array[T.untyped])) }
|
||||
def cookies
|
||||
[]
|
||||
end
|
||||
|
||||
sig { override.returns(T::Array[[String, Numeric]]) }
|
||||
def ratelimit
|
||||
[["static1.e621.net", 1], ["facdn.net", 2], ["*", 0.25]]
|
||||
end
|
||||
|
||||
sig { override.returns(T::Array[String]) }
|
||||
def allowed_domains
|
||||
DEFAULT_ALLOWED_DOMAINS
|
||||
end
|
||||
|
||||
sig { override.returns(Integer) }
|
||||
def redirect_limit
|
||||
2
|
||||
end
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
# typed: true
|
||||
|
||||
# typed: strict
|
||||
class Scraper::HttpClient
|
||||
extend T::Sig
|
||||
|
||||
include HasColorLogger
|
||||
|
||||
sig { returns(Scraper::HttpClientConfig) }
|
||||
attr_reader :config
|
||||
|
||||
Response = Struct.new(:status_code, :body, :log_entry)
|
||||
@@ -11,28 +12,40 @@ class Scraper::HttpClient
|
||||
class InvalidURLError < ArgumentError
|
||||
end
|
||||
|
||||
sig do
|
||||
params(config: Scraper::HttpClientConfig, http_performer: T.untyped).void
|
||||
end
|
||||
def initialize(config, http_performer)
|
||||
@config = config
|
||||
@http_performer = http_performer
|
||||
@domain_last_requested_at = {}
|
||||
@config = T.let(config, Scraper::HttpClientConfig)
|
||||
@http_performer = T.let(http_performer, T.untyped)
|
||||
@domain_last_requested_at = T.let({}, T::Hash[String, Time])
|
||||
@cookie_jar = T.let(HTTP::CookieJar.new, HTTP::CookieJar)
|
||||
init_cookie_jar
|
||||
@config.do_login(http_performer)
|
||||
end
|
||||
|
||||
sig { void }
|
||||
def close!
|
||||
@http_performer.close!
|
||||
end
|
||||
|
||||
sig { params(url: String, caused_by_entry: T.untyped).returns(Response) }
|
||||
def get(url, caused_by_entry: nil)
|
||||
do_request(:get, url, caused_by_entry)
|
||||
end
|
||||
|
||||
sig { params(url: String, caused_by_entry: T.untyped).returns(Response) }
|
||||
def post(url, caused_by_entry: nil)
|
||||
do_request(:post, url, caused_by_entry)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
sig do
|
||||
params(method: Symbol, url: String, caused_by_entry: T.untyped).returns(
|
||||
Response,
|
||||
)
|
||||
end
|
||||
def do_request(method, url, caused_by_entry)
|
||||
uri = Addressable::URI.parse(url)
|
||||
uri.scheme = "https" if uri.scheme.blank?
|
||||
@@ -52,7 +65,7 @@ class Scraper::HttpClient
|
||||
ratelimit_delay = ratelimit_conf[1]
|
||||
if ratelimit_delay == :none
|
||||
logger.debug("no rate limit for #{uri.host}")
|
||||
else
|
||||
elsif ratelimit_delay.is_a?(Numeric)
|
||||
last_requested_at = @domain_last_requested_at[uri.host]
|
||||
|
||||
if last_requested_at
|
||||
@@ -62,6 +75,8 @@ class Scraper::HttpClient
|
||||
sleep sleep_for
|
||||
end
|
||||
end
|
||||
else
|
||||
raise "unknown ratelimit delay: #{ratelimit_delay}"
|
||||
end
|
||||
|
||||
request_headers = {
|
||||
@@ -120,7 +135,7 @@ class Scraper::HttpClient
|
||||
begin
|
||||
response_blob_file.save unless response_blob_file.persisted?
|
||||
rescue => e
|
||||
puts "error saving blob file #{HexUtil.bin2hex(response_blob_file.sha256)}: #{e}"
|
||||
puts "error saving blob file #{HexUtil.bin2hex(T.must(response_blob_file.sha256))}: #{e}"
|
||||
end
|
||||
rescue StandardError
|
||||
retries += 1
|
||||
@@ -142,7 +157,7 @@ class Scraper::HttpClient
|
||||
[
|
||||
"[entry #{log_entry.id.to_s.bold} /",
|
||||
"GET #{response_code_colorized} /",
|
||||
"#{HexUtil.humansize(response_blob_entry.bytes_stored).bold} / #{HexUtil.humansize(response_blob_entry.size).bold}]",
|
||||
"#{HexUtil.humansize(T.must(response_blob_entry.bytes_stored)).bold} / #{HexUtil.humansize(T.must(response_blob_entry.size)).bold}]",
|
||||
"[#{response_time_ms.to_s.bold} ms / #{total_time_ms.to_s.bold} ms]",
|
||||
scrubbed_uri.to_s.black,
|
||||
].reject(&:nil?).join(" "),
|
||||
@@ -164,14 +179,15 @@ class Scraper::HttpClient
|
||||
Response.new(response_code, response_body, log_entry)
|
||||
end
|
||||
|
||||
sig { params(domain: String).returns(T::Boolean) }
|
||||
def allowed_domain?(domain)
|
||||
config.allowed_domains.any? do |domain_to_check|
|
||||
File.fnmatch? domain_to_check, domain
|
||||
end
|
||||
end
|
||||
|
||||
sig { void }
|
||||
def init_cookie_jar
|
||||
@cookie_jar = HTTP::CookieJar.new
|
||||
cookies = config.cookies || []
|
||||
cookies.each do |cookie_domain_conf|
|
||||
domain = cookie_domain_conf[:domain]
|
||||
@@ -188,7 +204,8 @@ class Scraper::HttpClient
|
||||
end
|
||||
end
|
||||
|
||||
sig { params(ct: String).returns(String) }
|
||||
def clean_content_type(ct)
|
||||
ct.split(";").first.strip
|
||||
T.must(ct.split(";").first).strip
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,40 +1,39 @@
|
||||
# typed: true
|
||||
# typed: strict
|
||||
class Scraper::HttpClientConfig
|
||||
extend T::Sig
|
||||
extend T::Helpers
|
||||
abstract!
|
||||
|
||||
include HasColorLogger
|
||||
|
||||
sig { abstract.returns(T.nilable(T::Array[T.untyped])) }
|
||||
def cookies
|
||||
not_implemented!
|
||||
end
|
||||
|
||||
sig { overridable.params(performer: T.untyped).void }
|
||||
def do_login(performer)
|
||||
# nop
|
||||
end
|
||||
|
||||
sig { overridable.params(uri: Addressable::URI).returns(Addressable::URI) }
|
||||
def map_uri(uri)
|
||||
# nop
|
||||
uri
|
||||
end
|
||||
|
||||
sig { overridable.params(uri: Addressable::URI).returns(Addressable::URI) }
|
||||
def scrub_stored_uri(uri)
|
||||
# nop
|
||||
uri
|
||||
end
|
||||
|
||||
sig { abstract.returns(T::Array[[String, Numeric]]) }
|
||||
def ratelimit
|
||||
not_implemented!
|
||||
end
|
||||
|
||||
sig { abstract.returns(T::Array[String]) }
|
||||
def allowed_domains
|
||||
not_implemented!
|
||||
end
|
||||
|
||||
sig { abstract.returns(Integer) }
|
||||
def redirect_limit
|
||||
not_implemented!
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def not_implemented!
|
||||
raise NotImplementedError.new("implement in #{self.class.name}")
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# typed: true
|
||||
# typed: strict
|
||||
class BlobEntry < ReduxApplicationRecord
|
||||
self.table_name = "blob_entries_p"
|
||||
|
||||
@@ -7,8 +7,11 @@ class BlobEntry < ReduxApplicationRecord
|
||||
|
||||
self.primary_key = :sha256
|
||||
EMPTY_FILE_SHA256 =
|
||||
HexUtil.hex2bin(
|
||||
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
||||
T.let(
|
||||
HexUtil.hex2bin(
|
||||
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
||||
),
|
||||
String,
|
||||
)
|
||||
|
||||
belongs_to :base,
|
||||
@@ -21,8 +24,14 @@ class BlobEntry < ReduxApplicationRecord
|
||||
raise("digest mismatch for BlobEntry") if sha256 != actual_sha256
|
||||
end
|
||||
|
||||
sig { returns(T.nilable(BlobEntry)) }
|
||||
def base
|
||||
@base_model ||= super || self.class.ensure(base_sha256) if base_sha256
|
||||
@base_model = T.let(@base_model, T.nilable(BlobEntry))
|
||||
@base_model ||=
|
||||
super ||
|
||||
if (sha256 = base_sha256)
|
||||
self.class.ensure(sha256)
|
||||
end
|
||||
end
|
||||
|
||||
validates_presence_of(:sha256, :content_type, :size)
|
||||
@@ -35,27 +44,42 @@ class BlobEntry < ReduxApplicationRecord
|
||||
validates :sha256, length: { is: 32 }
|
||||
validates :base_sha256, length: { is: 32 }, if: :base_sha256
|
||||
|
||||
sig { params(sha256: String).returns(BlobEntry) }
|
||||
def self.ensure(sha256)
|
||||
find_by(sha256: sha256) ||
|
||||
raise("blob #{HexUtil.bin2hex(sha256)} does not exist")
|
||||
end
|
||||
|
||||
sig { returns(T.nilable(String)) }
|
||||
def sha256_hex
|
||||
HexUtil.bin2hex(sha256) if sha256
|
||||
if (hash = self.sha256)
|
||||
HexUtil.bin2hex(hash)
|
||||
end
|
||||
end
|
||||
|
||||
sig { returns(T.nilable(String)) }
|
||||
def contents
|
||||
@contents ||=
|
||||
begin
|
||||
contents_raw = self.read_attribute(:contents)
|
||||
self.base ? XDiff.patch(self.base.contents, contents_raw) : contents_raw
|
||||
end
|
||||
T.let(
|
||||
begin
|
||||
contents_raw = self.read_attribute(:contents)
|
||||
base_contents = self.base&.contents
|
||||
if base_contents
|
||||
XDiff.patch(base_contents, contents_raw)
|
||||
else
|
||||
contents_raw
|
||||
end
|
||||
end,
|
||||
T.nilable(String),
|
||||
)
|
||||
end
|
||||
|
||||
sig { returns(T.nilable(Integer)) }
|
||||
def bytes_stored
|
||||
self.read_attribute(:contents).size
|
||||
self.read_attribute(:contents)&.size
|
||||
end
|
||||
|
||||
sig { params(content_type: String, contents: String).returns(BlobEntry) }
|
||||
def self.find_or_build(content_type:, contents:)
|
||||
sha256 = Digest::SHA256.digest(contents)
|
||||
BlobEntry.find_by(sha256: sha256) ||
|
||||
@@ -68,9 +92,20 @@ class BlobEntry < ReduxApplicationRecord
|
||||
end
|
||||
end
|
||||
|
||||
DIFFABLE_CONTENT_TYPES = [%r{text/html}, %r{text/plain}, %r{application/json}]
|
||||
DIFFABLE_CONTENT_TYPES =
|
||||
T.let(
|
||||
[%r{text/html}, %r{text/plain}, %r{application/json}],
|
||||
T::Array[Regexp],
|
||||
)
|
||||
|
||||
def self.build_record(content_type:, sha256: nil, contents:)
|
||||
sig do
|
||||
params(
|
||||
content_type: String,
|
||||
contents: String,
|
||||
sha256: T.nilable(String),
|
||||
).returns(BlobEntry)
|
||||
end
|
||||
def self.build_record(content_type:, contents:, sha256: nil)
|
||||
sha256 ||= Digest::SHA256.digest(contents)
|
||||
record =
|
||||
self.new(
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
# typed: false
|
||||
# typed: strict
|
||||
class BlobFile < ReduxApplicationRecord
|
||||
self.table_name = "blob_files"
|
||||
|
||||
ROOT_DIR =
|
||||
Rails.application.config_for("blob_file_location") ||
|
||||
raise("no blob_file_location config")
|
||||
TMP_DIR = File.join(ROOT_DIR, "tmp-files")
|
||||
T.let(
|
||||
Rails.application.config_for("blob_file_location") ||
|
||||
raise("no blob_file_location config"),
|
||||
String,
|
||||
)
|
||||
TMP_DIR = T.let(File.join(ROOT_DIR, "tmp-files"), String)
|
||||
|
||||
FILE_PATH_PATTERNS = { v1: [2, 2, 1] }
|
||||
FILE_PATH_PATTERNS =
|
||||
T.let({ v1: [2, 2, 1] }, T::Hash[Symbol, T::Array[Integer]])
|
||||
# consider sha256 `e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855`
|
||||
# breaking it into 3 parts, becomes:
|
||||
# e3/b0/c/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||
@@ -18,8 +22,11 @@ class BlobFile < ReduxApplicationRecord
|
||||
|
||||
self.primary_key = :sha256
|
||||
EMPTY_FILE_SHA256 =
|
||||
HexUtil.hex2bin(
|
||||
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
||||
T.let(
|
||||
HexUtil.hex2bin(
|
||||
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
||||
),
|
||||
String,
|
||||
)
|
||||
|
||||
enum :version, { v1: 1 }
|
||||
@@ -41,12 +48,13 @@ class BlobFile < ReduxApplicationRecord
|
||||
FileUtils.mkdir_p(TMP_DIR)
|
||||
|
||||
tmp_file_path = File.join(TMP_DIR, "blob-file-#{SecureRandom.uuid}")
|
||||
File.binwrite(tmp_file_path, self.content_bytes)
|
||||
File.binwrite(tmp_file_path, T.must(self.content_bytes))
|
||||
FileUtils.mv(tmp_file_path, self.absolute_file_path)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
sig { params(blob_entry: BlobEntry).returns(BlobFile) }
|
||||
def self.find_or_initialize_from_blob_entry(blob_entry)
|
||||
BlobFile.find_or_initialize_by(sha256: blob_entry.sha256) do |blob_file|
|
||||
blob_file.content_type = blob_entry.content_type
|
||||
@@ -55,6 +63,7 @@ class BlobFile < ReduxApplicationRecord
|
||||
end
|
||||
end
|
||||
|
||||
sig { params(blob_entry: BlobEntry).returns(BlobFile) }
|
||||
def self.initialize_from_blob_entry(blob_entry)
|
||||
BlobFile.new(
|
||||
sha256: blob_entry.sha256,
|
||||
@@ -64,14 +73,22 @@ class BlobFile < ReduxApplicationRecord
|
||||
)
|
||||
end
|
||||
|
||||
sig { params(content_bytes: T.nilable(String)).void }
|
||||
def content_bytes=(content_bytes)
|
||||
raise("cannot set content_bytes of an existing BlobFile") if self.persisted?
|
||||
@content_bytes = content_bytes
|
||||
@content_bytes.force_encoding("ASCII-8BIT")
|
||||
self.size_bytes = content_bytes.bytesize
|
||||
self.sha256 = Digest::SHA256.digest(content_bytes)
|
||||
if content_bytes.nil?
|
||||
@content_bytes = nil
|
||||
self.size_bytes = 0
|
||||
self.sha256 = nil
|
||||
else
|
||||
content_bytes.force_encoding("ASCII-8BIT")
|
||||
@content_bytes = T.let(content_bytes, T.nilable(String))
|
||||
self.size_bytes = content_bytes.bytesize
|
||||
self.sha256 = Digest::SHA256.digest(content_bytes)
|
||||
end
|
||||
end
|
||||
|
||||
sig { returns(T.nilable(String)) }
|
||||
def content_bytes
|
||||
if self.persisted?
|
||||
@content_bytes ||=
|
||||
@@ -85,26 +102,39 @@ class BlobFile < ReduxApplicationRecord
|
||||
end
|
||||
end
|
||||
|
||||
sig { returns(String) }
|
||||
def absolute_file_path
|
||||
@absolute_file_path ||= File.join(ROOT_DIR, self.relative_file_path)
|
||||
File.join(ROOT_DIR, self.relative_file_path)
|
||||
end
|
||||
|
||||
sig { returns(T.nilable(String)) }
|
||||
def relative_file_path
|
||||
version = self.version
|
||||
return nil if version.nil?
|
||||
sha256 = self.sha256
|
||||
return nil if sha256.nil?
|
||||
pattern = FILE_PATH_PATTERNS[version.to_sym]
|
||||
return nil if pattern.nil?
|
||||
|
||||
file_path_parts = [
|
||||
self.version,
|
||||
*self.class.path_segments(
|
||||
FILE_PATH_PATTERNS[self.version.to_sym],
|
||||
HexUtil.bin2hex(self.sha256),
|
||||
),
|
||||
version,
|
||||
*self.class.path_segments(pattern, HexUtil.bin2hex(sha256)),
|
||||
]
|
||||
file_path_parts.join("/")
|
||||
end
|
||||
|
||||
sig do
|
||||
params(pattern: T::Array[Integer], sha256_hex: String).returns(
|
||||
T::Array[String],
|
||||
)
|
||||
end
|
||||
def self.path_segments(pattern, sha256_hex)
|
||||
parts = []
|
||||
parts = T.let([], T::Array[String])
|
||||
offset = 0
|
||||
pattern.each do |part_len|
|
||||
parts << sha256_hex[offset, part_len]
|
||||
part = sha256_hex[offset, part_len]
|
||||
raise("invalid sha256_hex") if part.nil?
|
||||
parts << part
|
||||
offset += part_len
|
||||
end
|
||||
parts << sha256_hex
|
||||
|
||||
Reference in New Issue
Block a user