Files
redux-scraper/app/lib/scraper/gallery_dl_client.rb
2025-01-01 03:29:53 +00:00

161 lines
4.4 KiB
Ruby

# typed: false
class Scraper::GalleryDlClient
include HasColorLogger
StartEvent = Struct.new(:url, :extractor)
FinishEvent = Struct.new(:ignore)
HttpRequestEvent =
Struct.new(
:method,
:url,
:kwargs,
:requested_at,
:request_headers,
:response_headers,
:response_code,
:response_time_ms,
:body,
:log_entry,
keyword_init: true,
)
TweetEvent = Struct.new(:tweet, :author)
TweetMediaEvent =
Struct.new(
:tweet_id,
:file_url,
:filename,
:media_num,
:extension,
:height,
:width,
)
def initialize(name, host)
name || raise("no name provided")
host || raise("no host provided")
logger.info("build #{name.to_s.green.bold} - #{host.green}")
@performed_by = name
@client = Ripcord::Client.new(host)
end
def start_twitter_user(username, caused_by_entry: nil)
@token = SecureRandom.uuid
rpc =
@client.call(
"start_user",
[@token, "https://twitter.com/#{username}/tweets"],
)
raise rpc_error_str(rpc) unless rpc.successful?
decode_message(rpc.result, caused_by_entry)
end
def next_message(caused_by_entry: nil)
rpc = @client.call("next_message", [@token])
raise rpc_error_str(rpc) unless rpc.successful?
decode_message(rpc.result, caused_by_entry)
end
private
def rpc_error_str(rpc)
"#{rpc.error.message}: #{rpc.error.data}"
end
def decode_message(response, caused_by_entry)
token = response[:token]
raise("token mismatch: #{token} != #{@token}") if token != @token
case response[:event]
when "start"
StartEvent.new(response[:url], response[:extractor])
when "finish"
FinishEvent.new(nil)
when "http_request"
http_request = response[:http_request]
event =
HttpRequestEvent.new(
method: http_request[:method],
url: http_request[:url],
kwargs: http_request[:kwargs],
requested_at: Time.at(http_request[:requested_at]),
request_headers: http_request[:kwargs][:headers],
response_headers: http_request[:response_headers],
response_code: http_request[:status_code],
response_time_ms: (http_request[:duration] * 1000).to_i,
body: Base64.decode64(http_request[:content_base64]),
log_entry: nil,
)
log_and_set_http_request_event(event, caused_by_entry)
event
when "tweet"
TweetEvent.new(response[:tweet], response[:author])
when "tweet_media"
media = response[:media]
TweetMediaEvent.new(
media[:tweet_id],
media[:file_url],
media[:filename],
media[:media_num],
media[:extension],
media[:height],
media[:width],
)
end
end
def log_and_set_http_request_event(http_event, caused_by_entry)
request_headers = http_event.request_headers
response_headers = http_event.response_headers
content_type =
response_headers[:"Content-Type"] || response_headers[:"content-type"] ||
raise("no content type provided: #{response_headers}")
url = http_event.url
uri = Addressable::URI.parse(url)
if http_event.kwargs && http_event.kwargs[:params] &&
http_event.kwargs[:params][:variables]
uri.query = JSON.parse(http_event.kwargs[:params][:variables]).to_query
end
url = uri.to_s
retries = 0
begin
response_blob_entry =
BlobEntry.find_or_build(
content_type: content_type,
contents: http_event.body,
)
log_entry =
HttpLogEntry.new(
{
uri: url,
verb: http_event.method.downcase,
content_type: content_type,
status_code: http_event.response_code,
request_headers:
HttpLogEntryHeader.find_or_build(headers: request_headers),
response_headers:
HttpLogEntryHeader.find_or_build(headers: response_headers),
response: response_blob_entry,
response_time_ms: http_event.response_time_ms,
requested_at: http_event.requested_at,
caused_by_entry: caused_by_entry,
performed_by: @performed_by,
},
)
log_entry.save!
rescue StandardError
retries += 1
retry if retries < 2
raise
end
logger.debug "insert http log entry #{log_entry.id.to_s.bold}"
http_event.log_entry = log_entry
end
end