move proxy config into yaml

This commit is contained in:
2023-03-08 19:26:33 -08:00
parent 5a4595cd57
commit f2c41d2330
12 changed files with 95 additions and 41 deletions

View File

@@ -3,9 +3,7 @@ class Domain::Fa::Job::FaJobBase < Scraper::JobBase
def self.build_http_client
@@fa_base_http_client ||= begin
proxy = ENV["proxy"]
puts "Building FA HTTP client (proxy: #{proxy})"
Scraper::FaHttpClient.new(proxy || Scraper::CurlHttpPerformer.new)
Scraper::FaHttpClient.new
end
end

View File

@@ -24,10 +24,19 @@ class Domain::Twitter::Job::MediaJob < Domain::Twitter::Job::TwitterJobBase
"#{response.log_entry.response_time_ms} ms"
if response.status_code != 200
if response.status_code == 404
@media.count_404 += 1
if @media.count_404 > 2
@media.state = "error"
@media.save!
return
end
end
fatal_error("#{response.status_code} getting media")
end
@media.file = response.log_entry
@media.save
@media.save!
end
end

View File

@@ -3,17 +3,17 @@ class Domain::Twitter::Job::TwitterJobBase < Scraper::JobBase
def self.build_http_client
@@twitter_http_client ||= begin
proxy = ENV["proxy"]
puts "Building Twitter HTTP client (proxy: #{proxy})"
Scraper::TwitterHttpClient.new(proxy || Scraper::CurlHttpPerformer.new)
Scraper::TwitterHttpClient.new
end
end
def self.build_gallery_dl_client
@@gallery_dl_client ||= begin
proxy = ENV["proxy"]
puts "Building GalleryDlClient (TODO: proxy: #{proxy})"
Scraper::GalleryDlClient.new("http://localhost:5000/")
proxy_config = Rails.application.config.x.proxy
if proxy_config[:gallery_dl].blank?
raise("no gallery_dl host defined for proxy config #{proxy_config[:name]}")
end
Scraper::GalleryDlClient.new(proxy_config[:name], proxy_config[:gallery_dl])
end
end

View File

@@ -9,7 +9,7 @@ class Scraper::BaseHttpClient
class InvalidURLError < ArgumentError; end
def initialize(http_performer_or_proxy)
def initialize(http_performer = nil)
cookies || raise("Subclass must define non-nil #cookies")
allowed_domains || raise("Subclass must define non-nil #allowed_domains")
@@ -21,11 +21,17 @@ class Scraper::BaseHttpClient
hash[key] = LegacyImport::AdaptiveCache.new(@max_cache_size, 1.0, 0.1)
end
if http_performer_or_proxy.is_a?(String)
@http_performer = Scraper::ProxyHttpPerformer.new(http_performer_or_proxy)
else
@http_performer = http_performer_or_proxy
@http_performer = http_performer || begin
proxy_config = Rails.application.config.x.proxy
if proxy_config[:http]
logger.info("build #{proxy_config[:name]} - #{"Proxy".green} - #{proxy_config[:http]}")
Scraper::ProxyHttpPerformer.new(proxy_config[:name], proxy_config[:http])
else
logger.info("build #{proxy_config[:name]} - #{"Direct".green} - (localhost implicit)")
Scraper::CurlHttpPerformer.new
end
end
@domain_last_requested_at = {}
logger.level = :info
end
@@ -103,8 +109,16 @@ class Scraper::BaseHttpClient
@domain_last_requested_at[uri.host] = requested_at + (response_time_ms.to_f / 1000)
content_type = response_headers["Content-Type"] ||
response_headers["content-type"] ||
raise("no content type provided: #{response_headers}")
response_headers["content-type"]
if content_type.nil?
if response_code == 404
# fudge it
content_type = "plain/fourohfour"
else
raise("no content type provided: #{response_code} / #{response_headers}")
end
end
cache_key = "#{uri.host}|#{content_type}"
blob_entry_cache = @blob_entry_cache[cache_key]

View File

@@ -5,7 +5,7 @@ class Scraper::FaHttpClient < Scraper::BaseHttpClient
"ipinfo.io",
]
def initialize(http_performer_or_proxy)
def initialize(http_performer_or_proxy = nil)
if http_performer_or_proxy.is_a?(String)
@cookies = Rails.application.config.x.cookies.fa[http_performer_or_proxy]
else

View File

@@ -29,7 +29,11 @@ class Scraper::GalleryDlClient
:width,
)
def initialize(host)
def initialize(name, host)
name || raise("no name provided")
host || raise("no host provided")
logger.info("build #{name.green.bold} - #{host.green}")
@performed_by = name
@client = Ripcord::Client.new(host)
@max_cache_size = 8
@blob_entry_cache = Hash.new do |hash, key|
@@ -38,13 +42,16 @@ class Scraper::GalleryDlClient
end
def start_twitter_user(username, caused_by_entry: nil)
rpc = @client.call("start_user", ["https://twitter.com/#{username}/tweets"])
@token = SecureRandom.uuid
rpc = @client.call("start_user", [@token, "https://twitter.com/#{username}/tweets"])
raise unless rpc.successful?
decode_message(rpc.result, caused_by_entry)
rescue
binding.pry
end
def next_message(caused_by_entry: nil)
rpc = @client.call("next_message", [])
rpc = @client.call("next_message", [@token])
raise unless rpc.successful?
decode_message(rpc.result, caused_by_entry)
end
@@ -52,6 +59,9 @@ class Scraper::GalleryDlClient
private
def decode_message(response, caused_by_entry)
token = response[:token]
raise("token mismatch: #{token} != #{@token}") if token != @token
case response[:event]
when "start" then StartEvent.new(response[:url], response[:extractor])
when "finish" then FinishEvent.new(nil)
@@ -131,7 +141,7 @@ class Scraper::GalleryDlClient
response_time_ms: http_event.response_time_ms,
requested_at: http_event.requested_at,
caused_by_entry: caused_by_entry,
performed_by: "direct-gdl",
performed_by: @performed_by,
})
log_entry.save!

View File

@@ -1,10 +1,11 @@
require "base64"
class Scraper::ProxyHttpPerformer < Scraper::HttpPerformer
def initialize(proxy_name)
@name = proxy_name
proxy_url = Rails.application.config.x.proxies[proxy_name]
@client = Ripcord::Client.new(proxy_url)
def initialize(name, host)
name || raise("no name provided")
host || raise("no host provided")
@name = name
@client = Ripcord::Client.new(host)
end
def name

View File

@@ -4,15 +4,6 @@ class Scraper::TwitterHttpClient < Scraper::BaseHttpClient
"ipinfo.io",
]
def initialize(http_performer_or_proxy)
# if http_performer_or_proxy.is_a?(String)
# @cookies = Rails.application.config.x.cookies.fa[http_performer_or_proxy]
# else
# @cookies = Rails.application.config.x.cookies.fa["direct"]
# end
super(http_performer_or_proxy)
end
def cookies
[]
end

View File

@@ -10,4 +10,12 @@ class Domain::Twitter::Media < ReduxApplicationRecord
belongs_to :tweet, class_name: "Domain::Twitter::Tweet"
belongs_to :file, class_name: "HttpLogEntry", optional: true
def count_404
self.state_detail["404_count"] ||= 0
end
def count_404=(num)
self.state_detail["404_count"] = num
end
end

View File

@@ -23,11 +23,11 @@ module ReduxScraper
# config.time_zone = "Central Time (US & Canada)"
# config.eager_load_paths << Rails.root.join("extras")
config.x.cookies.fa = ReduxScraper::Application.config_for("cookies/fa")
config.x.proxies = {
"proxy-1" => "http://proxy-1.local:9292",
"dedipath-1" => "http://10.200.0.6:9292",
}
config.x.proxies = ReduxScraper::Application.config_for("proxies")
config.x.influxdb = ReduxScraper::Application.config_for("influxdb")
proxy_name = ENV["proxy"] || "direct"
config.x.proxy = Rails.application.config.x.proxies[proxy_name] || raise("no proxy config for '#{proxy_name}'")
config.x.proxy[:name] = proxy_name
end
end

23
config/proxies.yml Normal file
View File

@@ -0,0 +1,23 @@
default: &default
direct:
http: null # use direct curl instead of proxy-app
gallery_dl: http://localhost:5001
proxy-1:
http: http://proxy-1.local:9292
# gallery_dl: http://proxy-1.local:5001
dedipath-1:
http: http://10.200.0.6:9292
# gallery_dl: http://10.200.0.6:5001
production:
<<: *default
development:
<<: *default
test:
direct:
proxy-1:
dedipath-1:

View File

@@ -38,7 +38,7 @@ queues = {
}
if ENV["proxy"] == "direct" || ENV["proxy"].nil?
queues[:twitter_timeline_tweets] = 1
queues[:twitter_timeline_tweets] = 4
end
queues.each do |queue, workers|