add basic test for twitter timeline job

This commit is contained in:
Dylan Knutson
2023-03-28 15:54:31 +09:00
parent b1a5496f09
commit 31d78ad0b9
13 changed files with 213 additions and 82 deletions

View File

@@ -2,7 +2,7 @@ class Domain::Fa::Job::FaJobBase < Scraper::JobBase
discard_on ActiveJob::DeserializationError
def self.http_factory_method
:new_fa_http_client
:get_fa_http_client
end
protected

View File

@@ -1,20 +1,8 @@
class Domain::Twitter::Job::TwitterJobBase < Scraper::JobBase
discard_on ActiveJob::DeserializationError
def self.build_http_client
@@twitter_http_client ||= begin
Scraper::TwitterHttpClient.new
end
end
def self.build_gallery_dl_client
@@gallery_dl_client ||= begin
proxy_config = Rails.application.config.x.proxy
if proxy_config[:gallery_dl].blank?
raise("no gallery_dl host defined for proxy config #{proxy_config[:name]}")
end
Scraper::GalleryDlClient.new(proxy_config[:name], proxy_config[:gallery_dl])
end
def self.http_factory_method
:get_twitter_http_client
end
def find_or_intitialize_user_from_args(args)
@@ -28,14 +16,4 @@ class Domain::Twitter::Job::TwitterJobBase < Scraper::JobBase
fatal_error("arg 'name' or 'tw_id' is required if arg 'user' is nil")
end
end
protected
def http_client
@http_client ||= self.class.build_http_client
end
def gallery_dl_client
@gallery_dl_client ||= self.class.build_gallery_dl_client
end
end

View File

@@ -1,5 +1,5 @@
class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::TwitterJobBase
GDL = Scraper::GalleryDlClient
GDLClient = Scraper::GalleryDlClient
queue_as :twitter_timeline_tweets
def perform(args)
@@ -35,31 +35,32 @@ class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::Twitte
event = gallery_dl_client.next_message(
caused_by_entry: @first_twitter_caused_by || @caused_by_entry,
)
fatal_error("nil event from gallery_dl_client") if event.nil?
case event
when GDL::StartEvent
logger.info("start tweets(#{event.extractor})")
when GDL::FinishEvent
logger.info("end tweets - #{@num_created_tweets} total")
when GDLClient::StartEvent
logger.info("start tweets (#{event.extractor.to_s.bold})")
when GDLClient::FinishEvent
logger.info("finish - #{@num_created_tweets.to_s.bold} saw total")
break
when GDL::HttpRequestEvent
when GDLClient::HttpRequestEvent
@first_twitter_caused_by ||= event.log_entry
maybe_extract_user_info(event)
when GDL::TweetEvent
when GDLClient::TweetEvent
update_user_from_tweet(event)
break if handle_tweet_event(event) == :break
when GDL::TweetMediaEvent
when GDLClient::TweetMediaEvent
handle_media_event(event)
else
raise("unhandled event #{event}")
fatal_error("unhandled event: `#{event.inspect}`")
end
end
logger.info(
"created #{@num_created_tweets.to_s.bold} / " +
"scanned #{@num_scanned_tweets.to_s.bold} tweets, " +
"created #{@num_created_medias} medias"
)
logger.info([
"created #{@num_created_tweets.to_s.bold} tweets",
"scanned #{@num_scanned_tweets.to_s.bold} tweets",
"created #{@num_created_medias.to_s.bold} medias",
].join(", "))
@user.scanned_timeline_at = Time.now
@user.state = "ok"
@@ -87,28 +88,31 @@ class Domain::Twitter::Job::UserTimelineTweetsJob < Domain::Twitter::Job::Twitte
private
def maybe_extract_user_info(http_event)
return if @user&.id
return unless http_event.response_code == 200
return unless http_event.response_headers[:'content-type'].starts_with?("application/json")
json = JSON.parse(http_event.body)
return unless json
typename = json.dig("data", "user", "result", "__typename")
return unless typename == "User"
user_id = json.dig("data", "user", "result", "rest_id")
return unless user_id
user_id = user_id.to_i
tw_id = json.dig("data", "user", "result", "rest_id")
return unless tw_id
tw_id = tw_id.to_i
existing = Domain::Twitter::User.find_by(tw_id: user_id)
# tw_id is already up to date, do nothing
return if @user.tw_id == tw_id
existing = Domain::Twitter::User.find_by(tw_id: tw_id)
if existing
logger.info "found existing user model"
logger.info "found existing user with tw_id #{tw_id.to_s.bold}"
@user = existing
@updated_user_from_tweet = false
else
if @user.tw_id
raise "tw_id mismatch" unless @user.tw_id == user_id
raise "tw_id mismatch" unless @user.tw_id == tw_id
else
logger.info "extracted tw_id for user"
@user.tw_id = user_id
logger.info "extracted tw_id #{tw_id.to_s.bold}"
@user.tw_id = tw_id
@user.save!
end
end
end

View File

@@ -11,7 +11,11 @@ class Scraper::JobBase < ApplicationJob
end
def http_client
@http_client ||= Scraper::HttpFactory.send(self.class.http_factory_method)
@http_client ||= Scraper::ClientFactory.send(self.class.http_factory_method)
end
def gallery_dl_client
@gallery_dl_client ||= Scraper::ClientFactory.get_gallery_dl_client
end
good_job_control_concurrency_with(

View File

@@ -0,0 +1,58 @@
class Scraper::ClientFactory
@http_clients = Concurrent::ThreadLocalVar.new() { {} }
@gallery_dl_clients = Concurrent::ThreadLocalVar.new(nil)
# for testing only
def self.http_client_mock=(mock)
raise unless Rails.env.test?
@http_client_mock = mock
end
def self.gallery_dl_client_mock=(mock)
raise unless Rails.env.test?
@gallery_dl_client_mock = mock
end
# public API
def self.get_gallery_dl_client
if Rails.env.test?
@gallery_dl_client_mock || raise("no gallery dl mock set")
else
_gallery_dl_client_impl
end
end
def self.get_twitter_http_client
if Rails.env.test?
@http_client_mock || raise("no http client mock set")
else
_http_client_impl(:twitter, Scraper::TwitterHttpClientConfig)
end
end
def self.get_fa_http_client
if Rails.env.test?
@http_client_mock || raise("no http client mock set")
else
_http_client_impl(:fa, Scraper::FaHttpClientConfig)
end
end
def self._gallery_dl_client_impl
@gallery_dl_clients.value ||= begin
proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
if proxy_config[:gallery_dl].blank?
raise("no gallery_dl host defined for proxy config #{proxy_config[:name]}")
end
Scraper::GalleryDlClient.new(proxy_config[:name], proxy_config[:gallery_dl])
end
end
def self._http_client_impl(key, config_klass)
@http_clients.value[key] ||= begin
proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http])
Scraper::HttpClient.new(config_klass.new, performer)
end
end
end

View File

@@ -14,6 +14,7 @@ class Scraper::GalleryDlClient
:response_time_ms,
:body,
:log_entry,
keyword_init: true,
)
TweetEvent = Struct.new(
:tweet,

View File

@@ -1,27 +0,0 @@
class Scraper::HttpFactory
@http_clients = Concurrent::ThreadLocalVar.new() { {} }
def self.http_client_mock=(mock)
@http_client_mock = mock
end
def self.new_fa_http_client
if Rails.env.test?
@http_client_mock || raise("no mock set yet")
else
get_or_create_client(:fa, Scraper::FaHttpClientConfig.new)
end
end
def self.new_gallery_dl_client
raise("not implemented yet")
end
def self.get_or_create_client(key, config)
@http_clients.value[key] ||= begin
proxy_config = Rails.application.config.x.proxy || raise("no proxy config")
performer = Scraper::HttpPerformer.new(proxy_config[:name], proxy_config[:http])
Scraper::HttpClient.new(config, performer)
end
end
end

View File

@@ -1,4 +1,4 @@
class Scraper::TwitterHttpClient < Scraper::BaseHttpClient
class Scraper::TwitterHttpClientConfig < Scraper::HttpClientConfig
DEFAULT_ALLOWED_DOMAINS = [
"*.twimg.com",
"ipinfo.io",
@@ -18,4 +18,8 @@ class Scraper::TwitterHttpClient < Scraper::BaseHttpClient
def allowed_domains
DEFAULT_ALLOWED_DOMAINS
end
def redirect_limit
4
end
end

2
db/schema.rb generated
View File

@@ -195,6 +195,7 @@ ActiveRecord::Schema[7.0].define(version: 2023_03_25_124842) do
end
create_table "domain_twitter_users", force: :cascade do |t|
t.integer "tw_id"
t.integer "state"
t.json "state_detail"
t.json "raw_data"
@@ -209,6 +210,7 @@ ActiveRecord::Schema[7.0].define(version: 2023_03_25_124842) do
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["name"], name: "index_domain_twitter_users_on_name", unique: true
t.index ["tw_id"], name: "index_domain_twitter_users_on_tw_id", unique: true
end
create_table "flat_sst_entries", id: false, force: :cascade do |t|

View File

@@ -0,0 +1,56 @@
module TwitterHelpers
def set_up_gallery_dl_mock(mock, sequence)
sequence.each do |seq|
expected = receive(seq[:receive])
if seq[:with].respond_to?(:call)
expected = expected.with(*seq[:with].call(sequence))
else
expected = expected.with(*seq[:with])
end if seq[:with]
expected = expected.and_return(seq[:return]) if seq[:return]
expect(mock).to(expected)
end
allow(mock).to(receive(:next_message).and_raise(RuntimeError.new))
end
def gallery_dl_user_with_no_tweets_sequence(mock)
set_up_gallery_dl_mock(mock, [
{
receive: :start_twitter_user,
with: ["curtus", { caused_by_entry: nil }],
},
{
receive: :next_message,
with: [{ caused_by_entry: nil }],
return: GDLClient::HttpRequestEvent.new(
log_entry: instance_double("::HttpLogEntry"),
response_code: 200,
response_headers: {
:'content-type' => "application/json",
},
body: {
"data" => {
"user" => {
"result" => {
"__typename" => "User",
"rest_id" => "1234567",
},
},
},
}.to_json,
),
},
{
receive: :next_message,
with: proc { |sequence|
[{ caused_by_entry: sequence[1][:return].log_entry }]
},
return: GDLClient::FinishEvent.new,
},
])
end
end

View File

@@ -3,11 +3,7 @@ require "rails_helper"
describe Domain::Fa::Job::BrowsePageJob do
let(:http_client_mock) { instance_double("::Scraper::HttpClient") }
before do
Scraper::HttpFactory.http_client_mock = http_client_mock
end
around do |block|
ColorLogger.quiet(&block)
Scraper::ClientFactory.http_client_mock = http_client_mock
end
shared_context "user and post getters" do

View File

@@ -0,0 +1,42 @@
describe Domain::Twitter::Job::UserTimelineTweetsJob do
GDLClient = Scraper::GalleryDlClient
let(:http_client_mock) { instance_double("::Scraper::HttpClient") }
let(:gallery_dl_client_mock) { instance_double("::Scraper::GalleryDlClient") }
before do
Scraper::ClientFactory.http_client_mock = http_client_mock
Scraper::ClientFactory.gallery_dl_client_mock = gallery_dl_client_mock
end
context "scanning an unseen user" do
context "and the user has no tweets" do
it "creates the user" do
gallery_dl_user_with_no_tweets_sequence(gallery_dl_client_mock)
expect do
expect(described_class.perform_now({
name: "curtus",
})).to_not be_a(Exception)
end.to change { Domain::Twitter::User.count }.by(1)
user = Domain::Twitter::User.find_by(name: "curtus")
expect(user).to_not be_nil
expect(user.tw_id).to eq(1234567)
end
end
end
context "scanning an exisitng user with no tw_id" do
it "updates the user", quiet: false do
gallery_dl_user_with_no_tweets_sequence(gallery_dl_client_mock)
user = Domain::Twitter::User.create!(name: "curtus")
expect do
expect(described_class.perform_now({
name: "curtus",
})).to_not be_a(Exception)
end.not_to change { Domain::Twitter::User.count }
user.reload
expect(user.tw_id).to eq(1234567)
end
end
end

View File

@@ -13,7 +13,20 @@
# it.
#
# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
require "./spec/helpers/twitter_helpers"
RSpec.configure do |config|
config.include TwitterHelpers
# can tag classes with `quiet: false` to make ColorLogger loud
config.around(:each) do |example|
if example.example.metadata[:quiet].is_a?(FalseClass)
example.call
else
ColorLogger.quiet(&example)
end
end
# rspec-expectations config goes here. You can use an alternate
# assertion/expectation library such as wrong or the stdlib/minitest
# assertions if you prefer.