refactor file structure
This commit is contained in:
@@ -17,21 +17,21 @@ namespace :stats do
|
||||
max_points = args[:max_points]&.to_i
|
||||
|
||||
# Query and sample data
|
||||
records_array = StatsHelpers.sample_records(max_points)
|
||||
records_array = Stats::Helpers.sample_records(max_points)
|
||||
|
||||
# Create base normalizer for display ranges
|
||||
base_normalizer = LinearNormalizer.new(records_array)
|
||||
base_normalizer = Stats::LinearNormalizer.new(records_array)
|
||||
|
||||
puts "📈 X-axis range (fav_fa_id): #{base_normalizer.x_range}"
|
||||
puts "📈 Y-axis range (date): #{base_normalizer.y_range}"
|
||||
|
||||
# Split data for plotting
|
||||
split = StatsHelpers.split_train_test(records_array)
|
||||
train_normalizer = LinearNormalizer.new(split.training_records)
|
||||
eval_normalizer = LinearNormalizer.new(split.evaluation_records)
|
||||
split = Stats::Helpers.split_train_test(records_array)
|
||||
train_normalizer = Stats::LinearNormalizer.new(split.training_records)
|
||||
eval_normalizer = Stats::LinearNormalizer.new(split.evaluation_records)
|
||||
|
||||
# Run regressions using specialized normalizers
|
||||
regressions = RegressionAnalyzer.new(records_array).analyze
|
||||
regressions = Stats::RegressionAnalyzer.new(records_array).analyze
|
||||
|
||||
# Display results (automatically denormalized)
|
||||
regressions.each do |name, result|
|
||||
@@ -42,7 +42,7 @@ namespace :stats do
|
||||
|
||||
# Generate visualizations
|
||||
puts "\n🎨 Generating visualizations with UnicodePlot..."
|
||||
plotter = StatsPlotter.new
|
||||
plotter = Stats::Plotter.new
|
||||
|
||||
plotter.plot_train_eval_scatter(
|
||||
"Original Data (Train/Eval)",
|
||||
@@ -63,857 +63,36 @@ namespace :stats do
|
||||
)
|
||||
|
||||
puts "\n✅ Graph generation completed!"
|
||||
end
|
||||
end
|
||||
|
||||
# Helper methods extracted to avoid private method issues in Rake context
|
||||
module StatsHelpers
|
||||
extend T::Sig
|
||||
|
||||
sig do
|
||||
params(max_points: T.nilable(Integer)).returns(
|
||||
T::Array[Domain::FaFavIdAndDate],
|
||||
)
|
||||
end
|
||||
def self.sample_records(max_points)
|
||||
records = Domain::FaFavIdAndDate.complete
|
||||
|
||||
if records.empty?
|
||||
puts "❌ No complete FaFavIdAndDate records found"
|
||||
exit 1
|
||||
end
|
||||
|
||||
total_records = records.count
|
||||
puts "📊 Found #{total_records} complete records"
|
||||
records = records.select(:id, :fav_fa_id, :date)
|
||||
|
||||
records_array = records.to_a
|
||||
if max_points && total_records > max_points
|
||||
puts "🎲 Randomly sampling #{max_points} points from #{total_records} total records"
|
||||
srand(42) # Fixed seed for reproducibility
|
||||
records_array =
|
||||
T.cast(
|
||||
records_array.sample(max_points),
|
||||
T::Array[Domain::FaFavIdAndDate],
|
||||
)
|
||||
puts "📊 Using #{records_array.length} sampled records for analysis"
|
||||
else
|
||||
message =
|
||||
(
|
||||
if max_points
|
||||
"within max_points limit of #{max_points}"
|
||||
else
|
||||
"no sampling limit specified"
|
||||
end
|
||||
)
|
||||
puts "📊 Using all #{records_array.length} records (#{message})"
|
||||
end
|
||||
|
||||
records_array
|
||||
end
|
||||
|
||||
sig do
|
||||
params(
|
||||
records: T::Array[Domain::FaFavIdAndDate],
|
||||
eval_ratio: Float,
|
||||
).returns(TrainTestSplit)
|
||||
end
|
||||
def self.split_train_test(records, eval_ratio = 0.2)
|
||||
# Set random seed for reproducibility
|
||||
srand(42)
|
||||
|
||||
# Shuffle the records
|
||||
shuffled_records = records.shuffle
|
||||
|
||||
# Calculate split point
|
||||
split_index = (records.length * (1.0 - eval_ratio)).round
|
||||
|
||||
training_records =
|
||||
T.cast(
|
||||
shuffled_records[0...split_index],
|
||||
T::Array[Domain::FaFavIdAndDate],
|
||||
)
|
||||
evaluation_records =
|
||||
T.cast(
|
||||
shuffled_records[split_index..-1],
|
||||
T::Array[Domain::FaFavIdAndDate],
|
||||
)
|
||||
|
||||
split =
|
||||
TrainTestSplit.new(
|
||||
training_records: training_records,
|
||||
evaluation_records: evaluation_records,
|
||||
)
|
||||
|
||||
split
|
||||
end
|
||||
|
||||
sig { params(value: Float).returns(Float) }
|
||||
def self.format_r_squared(value)
|
||||
value.round(3).to_f
|
||||
end
|
||||
end
|
||||
|
||||
# Immutable struct representing training and evaluation data split
|
||||
class TrainTestSplit < T::ImmutableStruct
|
||||
extend T::Sig
|
||||
|
||||
const :training_records, T::Array[Domain::FaFavIdAndDate]
|
||||
const :evaluation_records, T::Array[Domain::FaFavIdAndDate]
|
||||
|
||||
sig { returns(Integer) }
|
||||
def training_count
|
||||
training_records.length
|
||||
end
|
||||
|
||||
sig { returns(Integer) }
|
||||
def evaluation_count
|
||||
evaluation_records.length
|
||||
end
|
||||
|
||||
sig { returns(Integer) }
|
||||
def total_count
|
||||
training_count + evaluation_count
|
||||
end
|
||||
|
||||
sig { returns(String) }
|
||||
def summary
|
||||
"📊 Split data: #{training_count} training, #{evaluation_count} evaluation records"
|
||||
end
|
||||
end
|
||||
|
||||
class AxisRange < T::ImmutableStruct
|
||||
extend T::Sig
|
||||
|
||||
const :min, Float
|
||||
const :max, Float
|
||||
|
||||
sig { returns(Float) }
|
||||
def scale
|
||||
max - min
|
||||
end
|
||||
|
||||
sig { returns(T::Range[Float]) }
|
||||
def range
|
||||
min..max
|
||||
end
|
||||
|
||||
sig { params(value: Float).returns(Float) }
|
||||
def normalize(value)
|
||||
(value - min) / scale
|
||||
end
|
||||
|
||||
sig { params(value: Float).returns(Float) }
|
||||
def denormalize(value)
|
||||
value * scale + min
|
||||
end
|
||||
|
||||
sig do
|
||||
params(
|
||||
mapper: T.nilable(T.proc.params(arg: Float).returns(String)),
|
||||
).returns(String)
|
||||
end
|
||||
def as_string(&mapper)
|
||||
mapper ||= ->(x) { x }
|
||||
"#{mapper.call(min)} to #{mapper.call(max)}"
|
||||
end
|
||||
end
|
||||
|
||||
# Base class for data normalization and denormalization
|
||||
class DataNormalizer
|
||||
extend T::Sig
|
||||
extend T::Helpers
|
||||
abstract!
|
||||
|
||||
sig(:final) { returns(T::Array[Float]) }
|
||||
attr_reader :x_values
|
||||
|
||||
sig(:final) { returns(T::Array[Float]) }
|
||||
attr_reader :y_values
|
||||
|
||||
sig(:final) { params(records: T::Array[Domain::FaFavIdAndDate]).void }
|
||||
def initialize(records)
|
||||
data_points =
|
||||
records.map do |record|
|
||||
{
|
||||
x: record.fav_fa_id.to_f,
|
||||
y: T.cast(record.date&.to_time&.to_i&.to_f, Float),
|
||||
}
|
||||
end
|
||||
|
||||
data_points.sort_by! { |point| point[:x] }
|
||||
@x_values = T.let(data_points.map { |p| p[:x] }, T::Array[Float])
|
||||
@y_values = T.let(data_points.map { |p| p[:y] }, T::Array[Float])
|
||||
|
||||
# Calculate min/max for normalization
|
||||
x_minmax = T.cast(@x_values.minmax, [Float, Float])
|
||||
y_minmax = T.cast(@y_values.minmax, [Float, Float])
|
||||
@x = T.let(AxisRange.new(min: x_minmax[0], max: x_minmax[1]), AxisRange)
|
||||
@y = T.let(AxisRange.new(min: y_minmax[0], max: y_minmax[1]), AxisRange)
|
||||
end
|
||||
|
||||
sig(:final) { returns(String) }
|
||||
def x_range
|
||||
@x.as_string
|
||||
end
|
||||
|
||||
sig(:final) { returns(String) }
|
||||
def y_range
|
||||
@y.as_string { |x| Time.at(x) }
|
||||
end
|
||||
|
||||
# Accessors for equation classes
|
||||
sig(:final) { returns(Float) }
|
||||
def x_scale
|
||||
@x.scale
|
||||
end
|
||||
|
||||
sig(:final) { returns(Float) }
|
||||
def y_scale
|
||||
@y.scale
|
||||
end
|
||||
|
||||
sig(:final) { returns(Float) }
|
||||
def x_min
|
||||
@x.min
|
||||
end
|
||||
|
||||
sig(:final) { returns(Float) }
|
||||
def y_min
|
||||
@y.min
|
||||
end
|
||||
|
||||
# Convert raw data to normalized [0,1] scale for Rumale
|
||||
sig(:final) { returns(T::Array[T::Array[Float]]) }
|
||||
def normalized_x_matrix
|
||||
@x_values.map { |x| [@x.normalize(x)] }
|
||||
end
|
||||
|
||||
sig(:final) { returns(T::Array[Float]) }
|
||||
def normalized_y_vector
|
||||
@y_values.map { |y| @y.normalize(y) }
|
||||
end
|
||||
|
||||
# Generate regression line points in original scale
|
||||
sig(:final) { returns(T::Array[Float]) }
|
||||
def regression_x_range
|
||||
step_size = @x.scale / 50.0
|
||||
@x.range.step(step_size).to_a
|
||||
end
|
||||
|
||||
# Default transformation matrix (identity for linear/quadratic)
|
||||
sig { returns(T::Array[T::Array[Float]]) }
|
||||
def transformed_x_matrix
|
||||
normalized_x_matrix
|
||||
end
|
||||
|
||||
# Abstract method for denormalizing regression results
|
||||
sig do
|
||||
abstract
|
||||
.params(regression_x: T::Array[Float], weight_vec: T::Array[Float])
|
||||
.returns(T::Array[Float])
|
||||
end
|
||||
def denormalize_regression(regression_x, weight_vec)
|
||||
end
|
||||
end
|
||||
|
||||
class LinearNormalizer < DataNormalizer
|
||||
# Denormalize linear regression results back to original scale
|
||||
sig do
|
||||
override
|
||||
.params(regression_x: T::Array[Float], weight_vec: T::Array[Float])
|
||||
.returns(T::Array[Float])
|
||||
end
|
||||
def denormalize_regression(regression_x, weight_vec)
|
||||
norm_slope = T.cast(weight_vec[1], Float)
|
||||
norm_intercept = T.cast(weight_vec[0], Float)
|
||||
regression_x.map do |x|
|
||||
x_norm = @x.normalize(x)
|
||||
y_norm = norm_slope * x_norm + norm_intercept
|
||||
@y.denormalize(y_norm)
|
||||
end
|
||||
end
|
||||
|
||||
# Denormalize linear regression coefficients back to original scale
|
||||
sig do
|
||||
params(norm_intercept: Float, norm_slope: Float).returns(T::Array[Float])
|
||||
end
|
||||
def denormalize_coefficients(norm_intercept, norm_slope)
|
||||
slope_orig = norm_slope * @y.scale / @x.scale
|
||||
intercept_orig = (norm_intercept * @y.scale + @y.min) - slope_orig * @x.min
|
||||
|
||||
[intercept_orig, slope_orig]
|
||||
end
|
||||
end
|
||||
|
||||
class QuadraticNormalizer < DataNormalizer
|
||||
# Denormalize quadratic regression results back to original scale
|
||||
sig do
|
||||
override
|
||||
.params(regression_x: T::Array[Float], weight_vec: T::Array[Float])
|
||||
.returns(T::Array[Float])
|
||||
end
|
||||
def denormalize_regression(regression_x, weight_vec)
|
||||
norm_a = T.cast(weight_vec[2], Float)
|
||||
norm_b = T.cast(weight_vec[1], Float)
|
||||
norm_c = T.cast(weight_vec[0], Float)
|
||||
regression_x.map do |x|
|
||||
x_norm = @x.normalize(x)
|
||||
y_norm = norm_a * x_norm * x_norm + norm_b * x_norm + norm_c
|
||||
@y.denormalize(y_norm)
|
||||
end
|
||||
end
|
||||
|
||||
# Denormalize quadratic regression coefficients back to original scale
|
||||
sig do
|
||||
params(norm_c: Float, norm_b: Float, norm_a: Float).returns(T::Array[Float])
|
||||
end
|
||||
def denormalize_coefficients(norm_c, norm_b, norm_a)
|
||||
a_orig = norm_a * @y.scale / (@x.scale * @x.scale)
|
||||
b_orig = norm_b * @y.scale / @x.scale - 2 * a_orig * @x.min
|
||||
c_orig =
|
||||
(norm_c * @y.scale + @y.min) - b_orig * @x.min - a_orig * @x.min * @x.min
|
||||
|
||||
[c_orig, b_orig, a_orig]
|
||||
end
|
||||
end
|
||||
|
||||
# Base class for transformations that follow y = a * f(x) + b pattern
|
||||
# where f(x) is a transformation function and denormalization only requires y-scaling
|
||||
class TransformedNormalizer < DataNormalizer
|
||||
extend T::Sig
|
||||
extend T::Helpers
|
||||
abstract!
|
||||
|
||||
# Denormalize coefficients for simple transformations (only y-scaling needed)
|
||||
sig do
|
||||
params(norm_intercept: Float, norm_slope: Float).returns(T::Array[Float])
|
||||
end
|
||||
def denormalize_coefficients(norm_intercept, norm_slope)
|
||||
slope_orig = norm_slope * @y.scale
|
||||
intercept_orig = norm_intercept * @y.scale + @y.min
|
||||
|
||||
[intercept_orig, slope_orig]
|
||||
end
|
||||
|
||||
# Common denormalization logic using the transformation function
|
||||
sig do
|
||||
override
|
||||
.params(regression_x: T::Array[Float], weight_vec: T::Array[Float])
|
||||
.returns(T::Array[Float])
|
||||
end
|
||||
def denormalize_regression(regression_x, weight_vec)
|
||||
norm_slope = T.cast(weight_vec[1], Float)
|
||||
norm_intercept = T.cast(weight_vec[0], Float)
|
||||
regression_x.map do |x|
|
||||
# y = a * f(x) + b, where coefficients are in normalized space
|
||||
y_norm = norm_slope * transform_x(x) + norm_intercept
|
||||
@y.denormalize(y_norm)
|
||||
end
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
# Abstract method for applying the transformation function
|
||||
sig { abstract.params(x: Float).returns(Float) }
|
||||
def transform_x(x)
|
||||
end
|
||||
end
|
||||
|
||||
# Logarithmic regression specific normalizer
|
||||
class LogarithmicNormalizer < TransformedNormalizer
|
||||
extend T::Sig
|
||||
|
||||
# Convert x values to log-transformed matrix for logarithmic regression
|
||||
sig { returns(T::Array[T::Array[Float]]) }
|
||||
def transformed_x_matrix
|
||||
@x_values.map { |x| [Math.log(x)] }
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
# Apply logarithmic transformation
|
||||
sig { override.params(x: Float).returns(Float) }
|
||||
def transform_x(x)
|
||||
Math.log(x)
|
||||
end
|
||||
end
|
||||
|
||||
# Square root regression specific normalizer
|
||||
class SquareRootNormalizer < TransformedNormalizer
|
||||
extend T::Sig
|
||||
|
||||
# Convert x values to square root transformed matrix for square root regression
|
||||
sig { returns(T::Array[T::Array[Float]]) }
|
||||
def transformed_x_matrix
|
||||
@x_values.map { |x| [Math.sqrt(x)] }
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
# Apply square root transformation
|
||||
sig { override.params(x: Float).returns(Float) }
|
||||
def transform_x(x)
|
||||
Math.sqrt(x)
|
||||
end
|
||||
end
|
||||
|
||||
# Base class for regression equations with common formatting logic
|
||||
class Equation
|
||||
extend T::Sig
|
||||
|
||||
sig { params(normalizer: DataNormalizer).void }
|
||||
def initialize(normalizer)
|
||||
@normalizer = normalizer
|
||||
end
|
||||
|
||||
# Format a number with significant figures and scientific notation when needed
|
||||
sig { params(num: Float, sig_figs: Integer).returns(String) }
|
||||
def format_number(num, sig_figs = 3)
|
||||
# Handle zero case
|
||||
return "0.0" if num.zero?
|
||||
|
||||
# Get order of scale
|
||||
order = Math.log10(num.abs).floor
|
||||
|
||||
# Use scientific notation for very large or small numbers
|
||||
if order >= 6 || order <= -3
|
||||
# Scale number between 1 and 10
|
||||
scaled = num / (10.0**order)
|
||||
# Round to sig figs
|
||||
rounded = scaled.round(sig_figs - 1)
|
||||
"#{rounded}e#{order}"
|
||||
else
|
||||
# For normal range numbers, just round to appropriate decimal places
|
||||
decimal_places = sig_figs - (order + 1)
|
||||
decimal_places = 0 if decimal_places < 0
|
||||
num.round(decimal_places).to_s
|
||||
end
|
||||
end
|
||||
|
||||
sig { returns(String) }
|
||||
def to_s
|
||||
format_equation
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
sig { returns(String) }
|
||||
def format_equation
|
||||
raise NotImplementedError, "Subclasses must implement format_equation"
|
||||
end
|
||||
|
||||
sig { returns(DataNormalizer) }
|
||||
attr_reader :normalizer
|
||||
end
|
||||
|
||||
class PolynomialEquation < Equation
|
||||
extend T::Sig
|
||||
|
||||
sig { params(normalizer: DataNormalizer, coefficients: T::Array[Float]).void }
|
||||
def initialize(normalizer, coefficients)
|
||||
super(normalizer)
|
||||
@coefficients = coefficients
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
sig { returns(String) }
|
||||
def format_equation
|
||||
"y = #{polynomial_equation(@coefficients)}"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Convert array of coefficients into polynomial equation string
|
||||
sig { params(coefficients: T::Array[Float]).returns(String) }
|
||||
def polynomial_equation(coefficients)
|
||||
terms =
|
||||
coefficients.each_with_index.map do |coeff, power|
|
||||
next if coeff.zero?
|
||||
|
||||
term = format_number(coeff)
|
||||
case power
|
||||
when 0
|
||||
term
|
||||
when 1
|
||||
"#{term}x"
|
||||
else
|
||||
"#{term}x#{power.to_s.tr("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")}"
|
||||
end
|
||||
end
|
||||
|
||||
terms.compact.reverse.join(" + ").gsub("+ -", "- ")
|
||||
end
|
||||
end
|
||||
|
||||
# Base class for transformed equations that follow y = a * f(x) + b pattern
|
||||
class TransformedEquation < Equation
|
||||
extend T::Sig
|
||||
extend T::Helpers
|
||||
abstract!
|
||||
|
||||
sig do
|
||||
params(
|
||||
normalizer: DataNormalizer,
|
||||
norm_slope: Float,
|
||||
norm_intercept: Float,
|
||||
).void
|
||||
end
|
||||
def initialize(normalizer, norm_slope, norm_intercept)
|
||||
super(normalizer)
|
||||
@norm_slope = norm_slope
|
||||
@norm_intercept = norm_intercept
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
sig { returns(String) }
|
||||
def format_equation
|
||||
slope_orig = @norm_slope * @normalizer.y_scale
|
||||
intercept_orig = @norm_intercept * @normalizer.y_scale + @normalizer.y_min
|
||||
|
||||
"y = #{format_number(slope_orig)} * #{transform_symbol("x")} + #{format_number(intercept_orig)}"
|
||||
end
|
||||
|
||||
# Abstract method for the transformation symbol
|
||||
sig { abstract.params(x: String).returns(String) }
|
||||
def transform_symbol(x)
|
||||
end
|
||||
end
|
||||
|
||||
class LogarithmicEquation < TransformedEquation
|
||||
extend T::Sig
|
||||
|
||||
protected
|
||||
|
||||
sig { override.params(x: String).returns(String) }
|
||||
def transform_symbol(x)
|
||||
"ln(#{x})"
|
||||
end
|
||||
end
|
||||
|
||||
class SquareRootEquation < TransformedEquation
|
||||
extend T::Sig
|
||||
|
||||
protected
|
||||
|
||||
sig { override.params(x: String).returns(String) }
|
||||
def transform_symbol(x)
|
||||
"√#{x}"
|
||||
end
|
||||
end
|
||||
|
||||
# Immutable struct representing a single regression analysis result
|
||||
class RegressionResult < T::ImmutableStruct
|
||||
extend T::Sig
|
||||
|
||||
const :equation, Equation
|
||||
const :training_r_squared, Float
|
||||
const :evaluation_r_squared, Float
|
||||
const :x_values, T::Array[Float]
|
||||
const :y_values, T::Array[Float]
|
||||
|
||||
sig { returns(String) }
|
||||
def equation_string
|
||||
equation.to_s
|
||||
end
|
||||
|
||||
sig { returns(String) }
|
||||
def score_summary
|
||||
"Training R² = #{StatsHelpers.format_r_squared(training_r_squared)}, Evaluation R² = #{StatsHelpers.format_r_squared(evaluation_r_squared)}"
|
||||
end
|
||||
end
|
||||
|
||||
# Immutable struct representing the complete analysis results
|
||||
class AnalysisResults < T::ImmutableStruct
|
||||
extend T::Sig
|
||||
|
||||
const :linear, RegressionResult
|
||||
const :quadratic, RegressionResult
|
||||
const :logarithmic, RegressionResult
|
||||
const :square_root, RegressionResult
|
||||
end
|
||||
|
||||
# Handles regression analysis using Rumale with normalized data
|
||||
class RegressionAnalyzer
|
||||
extend T::Sig
|
||||
|
||||
sig { params(records: T::Array[Domain::FaFavIdAndDate]).void }
|
||||
def initialize(records)
|
||||
@records = records
|
||||
end
|
||||
|
||||
# Generic regression analysis method to eliminate duplication
|
||||
sig do
|
||||
params(
|
||||
normalizer_class: T.class_of(DataNormalizer),
|
||||
equation_class: T.class_of(Equation),
|
||||
split: TrainTestSplit,
|
||||
degree: Integer,
|
||||
).returns(RegressionResult)
|
||||
end
|
||||
def analyze_regression(normalizer_class, equation_class, split, degree: 1)
|
||||
# Create normalizers for training and evaluation data
|
||||
training_normalizer = normalizer_class.new(split.training_records)
|
||||
evaluation_normalizer = normalizer_class.new(split.evaluation_records)
|
||||
|
||||
regression_x = training_normalizer.regression_x_range
|
||||
poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree:)
|
||||
regressor = Rumale::LinearModel::LinearRegression.new(fit_bias: true)
|
||||
pipeline =
|
||||
Rumale::Pipeline::Pipeline.new(
|
||||
steps: {
|
||||
transformer: poly_features,
|
||||
estimator: regressor,
|
||||
# Save each regression model to the database
|
||||
regressions.each do |name, result|
|
||||
normalizer = result.equation.normalizer
|
||||
model_name = "fa_fav_id_and_date_#{name.downcase}"
|
||||
TrainedRegressionModel.find_by(name: model_name)&.destroy
|
||||
TrainedRegressionModel.create!(
|
||||
name: model_name,
|
||||
model_type: name.downcase.tr(" ", "_"),
|
||||
description: "Trained on FaFavIdAndDate with #{name} regression.",
|
||||
total_records_count: records_array.size,
|
||||
training_records_count: split.training_records.size,
|
||||
evaluation_records_count: split.evaluation_records.size,
|
||||
train_test_split_ratio: 0.8, # hardcoded, see split_train_test default
|
||||
random_seed: 42, # hardcoded, see split_train_test
|
||||
max_points_limit: max_points,
|
||||
x_min: normalizer.x.min,
|
||||
x_max: normalizer.x.max,
|
||||
y_min: normalizer.y.min,
|
||||
y_max: normalizer.y.max,
|
||||
coefficients: result.equation.coefficients,
|
||||
training_r_squared: result.training_r_squared,
|
||||
evaluation_r_squared: result.evaluation_r_squared,
|
||||
equation_string: result.equation_string,
|
||||
metadata: {
|
||||
x_range: normalizer.x_range,
|
||||
y_range: normalizer.y_range,
|
||||
},
|
||||
)
|
||||
|
||||
# Fit the pipeline on training data
|
||||
training_x_matrix = training_normalizer.transformed_x_matrix
|
||||
training_y_vector = training_normalizer.normalized_y_vector
|
||||
pipeline.fit(training_x_matrix, training_y_vector)
|
||||
|
||||
# Score on training data
|
||||
training_r_squared = pipeline.score(training_x_matrix, training_y_vector)
|
||||
|
||||
# Score on evaluation data
|
||||
evaluation_x_matrix = evaluation_normalizer.transformed_x_matrix
|
||||
evaluation_y_vector = evaluation_normalizer.normalized_y_vector
|
||||
evaluation_r_squared =
|
||||
pipeline.score(evaluation_x_matrix, evaluation_y_vector)
|
||||
|
||||
weight_vec = pipeline.steps[:estimator].weight_vec.to_a
|
||||
|
||||
# Generate regression line data in original scale
|
||||
regression_y =
|
||||
generate_regression_line(training_normalizer, regression_x, weight_vec)
|
||||
|
||||
# Create equation object
|
||||
equation = create_equation(equation_class, training_normalizer, weight_vec)
|
||||
|
||||
RegressionResult.new(
|
||||
equation: equation,
|
||||
training_r_squared: training_r_squared,
|
||||
evaluation_r_squared: evaluation_r_squared,
|
||||
x_values: regression_x,
|
||||
y_values: regression_y,
|
||||
)
|
||||
end
|
||||
|
||||
sig { returns(T::Array[[String, RegressionResult]]) }
|
||||
def analyze
|
||||
# Split data into training and evaluation sets
|
||||
split = StatsHelpers.split_train_test(@records)
|
||||
|
||||
[
|
||||
[
|
||||
"Linear",
|
||||
analyze_regression(
|
||||
LinearNormalizer,
|
||||
PolynomialEquation,
|
||||
split,
|
||||
degree: 1,
|
||||
),
|
||||
],
|
||||
[
|
||||
"Quadratic",
|
||||
analyze_regression(
|
||||
QuadraticNormalizer,
|
||||
PolynomialEquation,
|
||||
split,
|
||||
degree: 2,
|
||||
),
|
||||
],
|
||||
[
|
||||
"Logarithmic",
|
||||
analyze_regression(LogarithmicNormalizer, LogarithmicEquation, split),
|
||||
],
|
||||
[
|
||||
"Square Root",
|
||||
analyze_regression(SquareRootNormalizer, SquareRootEquation, split),
|
||||
],
|
||||
]
|
||||
end
|
||||
|
||||
# Generate regression line using appropriate denormalization method
|
||||
sig do
|
||||
params(
|
||||
normalizer: DataNormalizer,
|
||||
regression_x: T::Array[Float],
|
||||
weight_vec: T::Array[Float],
|
||||
).returns(T::Array[Float])
|
||||
end
|
||||
def generate_regression_line(normalizer, regression_x, weight_vec)
|
||||
normalizer.denormalize_regression(regression_x, weight_vec)
|
||||
end
|
||||
|
||||
# Create appropriate equation object based on type
|
||||
sig do
|
||||
params(
|
||||
equation_class: T.class_of(Equation),
|
||||
normalizer: DataNormalizer,
|
||||
weight_vec: T::Array[Float],
|
||||
).returns(Equation)
|
||||
end
|
||||
def create_equation(equation_class, normalizer, weight_vec)
|
||||
if equation_class == PolynomialEquation
|
||||
case normalizer
|
||||
when LinearNormalizer
|
||||
coefficients =
|
||||
normalizer.denormalize_coefficients(
|
||||
T.cast(weight_vec[0], Float),
|
||||
T.cast(weight_vec[1], Float),
|
||||
)
|
||||
when QuadraticNormalizer
|
||||
coefficients =
|
||||
normalizer.denormalize_coefficients(
|
||||
T.cast(weight_vec[0], Float),
|
||||
T.cast(weight_vec[1], Float),
|
||||
T.cast(weight_vec[2], Float),
|
||||
)
|
||||
else
|
||||
raise "Unsupported normalizer for PolynomialEquation: #{normalizer.class}"
|
||||
end
|
||||
PolynomialEquation.new(normalizer, coefficients)
|
||||
elsif equation_class == LogarithmicEquation ||
|
||||
equation_class == SquareRootEquation
|
||||
equation_class.new(
|
||||
normalizer,
|
||||
T.cast(weight_vec[1], Float),
|
||||
T.cast(weight_vec[0], Float),
|
||||
)
|
||||
else
|
||||
raise "Unsupported equation class: #{equation_class}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Simplified plotting class with extracted common functionality
|
||||
class StatsPlotter
|
||||
extend T::Sig
|
||||
|
||||
sig do
|
||||
params(
|
||||
title: String,
|
||||
train_x: T::Array[Float],
|
||||
train_y: T::Array[Float],
|
||||
eval_x: T::Array[Float],
|
||||
eval_y: T::Array[Float],
|
||||
).void
|
||||
end
|
||||
def plot_train_eval_scatter(title, train_x, train_y, eval_x, eval_y)
|
||||
plot_with_error_handling(title) do
|
||||
plot =
|
||||
UnicodePlot.scatterplot(
|
||||
train_x,
|
||||
train_y,
|
||||
title: title,
|
||||
name: "Training Data",
|
||||
width: 80,
|
||||
height: 20,
|
||||
xlabel: "fav_fa_id",
|
||||
ylabel: date_axis_label(train_y + eval_y),
|
||||
)
|
||||
UnicodePlot.scatterplot!(plot, eval_x, eval_y, name: "Evaluation Data")
|
||||
plot
|
||||
end
|
||||
end
|
||||
|
||||
sig do
|
||||
params(
|
||||
title: String,
|
||||
x_values: T::Array[Float],
|
||||
y_values: T::Array[Float],
|
||||
).void
|
||||
end
|
||||
def plot_scatter(title, x_values, y_values)
|
||||
plot_with_error_handling(title) do
|
||||
UnicodePlot.scatterplot(
|
||||
x_values,
|
||||
y_values,
|
||||
title: title,
|
||||
width: 80,
|
||||
height: 20,
|
||||
xlabel: "fav_fa_id",
|
||||
ylabel: date_axis_label(y_values),
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
sig { params(title: String, result: RegressionResult).void }
|
||||
def plot_regression(title, result)
|
||||
subtitle =
|
||||
"#{title.split.first} fit (Training R² = #{result.training_r_squared.round(3)}, Evaluation R² = #{result.evaluation_r_squared.round(3)})"
|
||||
plot_with_error_handling("#{title} - #{subtitle}") do
|
||||
UnicodePlot.lineplot(
|
||||
result.x_values,
|
||||
result.y_values,
|
||||
title: title,
|
||||
width: 80,
|
||||
height: 20,
|
||||
xlabel: "fav_fa_id",
|
||||
ylabel: date_axis_label(result.y_values),
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
sig do
|
||||
params(
|
||||
x_values: T::Array[Float],
|
||||
y_values: T::Array[Float],
|
||||
regressions: T::Array[[String, RegressionResult]],
|
||||
).void
|
||||
end
|
||||
def plot_combined(x_values, y_values, regressions)
|
||||
plot_with_error_handling("📈 Combined Visualization:") do
|
||||
# Base scatter plot
|
||||
plot =
|
||||
UnicodePlot.scatterplot(
|
||||
x_values,
|
||||
y_values,
|
||||
title: "FaFavIdAndDate Analysis: Original Data vs Regression Models",
|
||||
name: "Original Data",
|
||||
width: 100,
|
||||
height: 25,
|
||||
xlabel: "fav_fa_id",
|
||||
ylabel: date_axis_label(y_values),
|
||||
)
|
||||
|
||||
# Add regression lines
|
||||
regressions.each do |name, result|
|
||||
UnicodePlot.lineplot!(
|
||||
plot,
|
||||
result.x_values,
|
||||
result.y_values,
|
||||
name:
|
||||
"#{name} (Train R²=#{result.training_r_squared.round(3)}, Eval R²=#{result.evaluation_r_squared.round(3)})",
|
||||
)
|
||||
end
|
||||
plot
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
sig { params(y_values: T::Array[Float]).returns(String) }
|
||||
def date_axis_label(y_values)
|
||||
y_min, y_max = y_values.minmax
|
||||
start_date = Time.at(y_min).strftime("%Y-%m-%d")
|
||||
end_date = Time.at(y_max).strftime("%Y-%m-%d")
|
||||
"#{start_date} to #{end_date}"
|
||||
end
|
||||
|
||||
sig { params(title: String, block: T.proc.returns(T.untyped)).void }
|
||||
def plot_with_error_handling(title, &block)
|
||||
puts "\n#{title}"
|
||||
begin
|
||||
plot = block.call
|
||||
puts plot.render
|
||||
rescue LoadError
|
||||
puts "⚠️ UnicodePlot gem not available. Install with: gem install unicode_plot"
|
||||
rescue => e
|
||||
puts "⚠️ Error generating plot: #{e.message}"
|
||||
puts "💾 Saved #{name} regression model to DB."
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user