refactor file structure

This commit is contained in:
Dylan Knutson
2025-07-10 21:17:47 +00:00
parent aad0cb045d
commit 163418c8cc
23 changed files with 3896 additions and 855 deletions

View File

@@ -17,21 +17,21 @@ namespace :stats do
max_points = args[:max_points]&.to_i
# Query and sample data
records_array = StatsHelpers.sample_records(max_points)
records_array = Stats::Helpers.sample_records(max_points)
# Create base normalizer for display ranges
base_normalizer = LinearNormalizer.new(records_array)
base_normalizer = Stats::LinearNormalizer.new(records_array)
puts "📈 X-axis range (fav_fa_id): #{base_normalizer.x_range}"
puts "📈 Y-axis range (date): #{base_normalizer.y_range}"
# Split data for plotting
split = StatsHelpers.split_train_test(records_array)
train_normalizer = LinearNormalizer.new(split.training_records)
eval_normalizer = LinearNormalizer.new(split.evaluation_records)
split = Stats::Helpers.split_train_test(records_array)
train_normalizer = Stats::LinearNormalizer.new(split.training_records)
eval_normalizer = Stats::LinearNormalizer.new(split.evaluation_records)
# Run regressions using specialized normalizers
regressions = RegressionAnalyzer.new(records_array).analyze
regressions = Stats::RegressionAnalyzer.new(records_array).analyze
# Display results (automatically denormalized)
regressions.each do |name, result|
@@ -42,7 +42,7 @@ namespace :stats do
# Generate visualizations
puts "\n🎨 Generating visualizations with UnicodePlot..."
plotter = StatsPlotter.new
plotter = Stats::Plotter.new
plotter.plot_train_eval_scatter(
"Original Data (Train/Eval)",
@@ -63,857 +63,36 @@ namespace :stats do
)
puts "\n✅ Graph generation completed!"
end
end
# Helper methods extracted to avoid private method issues in Rake context
module StatsHelpers
extend T::Sig
sig do
params(max_points: T.nilable(Integer)).returns(
T::Array[Domain::FaFavIdAndDate],
)
end
def self.sample_records(max_points)
records = Domain::FaFavIdAndDate.complete
if records.empty?
puts "❌ No complete FaFavIdAndDate records found"
exit 1
end
total_records = records.count
puts "📊 Found #{total_records} complete records"
records = records.select(:id, :fav_fa_id, :date)
records_array = records.to_a
if max_points && total_records > max_points
puts "🎲 Randomly sampling #{max_points} points from #{total_records} total records"
srand(42) # Fixed seed for reproducibility
records_array =
T.cast(
records_array.sample(max_points),
T::Array[Domain::FaFavIdAndDate],
)
puts "📊 Using #{records_array.length} sampled records for analysis"
else
message =
(
if max_points
"within max_points limit of #{max_points}"
else
"no sampling limit specified"
end
)
puts "📊 Using all #{records_array.length} records (#{message})"
end
records_array
end
sig do
params(
records: T::Array[Domain::FaFavIdAndDate],
eval_ratio: Float,
).returns(TrainTestSplit)
end
def self.split_train_test(records, eval_ratio = 0.2)
# Set random seed for reproducibility
srand(42)
# Shuffle the records
shuffled_records = records.shuffle
# Calculate split point
split_index = (records.length * (1.0 - eval_ratio)).round
training_records =
T.cast(
shuffled_records[0...split_index],
T::Array[Domain::FaFavIdAndDate],
)
evaluation_records =
T.cast(
shuffled_records[split_index..-1],
T::Array[Domain::FaFavIdAndDate],
)
split =
TrainTestSplit.new(
training_records: training_records,
evaluation_records: evaluation_records,
)
split
end
sig { params(value: Float).returns(Float) }
def self.format_r_squared(value)
value.round(3).to_f
end
end
# Immutable struct representing training and evaluation data split
class TrainTestSplit < T::ImmutableStruct
extend T::Sig
const :training_records, T::Array[Domain::FaFavIdAndDate]
const :evaluation_records, T::Array[Domain::FaFavIdAndDate]
sig { returns(Integer) }
def training_count
training_records.length
end
sig { returns(Integer) }
def evaluation_count
evaluation_records.length
end
sig { returns(Integer) }
def total_count
training_count + evaluation_count
end
sig { returns(String) }
def summary
"📊 Split data: #{training_count} training, #{evaluation_count} evaluation records"
end
end
class AxisRange < T::ImmutableStruct
extend T::Sig
const :min, Float
const :max, Float
sig { returns(Float) }
def scale
max - min
end
sig { returns(T::Range[Float]) }
def range
min..max
end
sig { params(value: Float).returns(Float) }
def normalize(value)
(value - min) / scale
end
sig { params(value: Float).returns(Float) }
def denormalize(value)
value * scale + min
end
sig do
params(
mapper: T.nilable(T.proc.params(arg: Float).returns(String)),
).returns(String)
end
def as_string(&mapper)
mapper ||= ->(x) { x }
"#{mapper.call(min)} to #{mapper.call(max)}"
end
end
# Base class for data normalization and denormalization
class DataNormalizer
extend T::Sig
extend T::Helpers
abstract!
sig(:final) { returns(T::Array[Float]) }
attr_reader :x_values
sig(:final) { returns(T::Array[Float]) }
attr_reader :y_values
sig(:final) { params(records: T::Array[Domain::FaFavIdAndDate]).void }
def initialize(records)
data_points =
records.map do |record|
{
x: record.fav_fa_id.to_f,
y: T.cast(record.date&.to_time&.to_i&.to_f, Float),
}
end
data_points.sort_by! { |point| point[:x] }
@x_values = T.let(data_points.map { |p| p[:x] }, T::Array[Float])
@y_values = T.let(data_points.map { |p| p[:y] }, T::Array[Float])
# Calculate min/max for normalization
x_minmax = T.cast(@x_values.minmax, [Float, Float])
y_minmax = T.cast(@y_values.minmax, [Float, Float])
@x = T.let(AxisRange.new(min: x_minmax[0], max: x_minmax[1]), AxisRange)
@y = T.let(AxisRange.new(min: y_minmax[0], max: y_minmax[1]), AxisRange)
end
sig(:final) { returns(String) }
def x_range
@x.as_string
end
sig(:final) { returns(String) }
def y_range
@y.as_string { |x| Time.at(x) }
end
# Accessors for equation classes
sig(:final) { returns(Float) }
def x_scale
@x.scale
end
sig(:final) { returns(Float) }
def y_scale
@y.scale
end
sig(:final) { returns(Float) }
def x_min
@x.min
end
sig(:final) { returns(Float) }
def y_min
@y.min
end
# Convert raw data to normalized [0,1] scale for Rumale
sig(:final) { returns(T::Array[T::Array[Float]]) }
def normalized_x_matrix
@x_values.map { |x| [@x.normalize(x)] }
end
sig(:final) { returns(T::Array[Float]) }
def normalized_y_vector
@y_values.map { |y| @y.normalize(y) }
end
# Generate regression line points in original scale
sig(:final) { returns(T::Array[Float]) }
def regression_x_range
step_size = @x.scale / 50.0
@x.range.step(step_size).to_a
end
# Default transformation matrix (identity for linear/quadratic)
sig { returns(T::Array[T::Array[Float]]) }
def transformed_x_matrix
normalized_x_matrix
end
# Abstract method for denormalizing regression results
sig do
abstract
.params(regression_x: T::Array[Float], weight_vec: T::Array[Float])
.returns(T::Array[Float])
end
def denormalize_regression(regression_x, weight_vec)
end
end
class LinearNormalizer < DataNormalizer
# Denormalize linear regression results back to original scale
sig do
override
.params(regression_x: T::Array[Float], weight_vec: T::Array[Float])
.returns(T::Array[Float])
end
def denormalize_regression(regression_x, weight_vec)
norm_slope = T.cast(weight_vec[1], Float)
norm_intercept = T.cast(weight_vec[0], Float)
regression_x.map do |x|
x_norm = @x.normalize(x)
y_norm = norm_slope * x_norm + norm_intercept
@y.denormalize(y_norm)
end
end
# Denormalize linear regression coefficients back to original scale
sig do
params(norm_intercept: Float, norm_slope: Float).returns(T::Array[Float])
end
def denormalize_coefficients(norm_intercept, norm_slope)
slope_orig = norm_slope * @y.scale / @x.scale
intercept_orig = (norm_intercept * @y.scale + @y.min) - slope_orig * @x.min
[intercept_orig, slope_orig]
end
end
class QuadraticNormalizer < DataNormalizer
# Denormalize quadratic regression results back to original scale
sig do
override
.params(regression_x: T::Array[Float], weight_vec: T::Array[Float])
.returns(T::Array[Float])
end
def denormalize_regression(regression_x, weight_vec)
norm_a = T.cast(weight_vec[2], Float)
norm_b = T.cast(weight_vec[1], Float)
norm_c = T.cast(weight_vec[0], Float)
regression_x.map do |x|
x_norm = @x.normalize(x)
y_norm = norm_a * x_norm * x_norm + norm_b * x_norm + norm_c
@y.denormalize(y_norm)
end
end
# Denormalize quadratic regression coefficients back to original scale
sig do
params(norm_c: Float, norm_b: Float, norm_a: Float).returns(T::Array[Float])
end
def denormalize_coefficients(norm_c, norm_b, norm_a)
a_orig = norm_a * @y.scale / (@x.scale * @x.scale)
b_orig = norm_b * @y.scale / @x.scale - 2 * a_orig * @x.min
c_orig =
(norm_c * @y.scale + @y.min) - b_orig * @x.min - a_orig * @x.min * @x.min
[c_orig, b_orig, a_orig]
end
end
# Base class for transformations that follow y = a * f(x) + b pattern
# where f(x) is a transformation function and denormalization only requires y-scaling
class TransformedNormalizer < DataNormalizer
extend T::Sig
extend T::Helpers
abstract!
# Denormalize coefficients for simple transformations (only y-scaling needed)
sig do
params(norm_intercept: Float, norm_slope: Float).returns(T::Array[Float])
end
def denormalize_coefficients(norm_intercept, norm_slope)
slope_orig = norm_slope * @y.scale
intercept_orig = norm_intercept * @y.scale + @y.min
[intercept_orig, slope_orig]
end
# Common denormalization logic using the transformation function
sig do
override
.params(regression_x: T::Array[Float], weight_vec: T::Array[Float])
.returns(T::Array[Float])
end
def denormalize_regression(regression_x, weight_vec)
norm_slope = T.cast(weight_vec[1], Float)
norm_intercept = T.cast(weight_vec[0], Float)
regression_x.map do |x|
# y = a * f(x) + b, where coefficients are in normalized space
y_norm = norm_slope * transform_x(x) + norm_intercept
@y.denormalize(y_norm)
end
end
protected
# Abstract method for applying the transformation function
sig { abstract.params(x: Float).returns(Float) }
def transform_x(x)
end
end
# Logarithmic regression specific normalizer
class LogarithmicNormalizer < TransformedNormalizer
extend T::Sig
# Convert x values to log-transformed matrix for logarithmic regression
sig { returns(T::Array[T::Array[Float]]) }
def transformed_x_matrix
@x_values.map { |x| [Math.log(x)] }
end
protected
# Apply logarithmic transformation
sig { override.params(x: Float).returns(Float) }
def transform_x(x)
Math.log(x)
end
end
# Square root regression specific normalizer
class SquareRootNormalizer < TransformedNormalizer
extend T::Sig
# Convert x values to square root transformed matrix for square root regression
sig { returns(T::Array[T::Array[Float]]) }
def transformed_x_matrix
@x_values.map { |x| [Math.sqrt(x)] }
end
protected
# Apply square root transformation
sig { override.params(x: Float).returns(Float) }
def transform_x(x)
Math.sqrt(x)
end
end
# Base class for regression equations with common formatting logic
class Equation
extend T::Sig
sig { params(normalizer: DataNormalizer).void }
def initialize(normalizer)
@normalizer = normalizer
end
# Format a number with significant figures and scientific notation when needed
sig { params(num: Float, sig_figs: Integer).returns(String) }
def format_number(num, sig_figs = 3)
# Handle zero case
return "0.0" if num.zero?
# Get order of scale
order = Math.log10(num.abs).floor
# Use scientific notation for very large or small numbers
if order >= 6 || order <= -3
# Scale number between 1 and 10
scaled = num / (10.0**order)
# Round to sig figs
rounded = scaled.round(sig_figs - 1)
"#{rounded}e#{order}"
else
# For normal range numbers, just round to appropriate decimal places
decimal_places = sig_figs - (order + 1)
decimal_places = 0 if decimal_places < 0
num.round(decimal_places).to_s
end
end
sig { returns(String) }
def to_s
format_equation
end
protected
sig { returns(String) }
def format_equation
raise NotImplementedError, "Subclasses must implement format_equation"
end
sig { returns(DataNormalizer) }
attr_reader :normalizer
end
class PolynomialEquation < Equation
extend T::Sig
sig { params(normalizer: DataNormalizer, coefficients: T::Array[Float]).void }
def initialize(normalizer, coefficients)
super(normalizer)
@coefficients = coefficients
end
protected
sig { returns(String) }
def format_equation
"y = #{polynomial_equation(@coefficients)}"
end
private
# Convert array of coefficients into polynomial equation string
sig { params(coefficients: T::Array[Float]).returns(String) }
def polynomial_equation(coefficients)
terms =
coefficients.each_with_index.map do |coeff, power|
next if coeff.zero?
term = format_number(coeff)
case power
when 0
term
when 1
"#{term}x"
else
"#{term}x#{power.to_s.tr("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")}"
end
end
terms.compact.reverse.join(" + ").gsub("+ -", "- ")
end
end
# Base class for transformed equations that follow y = a * f(x) + b pattern
class TransformedEquation < Equation
extend T::Sig
extend T::Helpers
abstract!
sig do
params(
normalizer: DataNormalizer,
norm_slope: Float,
norm_intercept: Float,
).void
end
def initialize(normalizer, norm_slope, norm_intercept)
super(normalizer)
@norm_slope = norm_slope
@norm_intercept = norm_intercept
end
protected
sig { returns(String) }
def format_equation
slope_orig = @norm_slope * @normalizer.y_scale
intercept_orig = @norm_intercept * @normalizer.y_scale + @normalizer.y_min
"y = #{format_number(slope_orig)} * #{transform_symbol("x")} + #{format_number(intercept_orig)}"
end
# Abstract method for the transformation symbol
sig { abstract.params(x: String).returns(String) }
def transform_symbol(x)
end
end
class LogarithmicEquation < TransformedEquation
extend T::Sig
protected
sig { override.params(x: String).returns(String) }
def transform_symbol(x)
"ln(#{x})"
end
end
class SquareRootEquation < TransformedEquation
extend T::Sig
protected
sig { override.params(x: String).returns(String) }
def transform_symbol(x)
"#{x}"
end
end
# Immutable struct representing a single regression analysis result
class RegressionResult < T::ImmutableStruct
extend T::Sig
const :equation, Equation
const :training_r_squared, Float
const :evaluation_r_squared, Float
const :x_values, T::Array[Float]
const :y_values, T::Array[Float]
sig { returns(String) }
def equation_string
equation.to_s
end
sig { returns(String) }
def score_summary
"Training R² = #{StatsHelpers.format_r_squared(training_r_squared)}, Evaluation R² = #{StatsHelpers.format_r_squared(evaluation_r_squared)}"
end
end
# Immutable struct representing the complete analysis results
class AnalysisResults < T::ImmutableStruct
extend T::Sig
const :linear, RegressionResult
const :quadratic, RegressionResult
const :logarithmic, RegressionResult
const :square_root, RegressionResult
end
# Handles regression analysis using Rumale with normalized data
class RegressionAnalyzer
extend T::Sig
sig { params(records: T::Array[Domain::FaFavIdAndDate]).void }
def initialize(records)
@records = records
end
# Generic regression analysis method to eliminate duplication
sig do
params(
normalizer_class: T.class_of(DataNormalizer),
equation_class: T.class_of(Equation),
split: TrainTestSplit,
degree: Integer,
).returns(RegressionResult)
end
def analyze_regression(normalizer_class, equation_class, split, degree: 1)
# Create normalizers for training and evaluation data
training_normalizer = normalizer_class.new(split.training_records)
evaluation_normalizer = normalizer_class.new(split.evaluation_records)
regression_x = training_normalizer.regression_x_range
poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree:)
regressor = Rumale::LinearModel::LinearRegression.new(fit_bias: true)
pipeline =
Rumale::Pipeline::Pipeline.new(
steps: {
transformer: poly_features,
estimator: regressor,
# Save each regression model to the database
regressions.each do |name, result|
normalizer = result.equation.normalizer
model_name = "fa_fav_id_and_date_#{name.downcase}"
TrainedRegressionModel.find_by(name: model_name)&.destroy
TrainedRegressionModel.create!(
name: model_name,
model_type: name.downcase.tr(" ", "_"),
description: "Trained on FaFavIdAndDate with #{name} regression.",
total_records_count: records_array.size,
training_records_count: split.training_records.size,
evaluation_records_count: split.evaluation_records.size,
train_test_split_ratio: 0.8, # hardcoded, see split_train_test default
random_seed: 42, # hardcoded, see split_train_test
max_points_limit: max_points,
x_min: normalizer.x.min,
x_max: normalizer.x.max,
y_min: normalizer.y.min,
y_max: normalizer.y.max,
coefficients: result.equation.coefficients,
training_r_squared: result.training_r_squared,
evaluation_r_squared: result.evaluation_r_squared,
equation_string: result.equation_string,
metadata: {
x_range: normalizer.x_range,
y_range: normalizer.y_range,
},
)
# Fit the pipeline on training data
training_x_matrix = training_normalizer.transformed_x_matrix
training_y_vector = training_normalizer.normalized_y_vector
pipeline.fit(training_x_matrix, training_y_vector)
# Score on training data
training_r_squared = pipeline.score(training_x_matrix, training_y_vector)
# Score on evaluation data
evaluation_x_matrix = evaluation_normalizer.transformed_x_matrix
evaluation_y_vector = evaluation_normalizer.normalized_y_vector
evaluation_r_squared =
pipeline.score(evaluation_x_matrix, evaluation_y_vector)
weight_vec = pipeline.steps[:estimator].weight_vec.to_a
# Generate regression line data in original scale
regression_y =
generate_regression_line(training_normalizer, regression_x, weight_vec)
# Create equation object
equation = create_equation(equation_class, training_normalizer, weight_vec)
RegressionResult.new(
equation: equation,
training_r_squared: training_r_squared,
evaluation_r_squared: evaluation_r_squared,
x_values: regression_x,
y_values: regression_y,
)
end
sig { returns(T::Array[[String, RegressionResult]]) }
def analyze
# Split data into training and evaluation sets
split = StatsHelpers.split_train_test(@records)
[
[
"Linear",
analyze_regression(
LinearNormalizer,
PolynomialEquation,
split,
degree: 1,
),
],
[
"Quadratic",
analyze_regression(
QuadraticNormalizer,
PolynomialEquation,
split,
degree: 2,
),
],
[
"Logarithmic",
analyze_regression(LogarithmicNormalizer, LogarithmicEquation, split),
],
[
"Square Root",
analyze_regression(SquareRootNormalizer, SquareRootEquation, split),
],
]
end
# Generate regression line using appropriate denormalization method
sig do
params(
normalizer: DataNormalizer,
regression_x: T::Array[Float],
weight_vec: T::Array[Float],
).returns(T::Array[Float])
end
def generate_regression_line(normalizer, regression_x, weight_vec)
normalizer.denormalize_regression(regression_x, weight_vec)
end
# Create appropriate equation object based on type
sig do
params(
equation_class: T.class_of(Equation),
normalizer: DataNormalizer,
weight_vec: T::Array[Float],
).returns(Equation)
end
def create_equation(equation_class, normalizer, weight_vec)
if equation_class == PolynomialEquation
case normalizer
when LinearNormalizer
coefficients =
normalizer.denormalize_coefficients(
T.cast(weight_vec[0], Float),
T.cast(weight_vec[1], Float),
)
when QuadraticNormalizer
coefficients =
normalizer.denormalize_coefficients(
T.cast(weight_vec[0], Float),
T.cast(weight_vec[1], Float),
T.cast(weight_vec[2], Float),
)
else
raise "Unsupported normalizer for PolynomialEquation: #{normalizer.class}"
end
PolynomialEquation.new(normalizer, coefficients)
elsif equation_class == LogarithmicEquation ||
equation_class == SquareRootEquation
equation_class.new(
normalizer,
T.cast(weight_vec[1], Float),
T.cast(weight_vec[0], Float),
)
else
raise "Unsupported equation class: #{equation_class}"
end
end
end
# Simplified plotting class with extracted common functionality
class StatsPlotter
extend T::Sig
sig do
params(
title: String,
train_x: T::Array[Float],
train_y: T::Array[Float],
eval_x: T::Array[Float],
eval_y: T::Array[Float],
).void
end
def plot_train_eval_scatter(title, train_x, train_y, eval_x, eval_y)
plot_with_error_handling(title) do
plot =
UnicodePlot.scatterplot(
train_x,
train_y,
title: title,
name: "Training Data",
width: 80,
height: 20,
xlabel: "fav_fa_id",
ylabel: date_axis_label(train_y + eval_y),
)
UnicodePlot.scatterplot!(plot, eval_x, eval_y, name: "Evaluation Data")
plot
end
end
sig do
params(
title: String,
x_values: T::Array[Float],
y_values: T::Array[Float],
).void
end
def plot_scatter(title, x_values, y_values)
plot_with_error_handling(title) do
UnicodePlot.scatterplot(
x_values,
y_values,
title: title,
width: 80,
height: 20,
xlabel: "fav_fa_id",
ylabel: date_axis_label(y_values),
)
end
end
sig { params(title: String, result: RegressionResult).void }
def plot_regression(title, result)
subtitle =
"#{title.split.first} fit (Training R² = #{result.training_r_squared.round(3)}, Evaluation R² = #{result.evaluation_r_squared.round(3)})"
plot_with_error_handling("#{title} - #{subtitle}") do
UnicodePlot.lineplot(
result.x_values,
result.y_values,
title: title,
width: 80,
height: 20,
xlabel: "fav_fa_id",
ylabel: date_axis_label(result.y_values),
)
end
end
sig do
params(
x_values: T::Array[Float],
y_values: T::Array[Float],
regressions: T::Array[[String, RegressionResult]],
).void
end
def plot_combined(x_values, y_values, regressions)
plot_with_error_handling("📈 Combined Visualization:") do
# Base scatter plot
plot =
UnicodePlot.scatterplot(
x_values,
y_values,
title: "FaFavIdAndDate Analysis: Original Data vs Regression Models",
name: "Original Data",
width: 100,
height: 25,
xlabel: "fav_fa_id",
ylabel: date_axis_label(y_values),
)
# Add regression lines
regressions.each do |name, result|
UnicodePlot.lineplot!(
plot,
result.x_values,
result.y_values,
name:
"#{name} (Train R²=#{result.training_r_squared.round(3)}, Eval R²=#{result.evaluation_r_squared.round(3)})",
)
end
plot
end
end
private
sig { params(y_values: T::Array[Float]).returns(String) }
def date_axis_label(y_values)
y_min, y_max = y_values.minmax
start_date = Time.at(y_min).strftime("%Y-%m-%d")
end_date = Time.at(y_max).strftime("%Y-%m-%d")
"#{start_date} to #{end_date}"
end
sig { params(title: String, block: T.proc.returns(T.untyped)).void }
def plot_with_error_handling(title, &block)
puts "\n#{title}"
begin
plot = block.call
puts plot.render
rescue LoadError
puts "⚠️ UnicodePlot gem not available. Install with: gem install unicode_plot"
rescue => e
puts "⚠️ Error generating plot: #{e.message}"
puts "💾 Saved #{name} regression model to DB."
end
end
end