sqrt regression

This commit is contained in:
Dylan Knutson
2025-07-10 19:58:01 +00:00
parent 6c086ac9cc
commit 91a16e12a1

View File

@@ -9,7 +9,7 @@ require "rumale/preprocessing/polynomial_features"
require "rumale/pipeline/pipeline"
namespace :stats do
desc "Generate graphs of FaFavIdAndDate models with linear and quadratic regression lines. Usage: rake stats:fa_fav_graph[max_points]"
desc "Generate graphs of FaFavIdAndDate models with linear, quadratic, logarithmic, and square root regression lines. Usage: rake stats:fa_fav_graph[max_points]"
task :fa_fav_graph, [:max_points] => :environment do |task, args|
puts "🔍 Analyzing FaFavIdAndDate data..."
@@ -19,23 +19,29 @@ namespace :stats do
# Query and sample data
records_array = StatsHelpers.sample_records(max_points)
# Create normalizer with raw data
normalizer = DataNormalizer.new(records_array)
# Create base normalizer for display ranges
base_normalizer = DataNormalizer.new(records_array)
puts "📈 X-axis range (fav_fa_id): #{normalizer.x_range}"
puts "📈 Y-axis range (date): #{normalizer.y_range}"
puts "📈 X-axis range (fav_fa_id): #{base_normalizer.x_range}"
puts "📈 Y-axis range (date): #{base_normalizer.y_range}"
# Run regressions using normalized data
results = RegressionAnalyzer.new(normalizer).analyze
# Run regressions using specialized normalizers
results = RegressionAnalyzer.new(records_array).analyze
# Define regression types for reuse across display and plotting
regressions = [
["Linear", results.linear],
["Quadratic", results.quadratic],
["Logarithmic", results.logarithmic],
["Square Root", results.square_root],
]
# Display results (automatically denormalized)
puts "\n📊 Linear Regression Results:"
puts " #{results.linear.equation}"
puts " R² = #{StatsHelpers.format_r_squared(results.linear.r_squared)}"
puts "\n📊 Quadratic Regression Results:"
puts " #{results.quadratic.equation}"
puts " R² = #{StatsHelpers.format_r_squared(results.quadratic.r_squared)}"
regressions.each do |name, result|
puts "\n📊 #{name} Regression Results:"
puts " #{result.equation_string}"
puts " R² = #{StatsHelpers.format_r_squared(result.r_squared)}"
end
# Generate visualizations
puts "\n🎨 Generating visualizations with UnicodePlot..."
@@ -43,12 +49,19 @@ namespace :stats do
plotter.plot_scatter(
"Original Data",
normalizer.x_values,
normalizer.y_values,
base_normalizer.x_values,
base_normalizer.y_values,
)
# Plot individual regression results
regressions.each do |name, result|
plotter.plot_regression("#{name} Regression", result)
end
plotter.plot_combined(
base_normalizer.x_values,
base_normalizer.y_values,
regressions,
)
plotter.plot_regression("Linear Regression", results.linear)
plotter.plot_regression("Quadratic Regression", results.quadratic)
plotter.plot_combined(normalizer.x_values, normalizer.y_values, results)
puts "\n✅ Graph generation completed!"
end
@@ -106,7 +119,7 @@ module StatsHelpers
end
end
# Handles data normalization and denormalization to prevent numerical instability
# Base class for data normalization and denormalization
class DataNormalizer
extend T::Sig
@@ -184,6 +197,27 @@ class DataNormalizer
@y.as_string { |x| Time.at(x) }
end
# Accessors for equation classes
sig { returns(Float) }
def x_scale
@x.scale
end
sig { returns(Float) }
def y_scale
@y.scale
end
sig { returns(Float) }
def x_min
@x.min
end
sig { returns(Float) }
def y_min
@y.min
end
# Convert raw data to normalized [0,1] scale for Rumale
sig { returns(T::Array[T::Array[Float]]) }
def normalized_x_matrix
@@ -202,6 +236,22 @@ class DataNormalizer
@x.range.step(step_size).to_a
end
# Default transformation matrix (identity for linear/quadratic)
sig { returns(T::Array[T::Array[Float]]) }
def transformed_x_matrix
normalized_x_matrix
end
protected
sig { returns(Range) }
attr_reader :x, :y
end
# Linear regression specific normalizer
class LinearNormalizer < DataNormalizer
extend T::Sig
# Denormalize linear regression results back to original scale
sig do
params(
@@ -210,7 +260,7 @@ class DataNormalizer
norm_intercept: Float,
).returns(T::Array[Float])
end
def denormalize_linear(regression_x, norm_slope, norm_intercept)
def denormalize_regression(regression_x, norm_slope, norm_intercept)
regression_x.map do |x|
x_norm = @x.normalize(x)
y_norm = norm_slope * x_norm + norm_intercept
@@ -218,6 +268,22 @@ class DataNormalizer
end
end
# Denormalize linear regression coefficients back to original scale
sig do
params(norm_intercept: Float, norm_slope: Float).returns(T::Array[Float])
end
def denormalize_coefficients(norm_intercept, norm_slope)
slope_orig = norm_slope * @y.scale / @x.scale
intercept_orig = (norm_intercept * @y.scale + @y.min) - slope_orig * @x.min
[intercept_orig, slope_orig]
end
end
# Quadratic regression specific normalizer
class QuadraticNormalizer < DataNormalizer
extend T::Sig
# Denormalize quadratic regression results back to original scale
sig do
params(
@@ -227,7 +293,7 @@ class DataNormalizer
norm_c: Float,
).returns(T::Array[Float])
end
def denormalize_quadratic(regression_x, norm_a, norm_b, norm_c)
def denormalize_regression(regression_x, norm_a, norm_b, norm_c)
regression_x.map do |x|
x_norm = @x.normalize(x)
y_norm = norm_a * x_norm * x_norm + norm_b * x_norm + norm_c
@@ -235,44 +301,106 @@ class DataNormalizer
end
end
# Generate equation strings with coefficients in original scale
sig { params(norm_slope: Float, norm_intercept: Float).returns(String) }
def linear_equation(norm_slope, norm_intercept)
slope_orig = norm_slope * @y.scale / @x.scale
intercept_orig = (norm_intercept * @y.scale + @y.min) - slope_orig * @x.min
"y = #{polynomial_equation([slope_orig, intercept_orig])}"
# Denormalize quadratic regression coefficients back to original scale
sig do
params(norm_c: Float, norm_b: Float, norm_a: Float).returns(T::Array[Float])
end
sig { params(norm_a: Float, norm_b: Float, norm_c: Float).returns(String) }
def quadratic_equation(norm_a, norm_b, norm_c)
def denormalize_coefficients(norm_c, norm_b, norm_a)
a_orig = norm_a * @y.scale / (@x.scale * @x.scale)
b_orig = norm_b * @y.scale / @x.scale - 2 * a_orig * @x.min
c_orig =
(norm_c * @y.scale + @y.min) - b_orig * @x.min - a_orig * @x.min * @x.min
"y = #{polynomial_equation([a_orig, b_orig, c_orig])}"
[c_orig, b_orig, a_orig]
end
end
# Base class for transformations that follow y = a * f(x) + b pattern
# where f(x) is a transformation function and denormalization only requires y-scaling
class TransformedNormalizer < DataNormalizer
extend T::Sig
# Denormalize coefficients for simple transformations (only y-scaling needed)
sig do
params(norm_intercept: Float, norm_slope: Float).returns(T::Array[Float])
end
def denormalize_coefficients(norm_intercept, norm_slope)
slope_orig = norm_slope * @y.scale
intercept_orig = norm_intercept * @y.scale + @y.min
[intercept_orig, slope_orig]
end
# Convert array of coefficients into polynomial equation string
sig { params(coefficients: T::Array[Float]).returns(String) }
def polynomial_equation(coefficients)
terms =
coefficients.each_with_index.map do |coeff, power|
next if coeff.zero?
# Common denormalization logic using the transformation function
sig do
params(
regression_x: T::Array[Float],
norm_slope: Float,
norm_intercept: Float,
).returns(T::Array[Float])
end
def denormalize_regression(regression_x, norm_slope, norm_intercept)
regression_x.map do |x|
# y = a * f(x) + b, where coefficients are in normalized space
y_norm = norm_slope * transform_x(x) + norm_intercept
@y.denormalize(y_norm)
end
end
term = format_number(coeff)
case power
when 0
term
when 1
"#{term}x"
else
"#{term}x#{power.to_s.tr("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")}"
end
end
protected
terms.compact.reverse.join(" + ").gsub("+ -", "- ")
# Abstract method for applying the transformation function
sig { params(x: Float).returns(Float) }
def transform_x(x)
raise NotImplementedError, "Subclasses must implement transform_x"
end
end
# Logarithmic regression specific normalizer
class LogarithmicNormalizer < TransformedNormalizer
extend T::Sig
# Convert x values to log-transformed matrix for logarithmic regression
sig { returns(T::Array[T::Array[Float]]) }
def transformed_x_matrix
@x_values.map { |x| [Math.log(x)] }
end
protected
# Apply logarithmic transformation
sig { params(x: Float).returns(Float) }
def transform_x(x)
Math.log(x)
end
end
# Square root regression specific normalizer
class SquareRootNormalizer < TransformedNormalizer
extend T::Sig
# Convert x values to square root transformed matrix for square root regression
sig { returns(T::Array[T::Array[Float]]) }
def transformed_x_matrix
@x_values.map { |x| [Math.sqrt(x)] }
end
protected
# Apply square root transformation
sig { params(x: Float).returns(Float) }
def transform_x(x)
Math.sqrt(x)
end
end
# Base class for regression equations with common formatting logic
class Equation
extend T::Sig
sig { params(normalizer: DataNormalizer).void }
def initialize(normalizer)
@normalizer = normalizer
end
# Format a number with significant figures and scientific notation when needed
@@ -298,16 +426,130 @@ class DataNormalizer
num.round(decimal_places).to_s
end
end
sig { returns(String) }
def to_s
format_equation
end
protected
sig { returns(String) }
def format_equation
raise NotImplementedError, "Subclasses must implement format_equation"
end
sig { returns(DataNormalizer) }
attr_reader :normalizer
end
class PolynomialEquation < Equation
extend T::Sig
sig { params(normalizer: DataNormalizer, coefficients: T::Array[Float]).void }
def initialize(normalizer, coefficients)
super(normalizer)
@coefficients = coefficients
end
protected
sig { returns(String) }
def format_equation
"y = #{polynomial_equation(@coefficients)}"
end
private
# Convert array of coefficients into polynomial equation string
sig { params(coefficients: T::Array[Float]).returns(String) }
def polynomial_equation(coefficients)
terms =
coefficients.each_with_index.map do |coeff, power|
next if coeff.zero?
term = format_number(coeff)
case power
when 0
term
when 1
"#{term}x"
else
"#{term}x#{power.to_s.tr("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")}"
end
end
terms.compact.reverse.join(" + ").gsub("+ -", "- ")
end
end
class LogarithmicEquation < Equation
extend T::Sig
sig do
params(
normalizer: DataNormalizer,
norm_slope: Float,
norm_intercept: Float,
).void
end
def initialize(normalizer, norm_slope, norm_intercept)
super(normalizer)
@norm_slope = norm_slope
@norm_intercept = norm_intercept
end
protected
sig { returns(String) }
def format_equation
slope_orig = @norm_slope * @normalizer.y_scale
intercept_orig = @norm_intercept * @normalizer.y_scale + @normalizer.y_min
"y = #{format_number(slope_orig)} * ln(x) + #{format_number(intercept_orig)}"
end
end
class SquareRootEquation < Equation
extend T::Sig
sig do
params(
normalizer: DataNormalizer,
norm_slope: Float,
norm_intercept: Float,
).void
end
def initialize(normalizer, norm_slope, norm_intercept)
super(normalizer)
@norm_slope = norm_slope
@norm_intercept = norm_intercept
end
protected
sig { returns(String) }
def format_equation
slope_orig = @norm_slope * @normalizer.y_scale
intercept_orig = @norm_intercept * @normalizer.y_scale + @normalizer.y_min
"y = #{format_number(slope_orig)} * √x + #{format_number(intercept_orig)}"
end
end
# Immutable struct representing a single regression analysis result
class RegressionResult < T::ImmutableStruct
extend T::Sig
const :equation, String
const :equation, Equation
const :r_squared, Float
const :x_values, T::Array[Float]
const :y_values, T::Array[Float]
sig { returns(String) }
def equation_string
equation.to_s
end
end
# Immutable struct representing the complete analysis results
@@ -316,40 +558,38 @@ class AnalysisResults < T::ImmutableStruct
const :linear, RegressionResult
const :quadratic, RegressionResult
const :logarithmic, RegressionResult
const :square_root, RegressionResult
end
# Handles regression analysis using Rumale with normalized data
class RegressionAnalyzer
extend T::Sig
sig { params(normalizer: DataNormalizer).void }
def initialize(normalizer)
@normalizer = normalizer
sig { params(records: T::Array[Domain::FaFavIdAndDate]).void }
def initialize(records)
@records = records
end
sig { returns(AnalysisResults) }
def analyze
# Use normalized data for Rumale calculations to prevent numerical instability
x_matrix = @normalizer.normalized_x_matrix
y_vector = @normalizer.normalized_y_vector
regression_x = @normalizer.regression_x_range
AnalysisResults.new(
linear: analyze_linear(x_matrix, y_vector, regression_x),
quadratic: analyze_quadratic(x_matrix, y_vector, regression_x),
linear: analyze_linear,
quadratic: analyze_quadratic,
logarithmic: analyze_logarithmic,
square_root: analyze_square_root,
)
end
private
sig do
params(
x_matrix: T::Array[T::Array[Float]],
y_vector: T::Array[Float],
regression_x: T::Array[Float],
).returns(RegressionResult)
end
def analyze_linear(x_matrix, y_vector, regression_x)
sig { returns(RegressionResult) }
def analyze_linear
normalizer = LinearNormalizer.new(@records)
x_matrix = normalizer.normalized_x_matrix
y_vector = normalizer.normalized_y_vector
regression_x = normalizer.regression_x_range
poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 1)
regressor = Rumale::LinearModel::LinearRegression.new
pipeline =
@@ -370,24 +610,31 @@ class RegressionAnalyzer
# Generate regression line data in original scale
linear_y =
@normalizer.denormalize_linear(regression_x, norm_slope, norm_intercept)
normalizer.denormalize_regression(
regression_x,
norm_slope,
norm_intercept,
)
# Denormalize coefficients for equation display
coefficients =
normalizer.denormalize_coefficients(norm_intercept, norm_slope)
RegressionResult.new(
equation: @normalizer.linear_equation(norm_slope, norm_intercept),
equation: PolynomialEquation.new(normalizer, coefficients),
r_squared: r_squared,
x_values: regression_x,
y_values: linear_y,
)
end
sig do
params(
x_matrix: T::Array[T::Array[Float]],
y_vector: T::Array[Float],
regression_x: T::Array[Float],
).returns(RegressionResult)
end
def analyze_quadratic(x_matrix, y_vector, regression_x)
sig { returns(RegressionResult) }
def analyze_quadratic
normalizer = QuadraticNormalizer.new(@records)
x_matrix = normalizer.normalized_x_matrix
y_vector = normalizer.normalized_y_vector
regression_x = normalizer.regression_x_range
# Use pipeline approach as recommended in documentation
poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
regressor = Rumale::LinearModel::LinearRegression.new(fit_bias: true)
@@ -410,15 +657,108 @@ class RegressionAnalyzer
# Generate regression line data in original scale
quadratic_y =
@normalizer.denormalize_quadratic(regression_x, norm_a, norm_b, norm_c)
normalizer.denormalize_regression(regression_x, norm_a, norm_b, norm_c)
# Denormalize coefficients for equation display
coefficients = normalizer.denormalize_coefficients(norm_c, norm_b, norm_a)
RegressionResult.new(
equation: @normalizer.quadratic_equation(norm_a, norm_b, norm_c),
equation: PolynomialEquation.new(normalizer, coefficients),
r_squared: r_squared,
x_values: regression_x,
y_values: quadratic_y,
)
end
sig { returns(RegressionResult) }
def analyze_logarithmic
normalizer = LogarithmicNormalizer.new(@records)
y_vector = normalizer.normalized_y_vector
regression_x = normalizer.regression_x_range
# Transform x values using natural log for logarithmic regression
# y = a * ln(x) + b
log_x_matrix = normalizer.transformed_x_matrix
poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 1)
regressor = Rumale::LinearModel::LinearRegression.new
pipeline =
Rumale::Pipeline::Pipeline.new(
steps: {
transformer: poly_features,
estimator: regressor,
},
)
# Fit the regression on log-transformed x values
pipeline.fit(log_x_matrix, y_vector)
r_squared = pipeline.score(log_x_matrix, y_vector)
# Extract coefficients (same pattern as linear regression)
weight_vec = pipeline.steps[:estimator].weight_vec
norm_intercept = weight_vec[0]
norm_slope = weight_vec[1]
# Generate regression line data in original scale
logarithmic_y =
normalizer.denormalize_regression(
regression_x,
norm_slope,
norm_intercept,
)
RegressionResult.new(
equation: LogarithmicEquation.new(normalizer, norm_slope, norm_intercept),
r_squared: r_squared,
x_values: regression_x,
y_values: logarithmic_y,
)
end
sig { returns(RegressionResult) }
def analyze_square_root
normalizer = SquareRootNormalizer.new(@records)
y_vector = normalizer.normalized_y_vector
regression_x = normalizer.regression_x_range
# Transform x values using square root for square root regression
# y = a * √x + b
sqrt_x_matrix = normalizer.transformed_x_matrix
poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 1)
regressor = Rumale::LinearModel::LinearRegression.new
pipeline =
Rumale::Pipeline::Pipeline.new(
steps: {
transformer: poly_features,
estimator: regressor,
},
)
# Fit the regression on square root transformed x values
pipeline.fit(sqrt_x_matrix, y_vector)
r_squared = pipeline.score(sqrt_x_matrix, y_vector)
# Extract coefficients (same pattern as other regressions)
weight_vec = pipeline.steps[:estimator].weight_vec
norm_intercept = weight_vec[0]
norm_slope = weight_vec[1]
# Generate regression line data in original scale
square_root_y =
normalizer.denormalize_regression(
regression_x,
norm_slope,
norm_intercept,
)
RegressionResult.new(
equation: SquareRootEquation.new(normalizer, norm_slope, norm_intercept),
r_squared: r_squared,
x_values: regression_x,
y_values: square_root_y,
)
end
end
# Simplified plotting class with extracted common functionality
@@ -466,10 +806,10 @@ class StatsPlotter
params(
x_values: T::Array[Float],
y_values: T::Array[Float],
results: AnalysisResults,
regressions: T::Array[[String, RegressionResult]],
).void
end
def plot_combined(x_values, y_values, results)
def plot_combined(x_values, y_values, regressions)
plot_with_error_handling("📈 Combined Visualization:") do
# Base scatter plot
plot =
@@ -485,18 +825,14 @@ class StatsPlotter
)
# Add regression lines
UnicodePlot.lineplot!(
plot,
results.linear.x_values,
results.linear.y_values,
name: "Linear (R²=#{results.linear.r_squared.round(3)})",
)
UnicodePlot.lineplot!(
plot,
results.quadratic.x_values,
results.quadratic.y_values,
name: "Quadratic (R²=#{results.quadratic.r_squared.round(3)})",
)
regressions.each do |name, result|
UnicodePlot.lineplot!(
plot,
result.x_values,
result.y_values,
name: "#{name} (R²=#{result.r_squared.round(3)})",
)
end
plot
end
end