sqrt regression

2025-07-10 19:58:01 +00:00
parent 6c086ac9cc
commit 91a16e12a1
1 changed files with 431 additions and 95 deletions
--- a/lib/tasks/stats.rake
+++ b/lib/tasks/stats.rake
@@ -9,7 +9,7 @@ require "rumale/preprocessing/polynomial_features"
 require "rumale/pipeline/pipeline"

 namespace :stats do
-  desc "Generate graphs of FaFavIdAndDate models with linear and quadratic regression lines. Usage: rake stats:fa_fav_graph[max_points]"
+  desc "Generate graphs of FaFavIdAndDate models with linear, quadratic, logarithmic, and square root regression lines. Usage: rake stats:fa_fav_graph[max_points]"
  task :fa_fav_graph, [:max_points] => :environment do |task, args|
    puts "🔍 Analyzing FaFavIdAndDate data..."

@@ -19,23 +19,29 @@ namespace :stats do
    # Query and sample data
    records_array = StatsHelpers.sample_records(max_points)

-    # Create normalizer with raw data
-    normalizer = DataNormalizer.new(records_array)
+    # Create base normalizer for display ranges
+    base_normalizer = DataNormalizer.new(records_array)

-    puts "📈 X-axis range (fav_fa_id): #{normalizer.x_range}"
-    puts "📈 Y-axis range (date): #{normalizer.y_range}"
+    puts "📈 X-axis range (fav_fa_id): #{base_normalizer.x_range}"
+    puts "📈 Y-axis range (date): #{base_normalizer.y_range}"

-    # Run regressions using normalized data
-    results = RegressionAnalyzer.new(normalizer).analyze
+    # Run regressions using specialized normalizers
+    results = RegressionAnalyzer.new(records_array).analyze
+
+    # Define regression types for reuse across display and plotting
+    regressions = [
+      ["Linear", results.linear],
+      ["Quadratic", results.quadratic],
+      ["Logarithmic", results.logarithmic],
+      ["Square Root", results.square_root],
+    ]

    # Display results (automatically denormalized)
-    puts "\n📊 Linear Regression Results:"
-    puts "   #{results.linear.equation}"
-    puts "   R² = #{StatsHelpers.format_r_squared(results.linear.r_squared)}"
-
-    puts "\n📊 Quadratic Regression Results:"
-    puts "   #{results.quadratic.equation}"
-    puts "   R² = #{StatsHelpers.format_r_squared(results.quadratic.r_squared)}"
+    regressions.each do |name, result|
+      puts "\n📊 #{name} Regression Results:"
+      puts "   #{result.equation_string}"
+      puts "   R² = #{StatsHelpers.format_r_squared(result.r_squared)}"
+    end

    # Generate visualizations
    puts "\n🎨 Generating visualizations with UnicodePlot..."
@@ -43,12 +49,19 @@ namespace :stats do

    plotter.plot_scatter(
      "Original Data",
-      normalizer.x_values,
-      normalizer.y_values,
+      base_normalizer.x_values,
+      base_normalizer.y_values,
+    )
+
+    # Plot individual regression results
+    regressions.each do |name, result|
+      plotter.plot_regression("#{name} Regression", result)
+    end
+    plotter.plot_combined(
+      base_normalizer.x_values,
+      base_normalizer.y_values,
+      regressions,
    )
-    plotter.plot_regression("Linear Regression", results.linear)
-    plotter.plot_regression("Quadratic Regression", results.quadratic)
-    plotter.plot_combined(normalizer.x_values, normalizer.y_values, results)

    puts "\n✅ Graph generation completed!"
  end
@@ -106,7 +119,7 @@ module StatsHelpers
  end
 end

-# Handles data normalization and denormalization to prevent numerical instability
+# Base class for data normalization and denormalization
 class DataNormalizer
  extend T::Sig

@@ -184,6 +197,27 @@ class DataNormalizer
    @y.as_string { |x| Time.at(x) }
  end

+  # Accessors for equation classes
+  sig { returns(Float) }
+  def x_scale
+    @x.scale
+  end
+
+  sig { returns(Float) }
+  def y_scale
+    @y.scale
+  end
+
+  sig { returns(Float) }
+  def x_min
+    @x.min
+  end
+
+  sig { returns(Float) }
+  def y_min
+    @y.min
+  end
+
  # Convert raw data to normalized [0,1] scale for Rumale
  sig { returns(T::Array[T::Array[Float]]) }
  def normalized_x_matrix
@@ -202,6 +236,22 @@ class DataNormalizer
    @x.range.step(step_size).to_a
  end

+  # Default transformation matrix (identity for linear/quadratic)
+  sig { returns(T::Array[T::Array[Float]]) }
+  def transformed_x_matrix
+    normalized_x_matrix
+  end
+
+  protected
+
+  sig { returns(Range) }
+  attr_reader :x, :y
+end
+
+# Linear regression specific normalizer
+class LinearNormalizer < DataNormalizer
+  extend T::Sig
+
  # Denormalize linear regression results back to original scale
  sig do
    params(
@@ -210,7 +260,7 @@ class DataNormalizer
      norm_intercept: Float,
    ).returns(T::Array[Float])
  end
-  def denormalize_linear(regression_x, norm_slope, norm_intercept)
+  def denormalize_regression(regression_x, norm_slope, norm_intercept)
    regression_x.map do |x|
      x_norm = @x.normalize(x)
      y_norm = norm_slope * x_norm + norm_intercept
@@ -218,6 +268,22 @@ class DataNormalizer
    end
  end

+  # Denormalize linear regression coefficients back to original scale
+  sig do
+    params(norm_intercept: Float, norm_slope: Float).returns(T::Array[Float])
+  end
+  def denormalize_coefficients(norm_intercept, norm_slope)
+    slope_orig = norm_slope * @y.scale / @x.scale
+    intercept_orig = (norm_intercept * @y.scale + @y.min) - slope_orig * @x.min
+
+    [intercept_orig, slope_orig]
+  end
+end
+
+# Quadratic regression specific normalizer
+class QuadraticNormalizer < DataNormalizer
+  extend T::Sig
+
  # Denormalize quadratic regression results back to original scale
  sig do
    params(
@@ -227,7 +293,7 @@ class DataNormalizer
      norm_c: Float,
    ).returns(T::Array[Float])
  end
-  def denormalize_quadratic(regression_x, norm_a, norm_b, norm_c)
+  def denormalize_regression(regression_x, norm_a, norm_b, norm_c)
    regression_x.map do |x|
      x_norm = @x.normalize(x)
      y_norm = norm_a * x_norm * x_norm + norm_b * x_norm + norm_c
@@ -235,44 +301,106 @@ class DataNormalizer
    end
  end

-  # Generate equation strings with coefficients in original scale
-  sig { params(norm_slope: Float, norm_intercept: Float).returns(String) }
-  def linear_equation(norm_slope, norm_intercept)
-    slope_orig = norm_slope * @y.scale / @x.scale
-    intercept_orig = (norm_intercept * @y.scale + @y.min) - slope_orig * @x.min
-
-    "y = #{polynomial_equation([slope_orig, intercept_orig])}"
+  # Denormalize quadratic regression coefficients back to original scale
+  sig do
+    params(norm_c: Float, norm_b: Float, norm_a: Float).returns(T::Array[Float])
  end
-
-  sig { params(norm_a: Float, norm_b: Float, norm_c: Float).returns(String) }
-  def quadratic_equation(norm_a, norm_b, norm_c)
+  def denormalize_coefficients(norm_c, norm_b, norm_a)
    a_orig = norm_a * @y.scale / (@x.scale * @x.scale)
    b_orig = norm_b * @y.scale / @x.scale - 2 * a_orig * @x.min
    c_orig =
      (norm_c * @y.scale + @y.min) - b_orig * @x.min - a_orig * @x.min * @x.min

-    "y = #{polynomial_equation([a_orig, b_orig, c_orig])}"
+    [c_orig, b_orig, a_orig]
+  end
+end
+
+# Base class for transformations that follow y = a * f(x) + b pattern
+# where f(x) is a transformation function and denormalization only requires y-scaling
+class TransformedNormalizer < DataNormalizer
+  extend T::Sig
+
+  # Denormalize coefficients for simple transformations (only y-scaling needed)
+  sig do
+    params(norm_intercept: Float, norm_slope: Float).returns(T::Array[Float])
+  end
+  def denormalize_coefficients(norm_intercept, norm_slope)
+    slope_orig = norm_slope * @y.scale
+    intercept_orig = norm_intercept * @y.scale + @y.min
+
+    [intercept_orig, slope_orig]
  end

-  # Convert array of coefficients into polynomial equation string
-  sig { params(coefficients: T::Array[Float]).returns(String) }
-  def polynomial_equation(coefficients)
-    terms =
-      coefficients.each_with_index.map do |coeff, power|
-        next if coeff.zero?
+  # Common denormalization logic using the transformation function
+  sig do
+    params(
+      regression_x: T::Array[Float],
+      norm_slope: Float,
+      norm_intercept: Float,
+    ).returns(T::Array[Float])
+  end
+  def denormalize_regression(regression_x, norm_slope, norm_intercept)
+    regression_x.map do |x|
+      # y = a * f(x) + b, where coefficients are in normalized space
+      y_norm = norm_slope * transform_x(x) + norm_intercept
+      @y.denormalize(y_norm)
+    end
+  end

-        term = format_number(coeff)
-        case power
-        when 0
-          term
-        when 1
-          "#{term}x"
-        else
-          "#{term}x#{power.to_s.tr("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")}"
-        end
-      end
+  protected

-    terms.compact.reverse.join(" + ").gsub("+ -", "- ")
+  # Abstract method for applying the transformation function
+  sig { params(x: Float).returns(Float) }
+  def transform_x(x)
+    raise NotImplementedError, "Subclasses must implement transform_x"
+  end
+end
+
+# Logarithmic regression specific normalizer
+class LogarithmicNormalizer < TransformedNormalizer
+  extend T::Sig
+
+  # Convert x values to log-transformed matrix for logarithmic regression
+  sig { returns(T::Array[T::Array[Float]]) }
+  def transformed_x_matrix
+    @x_values.map { |x| [Math.log(x)] }
+  end
+
+  protected
+
+  # Apply logarithmic transformation
+  sig { params(x: Float).returns(Float) }
+  def transform_x(x)
+    Math.log(x)
+  end
+end
+
+# Square root regression specific normalizer
+class SquareRootNormalizer < TransformedNormalizer
+  extend T::Sig
+
+  # Convert x values to square root transformed matrix for square root regression
+  sig { returns(T::Array[T::Array[Float]]) }
+  def transformed_x_matrix
+    @x_values.map { |x| [Math.sqrt(x)] }
+  end
+
+  protected
+
+  # Apply square root transformation
+  sig { params(x: Float).returns(Float) }
+  def transform_x(x)
+    Math.sqrt(x)
+  end
+end
+
+# Base class for regression equations with common formatting logic
+class Equation
+  extend T::Sig
+
+  sig { params(normalizer: DataNormalizer).void }
+  def initialize(normalizer)
+    @normalizer = normalizer
  end

  # Format a number with significant figures and scientific notation when needed
@@ -298,16 +426,130 @@ class DataNormalizer
      num.round(decimal_places).to_s
    end
  end
+
+  sig { returns(String) }
+  def to_s
+    format_equation
+  end
+
+  protected
+
+  sig { returns(String) }
+  def format_equation
+    raise NotImplementedError, "Subclasses must implement format_equation"
+  end
+
+  sig { returns(DataNormalizer) }
+  attr_reader :normalizer
+end
+
+class PolynomialEquation < Equation
+  extend T::Sig
+
+  sig { params(normalizer: DataNormalizer, coefficients: T::Array[Float]).void }
+  def initialize(normalizer, coefficients)
+    super(normalizer)
+    @coefficients = coefficients
+  end
+
+  protected
+
+  sig { returns(String) }
+  def format_equation
+    "y = #{polynomial_equation(@coefficients)}"
+  end
+
+  private
+
+  # Convert array of coefficients into polynomial equation string
+  sig { params(coefficients: T::Array[Float]).returns(String) }
+  def polynomial_equation(coefficients)
+    terms =
+      coefficients.each_with_index.map do |coeff, power|
+        next if coeff.zero?
+
+        term = format_number(coeff)
+        case power
+        when 0
+          term
+        when 1
+          "#{term}x"
+        else
+          "#{term}x#{power.to_s.tr("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")}"
+        end
+      end
+
+    terms.compact.reverse.join(" + ").gsub("+ -", "- ")
+  end
+end
+
+class LogarithmicEquation < Equation
+  extend T::Sig
+
+  sig do
+    params(
+      normalizer: DataNormalizer,
+      norm_slope: Float,
+      norm_intercept: Float,
+    ).void
+  end
+  def initialize(normalizer, norm_slope, norm_intercept)
+    super(normalizer)
+    @norm_slope = norm_slope
+    @norm_intercept = norm_intercept
+  end
+
+  protected
+
+  sig { returns(String) }
+  def format_equation
+    slope_orig = @norm_slope * @normalizer.y_scale
+    intercept_orig = @norm_intercept * @normalizer.y_scale + @normalizer.y_min
+
+    "y = #{format_number(slope_orig)} * ln(x) + #{format_number(intercept_orig)}"
+  end
+end
+
+class SquareRootEquation < Equation
+  extend T::Sig
+
+  sig do
+    params(
+      normalizer: DataNormalizer,
+      norm_slope: Float,
+      norm_intercept: Float,
+    ).void
+  end
+  def initialize(normalizer, norm_slope, norm_intercept)
+    super(normalizer)
+    @norm_slope = norm_slope
+    @norm_intercept = norm_intercept
+  end
+
+  protected
+
+  sig { returns(String) }
+  def format_equation
+    slope_orig = @norm_slope * @normalizer.y_scale
+    intercept_orig = @norm_intercept * @normalizer.y_scale + @normalizer.y_min
+
+    "y = #{format_number(slope_orig)} * √x + #{format_number(intercept_orig)}"
+  end
 end

 # Immutable struct representing a single regression analysis result
 class RegressionResult < T::ImmutableStruct
  extend T::Sig

-  const :equation, String
+  const :equation, Equation
  const :r_squared, Float
  const :x_values, T::Array[Float]
  const :y_values, T::Array[Float]
+
+  sig { returns(String) }
+  def equation_string
+    equation.to_s
+  end
 end

 # Immutable struct representing the complete analysis results
@@ -316,40 +558,38 @@ class AnalysisResults < T::ImmutableStruct

  const :linear, RegressionResult
  const :quadratic, RegressionResult
+  const :logarithmic, RegressionResult
+  const :square_root, RegressionResult
 end

 # Handles regression analysis using Rumale with normalized data
 class RegressionAnalyzer
  extend T::Sig

-  sig { params(normalizer: DataNormalizer).void }
-  def initialize(normalizer)
-    @normalizer = normalizer
+  sig { params(records: T::Array[Domain::FaFavIdAndDate]).void }
+  def initialize(records)
+    @records = records
  end

  sig { returns(AnalysisResults) }
  def analyze
-    # Use normalized data for Rumale calculations to prevent numerical instability
-    x_matrix = @normalizer.normalized_x_matrix
-    y_vector = @normalizer.normalized_y_vector
-    regression_x = @normalizer.regression_x_range
-
    AnalysisResults.new(
-      linear: analyze_linear(x_matrix, y_vector, regression_x),
-      quadratic: analyze_quadratic(x_matrix, y_vector, regression_x),
+      linear: analyze_linear,
+      quadratic: analyze_quadratic,
+      logarithmic: analyze_logarithmic,
+      square_root: analyze_square_root,
    )
  end

  private

-  sig do
-    params(
-      x_matrix: T::Array[T::Array[Float]],
-      y_vector: T::Array[Float],
-      regression_x: T::Array[Float],
-    ).returns(RegressionResult)
-  end
-  def analyze_linear(x_matrix, y_vector, regression_x)
+  sig { returns(RegressionResult) }
+  def analyze_linear
+    normalizer = LinearNormalizer.new(@records)
+    x_matrix = normalizer.normalized_x_matrix
+    y_vector = normalizer.normalized_y_vector
+    regression_x = normalizer.regression_x_range
+
    poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 1)
    regressor = Rumale::LinearModel::LinearRegression.new
    pipeline =
@@ -370,24 +610,31 @@ class RegressionAnalyzer

    # Generate regression line data in original scale
    linear_y =
-      @normalizer.denormalize_linear(regression_x, norm_slope, norm_intercept)
+      normalizer.denormalize_regression(
+        regression_x,
+        norm_slope,
+        norm_intercept,
+      )
+
+    # Denormalize coefficients for equation display
+    coefficients =
+      normalizer.denormalize_coefficients(norm_intercept, norm_slope)

    RegressionResult.new(
-      equation: @normalizer.linear_equation(norm_slope, norm_intercept),
+      equation: PolynomialEquation.new(normalizer, coefficients),
      r_squared: r_squared,
      x_values: regression_x,
      y_values: linear_y,
    )
  end

-  sig do
-    params(
-      x_matrix: T::Array[T::Array[Float]],
-      y_vector: T::Array[Float],
-      regression_x: T::Array[Float],
-    ).returns(RegressionResult)
-  end
-  def analyze_quadratic(x_matrix, y_vector, regression_x)
+  sig { returns(RegressionResult) }
+  def analyze_quadratic
+    normalizer = QuadraticNormalizer.new(@records)
+    x_matrix = normalizer.normalized_x_matrix
+    y_vector = normalizer.normalized_y_vector
+    regression_x = normalizer.regression_x_range
+
    # Use pipeline approach as recommended in documentation
    poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
    regressor = Rumale::LinearModel::LinearRegression.new(fit_bias: true)
@@ -410,15 +657,108 @@ class RegressionAnalyzer

    # Generate regression line data in original scale
    quadratic_y =
-      @normalizer.denormalize_quadratic(regression_x, norm_a, norm_b, norm_c)
+      normalizer.denormalize_regression(regression_x, norm_a, norm_b, norm_c)
+
+    # Denormalize coefficients for equation display
+    coefficients = normalizer.denormalize_coefficients(norm_c, norm_b, norm_a)

    RegressionResult.new(
-      equation: @normalizer.quadratic_equation(norm_a, norm_b, norm_c),
+      equation: PolynomialEquation.new(normalizer, coefficients),
      r_squared: r_squared,
      x_values: regression_x,
      y_values: quadratic_y,
    )
  end
+
+  sig { returns(RegressionResult) }
+  def analyze_logarithmic
+    normalizer = LogarithmicNormalizer.new(@records)
+    y_vector = normalizer.normalized_y_vector
+    regression_x = normalizer.regression_x_range
+
+    # Transform x values using natural log for logarithmic regression
+    # y = a * ln(x) + b
+    log_x_matrix = normalizer.transformed_x_matrix
+    poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 1)
+    regressor = Rumale::LinearModel::LinearRegression.new
+
+    pipeline =
+      Rumale::Pipeline::Pipeline.new(
+        steps: {
+          transformer: poly_features,
+          estimator: regressor,
+        },
+      )
+
+    # Fit the regression on log-transformed x values
+    pipeline.fit(log_x_matrix, y_vector)
+    r_squared = pipeline.score(log_x_matrix, y_vector)
+
+    # Extract coefficients (same pattern as linear regression)
+    weight_vec = pipeline.steps[:estimator].weight_vec
+    norm_intercept = weight_vec[0]
+    norm_slope = weight_vec[1]
+
+    # Generate regression line data in original scale
+    logarithmic_y =
+      normalizer.denormalize_regression(
+        regression_x,
+        norm_slope,
+        norm_intercept,
+      )
+
+    RegressionResult.new(
+      equation: LogarithmicEquation.new(normalizer, norm_slope, norm_intercept),
+      r_squared: r_squared,
+      x_values: regression_x,
+      y_values: logarithmic_y,
+    )
+  end
+
+  sig { returns(RegressionResult) }
+  def analyze_square_root
+    normalizer = SquareRootNormalizer.new(@records)
+    y_vector = normalizer.normalized_y_vector
+    regression_x = normalizer.regression_x_range
+
+    # Transform x values using square root for square root regression
+    # y = a * √x + b
+    sqrt_x_matrix = normalizer.transformed_x_matrix
+    poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 1)
+    regressor = Rumale::LinearModel::LinearRegression.new
+
+    pipeline =
+      Rumale::Pipeline::Pipeline.new(
+        steps: {
+          transformer: poly_features,
+          estimator: regressor,
+        },
+      )
+
+    # Fit the regression on square root transformed x values
+    pipeline.fit(sqrt_x_matrix, y_vector)
+    r_squared = pipeline.score(sqrt_x_matrix, y_vector)
+
+    # Extract coefficients (same pattern as other regressions)
+    weight_vec = pipeline.steps[:estimator].weight_vec
+    norm_intercept = weight_vec[0]
+    norm_slope = weight_vec[1]
+
+    # Generate regression line data in original scale
+    square_root_y =
+      normalizer.denormalize_regression(
+        regression_x,
+        norm_slope,
+        norm_intercept,
+      )
+
+    RegressionResult.new(
+      equation: SquareRootEquation.new(normalizer, norm_slope, norm_intercept),
+      r_squared: r_squared,
+      x_values: regression_x,
+      y_values: square_root_y,
+    )
+  end
 end

 # Simplified plotting class with extracted common functionality
@@ -466,10 +806,10 @@ class StatsPlotter
    params(
      x_values: T::Array[Float],
      y_values: T::Array[Float],
-      results: AnalysisResults,
+      regressions: T::Array[[String, RegressionResult]],
    ).void
  end
-  def plot_combined(x_values, y_values, results)
+  def plot_combined(x_values, y_values, regressions)
    plot_with_error_handling("📈 Combined Visualization:") do
      # Base scatter plot
      plot =
@@ -485,18 +825,14 @@ class StatsPlotter
        )

      # Add regression lines
-      UnicodePlot.lineplot!(
-        plot,
-        results.linear.x_values,
-        results.linear.y_values,
-        name: "Linear (R²=#{results.linear.r_squared.round(3)})",
-      )
-      UnicodePlot.lineplot!(
-        plot,
-        results.quadratic.x_values,
-        results.quadratic.y_values,
-        name: "Quadratic (R²=#{results.quadratic.r_squared.round(3)})",
-      )
+      regressions.each do |name, result|
+        UnicodePlot.lineplot!(
+          plot,
+          result.x_values,
+          result.y_values,
+          name: "#{name} (R²=#{result.r_squared.round(3)})",
+        )
+      end
      plot
    end
  end