redux-scraper/lib/tasks/stats.rake

# typed: strict
# frozen_string_literal: true
T.bind(self, T.all(Rake::DSL, Object))

require "unicode_plot"
require "rumale"
require "rumale/linear_model/linear_regression"
require "rumale/preprocessing/polynomial_features"
require "rumale/pipeline/pipeline"

namespace :stats do
  desc "Generate graphs of FaFavIdAndDate models with linear and quadratic regression lines. Usage: rake stats:fa_fav_graph[max_points]"
  task :fa_fav_graph, [:max_points] => :environment do |task, args|
    puts "🔍 Analyzing FaFavIdAndDate data..."

    # Parse max_points parameter (default to no limit)
    max_points = args[:max_points]&.to_i

    # Query and sample data
    records_array = StatsHelpers.sample_records(max_points)

    # Create normalizer with raw data
    normalizer = DataNormalizer.new(records_array)

    puts "📈 X-axis range (fav_fa_id): #{normalizer.x_range}"
    puts "📈 Y-axis range (date): #{normalizer.y_range}"

    # Run regressions using normalized data
    results = RegressionAnalyzer.new(normalizer).analyze

    # Display results (automatically denormalized)
    puts "\n📊 Linear Regression Results:"
    puts "   #{results.linear.equation}"
    puts "   R² = #{StatsHelpers.format_r_squared(results.linear.r_squared)}"

    puts "\n📊 Quadratic Regression Results:"
    puts "   #{results.quadratic.equation}"
    puts "   R² = #{StatsHelpers.format_r_squared(results.quadratic.r_squared)}"

    # Generate visualizations
    puts "\n🎨 Generating visualizations with UnicodePlot..."
    plotter = StatsPlotter.new

    plotter.plot_scatter(
      "Original Data",
      normalizer.x_values,
      normalizer.y_values,
    )
    plotter.plot_regression("Linear Regression", results.linear)
    plotter.plot_regression("Quadratic Regression", results.quadratic)
    plotter.plot_combined(normalizer.x_values, normalizer.y_values, results)

    puts "\n✅ Graph generation completed!"
  end
end

# Helper methods extracted to avoid private method issues in Rake context
module StatsHelpers
  extend T::Sig

  sig do
    params(max_points: T.nilable(Integer)).returns(
      T::Array[Domain::FaFavIdAndDate],
    )
  end
  def self.sample_records(max_points)
    records = Domain::FaFavIdAndDate.complete

    if records.empty?
      puts "❌ No complete FaFavIdAndDate records found"
      exit 1
    end

    total_records = records.count
    puts "📊 Found #{total_records} complete records"
    records = records.select(:id, :fav_fa_id, :date)

    records_array = records.to_a
    if max_points && total_records > max_points
      puts "🎲 Randomly sampling #{max_points} points from #{total_records} total records"
      srand(42) # Fixed seed for reproducibility
      records_array =
        T.cast(
          records_array.sample(max_points),
          T::Array[Domain::FaFavIdAndDate],
        )
      puts "📊 Using #{records_array.length} sampled records for analysis"
    else
      message =
        (
          if max_points
            "within max_points limit of #{max_points}"
          else
            "no sampling limit specified"
          end
        )
      puts "📊 Using all #{records_array.length} records (#{message})"
    end

    records_array
  end

  sig { params(value: Float).returns(Float) }
  def self.format_r_squared(value)
    value.round(3).to_f
  end
end

# Handles data normalization and denormalization to prevent numerical instability
class DataNormalizer
  extend T::Sig

  class Range < T::ImmutableStruct
    extend T::Sig

    const :min, Float
    const :max, Float

    sig { returns(Float) }
    def scale
      max - min
    end

    sig { returns(T::Range[Float]) }
    def range
      min..max
    end

    sig { params(value: Float).returns(Float) }
    def normalize(value)
      (value - min) / scale
    end

    sig { params(value: Float).returns(Float) }
    def denormalize(value)
      value * scale + min
    end

    sig do
      params(
        mapper: T.nilable(T.proc.params(arg: Float).returns(String)),
      ).returns(String)
    end
    def as_string(&mapper)
      mapper ||= ->(x) { x }
      "#{mapper.call(min)} to #{mapper.call(max)}"
    end
  end

  sig { returns(T::Array[Float]) }
  attr_reader :x_values

  sig { returns(T::Array[Float]) }
  attr_reader :y_values

  sig { params(records: T::Array[Domain::FaFavIdAndDate]).void }
  def initialize(records)
    data_points =
      records.map do |record|
        {
          x: record.fav_fa_id.to_f,
          y: T.cast(record.date&.to_time&.to_i&.to_f, Float),
        }
      end

    data_points.sort_by! { |point| point[:x] }
    @x_values = T.let(data_points.map { |p| p[:x] }, T::Array[Float])
    @y_values = T.let(data_points.map { |p| p[:y] }, T::Array[Float])

    # Calculate min/max for normalization
    x_minmax = T.cast(@x_values.minmax, [Float, Float])
    y_minmax = T.cast(@y_values.minmax, [Float, Float])
    @x = T.let(Range.new(min: x_minmax[0], max: x_minmax[1]), Range)
    @y = T.let(Range.new(min: y_minmax[0], max: y_minmax[1]), Range)
  end

  sig { returns(String) }
  def x_range
    @x.as_string
  end

  sig { returns(String) }
  def y_range
    @y.as_string { |x| Time.at(x) }
  end

  # Convert raw data to normalized [0,1] scale for Rumale
  sig { returns(T::Array[T::Array[Float]]) }
  def normalized_x_matrix
    @x_values.map { |x| [@x.normalize(x)] }
  end

  sig { returns(T::Array[Float]) }
  def normalized_y_vector
    @y_values.map { |y| @y.normalize(y) }
  end

  # Generate regression line points in original scale
  sig { returns(T::Array[Float]) }
  def regression_x_range
    step_size = @x.scale / 50.0
    @x.range.step(step_size).to_a
  end

  # Denormalize linear regression results back to original scale
  sig do
    params(
      regression_x: T::Array[Float],
      norm_slope: Float,
      norm_intercept: Float,
    ).returns(T::Array[Float])
  end
  def denormalize_linear(regression_x, norm_slope, norm_intercept)
    regression_x.map do |x|
      x_norm = @x.normalize(x)
      y_norm = norm_slope * x_norm + norm_intercept
      @y.denormalize(y_norm)
    end
  end

  # Denormalize quadratic regression results back to original scale
  sig do
    params(
      regression_x: T::Array[Float],
      norm_a: Float,
      norm_b: Float,
      norm_c: Float,
    ).returns(T::Array[Float])
  end
  def denormalize_quadratic(regression_x, norm_a, norm_b, norm_c)
    regression_x.map do |x|
      x_norm = @x.normalize(x)
      y_norm = norm_a * x_norm * x_norm + norm_b * x_norm + norm_c
      @y.denormalize(y_norm)
    end
  end

  # Generate equation strings with coefficients in original scale
  sig { params(norm_slope: Float, norm_intercept: Float).returns(String) }
  def linear_equation(norm_slope, norm_intercept)
    slope_orig = norm_slope * @y.scale / @x.scale
    intercept_orig = (norm_intercept * @y.scale + @y.min) - slope_orig * @x.min

    "y = #{polynomial_equation([slope_orig, intercept_orig])}"
  end

  sig { params(norm_a: Float, norm_b: Float, norm_c: Float).returns(String) }
  def quadratic_equation(norm_a, norm_b, norm_c)
    a_orig = norm_a * @y.scale / (@x.scale * @x.scale)
    b_orig = norm_b * @y.scale / @x.scale - 2 * a_orig * @x.min
    c_orig =
      (norm_c * @y.scale + @y.min) - b_orig * @x.min - a_orig * @x.min * @x.min

    "y = #{polynomial_equation([a_orig, b_orig, c_orig])}"
  end

  # Convert array of coefficients into polynomial equation string
  sig { params(coefficients: T::Array[Float]).returns(String) }
  def polynomial_equation(coefficients)
    terms =
      coefficients.each_with_index.map do |coeff, power|
        next if coeff.zero?

        term = format_number(coeff)
        case power
        when 0
          term
        when 1
          "#{term}x"
        else
          "#{term}x#{power.to_s.tr("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")}"
        end
      end

    terms.compact.reverse.join(" + ").gsub("+ -", "- ")
  end

  # Format a number with significant figures and scientific notation when needed
  sig { params(num: Float, sig_figs: Integer).returns(String) }
  def format_number(num, sig_figs = 3)
    # Handle zero case
    return "0.0" if num.zero?

    # Get order of scale
    order = Math.log10(num.abs).floor

    # Use scientific notation for very large or small numbers
    if order >= 6 || order <= -3
      # Scale number between 1 and 10
      scaled = num / (10.0**order)
      # Round to sig figs
      rounded = scaled.round(sig_figs - 1)
      "#{rounded}e#{order}"
    else
      # For normal range numbers, just round to appropriate decimal places
      decimal_places = sig_figs - (order + 1)
      decimal_places = 0 if decimal_places < 0
      num.round(decimal_places).to_s
    end
  end
end

# Immutable struct representing a single regression analysis result
class RegressionResult < T::ImmutableStruct
  extend T::Sig

  const :equation, String
  const :r_squared, Float
  const :x_values, T::Array[Float]
  const :y_values, T::Array[Float]
end

# Immutable struct representing the complete analysis results
class AnalysisResults < T::ImmutableStruct
  extend T::Sig

  const :linear, RegressionResult
  const :quadratic, RegressionResult
end

# Handles regression analysis using Rumale with normalized data
class RegressionAnalyzer
  extend T::Sig

  sig { params(normalizer: DataNormalizer).void }
  def initialize(normalizer)
    @normalizer = normalizer
  end

  sig { returns(AnalysisResults) }
  def analyze
    # Use normalized data for Rumale calculations to prevent numerical instability
    x_matrix = @normalizer.normalized_x_matrix
    y_vector = @normalizer.normalized_y_vector
    regression_x = @normalizer.regression_x_range

    AnalysisResults.new(
      linear: analyze_linear(x_matrix, y_vector, regression_x),
      quadratic: analyze_quadratic(x_matrix, y_vector, regression_x),
    )
  end

  private

  sig do
    params(
      x_matrix: T::Array[T::Array[Float]],
      y_vector: T::Array[Float],
      regression_x: T::Array[Float],
    ).returns(RegressionResult)
  end
  def analyze_linear(x_matrix, y_vector, regression_x)
    poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 1)
    regressor = Rumale::LinearModel::LinearRegression.new
    pipeline =
      Rumale::Pipeline::Pipeline.new(
        steps: {
          transformer: poly_features,
          estimator: regressor,
        },
      )

    pipeline.fit(x_matrix, y_vector)

    # Extract normalized coefficients
    weight_vec = pipeline.steps[:estimator].weight_vec
    norm_intercept = weight_vec[0]
    norm_slope = weight_vec[1]
    r_squared = pipeline.score(x_matrix, y_vector)

    # Generate regression line data in original scale
    linear_y =
      @normalizer.denormalize_linear(regression_x, norm_slope, norm_intercept)

    RegressionResult.new(
      equation: @normalizer.linear_equation(norm_slope, norm_intercept),
      r_squared: r_squared,
      x_values: regression_x,
      y_values: linear_y,
    )
  end

  sig do
    params(
      x_matrix: T::Array[T::Array[Float]],
      y_vector: T::Array[Float],
      regression_x: T::Array[Float],
    ).returns(RegressionResult)
  end
  def analyze_quadratic(x_matrix, y_vector, regression_x)
    # Use pipeline approach as recommended in documentation
    poly_features = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
    regressor = Rumale::LinearModel::LinearRegression.new(fit_bias: true)

    pipeline =
      Rumale::Pipeline::Pipeline.new(
        steps: {
          transformer: poly_features,
          estimator: regressor,
        },
      )

    # Fit the pipeline
    pipeline.fit(x_matrix, y_vector)
    r_squared = pipeline.score(x_matrix, y_vector)
    weight_vec = pipeline.steps[:estimator].weight_vec
    norm_c = weight_vec[0] # constant term
    norm_b = weight_vec[1] # x coefficient
    norm_a = weight_vec[2] # x² coefficient

    # Generate regression line data in original scale
    quadratic_y =
      @normalizer.denormalize_quadratic(regression_x, norm_a, norm_b, norm_c)

    RegressionResult.new(
      equation: @normalizer.quadratic_equation(norm_a, norm_b, norm_c),
      r_squared: r_squared,
      x_values: regression_x,
      y_values: quadratic_y,
    )
  end
end

# Simplified plotting class with extracted common functionality
class StatsPlotter
  extend T::Sig

  sig do
    params(
      title: String,
      x_values: T::Array[Float],
      y_values: T::Array[Float],
    ).void
  end
  def plot_scatter(title, x_values, y_values)
    plot_with_error_handling(title) do
      UnicodePlot.scatterplot(
        x_values,
        y_values,
        title: title,
        width: 80,
        height: 20,
        xlabel: "fav_fa_id",
        ylabel: date_axis_label(y_values),
      )
    end
  end

  sig { params(title: String, result: RegressionResult).void }
  def plot_regression(title, result)
    subtitle = "#{title.split.first} fit (R² = #{result.r_squared.round(3)})"
    plot_with_error_handling("#{title} - #{subtitle}") do
      UnicodePlot.lineplot(
        result.x_values,
        result.y_values,
        title: title,
        width: 80,
        height: 20,
        xlabel: "fav_fa_id",
        ylabel: date_axis_label(result.y_values),
      )
    end
  end

  sig do
    params(
      x_values: T::Array[Float],
      y_values: T::Array[Float],
      results: AnalysisResults,
    ).void
  end
  def plot_combined(x_values, y_values, results)
    plot_with_error_handling("📈 Combined Visualization:") do
      # Base scatter plot
      plot =
        UnicodePlot.scatterplot(
          x_values,
          y_values,
          title: "FaFavIdAndDate Analysis: Original Data vs Regression Models",
          name: "Original Data",
          width: 100,
          height: 25,
          xlabel: "fav_fa_id",
          ylabel: date_axis_label(y_values),
        )

      # Add regression lines
      UnicodePlot.lineplot!(
        plot,
        results.linear.x_values,
        results.linear.y_values,
        name: "Linear (R²=#{results.linear.r_squared.round(3)})",
      )
      UnicodePlot.lineplot!(
        plot,
        results.quadratic.x_values,
        results.quadratic.y_values,
        name: "Quadratic (R²=#{results.quadratic.r_squared.round(3)})",
      )
      plot
    end
  end

  private

  sig { params(y_values: T::Array[Float]).returns(String) }
  def date_axis_label(y_values)
    y_min, y_max = y_values.minmax
    start_date = Time.at(y_min).strftime("%Y-%m-%d")
    end_date = Time.at(y_max).strftime("%Y-%m-%d")
    "Date (#{start_date} to #{end_date})"
  end

  sig { params(title: String, block: T.proc.returns(T.untyped)).void }
  def plot_with_error_handling(title, &block)
    puts "\n#{title}"
    begin
      plot = block.call
      puts plot.render
    rescue LoadError
      puts "⚠️  UnicodePlot gem not available. Install with: gem install unicode_plot"
    rescue => e
      puts "⚠️  Error generating plot: #{e.message}"
    end
  end
end