redux-scraper/lib/tasks/stats.rake

# typed: strict
# frozen_string_literal: true
T.bind(self, T.all(Rake::DSL, Object))

require "unicode_plot"
require "rumale"
require "rumale/linear_model/linear_regression"
require "rumale/preprocessing/polynomial_features"
require "rumale/pipeline/pipeline"

namespace :stats do
  desc "Generate graphs of FaUserPostFav models with linear, quadratic, logarithmic, and square root regression lines. Usage: rake stats:fa_fav_graph[max_points]"
  task :fa_fav_graph, [:max_points] => :environment do |task, args|
    puts "🔍 Analyzing FaUserPostFav data..."

    # Parse max_points parameter (default to no limit)
    max_points = args[:max_points]&.to_i

    # Query and sample data
    records_array = Stats::Helpers.sample_records(max_points)

    # Create base normalizer for display ranges
    base_normalizer = Stats::LinearNormalizer.new(records_array)
    puts "📈 X-axis range (fav_id): #{base_normalizer.x_range}"
    puts "📈 Y-axis range (explicit_time): #{base_normalizer.y_range}"

    # Split data for plotting
    split = Stats::Helpers.split_train_test(records_array)
    train_normalizer = Stats::LinearNormalizer.new(split.training_records)
    eval_normalizer = Stats::LinearNormalizer.new(split.evaluation_records)

    # Run regressions using specialized normalizers
    regressions = Stats::RegressionAnalyzer.new(records_array).analyze

    # Display results (automatically denormalized)
    regressions.each do |model_type, result|
      puts "\n📊 #{model_type.serialize.humanize} Regression Results:"
      puts "   #{result.equation_string}"
      puts "   #{result.score_summary}"
    end

    # Generate visualizations
    puts "\n🎨 Generating visualizations with UnicodePlot..."
    plotter = Stats::Plotter.new

    plotter.plot_train_eval_scatter(
      "Original Data (Train/Eval)",
      train_normalizer.x_values,
      train_normalizer.y_values,
      eval_normalizer.x_values,
      eval_normalizer.y_values,
    )

    # Plot individual regression results
    regressions.each do |model_type, result|
      plotter.plot_regression(
        "#{model_type.serialize.humanize} Regression",
        result,
      )
    end
    plotter.plot_combined(
      base_normalizer.x_values,
      base_normalizer.y_values,
      regressions,
    )

    puts "\n✅ Graph generation completed!"

    # remove old regressions for this model
    model_name = "fa_fav_id_and_date"
    TrainedRegressionModel.where(name: model_name).destroy_all

    # Save each regression model to the database
    regressions.each do |model_type, result|
      equation = result.equation
      TrainedRegressionModel.create!(
        name: model_name,
        model_type: model_type.serialize,
        description:
          "Trained on FaUserPostFav with #{model_type.serialize} regression.",
        total_records_count: records_array.size,
        training_records_count: split.training_records.size,
        evaluation_records_count: split.evaluation_records.size,
        train_test_split_ratio: 0.8, # hardcoded, see split_train_test default
        random_seed: 42, # hardcoded, see split_train_test
        max_points_limit: max_points,
        x_min: equation.x.min,
        x_max: equation.x.max,
        y_min: equation.y.min,
        y_max: equation.y.max,
        coefficients: equation.coefficients,
        training_r_squared: result.training_r_squared,
        evaluation_r_squared: result.evaluation_r_squared,
        equation_string: result.equation_string,
        metadata: {
          x_range: equation.x.range,
          y_range: equation.y.range,
        },
      )
      puts "💾 Saved #{model_type.serialize.humanize} regression model to DB."
    end
  end
end