From ad467188f4d1ddf8e8033200e79768b175b342ed Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 26 Jul 2021 13:30:42 -0700 Subject: [PATCH] First commit --- .github/workflows/build.yml | 11 ++ .gitignore | 2 + .gitmodules | 3 + CHANGELOG.md | 3 + Cargo.toml | 19 +++ LICENSE.txt | 33 +++++ README.md | 150 +++++++++++++++++++++ build.rs | 10 ++ src/bindings.rs | 63 +++++++++ src/lib.rs | 10 ++ src/matrix.rs | 53 ++++++++ src/model.rs | 262 ++++++++++++++++++++++++++++++++++++ vendor/libmf | 1 + 13 files changed, 620 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 CHANGELOG.md create mode 100644 Cargo.toml create mode 100644 LICENSE.txt create mode 100644 README.md create mode 100644 build.rs create mode 100644 src/bindings.rs create mode 100644 src/lib.rs create mode 100644 src/matrix.rs create mode 100644 src/model.rs create mode 160000 vendor/libmf diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..66c9eed --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,11 @@ +name: build +on: [push, pull_request] +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + - run: cargo test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4ddec76 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "vendor/libmf"] + path = vendor/libmf + url = https://github.com/ankane/libmf-1.git diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..fdbb576 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,3 @@ +## 0.1.0 (unreleased) + +- First release diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e776e7e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "libmf" +version = "0.1.0" +description = "Large-scale sparse matrix factorization for Rust" +repository = "https://github.com/ankane/libmf-rust" +license = "BSD-3-Clause" +authors = ["Andrew Kane "] +edition = "2018" +readme = "README.md" +exclude = ["vendor/libmf/demo/*", "vendor/libmf/windows/*"] +links = "mf" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +libc = "0.2" + +[build-dependencies] +cc = "1.0" diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..cfdf506 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,33 @@ +BSD 3-Clause License + +Copyright (c) 2014-2015 The LIBMF Project. +Copyright (c) 2021, Andrew Kane. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither name of copyright holders nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..3e31014 --- /dev/null +++ b/README.md @@ -0,0 +1,150 @@ +# LIBMF Rust + +[LIBMF](https://github.com/cjlin1/libmf) - large-scale sparse matrix factorization - for Rust + +[![Build Status](https://github.com/ankane/libmf-rust/workflows/build/badge.svg?branch=master)](https://github.com/ankane/libmf-rust/actions) + +## Installation + +Add this line to your application’s `Cargo.toml` under `[dependencies]`: + +```toml +libmf = { version = "0.1" } +``` + +## Getting Started + +Prep your data in the format `row_index, column_index, value` + +```rust +let mut data = libmf::Matrix::new(); +data.push(0, 0, 5.0); +data.push(0, 2, 3.5); +data.push(1, 1, 4.0); +``` + +Create a model + +```rust +let mut model = libmf::Model::new(); +model.fit(&data); +``` + +Make predictions + +```rust +model.predict(row_index, column_index); +``` + +Get the latent factors (these approximate the training matrix) + +```rust +model.p_factors(); +model.q_factors(); +``` + +Get the bias (average of all elements in the training matrix) + +```rust +model.bias(); +``` + +Save the model to a file + +```rust +model.save("model.txt"); +``` + +Load the model from a file + +```rust +let model = libmf::Model::load("model.txt"); +``` + +Pass a validation set + +```rust +model.fit_eval(&train_set, &eval_set); +``` + +## Cross-Validation + +Perform cross-validation + +```rust +model.cv(&data, 5); +``` + +## Parameters + +Set parameters - default values below + +```rust +model.loss = 0; // loss function +model.factors = 8; // number of latent factors +model.threads = 12; // number of threads used +model.bins = 25; // number of bins +model.iterations = 20; // number of iterations +model.lambda_p1 = 0; // coefficient of L1-norm regularization on P +model.lambda_p2 = 0.1; // coefficient of L2-norm regularization on P +model.lambda_q1 = 0; // coefficient of L1-norm regularization on Q +model.lambda_q2 = 0.1; // coefficient of L2-norm regularization on Q +model.learning_rate = 0.1; // learning rate +model.alpha = 0.1; // importance of negative entries +model.c = 0.0001; // desired value of negative entries +model.nmf = false; // perform non-negative MF (NMF) +model.quiet = false; // no outputs to stdout +``` + +### Loss Functions + +For real-valued matrix factorization + +- 0 - squared error (L2-norm) +- 1 - absolute error (L1-norm) +- 2 - generalized KL-divergence + +For binary matrix factorization + +- 5 - logarithmic error +- 6 - squared hinge loss +- 7 - hinge loss + +For one-class matrix factorization + +- 10 - row-oriented pair-wise logarithmic loss +- 11 - column-oriented pair-wise logarithmic loss +- 12 - squared error (L2-norm) + +## Reference + +Specify the initial capacity for a matrix + +```rust +let mut data = libmf::Matrix::with_capacity(3); +``` + +## Resources + +- [LIBMF: A Library for Parallel Matrix Factorization in Shared-memory Systems](https://www.csie.ntu.edu.tw/~cjlin/papers/libmf/libmf_open_source.pdf) + +## History + +View the [changelog](https://github.com/ankane/libmf-rust/blob/master/CHANGELOG.md) + +## Contributing + +Everyone is encouraged to help improve this project. Here are a few ways you can help: + +- [Report bugs](https://github.com/ankane/libmf-rust/issues) +- Fix bugs and [submit pull requests](https://github.com/ankane/libmf-rust/pulls) +- Write, clarify, or fix documentation +- Suggest or add new features + +To get started with development: + +```sh +git clone --recursive https://github.com/ankane/libmf-rust.git +cd libmf-rust +cargo test +``` diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..504cbf4 --- /dev/null +++ b/build.rs @@ -0,0 +1,10 @@ +extern crate cc; + +fn main() { + cc::Build::new() + .cpp(true) + .flag("-std=c++11") + .flag("-Wno-unused-parameter") + .file("vendor/libmf/mf.cpp") + .compile("mf"); +} diff --git a/src/bindings.rs b/src/bindings.rs new file mode 100644 index 0000000..6cdf40e --- /dev/null +++ b/src/bindings.rs @@ -0,0 +1,63 @@ +extern crate libc; + +use std::os::raw::{c_char, c_double, c_float, c_int, c_longlong}; + +#[repr(C)] +pub struct MfNode +{ + pub u: c_int, + pub v: c_int, + pub r: c_float +} + +#[repr(C)] +pub struct MfProblem +{ + pub m: c_int, + pub n: c_int, + pub nnz: c_longlong, + pub r: *const MfNode +} + +#[repr(C)] +pub struct MfParameter { + pub fun: c_int, + pub k: c_int, + pub nr_threads: c_int, + pub nr_bins: c_int, + pub nr_iters: c_int, + pub lambda_p1: c_float, + pub lambda_p2: c_float, + pub lambda_q1: c_float, + pub lambda_q2: c_float, + pub eta: c_float, + pub alpha: c_float, + pub c: c_float, + pub do_nmf: bool, + pub quiet: bool, + pub copy_data: bool +} + +#[repr(C)] +pub struct MfModel +{ + pub fun: c_int, + pub m: c_int, + pub n: c_int, + pub k: c_int, + pub b: c_float, + pub p: *const c_float, + pub q: *const c_float +} + +extern "C" { + pub fn mf_get_default_param() -> MfParameter; + pub fn mf_save_model(model: *const MfModel, path: *const c_char) -> c_int; + pub fn mf_load_model(path: *const c_char) -> *mut MfModel; + pub fn mf_train(prob: *const MfProblem, param: MfParameter) -> *mut MfModel; + pub fn mf_train_with_validation(tr: *const MfProblem, va: *const MfProblem, param: MfParameter) -> *mut MfModel; + pub fn mf_cross_validation(prob: *const MfProblem, nr_folds: c_int, param: MfParameter) -> c_double; + pub fn mf_predict(model: *const MfModel, u: c_int, v: c_int) -> c_float; + pub fn calc_rmse(prob: *const MfProblem, model: *const MfModel) -> c_double; + pub fn calc_mae(prob: *const MfProblem, model: *const MfModel) -> c_double; +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..9a53b08 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,10 @@ +//! Large-scale sparse matrix factorization for Rust +//! +//! [View the docs](https://github.com/ankane/libmf-rust) + +mod bindings; +mod matrix; +mod model; + +pub use matrix::Matrix; +pub use model::Model; diff --git a/src/matrix.rs b/src/matrix.rs new file mode 100644 index 0000000..1d41c50 --- /dev/null +++ b/src/matrix.rs @@ -0,0 +1,53 @@ +use crate::bindings::{MfNode, MfProblem}; + +pub struct Matrix { + data: Vec +} + +impl Matrix { + pub fn new() -> Self { + Self { + data: Vec::new() + } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + data: Vec::with_capacity(capacity) + } + } + + pub fn push(&mut self, row_index: i32, column_index: i32, value: f32) { + self.data.push(MfNode { u: row_index, v: column_index, r: value }); + } + + pub(crate) fn to_problem(&self) -> MfProblem { + let data = &self.data; + let m = data.iter().map(|x| x.u).max().unwrap_or(-1) + 1; + let n = data.iter().map(|x| x.v).max().unwrap_or(-1) + 1; + + MfProblem { + m: m, + n: n, + nnz: data.len() as i64, + r: data.as_ptr() + } + } +} + +#[cfg(test)] +mod tests { + use crate::Matrix; + + #[test] + fn test_new() { + let mut data = Matrix::new(); + data.push(0, 0, 1.0); + } + + #[test] + fn test_with_capacity() { + let mut data = Matrix::with_capacity(1); + data.push(0, 0, 1.0); + } +} diff --git a/src/model.rs b/src/model.rs new file mode 100644 index 0000000..c7e7a01 --- /dev/null +++ b/src/model.rs @@ -0,0 +1,262 @@ +use crate::bindings::*; +use crate::Matrix; +use std::ffi::CString; + +pub struct Model { + model: *const MfModel, + pub loss: i32, + pub factors: i32, + pub threads: i32, + pub bins: i32, + pub iterations: i32, + pub lambda_p1: f32, + pub lambda_p2: f32, + pub lambda_q1: f32, + pub lambda_q2: f32, + pub learning_rate: f32, + pub alpha: f32, + pub c: f32, + pub nmf: bool, + pub quiet: bool +} + +impl Model { + pub fn new() -> Self { + Self::with_model(std::ptr::null()) + } + + pub fn load(path: &str) -> Self { + let cpath = CString::new(path).expect("CString::new failed"); + Self::with_model(unsafe { mf_load_model(cpath.as_ptr()) }) + } + + pub fn fit(&mut self, data: &Matrix) { + let prob = data.to_problem(); + self.model = unsafe { mf_train(&prob, self.param()) }; + } + + pub fn fit_eval(&mut self, train_set: &Matrix, eval_set: &Matrix) { + let tr = train_set.to_problem(); + let va = eval_set.to_problem(); + self.model = unsafe { mf_train_with_validation(&tr, &va, self.param()) }; + } + + pub fn cv(&mut self, data: &Matrix, folds: i32) { + let prob = data.to_problem(); + unsafe { mf_cross_validation(&prob, folds, self.param()); } + } + + pub fn predict(&self, row_index: i32, column_index: i32) -> f32 { + assert!(self.is_fit()); + unsafe { mf_predict(self.model, row_index, column_index) } + } + + pub fn save(&self, path: &str) { + assert!(self.is_fit()); + let cpath = CString::new(path).expect("CString::new failed"); + unsafe { mf_save_model(self.model, cpath.as_ptr()); } + } + + pub fn rows(&self) -> i32 { + if self.is_fit() { + unsafe { (*self.model).m } + } else { + 0 + } + } + + pub fn columns(&self) -> i32 { + if self.is_fit() { + unsafe { (*self.model).n } + } else { + 0 + } + } + + pub fn factors(&self) -> i32 { + if self.is_fit() { + unsafe { (*self.model).k } + } else { + self.factors + } + } + + pub fn bias(&self) -> f32 { + if self.is_fit() { + unsafe { (*self.model).b } + } else { + 0.0 + } + } + + pub fn p_factors(&self) -> &[f32] { + if self.is_fit() { + unsafe { std::slice::from_raw_parts((*self.model).p, (self.rows() * self.factors()) as usize) } + } else { + &[] + } + } + + pub fn q_factors(&self) -> &[f32] { + if self.is_fit() { + unsafe { std::slice::from_raw_parts((*self.model).q, (self.columns() * self.factors()) as usize) } + } else { + &[] + } + } + + pub fn rmse(&self, data: &Matrix) -> f64 { + assert!(self.is_fit()); + let prob = data.to_problem(); + unsafe { calc_rmse(&prob, self.model) } + } + + pub fn mae(&self, data: &Matrix) -> f64 { + assert!(self.is_fit()); + let prob = data.to_problem(); + unsafe { calc_mae(&prob, self.model) } + } + + fn with_model(model: *const MfModel) -> Self { + let param = unsafe { mf_get_default_param() }; + Self { + model: model, + loss: param.fun, + factors: param.k, + threads: param.nr_threads, + bins: 25, // prevent warning + iterations: param.nr_iters, + lambda_p1: param.lambda_p1, + lambda_p2: param.lambda_p2, + lambda_q1: param.lambda_q1, + lambda_q2: param.lambda_q2, + learning_rate: param.eta, + alpha: param.alpha, + c: param.c, + nmf: param.do_nmf, + quiet: param.quiet + } + } + + fn param(&self) -> MfParameter { + let mut param = unsafe { mf_get_default_param() }; + param.fun = self.loss; + param.k = self.factors; + param.nr_threads = self.threads; + param.nr_bins = self.bins; + param.nr_iters = self.iterations; + param.lambda_p1 = self.lambda_p1; + param.lambda_p2 = self.lambda_p2; + param.lambda_q1 = self.lambda_q1; + param.lambda_q2 = self.lambda_q2; + param.eta = self.learning_rate; + param.alpha = self.alpha; + param.c = self.c; + param.do_nmf = self.nmf; + param.quiet = self.quiet; + param + } + + fn is_fit(&self) -> bool { + !self.model.is_null() + } +} + +#[cfg(test)] +mod tests { + use crate::{Matrix, Model}; + + fn generate_data() -> Matrix { + let mut data = Matrix::new(); + data.push(0, 0, 1.0); + data.push(1, 0, 2.0); + data.push(1, 1, 1.0); + data + } + + #[test] + fn test_fit() { + let data = generate_data(); + let mut model = Model::new(); + model.quiet = true; + model.fit(&data); + model.predict(0, 1); + + model.p_factors(); + model.q_factors(); + model.bias(); + } + + #[test] + fn test_fit_eval() { + let data = generate_data(); + let mut model = Model::new(); + model.quiet = true; + model.fit_eval(&data, &data); + } + + #[test] + fn test_cv() { + let data = generate_data(); + let mut model = Model::new(); + model.quiet = true; + model.cv(&data, 5); + } + + #[test] + fn test_save_load() { + let data = generate_data(); + let mut model = Model::new(); + model.quiet = true; + model.fit(&data); + + model.save("/tmp/model.txt"); + let model = Model::load("/tmp/model.txt"); + + model.p_factors(); + model.q_factors(); + model.bias(); + } + + #[test] + fn test_metrics() { + let data = generate_data(); + let mut model = Model::new(); + model.quiet = true; + model.fit(&data); + + assert!(model.rmse(&data) < 0.15); + assert!(model.mae(&data) < 0.15); + } + + #[test] + fn test_not_fit() { + let model = Model::new(); + assert_eq!(0.0, model.bias()); + assert!(model.p_factors().is_empty()); + assert!(model.q_factors().is_empty()); + } + + #[test] + #[should_panic(expected = "assertion failed: self.is_fit()")] + fn test_predict_not_fit() { + let model = Model::new(); + model.predict(0, 1); + } + + #[test] + #[should_panic(expected = "assertion failed: self.is_fit()")] + fn test_save_not_fit() { + let model = Model::new(); + model.save("/tmp/model.txt"); + } + + #[test] + fn test_predict_out_of_range() { + let data = generate_data(); + let mut model = Model::new(); + model.quiet = true; + model.fit(&data); + assert_eq!(model.bias(), model.predict(1000, 1000)); + } +} diff --git a/vendor/libmf b/vendor/libmf new file mode 160000 index 0000000..6fa534d --- /dev/null +++ b/vendor/libmf @@ -0,0 +1 @@ +Subproject commit 6fa534dae7d6b5b052e363fd1833183d2a552c30