From ac347f2c0e11f11fbed9f61a2f4f1f4535c3a562 Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Sat, 28 Sep 2024 21:21:24 +0200
Subject: [PATCH 01/17] Added rayon to Cargo.toml

- Will be used for parallelization
---
 Cargo.toml | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 89ef53a9..18f5455b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,7 +10,14 @@ categories = ["science"]
 readme = "README.md"
 documentation = "https://axect.github.io/Peroxide_Doc"
 keywords = ["Numeric", "Science", "Dataframe", "Plot", "LinearAlgebra"]
-exclude = ["example_data/", "src/bin/", "benches/", "example/", "test_data/", "peroxide-ad2"]
+exclude = [
+    "example_data/",
+    "src/bin/",
+    "benches/",
+    "example/",
+    "test_data/",
+    "peroxide-ad2",
+]
 
 [badges]
 travis-ci = { repository = "axect/peroxide" }
@@ -32,17 +39,27 @@ anyhow = "1.0"
 paste = "1.0"
 #num-complex = "0.3"
 netcdf = { version = "0.7", optional = true, default-features = false }
-pyo3 = { version = "0.22", optional = true, features = ["auto-initialize", "gil-refs"] }
+pyo3 = { version = "0.22", optional = true, features = [
+    "auto-initialize",
+    "gil-refs",
+] }
 blas = { version = "0.22", optional = true }
 lapack = { version = "0.19", optional = true }
 serde = { version = "1.0", features = ["derive"], optional = true }
 json = { version = "0.12", optional = true }
-arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"], optional = true }
+arrow2 = { version = "0.18", features = [
+    "io_parquet",
+    "io_parquet_compression",
+], optional = true }
 num-complex = { version = "0.4", optional = true }
-lambert_w = { version = "0.3.0", default-features = false, features = ["24bits", "50bits"] }
+lambert_w = { version = "0.3.0", default-features = false, features = [
+    "24bits",
+    "50bits",
+] }
+rayon = "1.10"
 
 [package.metadata.docs.rs]
-rustdoc-args = [ "--html-in-header", "katex-header.html", "--cfg", "docsrs"]
+rustdoc-args = ["--html-in-header", "katex-header.html", "--cfg", "docsrs"]
 
 [features]
 default = []

From 36fb8bfe3f5b2c9295a1186d4c517ce7cdfbbdef Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Sun, 29 Sep 2024 00:21:45 +0200
Subject: [PATCH 02/17] Use rayon for FPVector

- Parallelized impl FPVector for Vec<f64>
using Rayon.
- Added Sync+Send traits are needed
---
 src/lib.rs                |   2 +
 src/numerical/integral.rs |  48 +++++++-------
 src/prelude/simpler.rs    |  24 +++----
 src/statistics/dist.rs    | 131 ++++++++++++++++++++++----------------
 src/statistics/rand.rs    |  23 +++++--
 src/structure/vector.rs   |  67 ++++++++++++++-----
 src/traits/fp.rs          |  10 +--
 src/traits/sugar.rs       |  22 ++++---
 8 files changed, 199 insertions(+), 128 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index e1929555..916878b5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -208,3 +208,5 @@ pub mod util;
 
 #[cfg(feature = "complex")]
 pub mod complex;
+
+extern crate rayon;
diff --git a/src/numerical/integral.rs b/src/numerical/integral.rs
index ac2165dd..a04d745f 100644
--- a/src/numerical/integral.rs
+++ b/src/numerical/integral.rs
@@ -161,19 +161,19 @@ impl Integral {
 ///     * `G30K61R`
 pub fn integrate<F>(f: F, (a, b): (f64, f64), method: Integral) -> f64
 where
-    F: Fn(f64) -> f64 + Copy,
+    F: Fn(f64) -> f64 + Copy + Send + Sync,
 {
     match method {
         Integral::GaussLegendre(n) => gauss_legendre_quadrature(f, n, (a, b)),
         Integral::NewtonCotes(n) => newton_cotes_quadrature(f, n, (a, b)),
-        method => gauss_kronrod_quadrature(f, (a,b), method),
+        method => gauss_kronrod_quadrature(f, (a, b), method),
     }
 }
 
 /// Newton Cotes Quadrature
 pub fn newton_cotes_quadrature<F>(f: F, n: usize, (a, b): (f64, f64)) -> f64
 where
-    F: Fn(f64) -> f64,
+    F: Fn(f64) -> f64 + Send + Sync,
 {
     let h = (b - a) / (n as f64);
     let node_x = seq(a, b, h);
@@ -217,9 +217,9 @@ where
 #[allow(non_snake_case)]
 pub fn gauss_kronrod_quadrature<F, T, S>(f: F, (a, b): (T, S), method: Integral) -> f64
 where
-     F: Fn(f64) -> f64 + Copy,
-     T: Into<f64>,
-     S: Into<f64>,
+    F: Fn(f64) -> f64 + Copy,
+    T: Into<f64>,
+    S: Into<f64>,
 {
     let (g, k) = method.get_gauss_kronrod_order();
     let tol = method.get_tol();
@@ -234,13 +234,9 @@ where
                 let G = gauss_legendre_quadrature(f, g as usize, (a, b));
                 let K = kronrod_quadrature(f, k as usize, (a, b));
                 let c = (a + b) / 2f64;
-                let tol_curr = if method.is_relative() {
-                    tol * G
-                } else {
-                    tol
-                };
+                let tol_curr = if method.is_relative() { tol * G } else { tol };
                 if (G - K).abs() < tol_curr || a == b || max_iter == 0 {
-                    if ! G.is_finite() {
+                    if !G.is_finite() {
                         return G;
                     }
                     I += G;
@@ -255,11 +251,11 @@ where
     I
 }
 
-pub fn kronrod_quadrature<F>(f: F, n: usize, (a, b): (f64, f64)) -> f64 
+pub fn kronrod_quadrature<F>(f: F, n: usize, (a, b): (f64, f64)) -> f64
 where
     F: Fn(f64) -> f64,
 {
-    (b - a) / 2f64 * unit_kronrod_quadrature(|x| f(x * (b-a) / 2f64 + (a + b) / 2f64), n)   
+    (b - a) / 2f64 * unit_kronrod_quadrature(|x| f(x * (b - a) / 2f64 + (a + b) / 2f64), n)
 }
 
 // =============================================================================
@@ -382,9 +378,9 @@ fn unit_kronrod_quadrature<F>(f: F, n: usize) -> f64
 where
     F: Fn(f64) -> f64,
 {
-    let (a,x) = kronrod_table(n);
+    let (a, x) = kronrod_table(n);
     let mut s = 0f64;
-    for i in 0 .. a.len() {
+    for i in 0..a.len() {
         s += a[i] * f(x[i]);
     }
     s
@@ -414,28 +410,28 @@ fn kronrod_table(n: usize) -> (Vec<f64>, Vec<f64>) {
 
     match n % 2 {
         0 => {
-            for i in 0 .. ref_node.len() {
+            for i in 0..ref_node.len() {
                 result_node[i] = ref_node[i];
                 result_weight[i] = ref_weight[i];
             }
 
-            for i in ref_node.len() .. n {
-                result_node[i] = -ref_node[n-i-1];
-                result_weight[i] = ref_weight[n-i-1];
+            for i in ref_node.len()..n {
+                result_node[i] = -ref_node[n - i - 1];
+                result_weight[i] = ref_weight[n - i - 1];
             }
         }
         1 => {
-            for i in 0 .. ref_node.len() {
+            for i in 0..ref_node.len() {
                 result_node[i] = ref_node[i];
                 result_weight[i] = ref_weight[i];
             }
 
-            for i in ref_node.len() .. n {
-                result_node[i] = -ref_node[n-i];
-                result_weight[i] = ref_weight[n-i];
+            for i in ref_node.len()..n {
+                result_node[i] = -ref_node[n - i];
+                result_weight[i] = ref_weight[n - i];
             }
         }
-        _ => unreachable!()
+        _ => unreachable!(),
     }
     (result_weight, result_node)
 }
@@ -922,7 +918,7 @@ const LEGENDRE_WEIGHT_25: [f64; 13] = [
     0.054904695975835192,
     0.040939156701306313,
     0.026354986615032137,
-    0.011393798501026288,    
+    0.011393798501026288,
 ];
 const LEGENDRE_WEIGHT_26: [f64; 13] = [
     0.118321415279262277,
diff --git a/src/prelude/simpler.rs b/src/prelude/simpler.rs
index 819bc38f..5b786b6a 100644
--- a/src/prelude/simpler.rs
+++ b/src/prelude/simpler.rs
@@ -1,5 +1,3 @@
-#[cfg(feature="parquet")]
-use std::error::Error;
 use crate::numerical::{
     eigen,
     eigen::{Eigen, EigenMethod::Jacobi},
@@ -8,13 +6,15 @@ use crate::numerical::{
     spline,
     spline::{CubicHermiteSpline, SlopeMethod::Quadratic},
 };
+#[cfg(feature = "parquet")]
+use crate::structure::dataframe::{DataFrame, WithParquet};
 use crate::structure::matrix::{self, Matrix};
 use crate::structure::polynomial;
 use crate::traits::math::{Norm, Normed};
-#[cfg(feature="parquet")]
-use crate::structure::dataframe::{DataFrame, WithParquet};
-#[cfg(feature="parquet")]
+#[cfg(feature = "parquet")]
 use arrow2::io::parquet::write::CompressionOptions;
+#[cfg(feature = "parquet")]
+use std::error::Error;
 
 /// Simple Norm
 pub trait SimpleNorm: Normed {
@@ -23,7 +23,7 @@ pub trait SimpleNorm: Normed {
 }
 
 /// Simple integrate
-pub fn integrate<F: Fn(f64) -> f64 + Copy>(f: F, (a, b): (f64, f64)) -> f64 {
+pub fn integrate<F: Fn(f64) -> f64 + Copy + Send + Sync>(f: F, (a, b): (f64, f64)) -> f64 {
     integral::integrate(f, (a, b), G7K15R(1e-4, 20))
 }
 
@@ -35,7 +35,7 @@ pub trait SimplerLinearAlgebra {
     fn waz_diag(&self) -> Option<matrix::WAZD>;
     fn waz(&self) -> Option<matrix::WAZD>;
     fn qr(&self) -> matrix::QR;
-    #[cfg(feature="O3")]
+    #[cfg(feature = "O3")]
     fn cholesky(&self) -> Matrix;
     fn rref(&self) -> Matrix;
     fn det(&self) -> f64;
@@ -99,7 +99,7 @@ impl SimplerLinearAlgebra for Matrix {
         matrix::LinearAlgebra::qr(self)
     }
 
-    #[cfg(feature="O3")]
+    #[cfg(feature = "O3")]
     fn cholesky(&self) -> Matrix {
         matrix::LinearAlgebra::cholesky(self, matrix::UPLO::Lower)
     }
@@ -161,7 +161,7 @@ use crate::special::function::{
 /// Returns [`NAN`](f64::NAN) if the given input is smaller than -1/e (≈ -0.36787944117144233).
 ///
 /// Accurate to 50 bits.
-/// 
+///
 /// Wrapper of `lambert_w_0` function of `lambert_w` crate.
 ///
 /// # Reference
@@ -176,7 +176,7 @@ pub fn lambert_w0(z: f64) -> f64 {
 /// Returns [`NAN`](f64::NAN) if the given input is positive or smaller than -1/e (≈ -0.36787944117144233).
 ///
 /// Accurate to 50 bits.
-/// 
+///
 /// Wrapper of `lambert_w_m1` function of `lambert_w` crate.
 ///
 /// # Reference
@@ -187,13 +187,13 @@ pub fn lambert_wm1(z: f64) -> f64 {
 }
 
 /// Simple handle parquet
-#[cfg(feature="parquet")]
+#[cfg(feature = "parquet")]
 pub trait SimpleParquet: Sized {
     fn write_parquet(&self, path: &str) -> Result<(), Box<dyn Error>>;
     fn read_parquet(path: &str) -> Result<Self, Box<dyn Error>>;
 }
 
-#[cfg(feature="parquet")]
+#[cfg(feature = "parquet")]
 impl SimpleParquet for DataFrame {
     fn write_parquet(&self, path: &str) -> Result<(), Box<dyn Error>> {
         WithParquet::write_parquet(self, path, CompressionOptions::Uncompressed)
diff --git a/src/statistics/dist.rs b/src/statistics/dist.rs
index b409c197..d3f1f6cb 100644
--- a/src/statistics/dist.rs
+++ b/src/statistics/dist.rs
@@ -234,15 +234,15 @@ use self::rand::distributions::uniform::SampleUniform;
 use self::rand::prelude::*;
 pub use self::OPDist::*;
 pub use self::TPDist::*;
-use crate::traits::fp::FPVector;
 use crate::special::function::*;
+use crate::traits::fp::FPVector;
 //use statistics::rand::ziggurat;
+use self::WeightedUniformError::*;
 use crate::statistics::{ops::C, stat::Statistics};
 use crate::util::non_macro::{linspace, seq};
 use crate::util::useful::{auto_zip, find_interval};
+use anyhow::{bail, Result};
 use std::f64::consts::E;
-use self::WeightedUniformError::*;
-use anyhow::{Result, bail};
 
 /// One parameter distribution
 ///
@@ -271,7 +271,7 @@ pub enum TPDist<T: PartialOrd + SampleUniform + Copy + Into<f64>> {
 pub struct WeightedUniform<T: PartialOrd + SampleUniform + Copy + Into<f64>> {
     weights: Vec<T>,
     sum: T,
-    intervals: Vec<(T, T)>
+    intervals: Vec<(T, T)>,
 }
 
 #[derive(Debug, Clone, Copy)]
@@ -286,7 +286,9 @@ impl std::fmt::Display for WeightedUniformError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             WeightedUniformError::AllZeroWeightError => write!(f, "all weights are zero"),
-            WeightedUniformError::LengthMismatchError => write!(f, "weights and intervals have different length"),
+            WeightedUniformError::LengthMismatchError => {
+                write!(f, "weights and intervals have different length")
+            }
             WeightedUniformError::NoNonZeroIntervalError => write!(f, "no non-zero interval found"),
             WeightedUniformError::EmptyWeightError => write!(f, "weights are empty"),
         }
@@ -295,11 +297,11 @@ impl std::fmt::Display for WeightedUniformError {
 
 impl WeightedUniform<f64> {
     /// Create a new weighted uniform distribution
-    /// 
+    ///
     /// # Examples
     /// ```
     /// use peroxide::fuga::*;
-    /// 
+    ///
     /// fn main() -> Result<(), Box<dyn Error>> {
     ///     let weights = vec![1f64, 3f64, 0f64, 2f64];
     ///     let intervals = vec![0f64, 1f64, 2f64, 4f64, 5f64];
@@ -336,32 +338,31 @@ impl WeightedUniform<f64> {
             }
         }
 
-        let sum = weights.iter()
+        let sum = weights
+            .iter()
             .zip(intervals.iter())
             .fold(0f64, |acc, (w, (a, b))| acc + w * (b - a));
 
-        Ok(
-            WeightedUniform {
-                weights,
-                sum,
-                intervals
-            }
-        )
+        Ok(WeightedUniform {
+            weights,
+            sum,
+            intervals,
+        })
     }
 
     /// Create WeightedUniform from max pooling
-    /// 
+    ///
     /// # Examples
     /// ```
     /// use peroxide::fuga::*;
-    /// 
+    ///
     /// fn main() -> Result<(), Box<dyn Error>> {
     ///     let w = WeightedUniform::from_max_pool_1d(f, (-2f64, 3f64), 10, 1e-3)?;
     ///     w.weights().print();
     ///
     ///     Ok(())
     /// }
-    /// 
+    ///
     /// fn f(x: f64) -> f64 {
     ///     if x.abs() < 1f64 {
     ///         1f64 - x.abs()
@@ -371,40 +372,39 @@ impl WeightedUniform<f64> {
     /// }
     /// ```
     pub fn from_max_pool_1d<F>(f: F, (a, b): (f64, f64), n: usize, eps: f64) -> Result<Self>
-    where F: Fn(f64) -> f64 + Copy {
+    where
+        F: Fn(f64) -> f64 + Copy + Send + Sync,
+    {
         // Find non-zero intervals
         let mut a = a;
         let mut b = b;
         let trial = seq(a, b, eps);
-        for i in 0 .. trial.len() {
+        for i in 0..trial.len() {
             let x = trial[i];
             if f(x) > 0f64 {
-                a = if i > 0 { trial[i-1] } else { x };
+                a = if i > 0 { trial[i - 1] } else { x };
                 break;
             }
         }
-        for i in (0 .. trial.len()).rev() {
+        for i in (0..trial.len()).rev() {
             let x = trial[i];
             if f(x) > 0f64 {
-                b = if i < trial.len() - 1 { trial[i+1] } else { x };
+                b = if i < trial.len() - 1 { trial[i + 1] } else { x };
                 break;
             }
         }
         if a >= b {
             bail!(NoNonZeroIntervalError);
         }
-        let domain = linspace(a, b, n+1);
+        let domain = linspace(a, b, n + 1);
 
         // Find intervals
         let intervals = auto_zip(&domain);
 
         // Find weights
-        let weights: Vec<f64> = intervals.iter()
-            .map(
-                |(a, b)| {
-                    seq(*a, *b+eps, eps).reduce(0f64, |acc, x| acc.max(f(x)))
-                }
-            )
+        let weights: Vec<f64> = intervals
+            .iter()
+            .map(|(a, b)| seq(*a, *b + eps, eps).reduce(0f64, |acc, x| acc.max(f(x))))
             .collect();
 
         Self::new(weights, domain)
@@ -419,11 +419,19 @@ impl WeightedUniform<f64> {
     }
 
     pub fn domain_linspace(&self, n: usize) -> Vec<f64> {
-        linspace(self.intervals[0].0, self.intervals[self.intervals.len()-1].1, n)
+        linspace(
+            self.intervals[0].0,
+            self.intervals[self.intervals.len() - 1].1,
+            n,
+        )
     }
 
     pub fn domain_seq(&self, step: f64) -> Vec<f64> {
-        seq(self.intervals[0].0, self.intervals[self.intervals.len()-1].1, step)
+        seq(
+            self.intervals[0].0,
+            self.intervals[self.intervals.len() - 1].1,
+            step,
+        )
     }
 
     pub fn sum(&self) -> f64 {
@@ -433,15 +441,19 @@ impl WeightedUniform<f64> {
     pub fn update_weights(&mut self, weights: Vec<f64>) {
         assert_eq!(self.intervals.len(), weights.len());
         self.weights = weights;
-        self.sum = self.weights.iter()
+        self.sum = self
+            .weights
+            .iter()
             .zip(self.intervals.iter())
             .fold(0f64, |acc, (w, (a, b))| acc + w * (b - a));
     }
 
     pub fn update_intervals(&mut self, intervals: Vec<f64>) {
-        assert_eq!(self.weights.len()+1, intervals.len());
+        assert_eq!(self.weights.len() + 1, intervals.len());
         self.intervals = auto_zip(&intervals);
-        self.sum = self.weights.iter()
+        self.sum = self
+            .weights
+            .iter()
             .zip(self.intervals.iter())
             .fold(0f64, |acc, (w, (a, b))| acc + w * (b - a));
     }
@@ -493,7 +505,11 @@ impl<T: PartialOrd + SampleUniform + Copy + Into<f64>> ParametricDist for Weight
 
     fn params(&self) -> Self::Parameter {
         let weights = self.weights.iter().map(|x| (*x).into()).collect();
-        let intervals = self.intervals.iter().map(|(x, y)| ((*x).into(), (*y).into())).collect();
+        let intervals = self
+            .intervals
+            .iter()
+            .map(|(x, y)| ((*x).into(), (*y).into()))
+            .collect();
         (weights, intervals)
     }
 }
@@ -619,11 +635,7 @@ impl<T: PartialOrd + SampleUniform + Copy + Into<f64>> RNG for TPDist<T> {
 
             Binomial(num, mu) => {
                 let binom = rand_distr::Binomial::new(*num as u64, (*mu).into()).unwrap();
-                binom
-                    .sample_iter(rng)
-                    .take(n)
-                    .map(|t| t as f64)
-                    .collect()
+                binom.sample_iter(rng).take(n).map(|t| t as f64).collect()
             }
 
             Normal(m, s) => {
@@ -789,10 +801,12 @@ impl RNG for WeightedUniform<f64> {
         let ics: Vec<usize> = w.sample_iter(&mut rng_clip).take(n).collect();
         *rng = rng_clip;
 
-        ics.into_iter().map(|idx| {
-            let (l, r) = self.intervals[idx];
-            rng.gen_range(l ..=r)
-        }).collect::<Vec<f64>>()
+        ics.into_iter()
+            .map(|idx| {
+                let (l, r) = self.intervals[idx];
+                rng.gen_range(l..=r)
+            })
+            .collect::<Vec<f64>>()
     }
 
     fn pdf<S: PartialOrd + SampleUniform + Copy + Into<f64>>(&self, x: S) -> f64 {
@@ -812,11 +826,11 @@ impl RNG for WeightedUniform<f64> {
             return 1f64;
         }
         let idx = find_interval(self.intervals(), x);
-        self.weights[0 ..=idx].iter()
-            .zip(self.intervals[0 ..=idx].iter())
-            .fold(0f64, |acc, (w, (a, b))| {
-                acc + w * (b - a)
-            }) / self.sum
+        self.weights[0..=idx]
+            .iter()
+            .zip(self.intervals[0..=idx].iter())
+            .fold(0f64, |acc, (w, (a, b))| acc + w * (b - a))
+            / self.sum
     }
 }
 
@@ -912,16 +926,23 @@ impl Statistics for WeightedUniform<f64> {
     type Value = f64;
 
     fn mean(&self) -> Self::Value {
-        self.intervals().iter().zip(self.weights().iter())
-            .map(|((l, r), w)| (r.powi(2)-l.powi(2))/2f64 * w)
-            .sum::<f64>() / self.sum
+        self.intervals()
+            .iter()
+            .zip(self.weights().iter())
+            .map(|((l, r), w)| (r.powi(2) - l.powi(2)) / 2f64 * w)
+            .sum::<f64>()
+            / self.sum
     }
 
     fn var(&self) -> Self::Value {
         let mean = self.mean();
-        self.intervals().iter().zip(self.weights().iter())
+        self.intervals()
+            .iter()
+            .zip(self.weights().iter())
             .map(|((l, r), w)| w * (r.powi(3) - l.powi(3)) / 3f64)
-            .sum::<f64>() / self.sum - mean * mean
+            .sum::<f64>()
+            / self.sum
+            - mean * mean
     }
 
     fn sd(&self) -> Self::Value {
diff --git a/src/statistics/rand.rs b/src/statistics/rand.rs
index 36590085..7b36bc1d 100644
--- a/src/statistics/rand.rs
+++ b/src/statistics/rand.rs
@@ -24,9 +24,9 @@ extern crate rand;
 use self::rand::distributions::uniform::SampleUniform;
 use self::rand::prelude::*;
 
+use crate::statistics::dist::{WeightedUniform, RNG};
 #[allow(unused_imports)]
 use crate::structure::matrix::*;
-use crate::statistics::dist::{RNG, WeightedUniform};
 
 /// Small random number generator from seed
 ///
@@ -466,7 +466,9 @@ pub fn ziggurat(rng: &mut ThreadRng, sigma: f64) -> f64 {
 ///     Ok(())
 /// }
 pub fn prs<F>(f: F, n: usize, (a, b): (f64, f64), m: usize, eps: f64) -> anyhow::Result<Vec<f64>>
-where F: Fn(f64) -> f64 + Copy {
+where
+    F: Fn(f64) -> f64 + Copy + Send + Sync,
+{
     let mut rng = thread_rng();
 
     let mut result = vec![0f64; n];
@@ -482,7 +484,7 @@ where F: Fn(f64) -> f64 + Copy {
             if weight <= 0f64 {
                 continue;
             } else {
-                let y = rng.gen_range(0f64 ..=weight);
+                let y = rng.gen_range(0f64..=weight);
 
                 if y <= f(x) {
                     result[n - left_num] = x;
@@ -527,8 +529,17 @@ where F: Fn(f64) -> f64 + Copy {
 ///
 ///     Ok(())
 /// }
-pub fn prs_with_rng<F, R: Rng + Clone>(f: F, n: usize, (a, b): (f64, f64), m: usize, eps: f64, rng: &mut R) -> anyhow::Result<Vec<f64>>
-    where F: Fn(f64) -> f64 + Copy {
+pub fn prs_with_rng<F, R: Rng + Clone>(
+    f: F,
+    n: usize,
+    (a, b): (f64, f64),
+    m: usize,
+    eps: f64,
+    rng: &mut R,
+) -> anyhow::Result<Vec<f64>>
+where
+    F: Fn(f64) -> f64 + Copy + Send + Sync,
+{
     let mut result = vec![0f64; n];
 
     let w = WeightedUniform::from_max_pool_1d(f, (a, b), m, eps)?;
@@ -542,7 +553,7 @@ pub fn prs_with_rng<F, R: Rng + Clone>(f: F, n: usize, (a, b): (f64, f64), m: us
             if weight <= 0f64 {
                 continue;
             } else {
-                let y = rng.gen_range(0f64 ..=weight);
+                let y = rng.gen_range(0f64..=weight);
 
                 if y <= f(x) {
                     result[n - left_num] = x;
diff --git a/src/structure/vector.rs b/src/structure/vector.rs
index 0e977361..0d57ae42 100644
--- a/src/structure/vector.rs
+++ b/src/structure/vector.rs
@@ -267,6 +267,8 @@ extern crate blas;
 #[cfg(feature = "O3")]
 use blas::{daxpy, ddot, dnrm2, idamax};
 
+use rayon::prelude::*;
+
 use crate::structure::matrix::{matrix, Matrix, Row};
 use crate::traits::{
     fp::FPVector,
@@ -295,10 +297,14 @@ impl FPVector for Vec<f64> {
     /// ```
     fn fmap<F>(&self, f: F) -> Vec<f64>
     where
-        F: Fn(f64) -> f64,
+        F: Fn(f64) -> f64 + Sync + Send,
     {
+        // let mut v = self.clone();
+        // v.iter_mut().for_each(|x| *x = f(*x));
+        // v
+
         let mut v = self.clone();
-        v.iter_mut().for_each(|x| *x = f(*x));
+        v.par_iter_mut().for_each(|x| *x = f(*x));
         v
     }
 
@@ -317,17 +323,27 @@ impl FPVector for Vec<f64> {
     /// ```
     fn reduce<F, T>(&self, init: T, f: F) -> f64
     where
-        F: Fn(f64, f64) -> f64,
-        T: Into<f64>,
+        F: Fn(f64, f64) -> f64 + Send + Sync,
+        T: Into<f64> + Send + Sync + Copy,
     {
         self.iter().fold(init.into(), |x, &y| f(x, y))
+        // self.par_iter()
+        //     .cloned()
+        //     .fold(|| init.into(), |x, y| f(x, y))
+        //     .sum::<f64>() // Combining fold and reduce to produce a single value
+        //                   // .reduce(|| init.into(), |x, y| f(x, y))  // can not use reduce instead of .sum(), since the fn f used might not be associative (e.g. check test_max_pool_1d)
+        //                   // https://docs.rs/rayon/latest/rayon/iter/trait.ParallelIterator.html#method.reduce
     }
 
     fn zip_with<F>(&self, f: F, other: &Vec<f64>) -> Vec<f64>
     where
-        F: Fn(f64, f64) -> f64,
+        F: Fn(f64, f64) -> f64 + Send + Sync,
     {
-        self.iter()
+        // self.iter()
+        //     .zip(other)
+        //     .map(|(x, y)| f(*x, *y))
+        //     .collect::<Vec<f64>>()
+        self.par_iter()
             .zip(other)
             .map(|(x, y)| f(*x, *y))
             .collect::<Vec<f64>>()
@@ -349,10 +365,14 @@ impl FPVector for Vec<f64> {
     /// ```
     fn filter<F>(&self, f: F) -> Vec<f64>
     where
-        F: Fn(f64) -> bool,
+        F: Fn(f64) -> bool + Send + Sync,
     {
+        // self.clone()
+        //     .into_iter()
+        //     .filter(|x| f(*x))
+        //     .collect::<Vec<f64>>()
         self.clone()
-            .into_iter()
+            .into_par_iter()
             .filter(|x| f(*x))
             .collect::<Vec<f64>>()
     }
@@ -372,8 +392,16 @@ impl FPVector for Vec<f64> {
     /// }
     /// ```
     fn take(&self, n: usize) -> Vec<f64> {
-        let mut v = vec![0f64; n];
-        v[..n].copy_from_slice(&self[..n]);
+        // let mut v = vec![0f64; n];
+        // v[..n].copy_from_slice(&self[..n]);
+        // v
+        let mut v = vec![0_f64; n];
+        v[..n]
+            .par_iter_mut()
+            .zip(self[..n].par_iter())
+            .for_each(|(dest, src)| {
+                *dest = *src;
+            }); // parallel equivalent of copy_from_slice()
         v
     }
 
@@ -392,20 +420,29 @@ impl FPVector for Vec<f64> {
     /// }
     /// ```
     fn skip(&self, n: usize) -> Vec<f64> {
+        // let l = self.len();
+        // let mut v = vec![0f64; l - n];
+        // for (i, j) in (n..l).enumerate() {
+        //     v[i] = self[j];
+        // }
+        // v
         let l = self.len();
         let mut v = vec![0f64; l - n];
-        for (i, j) in (n..l).enumerate() {
-            v[i] = self[j];
-        }
+        v[..(l - n)]
+            .par_iter_mut()
+            .zip(self[n..l].par_iter())
+            .for_each(|(dest, src)| *dest = *src);
         v
     }
 
     fn sum(&self) -> f64 {
-        self.iter().sum()
+        // self.iter().sum()
+        self.par_iter().sum()
     }
 
     fn prod(&self) -> f64 {
-        self.iter().product()
+        // self.iter().product()
+        self.par_iter().product()
     }
 }
 
diff --git a/src/traits/fp.rs b/src/traits/fp.rs
index 41f11463..9204df38 100644
--- a/src/traits/fp.rs
+++ b/src/traits/fp.rs
@@ -6,17 +6,17 @@ pub trait FPVector {
 
     fn fmap<F>(&self, f: F) -> Self
     where
-        F: Fn(Self::Scalar) -> Self::Scalar;
+        F: Fn(Self::Scalar) -> Self::Scalar + Send + Sync;
     fn reduce<F, T>(&self, init: T, f: F) -> Self::Scalar
     where
-        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar,
-        T: Into<Self::Scalar>;
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync,
+        T: Into<Self::Scalar> + Send + Sync + Copy;
     fn zip_with<F>(&self, f: F, other: &Self) -> Self
     where
-        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar;
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync;
     fn filter<F>(&self, f: F) -> Self
     where
-        F: Fn(Self::Scalar) -> bool;
+        F: Fn(Self::Scalar) -> bool + Send + Sync;
     fn take(&self, n: usize) -> Self;
     fn skip(&self, n: usize) -> Self;
     fn sum(&self) -> Self::Scalar;
diff --git a/src/traits/sugar.rs b/src/traits/sugar.rs
index a51d9269..7bb9b989 100644
--- a/src/traits/sugar.rs
+++ b/src/traits/sugar.rs
@@ -1,15 +1,19 @@
-use crate::structure::matrix::{Matrix, Shape, matrix};
+use crate::structure::matrix::{matrix, Matrix, Shape};
 use crate::traits::fp::FPVector;
 use crate::util::non_macro::zeros_shape;
-use std::ops::{Add, Sub, Mul, Div};
+use std::ops::{Add, Div, Mul, Sub};
 
 /// Syntactic sugar for Vector operations
-pub trait VecOps: Sized + FPVector 
-where Self::Scalar: Copy + Clone
-    + Add<Self::Scalar, Output=Self::Scalar>
-    + Sub<Self::Scalar, Output=Self::Scalar> 
-    + Mul<Self::Scalar, Output=Self::Scalar>
-    + Div<Self::Scalar, Output=Self::Scalar>
+pub trait VecOps: Sized + FPVector
+where
+    Self::Scalar: Copy
+        + Clone
+        + Add<Self::Scalar, Output = Self::Scalar>
+        + Sub<Self::Scalar, Output = Self::Scalar>
+        + Mul<Self::Scalar, Output = Self::Scalar>
+        + Div<Self::Scalar, Output = Self::Scalar>
+        + Send
+        + Sync,
 {
     //type Scalar;
     fn add_v(&self, v: &Self) -> Self {
@@ -449,7 +453,7 @@ impl ConvToMat for Vec<f64> {
     fn to_col(&self) -> Matrix {
         matrix(self.clone(), self.len(), 1, Shape::Col)
     }
-    
+
     fn to_row(&self) -> Matrix {
         matrix(self.clone(), 1, self.len(), Shape::Row)
     }

From e916c8858c4047d21c5a0553e2a12e93ee8ffeaf Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Sun, 29 Sep 2024 11:13:20 +0200
Subject: [PATCH 03/17] Added parallelism to other VecOps

- Used rayon to parallelize vector operations
like innerprod, crossprod
---
 src/structure/vector.rs | 118 ++++++++++++++++++++++++++--------------
 src/traits/mutable.rs   |   4 +-
 2 files changed, 79 insertions(+), 43 deletions(-)

diff --git a/src/structure/vector.rs b/src/structure/vector.rs
index 0d57ae42..58c4315e 100644
--- a/src/structure/vector.rs
+++ b/src/structure/vector.rs
@@ -277,7 +277,7 @@ use crate::traits::{
     mutable::MutFP,
     pointer::{Oxide, Redox, RedoxCommon},
 };
-use std::cmp::min;
+// use std::cmp::min;
 
 impl FPVector for Vec<f64> {
     type Scalar = f64;
@@ -449,14 +449,17 @@ impl FPVector for Vec<f64> {
 /// Explicit version of `map`
 pub fn map<F, T>(f: F, xs: &[T]) -> Vec<T>
 where
-    F: Fn(T) -> T,
-    T: Default + Copy,
+    F: Fn(T) -> T + Send + Sync,
+    T: Default + Copy + Send + Sync,
 {
-    let l = xs.len();
-    let mut result = vec![T::default(); l];
-    for i in 0..l {
-        result[i] = f(xs[i]);
-    }
+    // let l = xs.len();
+    // let mut result = vec![T::default(); l];
+    // for i in 0..l {
+    //     result[i] = f(xs[i]);
+    // }
+    // result
+    let mut result = vec![T::default(); xs.len()];
+    result.par_iter_mut().for_each(|x| *x = f(*x));
     result
 }
 
@@ -471,20 +474,25 @@ where
         s = f(s, x);
     }
     s
+    // Parallel version: !unimplemented()
 }
 
 /// Explicit version of `zip_with`
 pub fn zip_with<F, T>(f: F, xs: &[T], ys: &[T]) -> Vec<T>
 where
-    F: Fn(T, T) -> T,
-    T: Default + Copy,
+    F: Fn(T, T) -> T + Send + Sync,
+    T: Default + Copy + Send + Sync,
 {
-    let l = min(xs.len(), ys.len());
-    let mut result = vec![T::default(); l];
-    for i in 0..l {
-        result[i] = f(xs[i], ys[i]);
-    }
-    result
+    // let l = min(xs.len(), ys.len());
+    // let mut result = vec![T::default(); l];
+    // for i in 0..l {
+    //     result[i] = f(xs[i], ys[i]);
+    // }
+    // result
+    xs.par_iter()
+        .zip(ys.into_par_iter())
+        .map(|(x, y)| f(*x, *y))
+        .collect::<Vec<T>>()
 }
 
 impl MutFP for Vec<f64> {
@@ -492,20 +500,24 @@ impl MutFP for Vec<f64> {
 
     fn mut_map<F>(&mut self, f: F)
     where
-        F: Fn(Self::Scalar) -> Self::Scalar,
+        F: Fn(Self::Scalar) -> Self::Scalar + Send + Sync,
     {
-        for x in self.iter_mut() {
-            *x = f(*x);
-        }
+        // for x in self.iter_mut() {
+        //     *x = f(*x);
+        // }
+        self.par_iter_mut().for_each(|x| *x = f(*x));
     }
 
     fn mut_zip_with<F>(&mut self, f: F, other: &Self)
     where
-        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar,
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync,
     {
-        for i in 0..self.len() {
-            self[i] = f(self[i], other[i]);
-        }
+        // for i in 0..self.len() {
+        //     self[i] = f(self[i], other[i]);
+        // }
+        self.par_iter_mut()
+            .zip(other.into_par_iter())
+            .for_each(|(x, y)| *x = f(*x, *y));
     }
 }
 
@@ -705,7 +717,11 @@ impl Normed for Vec<f64> {
                 }
                 #[cfg(not(feature = "O3"))]
                 {
-                    self.iter().map(|x| x.powi(2)).sum::<f64>().sqrt()
+                    // self.iter().map(|x| x.powi(2)).sum::<f64>().sqrt()
+                    self.par_iter()
+                        .map(|x| x.powi(2))
+                        .sum::<Self::UnsignedScalar>()
+                        .sqrt()
                 }
             }
             Norm::Lp(p) => {
@@ -714,7 +730,11 @@ impl Normed for Vec<f64> {
                     "lp norm is only defined for p>=1, the given value was p={}",
                     p
                 );
-                self.iter().map(|x| x.powf(p)).sum::<f64>().powf(1f64 / p)
+                // self.iter().map(|x| x.powf(p)).sum::<f64>().powf(1f64 / p)
+                self.par_iter()
+                    .map(|x| x.powf(p))
+                    .sum::<Self::UnsignedScalar>()
+                    .powf(1_f64 / p)
             }
             Norm::LInf => self.iter().fold(0f64, |x, y| x.max(y.abs())),
             Norm::F => unimplemented!(),
@@ -743,9 +763,13 @@ impl InnerProduct for Vec<f64> {
         }
         #[cfg(not(feature = "O3"))]
         {
-            self.iter()
-                .zip(rhs.iter())
-                .fold(0f64, |x, (y1, y2)| x + y1 * y2)
+            // self.iter()
+            //     .zip(rhs.iter())
+            //     .fold(0f64, |x, (y1, y2)| x + y1 * y2)
+            self.par_iter()
+                .zip(rhs.into_par_iter())
+                .fold(|| 0_f64, |x, (y1, y2)| x + y1 * y2)
+                .sum::<f64>()
         }
     }
 }
@@ -766,18 +790,30 @@ impl VectorProduct for Vec<f64> {
     fn cross(&self, other: &Self) -> Self {
         assert_eq!(self.len(), other.len());
         // 2D cross product is ill-defined
-        if self.len() == 2 {
-            let mut v = vec![0f64; 1];
-            v[0] = self[0] * other[1] - self[1] * other[0];
-            v
-        } else if self.len() == 3 {
-            let mut v = vec![0f64; 3];
-            v[0] = self[1] * other[2] - self[2] * other[1];
-            v[1] = self[2] * other[0] - self[0] * other[2];
-            v[2] = self[0] * other[1] - self[1] * other[0];
-            v
-        } else {
-            panic!("Cross product can be defined only in 2 or 3 dimension")
+        match self.len() {
+            2 => {
+                let mut v = vec![0f64; 1];
+                v[0] = self[0] * other[1] - self[1] * other[0];
+                v
+            }
+            3 => {
+                // let mut v = vec![0f64; 3];
+                // v[0] = self[1] * other[2] - self[2] * other[1];
+                // v[1] = self[2] * other[0] - self[0] * other[2];
+                // v[2] = self[0] * other[1] - self[1] * other[0];
+                // v
+                let v = (0..3)
+                    .into_par_iter()
+                    .map(|index| {
+                        self[(index + 1) % 3] * other[(index + 2) % 3]
+                            - self[(index + 2) % 3] * other[(index + 1) % 3]
+                    })
+                    .collect::<Vec<f64>>();
+                v
+            }
+            _ => {
+                panic!("Cross product can be defined only in 2 or 3 dimension")
+            }
         }
     }
 
diff --git a/src/traits/mutable.rs b/src/traits/mutable.rs
index 7b0932fc..a8ef80a9 100644
--- a/src/traits/mutable.rs
+++ b/src/traits/mutable.rs
@@ -4,10 +4,10 @@ pub trait MutFP {
     type Scalar;
     fn mut_map<F>(&mut self, f: F)
     where
-        F: Fn(Self::Scalar) -> Self::Scalar;
+        F: Fn(Self::Scalar) -> Self::Scalar + Send + Sync;
     fn mut_zip_with<F>(&mut self, f: F, other: &Self)
     where
-        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar;
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync;
 }
 
 pub trait MutMatrix {

From 454f82c2d2dde5031166051a9cc33497a0b36184 Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Sun, 29 Sep 2024 13:21:23 +0200
Subject: [PATCH 04/17] Intial attempt to parallelize matrix

- Used rayon to parallelize impl Matrix
---
 src/structure/matrix.rs | 368 +++++++++++++++++++++++++---------------
 src/structure/vector.rs |   2 +-
 2 files changed, 228 insertions(+), 142 deletions(-)

diff --git a/src/structure/matrix.rs b/src/structure/matrix.rs
index 797792ca..4fa1f00f 100644
--- a/src/structure/matrix.rs
+++ b/src/structure/matrix.rs
@@ -192,7 +192,7 @@
 //! To write columns or rows, `DataFrame` and `nc` feature could be the best choice.
 //!
 //! * `nc` feature should be required - `netcdf` or `libnetcdf` are prerequisites.
-//! 
+//!
 //! ```no_run
 //! #[macro_use]
 //! extern crate peroxide;
@@ -596,10 +596,10 @@
 //!     }
 //!     ```
 
-#[cfg(feature="csv")]
+#[cfg(feature = "csv")]
 extern crate csv;
 
-#[cfg(feature="csv")]
+#[cfg(feature = "csv")]
 use self::csv::{ReaderBuilder, StringRecord, WriterBuilder};
 
 #[cfg(feature = "O3")]
@@ -609,30 +609,32 @@ extern crate lapack;
 #[cfg(feature = "O3")]
 use blas::{daxpy, dgemm, dgemv};
 #[cfg(feature = "O3")]
-use lapack::{dgecon, dgeqrf, dgetrf, dgetri, dgetrs, dorgqr, dgesvd, dpotrf};
+use lapack::{dgecon, dgeqrf, dgesvd, dgetrf, dgetri, dgetrs, dorgqr, dpotrf};
 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 
+use crate::rayon::prelude::*;
+
 pub use self::Shape::{Col, Row};
 use crate::numerical::eigen::{eigen, EigenMethod};
+use crate::structure::dataframe::{Series, TypedVector};
+use crate::traits::sugar::ScalableMut;
 use crate::traits::{
-    general::Algorithm,
     fp::{FPMatrix, FPVector},
+    general::Algorithm,
     math::{InnerProduct, LinearOp, MatrixProduct, Norm, Normed, Vector},
     mutable::MutMatrix,
 };
 use crate::util::{
-    low_level::{swap_vec_ptr, copy_vec_ptr},
+    low_level::{copy_vec_ptr, swap_vec_ptr},
     non_macro::{cbind, eye, rbind, zeros},
     useful::{nearly_eq, tab},
 };
-use crate::structure::dataframe::{Series, TypedVector};
+use peroxide_num::{ExpLogOps, Numeric, PowOps, TrigOps};
 use std::cmp::{max, min};
 pub use std::error::Error;
 use std::fmt;
 use std::ops::{Add, Div, Index, IndexMut, Mul, Neg, Sub};
-use crate::traits::sugar::ScalableMut;
-use peroxide_num::{ExpLogOps, PowOps, TrigOps, Numeric};
 
 pub type Perms = Vec<(usize, usize)>;
 
@@ -711,10 +713,10 @@ pub struct Matrix {
 /// ```
 pub fn matrix<T>(v: Vec<T>, r: usize, c: usize, shape: Shape) -> Matrix
 where
-    T: Into<f64>,
+    T: Into<f64> + Send + Sync,
 {
     Matrix {
-        data: v.into_iter().map(|t| t.into()).collect::<Vec<f64>>(),
+        data: v.into_par_iter().map(|t| t.into()).collect::<Vec<f64>>(),
         row: r,
         col: c,
         shape,
@@ -724,7 +726,7 @@ where
 /// R-like matrix constructor (Explicit ver.)
 pub fn r_matrix<T>(v: Vec<T>, r: usize, c: usize, shape: Shape) -> Matrix
 where
-    T: Into<f64>,
+    T: Into<f64> + Send + Sync,
 {
     matrix(v, r, c, shape)
 }
@@ -745,17 +747,30 @@ where
 /// ```
 pub fn py_matrix<T>(v: Vec<Vec<T>>) -> Matrix
 where
-    T: Into<f64> + Copy,
+    T: Into<f64> + Copy + Send + Sync,
 {
+    // let r = v.len();
+    // let c = v[0].len();
+    // let mut data = vec![0f64; r * c];
+    // for i in 0..r {
+    //     for j in 0..c {
+    //         let idx = i * c + j;
+    //         data[idx] = v[i][j].into();
+    //     }
+    // }
+    // matrix(data, r, c, Row)
+
     let r = v.len();
     let c = v[0].len();
-    let mut data = vec![0f64; r * c];
-    for i in 0..r {
-        for j in 0..c {
-            let idx = i * c + j;
-            data[idx] = v[i][j].into();
-        }
-    }
+    let data = (0..r)
+        .into_par_iter()
+        .flat_map(|i| {
+            (0..c)
+                .into_par_iter()
+                .map(|j| v[i][j].into())
+                .collect::<Vec<f64>>()
+        })
+        .collect::<Vec<f64>>();
     matrix(data, r, c, Row)
 }
 
@@ -774,6 +789,23 @@ where
 /// }
 /// ```
 pub fn ml_matrix(s: &str) -> Matrix where {
+    // let str_rows: Vec<&str> = s.split(';').collect();
+    // let r = str_rows.len();
+    // let str_data = str_rows
+    //     .into_iter()
+    //     .map(|x| x.trim().split(' ').collect::<Vec<&str>>())
+    //     .collect::<Vec<Vec<&str>>>();
+    // let c = str_data[0].len();
+    // let data = str_data
+    //     .into_iter()
+    //     .flat_map(|t| {
+    //         t.into_iter()
+    //             .map(|x| x.parse::<f64>().unwrap())
+    //             .collect::<Vec<f64>>()
+    //     })
+    //     .collect::<Vec<f64>>();
+    // matrix(data, r, c, Row)
+
     let str_rows: Vec<&str> = s.split(';').collect();
     let r = str_rows.len();
     let str_data = str_rows
@@ -781,10 +813,11 @@ pub fn ml_matrix(s: &str) -> Matrix where {
         .map(|x| x.trim().split(' ').collect::<Vec<&str>>())
         .collect::<Vec<Vec<&str>>>();
     let c = str_data[0].len();
+
     let data = str_data
-        .into_iter()
+        .into_par_iter()
         .flat_map(|t| {
-            t.into_iter()
+            t.into_par_iter()
                 .map(|x| x.parse::<f64>().unwrap())
                 .collect::<Vec<f64>>()
         })
@@ -803,9 +836,15 @@ impl fmt::Display for Matrix {
 impl PartialEq for Matrix {
     fn eq(&self, other: &Matrix) -> bool {
         if self.shape == other.shape {
+            // self.data
+            //     .clone()
+            //     .into_iter()
+            //     .zip(other.data.clone())
+            //     .all(|(x, y)| nearly_eq(x, y))
+            //     && self.row == other.row
             self.data
                 .clone()
-                .into_iter()
+                .into_par_iter()
                 .zip(other.data.clone())
                 .all(|(x, y)| nearly_eq(x, y))
                 && self.row == other.row
@@ -876,23 +915,35 @@ impl Matrix {
         let c = self.col;
         assert_eq!(r * c, self.data.len());
         let l = r * c - 1;
-        let mut data: Vec<f64> = self.data.clone();
+        // let mut data: Vec<f64> = self.data.clone();
         let ref_data = &self.data;
 
         match self.shape {
             Row => {
-                for i in 0..l {
-                    let s = (i * c) % l;
-                    data[i] = ref_data[s];
-                }
+                // for i in 0..l {
+                //     let s = (i * c) % l;
+                //     data[i] = ref_data[s];
+                // }
+                // data[l] = ref_data[l];
+                // matrix(data, r, c, Col)
+                let mut data = (0..=l)
+                    .into_par_iter()
+                    .map(|i| ref_data[(i * c) % l])
+                    .collect::<Vec<f64>>();
                 data[l] = ref_data[l];
                 matrix(data, r, c, Col)
             }
             Col => {
-                for i in 0..l {
-                    let s = (i * r) % l;
-                    data[i] = ref_data[s];
-                }
+                // for i in 0..l {
+                //     let s = (i * r) % l;
+                //     data[i] = ref_data[s];
+                // }
+                // data[l] = ref_data[l];
+                // matrix(data, r, c, Row)
+                let mut data = (0..=l)
+                    .into_par_iter()
+                    .map(|i| ref_data[(i * r) % l])
+                    .collect::<Vec<f64>>();
                 data[l] = ref_data[l];
                 matrix(data, r, c, Row)
             }
@@ -907,10 +958,11 @@ impl Matrix {
     /// ```
     /// use peroxide::fuga::*;
     ///
-    /// let a = matrix(vec![1,2,3,4],2,2,Row);
+    /// let mut a = matrix(vec![1,2,3,4,5,6,7,8,9],3,3,Row);
     /// assert_eq!(a.shape, Row);
-    /// let b = a.change_shape();
-    /// assert_eq!(b.shape, Col);
+    /// a.change_shape_mut();
+    /// assert_eq!(a, matrix(vec![1,4,7,2,5,8,3,6,9],3,3,Col));
+    /// assert_eq!(a.shape, Col);
     /// ```
     pub fn change_shape_mut(&mut self) {
         let r = self.row;
@@ -921,18 +973,32 @@ impl Matrix {
 
         match self.shape {
             Row => {
-                for i in 0..l {
-                    let s = (i * c) % l;
-                    self.data[i] = ref_data[s];
-                }
+                // for i in 0..l {
+                //     let s = (i * c) % l;
+                //     self.data[i] = ref_data[s];
+                // }
+                // self.data[l] = ref_data[l];
+                // self.shape = Col;
+
+                self.data = (0..=l)
+                    .into_par_iter()
+                    .map(|i| ref_data[(i * c) % l])
+                    .collect::<Vec<f64>>();
                 self.data[l] = ref_data[l];
                 self.shape = Col;
             }
             Col => {
-                for i in 0..l {
-                    let s = (i * r) % l;
-                    self.data[i] = ref_data[s];
-                }
+                // for i in 0..l {
+                //     let s = (i * r) % l;
+                //     self.data[i] = ref_data[s];
+                // }
+                // self.data[l] = ref_data[l];
+                // self.shape = Row;
+
+                self.data = (0..=l)
+                    .into_par_iter()
+                    .map(|i| ref_data[(i * r) % l])
+                    .collect::<Vec<f64>>();
                 self.data[l] = ref_data[l];
                 self.shape = Row;
             }
@@ -1041,10 +1107,14 @@ impl Matrix {
     /// ```
     pub fn col(&self, index: usize) -> Vec<f64> {
         assert!(index < self.col);
-        let mut container: Vec<f64> = vec![0f64; self.row];
-        for i in 0..self.row {
-            container[i] = self[(i, index)];
-        }
+        // let mut container: Vec<f64> = vec![0f64; self.row];
+        // for i in 0..self.row {
+        //     container[i] = self[(i, index)];
+        // }
+        let container = (0..self.row)
+            .into_par_iter()
+            .map(|i| self[(i, index)])
+            .collect::<Vec<f64>>();
         container
     }
 
@@ -1063,10 +1133,16 @@ impl Matrix {
     /// ```
     pub fn row(&self, index: usize) -> Vec<f64> {
         assert!(index < self.row);
-        let mut container: Vec<f64> = vec![0f64; self.col];
-        for i in 0..self.col {
-            container[i] = self[(index, i)];
-        }
+        // let mut container: Vec<f64> = vec![0f64; self.col];
+        // for i in 0..self.col {
+        //     container[i] = self[(index, i)];
+        // }
+        // container
+
+        let container = (0..self.col)
+            .into_par_iter()
+            .map(|i| self[(index, i)])
+            .collect::<Vec<f64>>();
         container
     }
 
@@ -1084,14 +1160,17 @@ impl Matrix {
     /// }
     /// ```
     pub fn diag(&self) -> Vec<f64> {
-        let mut container = vec![0f64; self.row];
-        let r = self.row;
-        let c = self.col;
-        assert_eq!(r, c);
-        let c2 = c + 1;
-        for i in 0..r {
-            container[i] = self.data[i * c2];
-        }
+        assert_eq!(self.row, self.col);
+        // let mut container = vec![0f64; self.row];
+        // for i in 0..self.row {
+        //     container[i] = self.data[i * (self.col + 1)];
+        // }
+        // container
+
+        let container = (0..self.row)
+            .into_par_iter()
+            .map(|i| self.data[i * (self.col + 1)])
+            .collect::<Vec<f64>>();
         container
     }
 
@@ -1102,7 +1181,8 @@ impl Matrix {
     /// use peroxide::fuga::*;
     ///
     /// let a = matrix(vec![1,2,3,4], 2, 2, Row);
-    /// println!("{}", a); // [[1,3],[2,4]]
+    /// println!("{}", a.transpose()); // [[1,3],[2,4]]
+    /// assert_eq!(a.transpose(), matrix(vec![1,3,2,4], 2, 2, Row))
     /// ```
     pub fn transpose(&self) -> Self {
         match self.shape {
@@ -1143,7 +1223,7 @@ impl Matrix {
     ///     Ok(())
     /// }
     /// ```
-    #[cfg(feature="csv")]
+    #[cfg(feature = "csv")]
     pub fn write(&self, file_path: &str) -> Result<(), Box<dyn Error>> {
         let mut wtr = WriterBuilder::new().from_path(file_path)?;
         let r = self.row;
@@ -1174,7 +1254,7 @@ impl Matrix {
     ///     Ok(())
     /// }
     /// ```
-    #[cfg(feature="csv")]
+    #[cfg(feature = "csv")]
     pub fn write_round(&self, file_path: &str, round: usize) -> Result<(), Box<dyn Error>> {
         let mut wtr = WriterBuilder::new().from_path(file_path)?;
         let r = self.row;
@@ -1190,7 +1270,7 @@ impl Matrix {
         Ok(())
     }
 
-    #[cfg(feature="csv")]
+    #[cfg(feature = "csv")]
     pub fn write_with_header(
         &self,
         file_path: &str,
@@ -1212,7 +1292,7 @@ impl Matrix {
         Ok(())
     }
 
-    #[cfg(feature="csv")]
+    #[cfg(feature = "csv")]
     pub fn write_with_header_round(
         &self,
         file_path: &str,
@@ -1260,7 +1340,7 @@ impl Matrix {
     ///     Ok(())
     /// }
     /// ```
-    #[cfg(feature="csv")]
+    #[cfg(feature = "csv")]
     pub fn read(file_path: &str, header: bool, delimiter: char) -> Result<Matrix, Box<dyn Error>> {
         let mut rdr = ReaderBuilder::new()
             .has_headers(header)
@@ -1324,20 +1404,31 @@ impl Matrix {
     /// From index operations
     pub fn from_index<F, G>(f: F, size: (usize, usize)) -> Matrix
     where
-        F: Fn(usize, usize) -> G + Copy,
+        F: Fn(usize, usize) -> G + Copy + Send + Sync,
         G: Into<f64>,
     {
         let row = size.0;
         let col = size.1;
 
-        let mut mat = matrix(vec![0f64; row * col], row, col, Row);
-
-        for i in 0..row {
-            for j in 0..col {
-                mat[(i, j)] = f(i, j).into();
-            }
-        }
-        mat
+        // let mut mat = matrix(vec![0f64; row * col], row, col, Row);
+
+        // for i in 0..row {
+        //     for j in 0..col {
+        //         mat[(i, j)] = f(i, j).into();
+        //     }
+        // }
+        // mat
+
+        let data = (0..row)
+            .into_par_iter()
+            .flat_map(|i| {
+                (0..col)
+                    .into_par_iter()
+                    .map(|j| f(i, j).into())
+                    .collect::<Vec<f64>>()
+            })
+            .collect::<Vec<f64>>();
+        matrix(data, row, col, Row)
     }
 
     /// Matrix to `Vec<Vec<f64>>`
@@ -1365,7 +1456,7 @@ impl Matrix {
     ///
     /// # Description
     /// Return below elements of matrix to new matrix
-    /// 
+    ///
     /// $$
     /// \begin{pmatrix}
     /// \\ddots & & & & \\\\
@@ -1380,7 +1471,7 @@ impl Matrix {
     /// ```
     /// extern crate peroxide;
     /// use peroxide::fuga::*;
-    /// 
+    ///
     /// fn main() {
     ///     let a = ml_matrix("1 2 3;4 5 6;7 8 9");
     ///     let b = ml_matrix("5 6;8 9");
@@ -1392,8 +1483,8 @@ impl Matrix {
         let row = end.0 - start.0 + 1;
         let col = end.1 - start.1 + 1;
         let mut result = matrix(vec![0f64; row * col], row, col, self.shape);
-        for i in 0 .. row {
-            for j in 0 .. col {
+        for i in 0..row {
+            for j in 0..col {
                 result[(i, j)] = self[(start.0 + i, start.1 + j)];
             }
         }
@@ -1404,7 +1495,7 @@ impl Matrix {
     ///
     /// # Description
     /// Substitute below elements of matrix
-    /// 
+    ///
     /// $$
     /// \begin{pmatrix}
     /// \\ddots & & & & \\\\
@@ -1414,12 +1505,12 @@ impl Matrix {
     ///   & & & & \\ddots
     /// \end{pmatrix}
     /// $$
-    /// 
+    ///
     /// # Examples
     /// ```
     /// extern crate peroxide;
     /// use peroxide::fuga::*;
-    /// 
+    ///
     /// fn main() {
     ///     let mut a = ml_matrix("1 2 3;4 5 6;7 8 9");
     ///     let b = ml_matrix("1 2;3 4");
@@ -1431,8 +1522,8 @@ impl Matrix {
     pub fn subs_mat(&mut self, start: (usize, usize), end: (usize, usize), m: &Matrix) {
         let row = end.0 - start.0 + 1;
         let col = end.1 - start.1 + 1;
-        for i in 0 .. row {
-            for j in 0 .. col {
+        for i in 0..row {
+            for j in 0..col {
                 self[(start.0 + i, start.1 + j)] = m[(i, j)];
             }
         }
@@ -2918,7 +3009,7 @@ impl SVD {
 
     pub fn s_mat(&self) -> Matrix {
         let mut mat = zeros(self.u.col, self.vt.row);
-        for i in 0 .. mat.row.min(mat.col) {
+        for i in 0..mat.row.min(mat.col) {
             mat[(i, i)] = self.s[i];
         }
         mat
@@ -2961,11 +3052,7 @@ impl SVD {
             }
         }
 
-        SVD {
-            s,
-            u,
-            vt,
-        }
+        SVD { s, u, vt }
     }
 }
 
@@ -3151,7 +3238,7 @@ impl LinearAlgebra for Matrix {
     #[allow(non_snake_case)]
     fn qr(&self) -> QR {
         match () {
-            #[cfg(feature="O3")]
+            #[cfg(feature = "O3")]
             () => {
                 let opt_dgeqrf = lapack_dgeqrf(self);
                 match opt_dgeqrf {
@@ -3159,10 +3246,7 @@ impl LinearAlgebra for Matrix {
                     Some(dgeqrf) => {
                         let q = dgeqrf.get_Q();
                         let r = dgeqrf.get_R();
-                        QR {
-                            q,
-                            r
-                        }
+                        QR { q, r }
                     }
                 }
             }
@@ -3209,7 +3293,7 @@ impl LinearAlgebra for Matrix {
     /// ```
     fn svd(&self) -> SVD {
         match () {
-            #[cfg(feature="O3")]
+            #[cfg(feature = "O3")]
             () => {
                 let opt_dgesvd = lapack_dgesvd(self);
                 match opt_dgesvd {
@@ -3218,14 +3302,12 @@ impl LinearAlgebra for Matrix {
                         SVD_STATUS::Diverge(i) => {
                             panic!("Divergent occurs in SVD - {} iterations", i)
                         }
-                        SVD_STATUS::Success => {
-                            SVD {
-                                s: dgesvd.s,
-                                u: dgesvd.u,
-                                vt: dgesvd.vt,
-                            }
-                        }
-                    }
+                        SVD_STATUS::Success => SVD {
+                            s: dgesvd.s,
+                            u: dgesvd.u,
+                            vt: dgesvd.vt,
+                        },
+                    },
                 }
             }
             _ => {
@@ -3465,7 +3547,9 @@ impl LinearAlgebra for Matrix {
                     None => panic!("Singular matrix (opt_dgrf)"),
                     Some(dgrf) => match dgrf.status {
                         LAPACK_STATUS::NonSingular => lapack_dgetri(&dgrf).unwrap(),
-                        LAPACK_STATUS::Singular => panic!("Singular matrix (LAPACK_STATUS Singular)"),
+                        LAPACK_STATUS::Singular => {
+                            panic!("Singular matrix (LAPACK_STATUS Singular)")
+                        }
                     },
                 }
             }
@@ -3500,13 +3584,13 @@ impl LinearAlgebra for Matrix {
     /// ```
     fn pseudo_inv(&self) -> Self {
         match () {
-            #[cfg(feature="O3")]
+            #[cfg(feature = "O3")]
             () => {
                 let svd = self.svd();
                 let row = svd.vt.row;
                 let col = svd.u.row;
                 let mut sp = zeros(row, col);
-                for i in 0 .. row.min(col) {
+                for i in 0..row.min(col) {
                     sp[(i, i)] = 1f64 / svd.s[i];
                 }
                 svd.vt.t() * sp * svd.u.t()
@@ -3620,9 +3704,9 @@ impl LinearAlgebra for Matrix {
             return false;
         }
 
-        for i in 0 .. self.row {
-            for j in i .. self.col {
-                if !nearly_eq(self[(i,j)], self[(j,i)]) {
+        for i in 0..self.row {
+            for j in i..self.col {
+                if !nearly_eq(self[(i, j)], self[(j, i)]) {
                     return false;
                 }
             }
@@ -3722,7 +3806,6 @@ impl Div<Matrix> for Matrix {
     }
 }
 
-
 impl ExpLogOps for Matrix {
     type Float = f64;
     fn exp(&self) -> Self {
@@ -4248,7 +4331,7 @@ pub enum POSITIVE_STATUS {
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 pub enum UPLO {
     Upper,
-    Lower
+    Lower,
 }
 
 /// Temporary data structure from `dgetrf`
@@ -4279,7 +4362,7 @@ pub struct DGESVD {
 pub struct DPOTRF {
     pub fact_mat: Matrix,
     pub uplo: UPLO,
-    pub status: POSITIVE_STATUS
+    pub status: POSITIVE_STATUS,
 }
 
 ///// Temporary data structure from `dgeev`
@@ -4473,7 +4556,22 @@ pub fn lapack_dgesvd(mat: &Matrix) -> Option<DGESVD> {
 
             // Workspace query
             unsafe {
-                dgesvd(jobu, jobvt, m, n, &mut A.data, lda, &mut s, &mut u, ldu, &mut vt, ldvt, &mut work, lwork, &mut info);
+                dgesvd(
+                    jobu,
+                    jobvt,
+                    m,
+                    n,
+                    &mut A.data,
+                    lda,
+                    &mut s,
+                    &mut u,
+                    ldu,
+                    &mut vt,
+                    ldvt,
+                    &mut work,
+                    lwork,
+                    &mut info,
+                );
             }
 
             let optimal_lwork = work[0] as usize;
@@ -4532,35 +4630,23 @@ pub fn lapack_dpotrf(mat: &Matrix, UPLO: UPLO) -> Option<DPOTRF> {
             let mut info = 0i32;
             let uplo = match UPLO {
                 UPLO::Upper => b'U',
-                UPLO::Lower => b'L'
+                UPLO::Lower => b'L',
             };
 
-            unsafe {
-                dpotrf(
-                    uplo,
-                    N,
-                    &mut A.data,
-                    lda,
-                    &mut info,
-                )
-            }
+            unsafe { dpotrf(uplo, N, &mut A.data, lda, &mut info) }
 
             if info == 0 {
-                Some(
-                    DPOTRF {
-                        fact_mat: matrix(A.data, mat.row, mat.col, Col),
-                        uplo: UPLO,
-                        status: POSITIVE_STATUS::Success
-                    }
-                )
+                Some(DPOTRF {
+                    fact_mat: matrix(A.data, mat.row, mat.col, Col),
+                    uplo: UPLO,
+                    status: POSITIVE_STATUS::Success,
+                })
             } else if info > 0 {
-                Some(
-                    DPOTRF {
-                        fact_mat: matrix(A.data, mat.row, mat.col, Col),
-                        uplo: UPLO,
-                        status: POSITIVE_STATUS::Failed(info)
-                    }
-                )
+                Some(DPOTRF {
+                    fact_mat: matrix(A.data, mat.row, mat.col, Col),
+                    uplo: UPLO,
+                    status: POSITIVE_STATUS::Failed(info),
+                })
             } else {
                 None
             }
@@ -4674,8 +4760,8 @@ impl DPOTRF {
         let mat = &self.fact_mat;
         let n = mat.col;
         let mut result = matrix(vec![0f64; n.pow(2)], n, n, mat.shape);
-        for i in 0 .. n {
-            for j in i .. n {
+        for i in 0..n {
+            for j in i..n {
                 result[(i, j)] = mat[(i, j)];
             }
         }
@@ -4691,8 +4777,8 @@ impl DPOTRF {
         let n = mat.col;
         let mut result = matrix(vec![0f64; n.pow(2)], n, n, mat.shape);
 
-        for i in 0 .. n {
-            for j in 0 .. i+1 {
+        for i in 0..n {
+            for j in 0..i + 1 {
                 result[(i, j)] = mat[(i, j)];
             }
         }
diff --git a/src/structure/vector.rs b/src/structure/vector.rs
index 58c4315e..0300bc78 100644
--- a/src/structure/vector.rs
+++ b/src/structure/vector.rs
@@ -267,7 +267,7 @@ extern crate blas;
 #[cfg(feature = "O3")]
 use blas::{daxpy, ddot, dnrm2, idamax};
 
-use rayon::prelude::*;
+use crate::rayon::prelude::*;
 
 use crate::structure::matrix::{matrix, Matrix, Row};
 use crate::traits::{

From 1c4e0b305dfdf793af90b9e4716d733a615f8029 Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Sun, 29 Sep 2024 17:18:46 +0200
Subject: [PATCH 05/17] Modified further traits for Matrix

- Added rayon support for Matrix
for traits like LinearOps, Normed etc
---
 src/structure/matrix.rs | 198 +++++++++++++++++++++++++++++-----------
 1 file changed, 147 insertions(+), 51 deletions(-)

diff --git a/src/structure/matrix.rs b/src/structure/matrix.rs
index 4fa1f00f..de290215 100644
--- a/src/structure/matrix.rs
+++ b/src/structure/matrix.rs
@@ -1435,21 +1435,49 @@ impl Matrix {
     ///
     /// To send `Matrix` to `inline-python`
     pub fn to_vec(&self) -> Vec<Vec<f64>> {
-        let mut result = vec![vec![0f64; self.col]; self.row];
-        for i in 0..self.row {
-            result[i] = self.row(i);
-        }
-        result
+        // let mut result = vec![vec![0f64; self.col]; self.row];
+        // for i in 0..self.row {
+        //     result[i] = self.row(i);
+        // }
+        // result
+
+        (0..self.row)
+            .into_par_iter()
+            .map(|i| self.row(i))
+            .collect::<Vec<Vec<f64>>>()
     }
 
+    /// To diagonal components
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = matrix!(1;4;1, 2, 2, Row);
+    ///     assert_eq!(a.to_diag(), matrix(c!(1,0,0,4), 2, 2, Row));
+    /// }
     pub fn to_diag(&self) -> Matrix {
         assert_eq!(self.row, self.col, "Should be square matrix");
-        let mut result = matrix(vec![0f64; self.row * self.col], self.row, self.col, Row);
-        let diag = self.diag();
-        for i in 0..self.row {
-            result[(i, i)] = diag[i];
-        }
-        result
+        // let mut result = matrix(vec![0f64; self.row * self.col], self.row, self.col, Row);
+        // let diag = self.diag();
+        // for i in 0..self.row {
+        //     result[(i, i)] = diag[i];
+        // }
+        // result
+
+        let data = (0..self.row)
+            .into_par_iter()
+            .flat_map(|i| {
+                (0..self.col)
+                    .into_par_iter()
+                    .map(|j| if i == j { self.diag()[i] } else { 0_f64 })
+                    .collect::<Vec<f64>>()
+            })
+            .collect::<Vec<f64>>();
+        matrix(data, self.row, self.col, Row)
     }
 
     /// Submatrix
@@ -1482,13 +1510,24 @@ impl Matrix {
     pub fn submat(&self, start: (usize, usize), end: (usize, usize)) -> Matrix {
         let row = end.0 - start.0 + 1;
         let col = end.1 - start.1 + 1;
-        let mut result = matrix(vec![0f64; row * col], row, col, self.shape);
-        for i in 0..row {
-            for j in 0..col {
-                result[(i, j)] = self[(start.0 + i, start.1 + j)];
-            }
-        }
-        result
+        // let mut result = matrix(vec![0f64; row * col], row, col, self.shape);
+        // for i in 0..row {
+        //     for j in 0..col {
+        //         result[(i, j)] = self[(start.0 + i, start.1 + j)];
+        //     }
+        // }
+        // result
+
+        let data = (0..row)
+            .into_par_iter()
+            .flat_map(|i| {
+                (0..col)
+                    .into_par_iter()
+                    .map(|j| self[(start.0 + i, start.1 + j)])
+                    .collect::<Vec<f64>>()
+            })
+            .collect::<Vec<f64>>();
+        matrix(data, row, col, self.shape)
     }
 
     /// Substitute matrix to specific position
@@ -1576,12 +1615,26 @@ impl Vector for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
+                // let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                // for i in 0..self.row {
+                //     for j in 0..self.col {
+                //         result[(i, j)] += other[(i, j)];
+                //     }
+                // }
+                // result
+
                 let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
-                for i in 0..self.row {
-                    for j in 0..self.col {
-                        result[(i, j)] += other[(i, j)];
-                    }
-                }
+                result
+                    .data
+                    .par_iter_mut()
+                    .enumerate()
+                    .for_each(|(idx, value)| {
+                        let i = idx / self.col;
+                        let j = idx % self.col;
+
+                        *value += other[(i, j)];
+                    });
+
                 result
             }
         }
@@ -1606,12 +1659,25 @@ impl Vector for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
+                // let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                // for i in 0..self.row {
+                //     for j in 0..self.col {
+                //         result[(i, j)] -= other[(i, j)];
+                //     }
+                // }
+                // result
+
                 let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
-                for i in 0..self.row {
-                    for j in 0..self.col {
-                        result[(i, j)] -= other[(i, j)];
-                    }
-                }
+                result
+                    .data
+                    .par_iter_mut()
+                    .enumerate()
+                    .for_each(|(idx, val)| {
+                        let i = idx / self.col;
+                        let j = idx % self.col;
+                        *val -= other[(i, j)];
+                    });
+
                 result
             }
         }
@@ -1644,21 +1710,39 @@ impl Normed for Matrix {
     fn norm(&self, kind: Norm) -> f64 {
         match kind {
             Norm::F => {
-                let mut s = 0f64;
-                for i in 0..self.data.len() {
-                    s += self.data[i].powi(2);
-                }
+                // let mut s = 0f64;
+                // for i in 0..self.data.len() {
+                //     s += self.data[i].powi(2);
+                // }
+                // s.sqrt()
+                let s = self
+                    .data
+                    .clone()
+                    .into_par_iter()
+                    .fold(|| 0_f64, |acc, el| acc + el.powi(2))
+                    .sum::<f64>();
                 s.sqrt()
             }
             Norm::Lpq(p, q) => {
-                let mut s = 0f64;
-                for j in 0..self.col {
-                    let mut s_row = 0f64;
-                    for i in 0..self.row {
-                        s_row += self[(i, j)].powi(p as i32);
-                    }
-                    s += s_row.powf(q / p);
-                }
+                // let mut s = 0f64;
+                // for j in 0..self.col {
+                //     let mut s_row = 0f64;
+                //     for i in 0..self.row {
+                //         s_row += self[(i, j)].powi(p as i32);
+                //     }
+                //     s += s_row.powf(q / p);
+                // }
+                // s.powf(1f64 / q)
+                let s = (0..self.col)
+                    .into_par_iter()
+                    .map(|j| {
+                        let s_row = (0..self.row)
+                            .into_iter()
+                            .map(|i| self[(i, j)].powi(p as i32))
+                            .sum::<f64>();
+                        s_row.powf(q / p)
+                    })
+                    .sum::<f64>();
                 s.powf(1f64 / q)
             }
             Norm::L1 => {
@@ -1667,7 +1751,8 @@ impl Normed for Matrix {
                     Row => self.change_shape().norm(Norm::L1),
                     Col => {
                         for c in 0..self.col {
-                            let s = self.col(c).iter().sum();
+                            // let s = self.col(c).iter().sum();
+                            let s = self.col(c).par_iter().sum();
                             if s > m {
                                 m = s;
                             }
@@ -1682,7 +1767,8 @@ impl Normed for Matrix {
                     Col => self.change_shape().norm(Norm::LInf),
                     Row => {
                         for r in 0..self.row {
-                            let s = self.row(r).iter().sum();
+                            // let s = self.row(r).iter().sum();
+                            let s = self.row(r).par_iter().sum();
                             if s > m {
                                 m = s;
                             }
@@ -1779,16 +1865,26 @@ impl MatrixProduct for Matrix {
         assert_eq!(self.row, other.row);
         assert_eq!(self.col, other.col);
 
-        let r = self.row;
-        let c = self.col;
+        // let r = self.row;
+        // let c = self.col;
+        // let mut m = matrix(vec![0f64; r * c], r, c, self.shape);
+        // for i in 0..r {
+        //     for j in 0..c {
+        //         m[(i, j)] = self[(i, j)] * other[(i, j)]
+        //     }
+        // }
+        // m
 
-        let mut m = matrix(vec![0f64; r * c], r, c, self.shape);
-        for i in 0..r {
-            for j in 0..c {
-                m[(i, j)] = self[(i, j)] * other[(i, j)]
-            }
-        }
-        m
+        let m = (0..self.row)
+            .into_par_iter()
+            .flat_map(|i| {
+                (0..self.col)
+                    .into_par_iter()
+                    .map(|j| self[(i, j)] * other[(i, j)])
+                    .collect::<Vec<f64>>()
+            })
+            .collect::<Vec<f64>>();
+        matrix(m, self.row, self.col, self.shape)
     }
 }
 

From 9d3a64931e8db78c34545d705dbd5692335d0d9f Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Sun, 29 Sep 2024 19:13:24 +0200
Subject: [PATCH 06/17] Modified FPMatrix to use Rayon

- Used rayon to parallelize Functional
Programming for Matrix
---
 src/structure/matrix.rs | 197 ++++++++++++++++++++++++++++------------
 src/traits/fp.rs        |  12 +--
 2 files changed, 145 insertions(+), 64 deletions(-)

diff --git a/src/structure/matrix.rs b/src/structure/matrix.rs
index de290215..b53212ff 100644
--- a/src/structure/matrix.rs
+++ b/src/structure/matrix.rs
@@ -1952,12 +1952,24 @@ impl Add<Matrix> for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
+                // let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                // for i in 0..self.row {
+                //     for j in 0..self.col {
+                //         result[(i, j)] += other[(i, j)];
+                //     }
+                // }
+                // result
                 let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
-                for i in 0..self.row {
-                    for j in 0..self.col {
-                        result[(i, j)] += other[(i, j)];
-                    }
-                }
+                result
+                    .data
+                    .par_iter_mut()
+                    .enumerate()
+                    .for_each(|(idx, value)| {
+                        let i = idx / self.col;
+                        let j = idx % self.col;
+
+                        *value += other[(i, j)];
+                    });
                 result
             }
         }
@@ -1987,7 +1999,7 @@ impl<'a, 'b> Add<&'b Matrix> for &'a Matrix {
 /// ```
 impl<T> Add<T> for Matrix
 where
-    T: Into<f64> + Copy,
+    T: Into<f64> + Copy + Send + Sync,
 {
     type Output = Self;
     fn add(self, other: T) -> Self {
@@ -2010,7 +2022,7 @@ where
 /// Element-wise addition between &Matrix & f64
 impl<'a, T> Add<T> for &'a Matrix
 where
-    T: Into<f64> + Copy,
+    T: Into<f64> + Copy + Send + Sync,
 {
     type Output = Matrix;
     fn add(self, other: T) -> Self::Output {
@@ -2149,7 +2161,11 @@ impl Neg for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => matrix(
-                self.data.into_iter().map(|x: f64| -x).collect::<Vec<f64>>(),
+                // self.data.into_iter().map(|x: f64| -x).collect::<Vec<f64>>(),
+                self.data
+                    .into_par_iter()
+                    .map(|x: f64| -x)
+                    .collect::<Vec<f64>>(),
                 self.row,
                 self.col,
                 self.shape,
@@ -2175,9 +2191,14 @@ impl<'a> Neg for &'a Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => matrix(
+                // self.data
+                //     .clone()
+                //     .into_iter()
+                //     .map(|x: f64| -x)
+                //     .collect::<Vec<f64>>(),
                 self.data
                     .clone()
-                    .into_iter()
+                    .into_par_iter()
                     .map(|x: f64| -x)
                     .collect::<Vec<f64>>(),
                 self.row,
@@ -2224,12 +2245,24 @@ impl Sub<Matrix> for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
+                // let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                // for i in 0..self.row {
+                //     for j in 0..self.col {
+                //         result[(i, j)] -= other[(i, j)];
+                //     }
+                // }
+                // result
                 let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
-                for i in 0..self.row {
-                    for j in 0..self.col {
-                        result[(i, j)] -= other[(i, j)];
-                    }
-                }
+                result
+                    .data
+                    .par_iter_mut()
+                    .enumerate()
+                    .for_each(|(idx, value)| {
+                        let i = idx / self.col;
+                        let j = idx % self.col;
+
+                        *value -= other[(i, j)];
+                    });
                 result
             }
         }
@@ -2247,7 +2280,7 @@ impl<'a, 'b> Sub<&'b Matrix> for &'a Matrix {
 /// Subtraction between Matrix & f64
 impl<T> Sub<T> for Matrix
 where
-    T: Into<f64> + Copy,
+    T: Into<f64> + Copy + Send + Sync,
 {
     type Output = Self;
 
@@ -2271,7 +2304,7 @@ where
 /// Subtraction between &Matrix & f64
 impl<'a, T> Sub<T> for &'a Matrix
 where
-    T: Into<f64> + Copy,
+    T: Into<f64> + Copy + Send + Sync,
 {
     type Output = Matrix;
 
@@ -2757,16 +2790,23 @@ impl FPMatrix for Matrix {
                 let new_data = self
                     .data
                     .clone()
-                    .into_iter()
+                    // .into_iter()
+                    .into_par_iter()
                     .take(n * self.col)
                     .collect::<Vec<f64>>();
                 matrix(new_data, n, self.col, Row)
             }
             Col => {
-                let mut temp_data: Vec<f64> = Vec::new();
-                for i in 0..n {
-                    temp_data.extend(self.row(i));
-                }
+                // let mut temp_data: Vec<f64> = Vec::new();
+                // for i in 0..n {
+                //     temp_data.extend(self.row(i));
+                // }
+                // matrix(temp_data, n, self.col, Row)
+
+                let temp_data = (0..n)
+                    .into_par_iter()
+                    .flat_map(|i| self.row(i))
+                    .collect::<Vec<f64>>();
                 matrix(temp_data, n, self.col, Row)
             }
         }
@@ -2781,16 +2821,23 @@ impl FPMatrix for Matrix {
                 let new_data = self
                     .data
                     .clone()
-                    .into_iter()
+                    // .into_iter()
+                    .into_par_iter()
                     .take(n * self.row)
                     .collect::<Vec<f64>>();
                 matrix(new_data, self.row, n, Col)
             }
             Row => {
-                let mut temp_data: Vec<f64> = Vec::new();
-                for i in 0..n {
-                    temp_data.extend(self.col(i));
-                }
+                // let mut temp_data: Vec<f64> = Vec::new();
+                // for i in 0..n {
+                //     temp_data.extend(self.col(i));
+                // }
+                // matrix(temp_data, self.row, n, Col)
+
+                let temp_data = (0..n)
+                    .into_par_iter()
+                    .flat_map(|i| self.col(i))
+                    .collect::<Vec<f64>>();
                 matrix(temp_data, self.row, n, Col)
             }
         }
@@ -2799,28 +2846,43 @@ impl FPMatrix for Matrix {
     fn skip_row(&self, n: usize) -> Self {
         assert!(n < self.row, "Skip range is larger than row of matrix");
 
-        let mut temp_data: Vec<f64> = Vec::new();
-        for i in n..self.row {
-            temp_data.extend(self.row(i));
-        }
+        // let mut temp_data: Vec<f64> = Vec::new();
+        // for i in n..self.row {
+        //     temp_data.extend(self.row(i));
+        // }
+        // matrix(temp_data, self.row - n, self.col, Row)
+
+        let temp_data = (n..self.row)
+            .into_par_iter()
+            .flat_map(|i| self.row(i))
+            .collect::<Vec<f64>>();
         matrix(temp_data, self.row - n, self.col, Row)
     }
 
     fn skip_col(&self, n: usize) -> Self {
         assert!(n < self.col, "Skip range is larger than col of matrix");
 
-        let mut temp_data: Vec<f64> = Vec::new();
-        for i in n..self.col {
-            temp_data.extend(self.col(i));
-        }
+        // let mut temp_data: Vec<f64> = Vec::new();
+        // for i in n..self.col {
+        //     temp_data.extend(self.col(i));
+        // }
+        // matrix(temp_data, self.row, self.col - n, Col)
+
+        let temp_data = (n..self.col)
+            .into_par_iter()
+            .flat_map(|i| self.col(i))
+            .collect::<Vec<f64>>();
         matrix(temp_data, self.row, self.col - n, Col)
     }
 
     fn fmap<F>(&self, f: F) -> Matrix
     where
-        F: Fn(f64) -> f64,
+        F: Fn(f64) -> f64 + Send + Sync,
     {
-        let result = self.data.iter().map(|x| f(*x)).collect::<Vec<f64>>();
+        // let result = self.data.iter().map(|x| f(*x)).collect::<Vec<f64>>();
+        // matrix(result, self.row, self.col, self.shape)
+
+        let result = self.data.par_iter().map(|x| f(*x)).collect::<Vec<f64>>();
         matrix(result, self.row, self.col, self.shape)
     }
 
@@ -2842,11 +2904,9 @@ impl FPMatrix for Matrix {
         F: Fn(Vec<f64>) -> Vec<f64>,
     {
         let mut result = matrix(vec![0f64; self.row * self.col], self.row, self.col, Col);
-
         for i in 0..self.col {
             result.subs_col(i, &f(self.col(i)));
         }
-
         result
     }
 
@@ -2868,11 +2928,9 @@ impl FPMatrix for Matrix {
         F: Fn(Vec<f64>) -> Vec<f64>,
     {
         let mut result = matrix(vec![0f64; self.row * self.col], self.row, self.col, Row);
-
         for i in 0..self.row {
             result.subs_row(i, &f(self.row(i)));
         }
-
         result
     }
 
@@ -2908,25 +2966,38 @@ impl FPMatrix for Matrix {
 
     fn reduce<F, T>(&self, init: T, f: F) -> f64
     where
-        F: Fn(f64, f64) -> f64,
-        T: Into<f64>,
+        F: Fn(f64, f64) -> f64 + Send + Sync,
+        T: Into<f64> + Send + Sync + Copy + Clone,
     {
-        self.data.iter().fold(init.into(), |x, y| f(x, *y))
+        // self.data.iter().fold(init.into(), |x, y| f(x, *y))
+        self.data
+            .clone()
+            .into_par_iter()
+            .fold(|| init.into(), |acc, y| f(acc, y))
+            .sum()
     }
 
     fn zip_with<F>(&self, f: F, other: &Matrix) -> Self
     where
-        F: Fn(f64, f64) -> f64,
+        F: Fn(f64, f64) -> f64 + Send + Sync,
     {
         assert_eq!(self.data.len(), other.data.len());
         let mut a = other.clone();
         if self.shape != other.shape {
             a = a.change_shape();
         }
+        // let result = self
+        //     .data
+        //     .iter()
+        //     .zip(a.data.iter())
+        //     .map(|(x, y)| f(*x, *y))
+        //     .collect::<Vec<f64>>();
+        // matrix(result, self.row, self.col, self.shape)
+
         let result = self
             .data
-            .iter()
-            .zip(a.data.iter())
+            .par_iter()
+            .zip(a.data.par_iter())
             .map(|(x, y)| f(*x, *y))
             .collect::<Vec<f64>>();
         matrix(result, self.row, self.col, self.shape)
@@ -2934,24 +3005,34 @@ impl FPMatrix for Matrix {
 
     fn col_reduce<F>(&self, f: F) -> Vec<f64>
     where
-        F: Fn(Vec<f64>) -> f64,
+        F: Fn(Vec<f64>) -> f64 + Send + Sync,
     {
-        let mut v = vec![0f64; self.col];
-        for i in 0..self.col {
-            v[i] = f(self.col(i));
-        }
-        v
+        // let mut v = vec![0f64; self.col];
+        // for i in 0..self.col {
+        //     v[i] = f(self.col(i));
+        // }
+        // v
+
+        (0..self.col)
+            .into_par_iter()
+            .map(|i| f(self.col(i)))
+            .collect::<Vec<f64>>()
     }
 
     fn row_reduce<F>(&self, f: F) -> Vec<f64>
     where
-        F: Fn(Vec<f64>) -> f64,
+        F: Fn(Vec<f64>) -> f64 + Send + Sync,
     {
-        let mut v = vec![0f64; self.row];
-        for i in 0..self.row {
-            v[i] = f(self.row(i));
-        }
-        v
+        // let mut v = vec![0f64; self.row];
+        // for i in 0..self.row {
+        //     v[i] = f(self.row(i));
+        // }
+        // v
+
+        (0..self.row)
+            .into_par_iter()
+            .map(|i| f(self.row(i)))
+            .collect::<Vec<f64>>()
     }
 }
 
diff --git a/src/traits/fp.rs b/src/traits/fp.rs
index 9204df38..1106f452 100644
--- a/src/traits/fp.rs
+++ b/src/traits/fp.rs
@@ -31,7 +31,7 @@ pub trait FPMatrix {
     fn skip_col(&self, n: usize) -> Matrix;
     fn fmap<F>(&self, f: F) -> Matrix
     where
-        F: Fn(f64) -> f64;
+        F: Fn(f64) -> f64 + Send + Sync;
     fn col_map<F>(&self, f: F) -> Matrix
     where
         F: Fn(Vec<f64>) -> Vec<f64>;
@@ -46,15 +46,15 @@ pub trait FPMatrix {
         F: Fn(Vec<f64>) -> Vec<f64>;
     fn reduce<F, T>(&self, init: T, f: F) -> f64
     where
-        F: Fn(f64, f64) -> f64,
-        T: Into<f64>;
+        F: Fn(f64, f64) -> f64 + Send + Sync,
+        T: Into<f64> + Copy + Clone + Send + Sync;
     fn zip_with<F>(&self, f: F, other: &Matrix) -> Matrix
     where
-        F: Fn(f64, f64) -> f64;
+        F: Fn(f64, f64) -> f64 + Send + Sync;
     fn col_reduce<F>(&self, f: F) -> Vec<f64>
     where
-        F: Fn(Vec<f64>) -> f64;
+        F: Fn(Vec<f64>) -> f64 + Send + Sync;
     fn row_reduce<F>(&self, f: F) -> Vec<f64>
     where
-        F: Fn(Vec<f64>) -> f64;
+        F: Fn(Vec<f64>) -> f64 + Send + Sync;
 }

From bc070a7339d0c27cf2b8f70c4db477435e6239cd Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Mon, 30 Sep 2024 10:57:23 +0200
Subject: [PATCH 07/17] Parallelized vec and matrix using Rayon

---
 src/structure/matrix.rs |  46 +++++++------
 src/structure/vector.rs | 143 +++++++++++++++++++++++++++++++++-------
 2 files changed, 144 insertions(+), 45 deletions(-)

diff --git a/src/structure/matrix.rs b/src/structure/matrix.rs
index b53212ff..bcb8bf49 100644
--- a/src/structure/matrix.rs
+++ b/src/structure/matrix.rs
@@ -789,13 +789,14 @@ where
 /// }
 /// ```
 pub fn ml_matrix(s: &str) -> Matrix where {
-    // let str_rows: Vec<&str> = s.split(';').collect();
-    // let r = str_rows.len();
-    // let str_data = str_rows
-    //     .into_iter()
-    //     .map(|x| x.trim().split(' ').collect::<Vec<&str>>())
-    //     .collect::<Vec<Vec<&str>>>();
-    // let c = str_data[0].len();
+    let str_rows: Vec<&str> = s.split(';').collect();
+    let r = str_rows.len();
+    let str_data = str_rows
+        .into_iter()
+        .map(|x| x.trim().split(' ').collect::<Vec<&str>>())
+        .collect::<Vec<Vec<&str>>>();
+    let c = str_data[0].len();
+
     // let data = str_data
     //     .into_iter()
     //     .flat_map(|t| {
@@ -806,14 +807,6 @@ pub fn ml_matrix(s: &str) -> Matrix where {
     //     .collect::<Vec<f64>>();
     // matrix(data, r, c, Row)
 
-    let str_rows: Vec<&str> = s.split(';').collect();
-    let r = str_rows.len();
-    let str_data = str_rows
-        .into_iter()
-        .map(|x| x.trim().split(' ').collect::<Vec<&str>>())
-        .collect::<Vec<Vec<&str>>>();
-    let c = str_data[0].len();
-
     let data = str_data
         .into_par_iter()
         .flat_map(|t| {
@@ -2970,6 +2963,7 @@ impl FPMatrix for Matrix {
         T: Into<f64> + Send + Sync + Copy + Clone,
     {
         // self.data.iter().fold(init.into(), |x, y| f(x, *y))
+
         self.data
             .clone()
             .into_par_iter()
@@ -3061,11 +3055,23 @@ pub trait LinearAlgebra {
 }
 
 pub fn diag(n: usize) -> Matrix {
-    let mut v: Vec<f64> = vec![0f64; n * n];
-    for i in 0..n {
-        let idx = i * (n + 1);
-        v[idx] = 1f64;
-    }
+    // let mut v: Vec<f64> = vec![0f64; n * n];
+    // for i in 0..n {
+    //     let idx = i * (n + 1);
+    //     v[idx] = 1f64;
+    // }
+    // matrix(v, n, n, Row)
+
+    let v = (0..n * n)
+        .into_par_iter()
+        .map(|i| {
+            if i % (n + 1) == 0 {
+                1_f64 // Set diagonal elements to 1
+            } else {
+                0_f64 // All other elements are 0
+            }
+        })
+        .collect::<Vec<f64>>();
     matrix(v, n, n, Row)
 }
 
diff --git a/src/structure/vector.rs b/src/structure/vector.rs
index 0300bc78..ad00d09d 100644
--- a/src/structure/vector.rs
+++ b/src/structure/vector.rs
@@ -327,6 +327,7 @@ impl FPVector for Vec<f64> {
         T: Into<f64> + Send + Sync + Copy,
     {
         self.iter().fold(init.into(), |x, &y| f(x, y))
+
         // self.par_iter()
         //     .cloned()
         //     .fold(|| init.into(), |x, y| f(x, y))
@@ -539,15 +540,35 @@ impl Algorithm for Vec<f64> {
         let l = self.len();
         let idx = (1..(l + 1)).collect::<Vec<usize>>();
 
+        // let mut vec_tup = self
+        //     .clone()
+        //     .into_iter()
+        //     .zip(idx.clone())
+        //     .collect::<Vec<(f64, usize)>>();
+        // vec_tup.sort_by(|x, y| x.0.partial_cmp(&y.0).unwrap().reverse());
+        // let indices = vec_tup.into_iter().map(|(_, y)| y).collect::<Vec<usize>>();
+        // idx.into_iter()
+        //     .map(|x| indices.clone().into_iter().position(|t| t == x).unwrap())
+        //     .collect::<Vec<usize>>()
+
         let mut vec_tup = self
             .clone()
-            .into_iter()
+            .into_par_iter()
             .zip(idx.clone())
             .collect::<Vec<(f64, usize)>>();
         vec_tup.sort_by(|x, y| x.0.partial_cmp(&y.0).unwrap().reverse());
-        let indices = vec_tup.into_iter().map(|(_, y)| y).collect::<Vec<usize>>();
-        idx.into_iter()
-            .map(|x| indices.clone().into_iter().position(|t| t == x).unwrap())
+        let indices = vec_tup
+            .into_par_iter()
+            .map(|(_, y)| y)
+            .collect::<Vec<usize>>();
+        idx.into_par_iter()
+            .map(|x| {
+                indices
+                    .clone()
+                    .into_par_iter()
+                    .position_any(|t| t == x) // Note: position() is deprecated by rayon
+                    .unwrap()
+            })
             .collect::<Vec<usize>>()
     }
 
@@ -613,15 +634,41 @@ impl Algorithm for Vec<f64> {
         }
         #[cfg(not(feature = "O3"))]
         {
-            self.into_iter()
+            // self.into_iter()
+            //     .enumerate()
+            //     .fold((0usize, f64::MIN), |acc, (ics, &val)| {
+            //         if acc.1 < val {
+            //             (ics, val)
+            //         } else {
+            //             acc
+            //         }
+            //     })
+            //     .0
+
+            self.into_par_iter()
                 .enumerate()
-                .fold((0usize, f64::MIN), |acc, (ics, &val)| {
-                    if acc.1 < val {
-                        (ics, val)
-                    } else {
-                        acc
-                    }
-                })
+                .fold(
+                    || (0usize, f64::MIN),
+                    |acc, (ics, &val)| {
+                        if acc.1 < val {
+                            (ics, val)
+                        } else {
+                            acc
+                        }
+                    },
+                )
+                .reduce(
+                    // Note: need reduce and can nto simply do .max() because rayon::..max() requires Ord, which is stricter than PartialOrd
+                    // Hence combine the results from the parallel fold
+                    || (0usize, f64::MIN), // Identity element
+                    |acc1, acc2| {
+                        if acc1.1 < acc2.1 {
+                            acc2 // Return the pair with the larger value
+                        } else {
+                            acc1
+                        }
+                    },
+                )
                 .0
         }
     }
@@ -642,15 +689,39 @@ impl Algorithm for Vec<f64> {
     ///     assert_eq!(v2.arg_min(),4);
     /// }
     fn arg_min(&self) -> usize {
-        self.iter()
+        // self.iter()
+        //     .enumerate()
+        //     .fold((0usize, f64::MAX), |acc, (ics, &val)| {
+        //         if acc.1 > val {
+        //             (ics, val)
+        //         } else {
+        //             acc
+        //         }
+        //     })
+        //     .0
+
+        self.into_par_iter()
             .enumerate()
-            .fold((0usize, f64::MAX), |acc, (ics, &val)| {
-                if acc.1 > val {
-                    (ics, val)
-                } else {
-                    acc
-                }
-            })
+            .fold(
+                || (0usize, f64::MAX),
+                |acc, (ics, &val)| {
+                    if acc.1 > val {
+                        (ics, val)
+                    } else {
+                        acc
+                    }
+                },
+            )
+            .reduce(
+                || (0usize, f64::MAX),
+                |acc1, acc2| {
+                    if acc1.1 > acc2.1 {
+                        acc2 // Return the pair with the smaller value
+                    } else {
+                        acc1
+                    }
+                },
+            )
             .0
     }
 
@@ -666,14 +737,28 @@ impl Algorithm for Vec<f64> {
         }
         #[cfg(not(feature = "O3"))]
         {
-            self.into_iter()
-                .fold(f64::MIN, |acc, &val| if acc < val { val } else { acc })
+            // self.into_iter()
+            //     .fold(f64::MIN, |acc, &val| if acc < val { val } else { acc })
+
+            self.into_par_iter()
+                .fold(|| f64::MIN, |acc, &val| if acc < val { val } else { acc })
+                .reduce(
+                    || f64::MIN,
+                    |acc1, acc2| if acc1 < acc2 { acc2 } else { acc1 },
+                )
         }
     }
 
     fn min(&self) -> f64 {
-        self.iter()
-            .fold(f64::MAX, |acc, &val| if acc > val { val } else { acc })
+        // self.iter()
+        //     .fold(f64::MAX, |acc, &val| if acc > val { val } else { acc })
+
+        self.into_par_iter()
+            .fold(|| f64::MAX, |acc, &val| if acc > val { val } else { acc })
+            .reduce(
+                || f64::MAX,
+                |acc1, acc2| if acc1 > acc2 { acc2 } else { acc1 },
+            )
     }
 
     fn swap_with_perm(&mut self, p: &Vec<(usize, usize)>) {
@@ -718,6 +803,7 @@ impl Normed for Vec<f64> {
                 #[cfg(not(feature = "O3"))]
                 {
                     // self.iter().map(|x| x.powi(2)).sum::<f64>().sqrt()
+
                     self.par_iter()
                         .map(|x| x.powi(2))
                         .sum::<Self::UnsignedScalar>()
@@ -731,12 +817,17 @@ impl Normed for Vec<f64> {
                     p
                 );
                 // self.iter().map(|x| x.powf(p)).sum::<f64>().powf(1f64 / p)
+
                 self.par_iter()
                     .map(|x| x.powf(p))
                     .sum::<Self::UnsignedScalar>()
                     .powf(1_f64 / p)
             }
-            Norm::LInf => self.iter().fold(0f64, |x, y| x.max(y.abs())),
+            // Norm::LInf => self.iter().fold(0f64, |x, y| x.max(y.abs())),
+            Norm::LInf => self
+                .par_iter()
+                .fold(|| 0f64, |x, y| x.max(y.abs()))
+                .reduce(|| 0f64, |acc1, acc2| acc1.max(acc2.abs())),
             Norm::F => unimplemented!(),
             Norm::Lpq(_, _) => unimplemented!(),
         }
@@ -766,6 +857,7 @@ impl InnerProduct for Vec<f64> {
             // self.iter()
             //     .zip(rhs.iter())
             //     .fold(0f64, |x, (y1, y2)| x + y1 * y2)
+
             self.par_iter()
                 .zip(rhs.into_par_iter())
                 .fold(|| 0_f64, |x, (y1, y2)| x + y1 * y2)
@@ -802,6 +894,7 @@ impl VectorProduct for Vec<f64> {
                 // v[1] = self[2] * other[0] - self[0] * other[2];
                 // v[2] = self[0] * other[1] - self[1] * other[0];
                 // v
+
                 let v = (0..3)
                     .into_par_iter()
                     .map(|index| {

From 60037918e1ca06142cb6692b557e701f604e3683 Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Mon, 30 Sep 2024 21:23:07 +0200
Subject: [PATCH 08/17] [Feature] Added Parallel trait

- Added parallel version of required traits
for vec and matrix
- Adjusted commented sections of the code
---
 src/numerical/integral.rs |   4 +-
 src/prelude/simpler.rs    |   2 +-
 src/statistics/dist.rs    |   2 +-
 src/statistics/rand.rs    |   4 +-
 src/structure/matrix.rs   | 931 ++++++++++++++++++++++++++++----------
 src/structure/vector.rs   | 593 ++++++++++++++++++------
 src/traits/fp.rs          |  74 ++-
 src/traits/general.rs     |  13 +
 src/traits/math.rs        |  32 ++
 src/traits/mod.rs         |   2 +-
 src/traits/mutable.rs     |  15 +-
 src/traits/sugar.rs       |   4 +-
 12 files changed, 1262 insertions(+), 414 deletions(-)

diff --git a/src/numerical/integral.rs b/src/numerical/integral.rs
index a04d745f..d7650f3c 100644
--- a/src/numerical/integral.rs
+++ b/src/numerical/integral.rs
@@ -161,7 +161,7 @@ impl Integral {
 ///     * `G30K61R`
 pub fn integrate<F>(f: F, (a, b): (f64, f64), method: Integral) -> f64
 where
-    F: Fn(f64) -> f64 + Copy + Send + Sync,
+    F: Fn(f64) -> f64 + Copy,
 {
     match method {
         Integral::GaussLegendre(n) => gauss_legendre_quadrature(f, n, (a, b)),
@@ -173,7 +173,7 @@ where
 /// Newton Cotes Quadrature
 pub fn newton_cotes_quadrature<F>(f: F, n: usize, (a, b): (f64, f64)) -> f64
 where
-    F: Fn(f64) -> f64 + Send + Sync,
+    F: Fn(f64) -> f64,
 {
     let h = (b - a) / (n as f64);
     let node_x = seq(a, b, h);
diff --git a/src/prelude/simpler.rs b/src/prelude/simpler.rs
index 5b786b6a..c2c0c7d7 100644
--- a/src/prelude/simpler.rs
+++ b/src/prelude/simpler.rs
@@ -23,7 +23,7 @@ pub trait SimpleNorm: Normed {
 }
 
 /// Simple integrate
-pub fn integrate<F: Fn(f64) -> f64 + Copy + Send + Sync>(f: F, (a, b): (f64, f64)) -> f64 {
+pub fn integrate<F: Fn(f64) -> f64 + Copy>(f: F, (a, b): (f64, f64)) -> f64 {
     integral::integrate(f, (a, b), G7K15R(1e-4, 20))
 }
 
diff --git a/src/statistics/dist.rs b/src/statistics/dist.rs
index d3f1f6cb..ee150aca 100644
--- a/src/statistics/dist.rs
+++ b/src/statistics/dist.rs
@@ -373,7 +373,7 @@ impl WeightedUniform<f64> {
     /// ```
     pub fn from_max_pool_1d<F>(f: F, (a, b): (f64, f64), n: usize, eps: f64) -> Result<Self>
     where
-        F: Fn(f64) -> f64 + Copy + Send + Sync,
+        F: Fn(f64) -> f64 + Copy,
     {
         // Find non-zero intervals
         let mut a = a;
diff --git a/src/statistics/rand.rs b/src/statistics/rand.rs
index 7b36bc1d..1e577fbe 100644
--- a/src/statistics/rand.rs
+++ b/src/statistics/rand.rs
@@ -467,7 +467,7 @@ pub fn ziggurat(rng: &mut ThreadRng, sigma: f64) -> f64 {
 /// }
 pub fn prs<F>(f: F, n: usize, (a, b): (f64, f64), m: usize, eps: f64) -> anyhow::Result<Vec<f64>>
 where
-    F: Fn(f64) -> f64 + Copy + Send + Sync,
+    F: Fn(f64) -> f64 + Copy,
 {
     let mut rng = thread_rng();
 
@@ -538,7 +538,7 @@ pub fn prs_with_rng<F, R: Rng + Clone>(
     rng: &mut R,
 ) -> anyhow::Result<Vec<f64>>
 where
-    F: Fn(f64) -> f64 + Copy + Send + Sync,
+    F: Fn(f64) -> f64 + Copy,
 {
     let mut result = vec![0f64; n];
 
diff --git a/src/structure/matrix.rs b/src/structure/matrix.rs
index bcb8bf49..af3d98e8 100644
--- a/src/structure/matrix.rs
+++ b/src/structure/matrix.rs
@@ -614,6 +614,8 @@ use lapack::{dgecon, dgeqrf, dgesvd, dgetrf, dgetri, dgetrs, dorgqr, dpotrf};
 use serde::{Deserialize, Serialize};
 
 use crate::rayon::prelude::*;
+use crate::traits::fp::ParallelFPMatrix;
+use crate::traits::math::{ParallelMatrixProduct, ParallelNormed, ParallelVector};
 
 pub use self::Shape::{Col, Row};
 use crate::numerical::eigen::{eigen, EigenMethod};
@@ -712,6 +714,31 @@ pub struct Matrix {
 /// }
 /// ```
 pub fn matrix<T>(v: Vec<T>, r: usize, c: usize, shape: Shape) -> Matrix
+where
+    T: Into<f64>,
+{
+    Matrix {
+        data: v.into_iter().map(|t| t.into()).collect::<Vec<f64>>(),
+        row: r,
+        col: c,
+        shape,
+    }
+}
+
+/// R-like matrix constructor in parallel
+///
+/// # Examples
+/// ```
+/// #[macro_use]
+/// extern crate peroxide;
+/// use peroxide::fuga::*;
+///
+/// fn main() {
+///     let a = par_matrix(c!(1,2,3,4), 2, 2, Row);
+///     a.print(); // Print matrix
+/// }
+/// ```
+pub fn par_matrix<T>(v: Vec<T>, r: usize, c: usize, shape: Shape) -> Matrix
 where
     T: Into<f64> + Send + Sync,
 {
@@ -726,11 +753,19 @@ where
 /// R-like matrix constructor (Explicit ver.)
 pub fn r_matrix<T>(v: Vec<T>, r: usize, c: usize, shape: Shape) -> Matrix
 where
-    T: Into<f64> + Send + Sync,
+    T: Into<f64>,
 {
     matrix(v, r, c, shape)
 }
 
+/// R-like matrix constructor (Explicit ver.) in parallel
+pub fn par_r_matrix<T>(v: Vec<T>, r: usize, c: usize, shape: Shape) -> Matrix
+where
+    T: Into<f64> + Send + Sync,
+{
+    par_matrix(v, r, c, shape)
+}
+
 /// Python-like matrix constructor
 ///
 /// # Examples
@@ -747,19 +782,38 @@ where
 /// ```
 pub fn py_matrix<T>(v: Vec<Vec<T>>) -> Matrix
 where
-    T: Into<f64> + Copy + Send + Sync,
+    T: Into<f64> + Copy,
 {
-    // let r = v.len();
-    // let c = v[0].len();
-    // let mut data = vec![0f64; r * c];
-    // for i in 0..r {
-    //     for j in 0..c {
-    //         let idx = i * c + j;
-    //         data[idx] = v[i][j].into();
-    //     }
-    // }
-    // matrix(data, r, c, Row)
+    let r = v.len();
+    let c = v[0].len();
+    let mut data = vec![0f64; r * c];
+    for i in 0..r {
+        for j in 0..c {
+            let idx = i * c + j;
+            data[idx] = v[i][j].into();
+        }
+    }
+    matrix(data, r, c, Row)
+}
 
+/// Python-like matrix constructor in parallel
+///
+/// # Examples
+/// ```
+/// #[macro_use]
+/// extern crate peroxide;
+/// use peroxide::fuga::*;
+///
+/// fn main() {
+///     let a = par_py_matrix(vec![c!(1,2), c!(3,4)]);
+///     let b = matrix(c!(1,2,3,4), 2, 2, Row);
+///     assert_eq!(a, b);
+/// }
+/// ```
+pub fn par_py_matrix<T>(v: Vec<Vec<T>>) -> Matrix
+where
+    T: Into<f64> + Copy + Send + Sync,
+{
     let r = v.len();
     let c = v[0].len();
     let data = (0..r)
@@ -771,7 +825,7 @@ where
                 .collect::<Vec<f64>>()
         })
         .collect::<Vec<f64>>();
-    matrix(data, r, c, Row)
+    par_matrix(data, r, c, Row)
 }
 
 /// Matlab-like matrix constructor
@@ -797,15 +851,40 @@ pub fn ml_matrix(s: &str) -> Matrix where {
         .collect::<Vec<Vec<&str>>>();
     let c = str_data[0].len();
 
-    // let data = str_data
-    //     .into_iter()
-    //     .flat_map(|t| {
-    //         t.into_iter()
-    //             .map(|x| x.parse::<f64>().unwrap())
-    //             .collect::<Vec<f64>>()
-    //     })
-    //     .collect::<Vec<f64>>();
-    // matrix(data, r, c, Row)
+    let data = str_data
+        .into_iter()
+        .flat_map(|t| {
+            t.into_iter()
+                .map(|x| x.parse::<f64>().unwrap())
+                .collect::<Vec<f64>>()
+        })
+        .collect::<Vec<f64>>();
+
+    matrix(data, r, c, Row)
+}
+
+/// Matlab-like matrix constructor in parallel
+///
+/// # Examples
+/// ```
+/// #[macro_use]
+/// extern crate peroxide;
+/// use peroxide::fuga::*;
+///
+/// fn main() {
+///     let a = par_ml_matrix("1 2; 3 4");
+///     let b = matrix(c!(1,2,3,4), 2, 2, Row);
+///     assert_eq!(a, b);
+/// }
+/// ```
+pub fn par_ml_matrix(s: &str) -> Matrix where {
+    let str_rows: Vec<&str> = s.split(';').collect();
+    let r = str_rows.len();
+    let str_data = str_rows
+        .into_iter()
+        .map(|x| x.trim().split(' ').collect::<Vec<&str>>())
+        .collect::<Vec<Vec<&str>>>();
+    let c = str_data[0].len();
 
     let data = str_data
         .into_par_iter()
@@ -815,7 +894,8 @@ pub fn ml_matrix(s: &str) -> Matrix where {
                 .collect::<Vec<f64>>()
         })
         .collect::<Vec<f64>>();
-    matrix(data, r, c, Row)
+
+    par_matrix(data, r, c, Row)
 }
 
 /// Pretty Print
@@ -835,6 +915,8 @@ impl PartialEq for Matrix {
             //     .zip(other.data.clone())
             //     .all(|(x, y)| nearly_eq(x, y))
             //     && self.row == other.row
+
+            // Note: the sequential version in this case can be replaced by the parallel version?
             self.data
                 .clone()
                 .into_par_iter()
@@ -908,37 +990,65 @@ impl Matrix {
         let c = self.col;
         assert_eq!(r * c, self.data.len());
         let l = r * c - 1;
-        // let mut data: Vec<f64> = self.data.clone();
+        let mut data: Vec<f64> = self.data.clone();
+        let ref_data = &self.data;
+
+        match self.shape {
+            Row => {
+                for i in 0..l {
+                    let s = (i * c) % l;
+                    data[i] = ref_data[s];
+                }
+                data[l] = ref_data[l];
+                matrix(data, r, c, Col)
+            }
+            Col => {
+                for i in 0..l {
+                    let s = (i * r) % l;
+                    data[i] = ref_data[s];
+                }
+                data[l] = ref_data[l];
+                matrix(data, r, c, Row)
+            }
+        }
+    }
+
+    /// Change Bindings in Parallel
+    ///
+    /// `Row` -> `Col` or `Col` -> `Row`
+    ///
+    /// # Examples
+    /// ```
+    /// use peroxide::fuga::*;
+    ///
+    /// let a = matrix(vec![1,2,3,4],2,2,Row);
+    /// assert_eq!(a.shape, Row);
+    /// let b = a.par_change_shape();
+    /// assert_eq!(b.shape, Col);
+    /// ```
+    pub fn par_change_shape(&self) -> Self {
+        let r = self.row;
+        let c = self.col;
+        assert_eq!(r * c, self.data.len());
+        let l = r * c - 1;
         let ref_data = &self.data;
 
         match self.shape {
             Row => {
-                // for i in 0..l {
-                //     let s = (i * c) % l;
-                //     data[i] = ref_data[s];
-                // }
-                // data[l] = ref_data[l];
-                // matrix(data, r, c, Col)
                 let mut data = (0..=l)
                     .into_par_iter()
                     .map(|i| ref_data[(i * c) % l])
                     .collect::<Vec<f64>>();
                 data[l] = ref_data[l];
-                matrix(data, r, c, Col)
+                par_matrix(data, r, c, Col)
             }
             Col => {
-                // for i in 0..l {
-                //     let s = (i * r) % l;
-                //     data[i] = ref_data[s];
-                // }
-                // data[l] = ref_data[l];
-                // matrix(data, r, c, Row)
                 let mut data = (0..=l)
                     .into_par_iter()
                     .map(|i| ref_data[(i * r) % l])
                     .collect::<Vec<f64>>();
                 data[l] = ref_data[l];
-                matrix(data, r, c, Row)
+                par_matrix(data, r, c, Row)
             }
         }
     }
@@ -966,13 +1076,47 @@ impl Matrix {
 
         match self.shape {
             Row => {
-                // for i in 0..l {
-                //     let s = (i * c) % l;
-                //     self.data[i] = ref_data[s];
-                // }
-                // self.data[l] = ref_data[l];
-                // self.shape = Col;
+                for i in 0..l {
+                    let s = (i * c) % l;
+                    self.data[i] = ref_data[s];
+                }
+                self.data[l] = ref_data[l];
+                self.shape = Col;
+            }
+            Col => {
+                for i in 0..l {
+                    let s = (i * r) % l;
+                    self.data[i] = ref_data[s];
+                }
+                self.data[l] = ref_data[l];
+                self.shape = Row;
+            }
+        }
+    }
+
+    /// Change Bindings Mutably in Parallel
+    ///
+    /// `Row` -> `Col` or `Col` -> `Row`
+    ///
+    /// # Examples
+    /// ```
+    /// use peroxide::fuga::*;
+    ///
+    /// let mut a = matrix(vec![1,2,3,4,5,6,7,8,9],3,3,Row);
+    /// assert_eq!(a.shape, Row);
+    /// a.par_change_shape_mut();
+    /// assert_eq!(a, matrix(vec![1,4,7,2,5,8,3,6,9],3,3,Col));
+    /// assert_eq!(a.shape, Col);
+    /// ```
+    pub fn par_change_shape_mut(&mut self) {
+        let r = self.row;
+        let c = self.col;
+        assert_eq!(r * c, self.data.len());
+        let l = r * c - 1;
+        let ref_data = self.data.clone();
 
+        match self.shape {
+            Row => {
                 self.data = (0..=l)
                     .into_par_iter()
                     .map(|i| ref_data[(i * c) % l])
@@ -981,13 +1125,6 @@ impl Matrix {
                 self.shape = Col;
             }
             Col => {
-                // for i in 0..l {
-                //     let s = (i * r) % l;
-                //     self.data[i] = ref_data[s];
-                // }
-                // self.data[l] = ref_data[l];
-                // self.shape = Row;
-
                 self.data = (0..=l)
                     .into_par_iter()
                     .map(|i| ref_data[(i * r) % l])
@@ -1100,10 +1237,29 @@ impl Matrix {
     /// ```
     pub fn col(&self, index: usize) -> Vec<f64> {
         assert!(index < self.col);
-        // let mut container: Vec<f64> = vec![0f64; self.row];
-        // for i in 0..self.row {
-        //     container[i] = self[(i, index)];
-        // }
+        let mut container: Vec<f64> = vec![0f64; self.row];
+        for i in 0..self.row {
+            container[i] = self[(i, index)];
+        }
+        container
+    }
+
+    /// Extract Column in parallel
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = matrix(c!(1,2,3,4), 2, 2, Row);
+    ///     assert_eq!(a.pub_col(0), c!(1,3));
+    /// }
+    /// ```
+    pub fn pub_col(&self, index: usize) -> Vec<f64> {
+        assert!(index < self.col);
+
         let container = (0..self.row)
             .into_par_iter()
             .map(|i| self[(i, index)])
@@ -1126,11 +1282,28 @@ impl Matrix {
     /// ```
     pub fn row(&self, index: usize) -> Vec<f64> {
         assert!(index < self.row);
-        // let mut container: Vec<f64> = vec![0f64; self.col];
-        // for i in 0..self.col {
-        //     container[i] = self[(index, i)];
-        // }
-        // container
+        let mut container: Vec<f64> = vec![0f64; self.col];
+        for i in 0..self.col {
+            container[i] = self[(index, i)];
+        }
+        container
+    }
+
+    /// Extract Row in Parallel
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = matrix(c!(1,2,3,4), 2, 2, Row);
+    ///     assert_eq!(a.pub_row(0), c!(1,2));
+    /// }
+    /// ```
+    pub fn pub_row(&self, index: usize) -> Vec<f64> {
+        assert!(index < self.row);
 
         let container = (0..self.col)
             .into_par_iter()
@@ -1154,11 +1327,28 @@ impl Matrix {
     /// ```
     pub fn diag(&self) -> Vec<f64> {
         assert_eq!(self.row, self.col);
-        // let mut container = vec![0f64; self.row];
-        // for i in 0..self.row {
-        //     container[i] = self.data[i * (self.col + 1)];
-        // }
-        // container
+        let mut container = vec![0f64; self.row];
+        for i in 0..self.row {
+            container[i] = self.data[i * (self.col + 1)];
+        }
+        container
+    }
+
+    /// Extract diagonal components in parallel
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = matrix!(1;4;1, 2, 2, Row);
+    ///     assert_eq!(a.par_diag(), c!(1,4));
+    /// }
+    /// ```
+    pub fn par_diag(&self) -> Vec<f64> {
+        assert_eq!(self.row, self.col);
 
         let container = (0..self.row)
             .into_par_iter()
@@ -1397,20 +1587,30 @@ impl Matrix {
     /// From index operations
     pub fn from_index<F, G>(f: F, size: (usize, usize)) -> Matrix
     where
-        F: Fn(usize, usize) -> G + Copy + Send + Sync,
+        F: Fn(usize, usize) -> G + Copy,
         G: Into<f64>,
     {
         let row = size.0;
         let col = size.1;
 
-        // let mut mat = matrix(vec![0f64; row * col], row, col, Row);
+        let mut mat = matrix(vec![0f64; row * col], row, col, Row);
 
-        // for i in 0..row {
-        //     for j in 0..col {
-        //         mat[(i, j)] = f(i, j).into();
-        //     }
-        // }
-        // mat
+        for i in 0..row {
+            for j in 0..col {
+                mat[(i, j)] = f(i, j).into();
+            }
+        }
+        mat
+    }
+
+    /// From index operations
+    pub fn par_from_index<F, G>(f: F, size: (usize, usize)) -> Matrix
+    where
+        F: Fn(usize, usize) -> G + Copy + Send + Sync,
+        G: Into<f64>,
+    {
+        let row = size.0;
+        let col = size.1;
 
         let data = (0..row)
             .into_par_iter()
@@ -1421,19 +1621,24 @@ impl Matrix {
                     .collect::<Vec<f64>>()
             })
             .collect::<Vec<f64>>();
-        matrix(data, row, col, Row)
+        par_matrix(data, row, col, Row)
     }
 
     /// Matrix to `Vec<Vec<f64>>`
     ///
     /// To send `Matrix` to `inline-python`
     pub fn to_vec(&self) -> Vec<Vec<f64>> {
-        // let mut result = vec![vec![0f64; self.col]; self.row];
-        // for i in 0..self.row {
-        //     result[i] = self.row(i);
-        // }
-        // result
+        let mut result = vec![vec![0f64; self.col]; self.row];
+        for i in 0..self.row {
+            result[i] = self.row(i);
+        }
+        result
+    }
 
+    /// Matrix to `Vec<Vec<f64>>` in parallel
+    ///
+    /// To send `Matrix` to `inline-python`
+    pub fn par_to_vec(&self) -> Vec<Vec<f64>> {
         (0..self.row)
             .into_par_iter()
             .map(|i| self.row(i))
@@ -1454,12 +1659,28 @@ impl Matrix {
     /// }
     pub fn to_diag(&self) -> Matrix {
         assert_eq!(self.row, self.col, "Should be square matrix");
-        // let mut result = matrix(vec![0f64; self.row * self.col], self.row, self.col, Row);
-        // let diag = self.diag();
-        // for i in 0..self.row {
-        //     result[(i, i)] = diag[i];
-        // }
-        // result
+        let mut result = matrix(vec![0f64; self.row * self.col], self.row, self.col, Row);
+        let diag = self.diag();
+        for i in 0..self.row {
+            result[(i, i)] = diag[i];
+        }
+        result
+    }
+
+    /// To diagonal components in parallel
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = matrix!(1;4;1, 2, 2, Row);
+    ///     assert_eq!(a.par_to_diag(), matrix(c!(1,0,0,4), 2, 2, Row));
+    /// }
+    pub fn par_to_diag(&self) -> Matrix {
+        assert_eq!(self.row, self.col, "Should be square matrix");
 
         let data = (0..self.row)
             .into_par_iter()
@@ -1470,7 +1691,8 @@ impl Matrix {
                     .collect::<Vec<f64>>()
             })
             .collect::<Vec<f64>>();
-        matrix(data, self.row, self.col, Row)
+
+        par_matrix(data, self.row, self.col, Row)
     }
 
     /// Submatrix
@@ -1503,13 +1725,45 @@ impl Matrix {
     pub fn submat(&self, start: (usize, usize), end: (usize, usize)) -> Matrix {
         let row = end.0 - start.0 + 1;
         let col = end.1 - start.1 + 1;
-        // let mut result = matrix(vec![0f64; row * col], row, col, self.shape);
-        // for i in 0..row {
-        //     for j in 0..col {
-        //         result[(i, j)] = self[(start.0 + i, start.1 + j)];
-        //     }
-        // }
-        // result
+        let mut result = matrix(vec![0f64; row * col], row, col, self.shape);
+        for i in 0..row {
+            for j in 0..col {
+                result[(i, j)] = self[(start.0 + i, start.1 + j)];
+            }
+        }
+        result
+    }
+
+    /// Submatrix in parallel
+    ///
+    /// # Description
+    /// Return below elements of matrix to new matrix
+    ///
+    /// $$
+    /// \begin{pmatrix}
+    /// \\ddots & & & & \\\\
+    ///   & start & \\cdots & end.1 & \\\\
+    ///   & \\vdots & \\ddots & \\vdots & \\\\
+    ///   & end.0 & \\cdots & end & \\\\
+    ///   & & & & \\ddots
+    /// \end{pmatrix}
+    /// $$
+    ///
+    /// # Examples
+    /// ```
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = ml_matrix("1 2 3;4 5 6;7 8 9");
+    ///     let b = ml_matrix("5 6;8 9");
+    ///     let c = a.par_submat((1, 1), (2, 2));
+    ///     assert_eq!(b, c);   
+    /// }
+    /// ```
+    pub fn par_submat(&self, start: (usize, usize), end: (usize, usize)) -> Matrix {
+        let row = end.0 - start.0 + 1;
+        let col = end.1 - start.1 + 1;
 
         let data = (0..row)
             .into_par_iter()
@@ -1520,7 +1774,7 @@ impl Matrix {
                     .collect::<Vec<f64>>()
             })
             .collect::<Vec<f64>>();
-        matrix(data, row, col, self.shape)
+        par_matrix(data, row, col, self.shape)
     }
 
     /// Substitute matrix to specific position
@@ -1582,13 +1836,99 @@ impl Matrix {
     }
 }
 
-// =============================================================================
-// Mathematics for Matrix
-// =============================================================================
-impl Vector for Matrix {
+// =============================================================================
+// Mathematics for Matrix
+// =============================================================================
+impl Vector for Matrix {
+    type Scalar = f64;
+
+    fn add_vec(&self, other: &Self) -> Self {
+        assert_eq!(self.row, other.row);
+        assert_eq!(self.col, other.col);
+
+        match () {
+            #[cfg(feature = "O3")]
+            () => {
+                if self.shape != other.shape {
+                    return self.add(&other.change_shape());
+                }
+                let x = &self.data;
+                let mut y = other.data.clone();
+                let n_i32 = x.len() as i32;
+                let a_f64 = 1f64;
+                unsafe {
+                    daxpy(n_i32, a_f64, x, 1, &mut y, 1);
+                }
+                matrix(y, self.row, self.col, self.shape)
+            }
+            _ => {
+                let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                for i in 0..self.row {
+                    for j in 0..self.col {
+                        result[(i, j)] += other[(i, j)];
+                    }
+                }
+                result
+            }
+        }
+    }
+
+    fn sub_vec(&self, other: &Self) -> Self {
+        assert_eq!(self.row, other.row);
+        assert_eq!(self.col, other.col);
+        match () {
+            #[cfg(feature = "O3")]
+            () => {
+                if self.shape != other.shape {
+                    return self.sub(&other.change_shape());
+                }
+                let x = &other.data;
+                let mut y = self.data.clone();
+                let n_i32 = x.len() as i32;
+                let a_f64 = -1f64;
+                unsafe {
+                    daxpy(n_i32, a_f64, x, 1, &mut y, 1);
+                }
+                matrix(y, self.row, self.col, self.shape)
+            }
+            _ => {
+                let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                for i in 0..self.row {
+                    for j in 0..self.col {
+                        result[(i, j)] -= other[(i, j)];
+                    }
+                }
+                result
+            }
+        }
+    }
+
+    fn mul_scalar(&self, other: Self::Scalar) -> Self {
+        match () {
+            #[cfg(feature = "O3")]
+            () => {
+                let x = &self.data;
+                let mut y = vec![0f64; x.len()];
+                let a_f64 = other;
+                let n_i32 = x.len() as i32;
+
+                unsafe {
+                    daxpy(n_i32, a_f64, x, 1, &mut y, 1);
+                }
+                matrix(y, self.row, self.col, self.shape)
+            }
+            _ => {
+                let scalar = other;
+                self.fmap(|x| x * scalar)
+            }
+        }
+    }
+}
+
+impl ParallelVector for Matrix {
     type Scalar = f64;
 
-    fn add_vec(&self, other: &Self) -> Self {
+    fn par_add_vec(&self, other: &Self) -> Self {
         assert_eq!(self.row, other.row);
         assert_eq!(self.col, other.col);
 
@@ -1608,14 +1948,6 @@ impl Vector for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
-                // let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
-                // for i in 0..self.row {
-                //     for j in 0..self.col {
-                //         result[(i, j)] += other[(i, j)];
-                //     }
-                // }
-                // result
-
                 let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
                 result
                     .data
@@ -1633,7 +1965,7 @@ impl Vector for Matrix {
         }
     }
 
-    fn sub_vec(&self, other: &Self) -> Self {
+    fn par_sub_vec(&self, other: &Self) -> Self {
         assert_eq!(self.row, other.row);
         assert_eq!(self.col, other.col);
         match () {
@@ -1652,14 +1984,6 @@ impl Vector for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
-                // let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
-                // for i in 0..self.row {
-                //     for j in 0..self.col {
-                //         result[(i, j)] -= other[(i, j)];
-                //     }
-                // }
-                // result
-
                 let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
                 result
                     .data
@@ -1675,39 +1999,82 @@ impl Vector for Matrix {
             }
         }
     }
+}
 
-    fn mul_scalar(&self, other: Self::Scalar) -> Self {
-        match () {
-            #[cfg(feature = "O3")]
-            () => {
-                let x = &self.data;
-                let mut y = vec![0f64; x.len()];
-                let a_f64 = other;
-                let n_i32 = x.len() as i32;
-
-                unsafe {
-                    daxpy(n_i32, a_f64, x, 1, &mut y, 1);
+impl Normed for Matrix {
+    type UnsignedScalar = f64;
+    fn norm(&self, kind: Norm) -> f64 {
+        match kind {
+            Norm::F => {
+                let mut s = 0f64;
+                for i in 0..self.data.len() {
+                    s += self.data[i].powi(2);
                 }
-                matrix(y, self.row, self.col, self.shape)
+                s.sqrt()
             }
-            _ => {
-                let scalar = other;
-                self.fmap(|x| x * scalar)
+            Norm::Lpq(p, q) => {
+                let mut s = 0f64;
+                for j in 0..self.col {
+                    let mut s_row = 0f64;
+                    for i in 0..self.row {
+                        s_row += self[(i, j)].powi(p as i32);
+                    }
+                    s += s_row.powf(q / p);
+                }
+                s.powf(1f64 / q)
+            }
+            Norm::L1 => {
+                let mut m = f64::MIN;
+                match self.shape {
+                    Row => self.change_shape().norm(Norm::L1),
+                    Col => {
+                        for c in 0..self.col {
+                            let s = self.col(c).iter().sum();
+                            if s > m {
+                                m = s;
+                            }
+                        }
+                        m
+                    }
+                }
+            }
+            Norm::LInf => {
+                let mut m = f64::MIN;
+                match self.shape {
+                    Col => self.change_shape().norm(Norm::LInf),
+                    Row => {
+                        for r in 0..self.row {
+                            let s = self.row(r).iter().sum();
+                            if s > m {
+                                m = s;
+                            }
+                        }
+                        m
+                    }
+                }
+            }
+            Norm::L2 => {
+                let a = &self.t() * self;
+                let eig = eigen(&a, EigenMethod::Jacobi);
+                eig.eigenvalue.norm(Norm::LInf)
             }
+            Norm::Lp(_) => unimplemented!(),
         }
     }
+    fn normalize(&self, _kind: Norm) -> Self
+    where
+        Self: Sized,
+    {
+        unimplemented!()
+    }
 }
 
-impl Normed for Matrix {
+impl ParallelNormed for Matrix {
     type UnsignedScalar = f64;
-    fn norm(&self, kind: Norm) -> f64 {
+
+    fn par_norm(&self, kind: Norm) -> f64 {
         match kind {
             Norm::F => {
-                // let mut s = 0f64;
-                // for i in 0..self.data.len() {
-                //     s += self.data[i].powi(2);
-                // }
-                // s.sqrt()
                 let s = self
                     .data
                     .clone()
@@ -1717,15 +2084,6 @@ impl Normed for Matrix {
                 s.sqrt()
             }
             Norm::Lpq(p, q) => {
-                // let mut s = 0f64;
-                // for j in 0..self.col {
-                //     let mut s_row = 0f64;
-                //     for i in 0..self.row {
-                //         s_row += self[(i, j)].powi(p as i32);
-                //     }
-                //     s += s_row.powf(q / p);
-                // }
-                // s.powf(1f64 / q)
                 let s = (0..self.col)
                     .into_par_iter()
                     .map(|j| {
@@ -1744,7 +2102,6 @@ impl Normed for Matrix {
                     Row => self.change_shape().norm(Norm::L1),
                     Col => {
                         for c in 0..self.col {
-                            // let s = self.col(c).iter().sum();
                             let s = self.col(c).par_iter().sum();
                             if s > m {
                                 m = s;
@@ -1760,7 +2117,6 @@ impl Normed for Matrix {
                     Col => self.change_shape().norm(Norm::LInf),
                     Row => {
                         for r in 0..self.row {
-                            // let s = self.row(r).iter().sum();
                             let s = self.row(r).par_iter().sum();
                             if s > m {
                                 m = s;
@@ -1778,12 +2134,6 @@ impl Normed for Matrix {
             Norm::Lp(_) => unimplemented!(),
         }
     }
-    fn normalize(&self, _kind: Norm) -> Self
-    where
-        Self: Sized,
-    {
-        unimplemented!()
-    }
 }
 
 /// Frobenius inner product
@@ -1858,15 +2208,22 @@ impl MatrixProduct for Matrix {
         assert_eq!(self.row, other.row);
         assert_eq!(self.col, other.col);
 
-        // let r = self.row;
-        // let c = self.col;
-        // let mut m = matrix(vec![0f64; r * c], r, c, self.shape);
-        // for i in 0..r {
-        //     for j in 0..c {
-        //         m[(i, j)] = self[(i, j)] * other[(i, j)]
-        //     }
-        // }
-        // m
+        let r = self.row;
+        let c = self.col;
+        let mut m = matrix(vec![0f64; r * c], r, c, self.shape);
+        for i in 0..r {
+            for j in 0..c {
+                m[(i, j)] = self[(i, j)] * other[(i, j)]
+            }
+        }
+        m
+    }
+}
+
+impl ParallelMatrixProduct for Matrix {
+    fn par_hadamard(&self, other: &Self) -> Matrix {
+        assert_eq!(self.row, other.row);
+        assert_eq!(self.col, other.col);
 
         let m = (0..self.row)
             .into_par_iter()
@@ -1877,7 +2234,7 @@ impl MatrixProduct for Matrix {
                     .collect::<Vec<f64>>()
             })
             .collect::<Vec<f64>>();
-        matrix(m, self.row, self.col, self.shape)
+        par_matrix(m, self.row, self.col, self.shape)
     }
 }
 
@@ -1952,7 +2309,7 @@ impl Add<Matrix> for Matrix {
                 //     }
                 // }
                 // result
-                let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
                 result
                     .data
                     .par_iter_mut()
@@ -2245,7 +2602,7 @@ impl Sub<Matrix> for Matrix {
                 //     }
                 // }
                 // result
-                let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
                 result
                     .data
                     .par_iter_mut()
@@ -2783,23 +3140,16 @@ impl FPMatrix for Matrix {
                 let new_data = self
                     .data
                     .clone()
-                    // .into_iter()
-                    .into_par_iter()
+                    .into_iter()
                     .take(n * self.col)
                     .collect::<Vec<f64>>();
                 matrix(new_data, n, self.col, Row)
             }
             Col => {
-                // let mut temp_data: Vec<f64> = Vec::new();
-                // for i in 0..n {
-                //     temp_data.extend(self.row(i));
-                // }
-                // matrix(temp_data, n, self.col, Row)
-
-                let temp_data = (0..n)
-                    .into_par_iter()
-                    .flat_map(|i| self.row(i))
-                    .collect::<Vec<f64>>();
+                let mut temp_data: Vec<f64> = Vec::new();
+                for i in 0..n {
+                    temp_data.extend(self.row(i));
+                }
                 matrix(temp_data, n, self.col, Row)
             }
         }
@@ -2814,23 +3164,16 @@ impl FPMatrix for Matrix {
                 let new_data = self
                     .data
                     .clone()
-                    // .into_iter()
-                    .into_par_iter()
+                    .into_iter()
                     .take(n * self.row)
                     .collect::<Vec<f64>>();
                 matrix(new_data, self.row, n, Col)
             }
             Row => {
-                // let mut temp_data: Vec<f64> = Vec::new();
-                // for i in 0..n {
-                //     temp_data.extend(self.col(i));
-                // }
-                // matrix(temp_data, self.row, n, Col)
-
-                let temp_data = (0..n)
-                    .into_par_iter()
-                    .flat_map(|i| self.col(i))
-                    .collect::<Vec<f64>>();
+                let mut temp_data: Vec<f64> = Vec::new();
+                for i in 0..n {
+                    temp_data.extend(self.col(i));
+                }
                 matrix(temp_data, self.row, n, Col)
             }
         }
@@ -2839,43 +3182,28 @@ impl FPMatrix for Matrix {
     fn skip_row(&self, n: usize) -> Self {
         assert!(n < self.row, "Skip range is larger than row of matrix");
 
-        // let mut temp_data: Vec<f64> = Vec::new();
-        // for i in n..self.row {
-        //     temp_data.extend(self.row(i));
-        // }
-        // matrix(temp_data, self.row - n, self.col, Row)
-
-        let temp_data = (n..self.row)
-            .into_par_iter()
-            .flat_map(|i| self.row(i))
-            .collect::<Vec<f64>>();
+        let mut temp_data: Vec<f64> = Vec::new();
+        for i in n..self.row {
+            temp_data.extend(self.row(i));
+        }
         matrix(temp_data, self.row - n, self.col, Row)
     }
 
     fn skip_col(&self, n: usize) -> Self {
         assert!(n < self.col, "Skip range is larger than col of matrix");
 
-        // let mut temp_data: Vec<f64> = Vec::new();
-        // for i in n..self.col {
-        //     temp_data.extend(self.col(i));
-        // }
-        // matrix(temp_data, self.row, self.col - n, Col)
-
-        let temp_data = (n..self.col)
-            .into_par_iter()
-            .flat_map(|i| self.col(i))
-            .collect::<Vec<f64>>();
+        let mut temp_data: Vec<f64> = Vec::new();
+        for i in n..self.col {
+            temp_data.extend(self.col(i));
+        }
         matrix(temp_data, self.row, self.col - n, Col)
     }
 
     fn fmap<F>(&self, f: F) -> Matrix
     where
-        F: Fn(f64) -> f64 + Send + Sync,
+        F: Fn(f64) -> f64,
     {
-        // let result = self.data.iter().map(|x| f(*x)).collect::<Vec<f64>>();
-        // matrix(result, self.row, self.col, self.shape)
-
-        let result = self.data.par_iter().map(|x| f(*x)).collect::<Vec<f64>>();
+        let result = self.data.iter().map(|x| f(*x)).collect::<Vec<f64>>();
         matrix(result, self.row, self.col, self.shape)
     }
 
@@ -2958,12 +3286,136 @@ impl FPMatrix for Matrix {
     }
 
     fn reduce<F, T>(&self, init: T, f: F) -> f64
+    where
+        F: Fn(f64, f64) -> f64,
+        T: Into<f64> + Copy + Clone,
+    {
+        self.data.iter().fold(init.into(), |x, y| f(x, *y))
+    }
+
+    fn zip_with<F>(&self, f: F, other: &Matrix) -> Self
+    where
+        F: Fn(f64, f64) -> f64,
+    {
+        assert_eq!(self.data.len(), other.data.len());
+        let mut a = other.clone();
+        if self.shape != other.shape {
+            a = a.change_shape();
+        }
+        let result = self
+            .data
+            .iter()
+            .zip(a.data.iter())
+            .map(|(x, y)| f(*x, *y))
+            .collect::<Vec<f64>>();
+        matrix(result, self.row, self.col, self.shape)
+    }
+
+    fn col_reduce<F>(&self, f: F) -> Vec<f64>
+    where
+        F: Fn(Vec<f64>) -> f64,
+    {
+        let mut v = vec![0f64; self.col];
+        for i in 0..self.col {
+            v[i] = f(self.col(i));
+        }
+        v
+    }
+
+    fn row_reduce<F>(&self, f: F) -> Vec<f64>
+    where
+        F: Fn(Vec<f64>) -> f64,
+    {
+        let mut v = vec![0f64; self.row];
+        for i in 0..self.row {
+            v[i] = f(self.row(i));
+        }
+        v
+    }
+}
+
+impl ParallelFPMatrix for Matrix {
+    fn par_take_row(&self, n: usize) -> Self {
+        if n >= self.row {
+            return self.clone();
+        }
+        match self.shape {
+            Row => {
+                let new_data = self
+                    .data
+                    .clone()
+                    .into_par_iter()
+                    .take(n * self.col)
+                    .collect::<Vec<f64>>();
+                par_matrix(new_data, n, self.col, Row)
+            }
+            Col => {
+                let temp_data = (0..n)
+                    .into_par_iter()
+                    .flat_map(|i| self.row(i))
+                    .collect::<Vec<f64>>();
+                par_matrix(temp_data, n, self.col, Row)
+            }
+        }
+    }
+
+    fn par_take_col(&self, n: usize) -> Self {
+        if n >= self.col {
+            return self.clone();
+        }
+        match self.shape {
+            Col => {
+                let new_data = self
+                    .data
+                    .clone()
+                    .into_par_iter()
+                    .take(n * self.row)
+                    .collect::<Vec<f64>>();
+                par_matrix(new_data, self.row, n, Col)
+            }
+            Row => {
+                let temp_data = (0..n)
+                    .into_par_iter()
+                    .flat_map(|i| self.col(i))
+                    .collect::<Vec<f64>>();
+                par_matrix(temp_data, self.row, n, Col)
+            }
+        }
+    }
+
+    fn par_skip_row(&self, n: usize) -> Self {
+        assert!(n < self.row, "Skip range is larger than row of matrix");
+
+        let temp_data = (n..self.row)
+            .into_par_iter()
+            .flat_map(|i| self.row(i))
+            .collect::<Vec<f64>>();
+        par_matrix(temp_data, self.row - n, self.col, Row)
+    }
+
+    fn par_skip_col(&self, n: usize) -> Self {
+        assert!(n < self.col, "Skip range is larger than col of matrix");
+
+        let temp_data = (n..self.col)
+            .into_par_iter()
+            .flat_map(|i| self.col(i))
+            .collect::<Vec<f64>>();
+        par_matrix(temp_data, self.row, self.col - n, Col)
+    }
+
+    fn par_fmap<F>(&self, f: F) -> Matrix
+    where
+        F: Fn(f64) -> f64 + Send + Sync,
+    {
+        let result = self.data.par_iter().map(|x| f(*x)).collect::<Vec<f64>>();
+        par_matrix(result, self.row, self.col, self.shape)
+    }
+
+    fn par_reduce<F, T>(&self, init: T, f: F) -> f64
     where
         F: Fn(f64, f64) -> f64 + Send + Sync,
         T: Into<f64> + Send + Sync + Copy + Clone,
     {
-        // self.data.iter().fold(init.into(), |x, y| f(x, *y))
-
         self.data
             .clone()
             .into_par_iter()
@@ -2971,7 +3423,7 @@ impl FPMatrix for Matrix {
             .sum()
     }
 
-    fn zip_with<F>(&self, f: F, other: &Matrix) -> Self
+    fn par_zip_with<F>(&self, f: F, other: &Matrix) -> Self
     where
         F: Fn(f64, f64) -> f64 + Send + Sync,
     {
@@ -2980,13 +3432,6 @@ impl FPMatrix for Matrix {
         if self.shape != other.shape {
             a = a.change_shape();
         }
-        // let result = self
-        //     .data
-        //     .iter()
-        //     .zip(a.data.iter())
-        //     .map(|(x, y)| f(*x, *y))
-        //     .collect::<Vec<f64>>();
-        // matrix(result, self.row, self.col, self.shape)
 
         let result = self
             .data
@@ -2994,35 +3439,23 @@ impl FPMatrix for Matrix {
             .zip(a.data.par_iter())
             .map(|(x, y)| f(*x, *y))
             .collect::<Vec<f64>>();
-        matrix(result, self.row, self.col, self.shape)
+        par_matrix(result, self.row, self.col, self.shape)
     }
 
-    fn col_reduce<F>(&self, f: F) -> Vec<f64>
+    fn par_col_reduce<F>(&self, f: F) -> Vec<f64>
     where
         F: Fn(Vec<f64>) -> f64 + Send + Sync,
     {
-        // let mut v = vec![0f64; self.col];
-        // for i in 0..self.col {
-        //     v[i] = f(self.col(i));
-        // }
-        // v
-
         (0..self.col)
             .into_par_iter()
             .map(|i| f(self.col(i)))
             .collect::<Vec<f64>>()
     }
 
-    fn row_reduce<F>(&self, f: F) -> Vec<f64>
+    fn par_row_reduce<F>(&self, f: F) -> Vec<f64>
     where
         F: Fn(Vec<f64>) -> f64 + Send + Sync,
     {
-        // let mut v = vec![0f64; self.row];
-        // for i in 0..self.row {
-        //     v[i] = f(self.row(i));
-        // }
-        // v
-
         (0..self.row)
             .into_par_iter()
             .map(|i| f(self.row(i)))
@@ -3055,13 +3488,15 @@ pub trait LinearAlgebra {
 }
 
 pub fn diag(n: usize) -> Matrix {
-    // let mut v: Vec<f64> = vec![0f64; n * n];
-    // for i in 0..n {
-    //     let idx = i * (n + 1);
-    //     v[idx] = 1f64;
-    // }
-    // matrix(v, n, n, Row)
+    let mut v: Vec<f64> = vec![0f64; n * n];
+    for i in 0..n {
+        let idx = i * (n + 1);
+        v[idx] = 1f64;
+    }
+    matrix(v, n, n, Row)
+}
 
+pub fn par_diag(n: usize) -> Matrix {
     let v = (0..n * n)
         .into_par_iter()
         .map(|i| {
@@ -3072,7 +3507,7 @@ pub fn diag(n: usize) -> Matrix {
             }
         })
         .collect::<Vec<f64>>();
-    matrix(v, n, n, Row)
+    par_matrix(v, n, n, Row)
 }
 
 /// Data structure for Complete Pivoting LU decomposition
diff --git a/src/structure/vector.rs b/src/structure/vector.rs
index ad00d09d..aaa939d5 100644
--- a/src/structure/vector.rs
+++ b/src/structure/vector.rs
@@ -270,6 +270,10 @@ use blas::{daxpy, ddot, dnrm2, idamax};
 use crate::rayon::prelude::*;
 
 use crate::structure::matrix::{matrix, Matrix, Row};
+use crate::traits::fp::ParallelFPVector;
+use crate::traits::general::ParallelAlgorithm;
+use crate::traits::math::{ParallelInnerProduct, ParallelNormed, ParallelVectorProduct};
+use crate::traits::mutable::ParallelMutFP;
 use crate::traits::{
     fp::FPVector,
     general::Algorithm,
@@ -277,7 +281,7 @@ use crate::traits::{
     mutable::MutFP,
     pointer::{Oxide, Redox, RedoxCommon},
 };
-// use std::cmp::min;
+use std::cmp::min;
 
 impl FPVector for Vec<f64> {
     type Scalar = f64;
@@ -297,14 +301,10 @@ impl FPVector for Vec<f64> {
     /// ```
     fn fmap<F>(&self, f: F) -> Vec<f64>
     where
-        F: Fn(f64) -> f64 + Sync + Send,
+        F: Fn(f64) -> f64,
     {
-        // let mut v = self.clone();
-        // v.iter_mut().for_each(|x| *x = f(*x));
-        // v
-
         let mut v = self.clone();
-        v.par_iter_mut().for_each(|x| *x = f(*x));
+        v.iter_mut().for_each(|x| *x = f(*x));
         v
     }
 
@@ -322,12 +322,148 @@ impl FPVector for Vec<f64> {
     /// }
     /// ```
     fn reduce<F, T>(&self, init: T, f: F) -> f64
+    where
+        F: Fn(f64, f64) -> f64,
+        T: Into<f64> + Copy,
+    {
+        self.iter().fold(init.into(), |x, &y| f(x, y))
+    }
+
+    fn zip_with<F>(&self, f: F, other: &Vec<f64>) -> Vec<f64>
+    where
+        F: Fn(f64, f64) -> f64,
+    {
+        self.iter()
+            .zip(other)
+            .map(|(x, y)| f(*x, *y))
+            .collect::<Vec<f64>>()
+    }
+
+    /// Filter for `Vec<f64>`
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = c!(1,2,3,4,5);
+    ///     let b = a.filter(|x| x > 3.);
+    ///     assert_eq!(b, c!(4,5));
+    /// }
+    /// ```
+    fn filter<F>(&self, f: F) -> Vec<f64>
+    where
+        F: Fn(f64) -> bool,
+    {
+        self.clone()
+            .into_iter()
+            .filter(|x| f(*x))
+            .collect::<Vec<f64>>()
+    }
+
+    /// Take for `Vec<f64>`
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = c!(1,2,3,4,5);
+    ///     let b = a.take(3);
+    ///     assert_eq!(b, c!(1,2,3));
+    /// }
+    /// ```
+    fn take(&self, n: usize) -> Vec<f64> {
+        let mut v = vec![0f64; n];
+        v[..n].copy_from_slice(&self[..n]);
+        v
+    }
+
+    /// Skip for `Vec<f64>`
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = c!(1,2,3,4,5);
+    ///     let b = a.skip(3);
+    ///     assert_eq!(b, c!(4,5));
+    /// }
+    /// ```
+    fn skip(&self, n: usize) -> Vec<f64> {
+        let l = self.len();
+        let mut v = vec![0f64; l - n];
+        for (i, j) in (n..l).enumerate() {
+            v[i] = self[j];
+        }
+        v
+    }
+
+    fn sum(&self) -> f64 {
+        self.iter().sum()
+    }
+
+    fn prod(&self) -> f64 {
+        self.iter().product()
+    }
+}
+
+impl ParallelFPVector for Vec<f64> {
+    type Scalar = f64;
+
+    /// par_fmap for `Vec<f64>`
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    /// use peroxide::traits::fp::ParallelFPVector;
+    ///
+    /// fn main() {
+    ///     let a = c!(1,2,3,4,5);
+    ///     assert_eq!(a.par_fmap(|x| x*2f64), seq!(2,10,2));
+    /// }
+    /// ```
+    fn par_fmap<F>(&self, f: F) -> Vec<f64>
+    where
+        F: Fn(f64) -> f64 + Sync + Send,
+    {
+        let mut v = self.clone();
+        v.par_iter_mut().for_each(|x| *x = f(*x));
+        v
+    }
+
+    /// reduce for `Vec<f64>`
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    /// use peroxide::traits::fp::ParallelFPVector;
+    ///
+    /// fn main() {
+    ///     let a = seq!(1,100,1);
+    ///     assert_eq!(a.par_reduce(0, |x,y| x + y), 5050f64);
+    /// }
+    /// ```
+    fn par_reduce<F, T>(&self, init: T, f: F) -> f64
     where
         F: Fn(f64, f64) -> f64 + Send + Sync,
         T: Into<f64> + Send + Sync + Copy,
     {
         self.iter().fold(init.into(), |x, &y| f(x, y))
 
+        // Parallel version unimplemented
+
         // self.par_iter()
         //     .cloned()
         //     .fold(|| init.into(), |x, y| f(x, y))
@@ -336,14 +472,10 @@ impl FPVector for Vec<f64> {
         //                   // https://docs.rs/rayon/latest/rayon/iter/trait.ParallelIterator.html#method.reduce
     }
 
-    fn zip_with<F>(&self, f: F, other: &Vec<f64>) -> Vec<f64>
+    fn par_zip_with<F>(&self, f: F, other: &Vec<f64>) -> Vec<f64>
     where
         F: Fn(f64, f64) -> f64 + Send + Sync,
     {
-        // self.iter()
-        //     .zip(other)
-        //     .map(|(x, y)| f(*x, *y))
-        //     .collect::<Vec<f64>>()
         self.par_iter()
             .zip(other)
             .map(|(x, y)| f(*x, *y))
@@ -357,45 +489,40 @@ impl FPVector for Vec<f64> {
     /// #[macro_use]
     /// extern crate peroxide;
     /// use peroxide::fuga::*;
+    /// use peroxide::traits::fp::ParallelFPVector;
     ///
     /// fn main() {
     ///     let a = c!(1,2,3,4,5);
-    ///     let b = a.filter(|x| x > 3.);
+    ///     let b = a.par_filter(|x| x > 3.);
     ///     assert_eq!(b, c!(4,5));
     /// }
     /// ```
-    fn filter<F>(&self, f: F) -> Vec<f64>
+    fn par_filter<F>(&self, f: F) -> Vec<f64>
     where
         F: Fn(f64) -> bool + Send + Sync,
     {
-        // self.clone()
-        //     .into_iter()
-        //     .filter(|x| f(*x))
-        //     .collect::<Vec<f64>>()
         self.clone()
             .into_par_iter()
             .filter(|x| f(*x))
             .collect::<Vec<f64>>()
     }
 
-    /// Take for `Vec<f64>`
+    /// Take for `Vec<f64>` in Parallel
     ///
     /// # Examples
     /// ```
     /// #[macro_use]
     /// extern crate peroxide;
     /// use peroxide::fuga::*;
+    /// use peroxide::traits::fp::ParallelFPVector;
     ///
     /// fn main() {
     ///     let a = c!(1,2,3,4,5);
-    ///     let b = a.take(3);
+    ///     let b = a.par_take(3);
     ///     assert_eq!(b, c!(1,2,3));
     /// }
     /// ```
-    fn take(&self, n: usize) -> Vec<f64> {
-        // let mut v = vec![0f64; n];
-        // v[..n].copy_from_slice(&self[..n]);
-        // v
+    fn par_take(&self, n: usize) -> Vec<f64> {
         let mut v = vec![0_f64; n];
         v[..n]
             .par_iter_mut()
@@ -406,27 +533,22 @@ impl FPVector for Vec<f64> {
         v
     }
 
-    /// Skip for `Vec<f64>`
+    /// Skip for `Vec<f64>` in Parallel
     ///
     /// # Examples
     /// ```
     /// #[macro_use]
     /// extern crate peroxide;
     /// use peroxide::fuga::*;
+    /// use peroxide::traits::fp::ParallelFPVector;
     ///
     /// fn main() {
     ///     let a = c!(1,2,3,4,5);
-    ///     let b = a.skip(3);
+    ///     let b = a.par_skip(3);
     ///     assert_eq!(b, c!(4,5));
     /// }
     /// ```
-    fn skip(&self, n: usize) -> Vec<f64> {
-        // let l = self.len();
-        // let mut v = vec![0f64; l - n];
-        // for (i, j) in (n..l).enumerate() {
-        //     v[i] = self[j];
-        // }
-        // v
+    fn par_skip(&self, n: usize) -> Vec<f64> {
         let l = self.len();
         let mut v = vec![0f64; l - n];
         v[..(l - n)]
@@ -436,29 +558,35 @@ impl FPVector for Vec<f64> {
         v
     }
 
-    fn sum(&self) -> f64 {
-        // self.iter().sum()
+    fn par_sum(&self) -> f64 {
         self.par_iter().sum()
     }
 
-    fn prod(&self) -> f64 {
-        // self.iter().product()
+    fn par_prod(&self) -> f64 {
         self.par_iter().product()
     }
 }
 
 /// Explicit version of `map`
 pub fn map<F, T>(f: F, xs: &[T]) -> Vec<T>
+where
+    F: Fn(T) -> T,
+    T: Default + Copy,
+{
+    let l = xs.len();
+    let mut result = vec![T::default(); l];
+    for i in 0..l {
+        result[i] = f(xs[i]);
+    }
+    result
+}
+
+/// Explicit version of `map` in parallel
+pub fn par_map<F, T>(f: F, xs: &[T]) -> Vec<T>
 where
     F: Fn(T) -> T + Send + Sync,
     T: Default + Copy + Send + Sync,
 {
-    // let l = xs.len();
-    // let mut result = vec![T::default(); l];
-    // for i in 0..l {
-    //     result[i] = f(xs[i]);
-    // }
-    // result
     let mut result = vec![T::default(); xs.len()];
     result.par_iter_mut().for_each(|x| *x = f(*x));
     result
@@ -480,16 +608,24 @@ where
 
 /// Explicit version of `zip_with`
 pub fn zip_with<F, T>(f: F, xs: &[T], ys: &[T]) -> Vec<T>
+where
+    F: Fn(T, T) -> T,
+    T: Default + Copy,
+{
+    let l = min(xs.len(), ys.len());
+    let mut result = vec![T::default(); l];
+    for i in 0..l {
+        result[i] = f(xs[i], ys[i]);
+    }
+    result
+}
+
+/// Explicit version of `zip_with` in parallel
+pub fn par_zip_with<F, T>(f: F, xs: &[T], ys: &[T]) -> Vec<T>
 where
     F: Fn(T, T) -> T + Send + Sync,
     T: Default + Copy + Send + Sync,
 {
-    // let l = min(xs.len(), ys.len());
-    // let mut result = vec![T::default(); l];
-    // for i in 0..l {
-    //     result[i] = f(xs[i], ys[i]);
-    // }
-    // result
     xs.par_iter()
         .zip(ys.into_par_iter())
         .map(|(x, y)| f(*x, *y))
@@ -500,22 +636,38 @@ impl MutFP for Vec<f64> {
     type Scalar = f64;
 
     fn mut_map<F>(&mut self, f: F)
+    where
+        F: Fn(Self::Scalar) -> Self::Scalar,
+    {
+        for x in self.iter_mut() {
+            *x = f(*x);
+        }
+    }
+
+    fn mut_zip_with<F>(&mut self, f: F, other: &Self)
+    where
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar,
+    {
+        for i in 0..self.len() {
+            self[i] = f(self[i], other[i]);
+        }
+    }
+}
+
+impl ParallelMutFP for Vec<f64> {
+    type Scalar = f64;
+
+    fn par_mut_map<F>(&mut self, f: F)
     where
         F: Fn(Self::Scalar) -> Self::Scalar + Send + Sync,
     {
-        // for x in self.iter_mut() {
-        //     *x = f(*x);
-        // }
         self.par_iter_mut().for_each(|x| *x = f(*x));
     }
 
-    fn mut_zip_with<F>(&mut self, f: F, other: &Self)
+    fn par_mut_zip_with<F>(&mut self, f: F, other: &Self)
     where
         F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync,
     {
-        // for i in 0..self.len() {
-        //     self[i] = f(self[i], other[i]);
-        // }
         self.par_iter_mut()
             .zip(other.into_par_iter())
             .for_each(|(x, y)| *x = f(*x, *y));
@@ -540,35 +692,15 @@ impl Algorithm for Vec<f64> {
         let l = self.len();
         let idx = (1..(l + 1)).collect::<Vec<usize>>();
 
-        // let mut vec_tup = self
-        //     .clone()
-        //     .into_iter()
-        //     .zip(idx.clone())
-        //     .collect::<Vec<(f64, usize)>>();
-        // vec_tup.sort_by(|x, y| x.0.partial_cmp(&y.0).unwrap().reverse());
-        // let indices = vec_tup.into_iter().map(|(_, y)| y).collect::<Vec<usize>>();
-        // idx.into_iter()
-        //     .map(|x| indices.clone().into_iter().position(|t| t == x).unwrap())
-        //     .collect::<Vec<usize>>()
-
         let mut vec_tup = self
             .clone()
-            .into_par_iter()
+            .into_iter()
             .zip(idx.clone())
             .collect::<Vec<(f64, usize)>>();
         vec_tup.sort_by(|x, y| x.0.partial_cmp(&y.0).unwrap().reverse());
-        let indices = vec_tup
-            .into_par_iter()
-            .map(|(_, y)| y)
-            .collect::<Vec<usize>>();
-        idx.into_par_iter()
-            .map(|x| {
-                indices
-                    .clone()
-                    .into_par_iter()
-                    .position_any(|t| t == x) // Note: position() is deprecated by rayon
-                    .unwrap()
-            })
+        let indices = vec_tup.into_iter().map(|(_, y)| y).collect::<Vec<usize>>();
+        idx.into_iter()
+            .map(|x| indices.clone().into_iter().position(|t| t == x).unwrap())
             .collect::<Vec<usize>>()
     }
 
@@ -634,17 +766,145 @@ impl Algorithm for Vec<f64> {
         }
         #[cfg(not(feature = "O3"))]
         {
-            // self.into_iter()
-            //     .enumerate()
-            //     .fold((0usize, f64::MIN), |acc, (ics, &val)| {
-            //         if acc.1 < val {
-            //             (ics, val)
-            //         } else {
-            //             acc
-            //         }
-            //     })
-            //     .0
+            self.into_iter()
+                .enumerate()
+                .fold((0usize, f64::MIN), |acc, (ics, &val)| {
+                    if acc.1 < val {
+                        (ics, val)
+                    } else {
+                        acc
+                    }
+                })
+                .0
+        }
+    }
+
+    /// arg min
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let v = c!(1,3,2,4,3,7);
+    ///     assert_eq!(v.arg_min(),0);
+    ///
+    ///     let v2 = c!(4,3,2,5,1,6);
+    ///     assert_eq!(v2.arg_min(),4);
+    /// }
+    fn arg_min(&self) -> usize {
+        self.iter()
+            .enumerate()
+            .fold((0usize, f64::MAX), |acc, (ics, &val)| {
+                if acc.1 > val {
+                    (ics, val)
+                } else {
+                    acc
+                }
+            })
+            .0
+    }
+
+    fn max(&self) -> f64 {
+        #[cfg(feature = "O3")]
+        {
+            let n_i32 = self.len() as i32;
+            let idx: usize;
+            unsafe {
+                idx = idamax(n_i32, self, 1) - 1;
+            }
+            self[idx]
+        }
+        #[cfg(not(feature = "O3"))]
+        {
+            self.into_iter()
+                .fold(f64::MIN, |acc, &val| if acc < val { val } else { acc })
+        }
+    }
+
+    fn min(&self) -> f64 {
+        self.iter()
+            .fold(f64::MAX, |acc, &val| if acc > val { val } else { acc })
+    }
 
+    fn swap_with_perm(&mut self, p: &Vec<(usize, usize)>) {
+        for (i, j) in p.iter() {
+            self.swap(*i, *j);
+        }
+    }
+}
+
+impl ParallelAlgorithm for Vec<f64> {
+    /// Assign rank in parallel
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    /// use peroxide::traits::general::ParallelAlgorithm;
+    ///
+    /// fn main() {
+    ///     let v = c!(7, 5, 9, 2, 8);
+    ///     assert_eq!(v.par_rank(), vec![2,3,0,4,1]);
+    /// }
+    /// ```
+    fn par_rank(&self) -> Vec<usize> {
+        let l = self.len();
+        let idx = (1..(l + 1)).collect::<Vec<usize>>();
+
+        let mut vec_tup = self
+            .clone()
+            .into_par_iter()
+            .zip(idx.clone())
+            .collect::<Vec<(f64, usize)>>();
+        vec_tup.sort_by(|x, y| x.0.partial_cmp(&y.0).unwrap().reverse());
+        let indices = vec_tup
+            .into_par_iter()
+            .map(|(_, y)| y)
+            .collect::<Vec<usize>>();
+        idx.into_par_iter()
+            .map(|x| {
+                indices
+                    .clone()
+                    .into_par_iter()
+                    .position_any(|t| t == x) // Note: position() is deprecated by rayon
+                    .unwrap()
+            })
+            .collect::<Vec<usize>>()
+    }
+
+    /// arg max in Parallel
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    /// use peroxide::traits::general::ParallelAlgorithm;
+    ///
+    /// fn main() {
+    ///     let v = c!(1,3,2,4,3,7);
+    ///     assert_eq!(v.par_arg_max(),5);
+    ///
+    ///     let v2 = c!(1,3,2,5,6,6);
+    ///     assert_eq!(v2.par_arg_max(),4);
+    /// }
+    /// ```
+    fn par_arg_max(&self) -> usize {
+        #[cfg(feature = "O3")]
+        {
+            let n_i32 = self.len() as i32;
+            let idx: usize;
+            unsafe {
+                idx = idamax(n_i32, self, 1) - 1;
+            }
+            idx
+        }
+        #[cfg(not(feature = "O3"))]
+        {
             self.into_par_iter()
                 .enumerate()
                 .fold(
@@ -658,7 +918,7 @@ impl Algorithm for Vec<f64> {
                     },
                 )
                 .reduce(
-                    // Note: need reduce and can nto simply do .max() because rayon::..max() requires Ord, which is stricter than PartialOrd
+                    // Note: need reduce and can not simply do .max() because rayon::..max() requires Ord, which is stricter than PartialOrd
                     // Hence combine the results from the parallel fold
                     || (0usize, f64::MIN), // Identity element
                     |acc1, acc2| {
@@ -673,33 +933,23 @@ impl Algorithm for Vec<f64> {
         }
     }
 
-    /// arg min
+    /// arg min in Parallel
     ///
     /// # Examples
     /// ```
     /// #[macro_use]
     /// extern crate peroxide;
     /// use peroxide::fuga::*;
+    /// use peroxide::traits::general::ParallelAlgorithm;
     ///
     /// fn main() {
     ///     let v = c!(1,3,2,4,3,7);
-    ///     assert_eq!(v.arg_min(),0);
+    ///     assert_eq!(v.par_arg_min(),0);
     ///
     ///     let v2 = c!(4,3,2,5,1,6);
-    ///     assert_eq!(v2.arg_min(),4);
+    ///     assert_eq!(v2.par_arg_min(),4);
     /// }
-    fn arg_min(&self) -> usize {
-        // self.iter()
-        //     .enumerate()
-        //     .fold((0usize, f64::MAX), |acc, (ics, &val)| {
-        //         if acc.1 > val {
-        //             (ics, val)
-        //         } else {
-        //             acc
-        //         }
-        //     })
-        //     .0
-
+    fn par_arg_min(&self) -> usize {
         self.into_par_iter()
             .enumerate()
             .fold(
@@ -725,7 +975,7 @@ impl Algorithm for Vec<f64> {
             .0
     }
 
-    fn max(&self) -> f64 {
+    fn par_max(&self) -> f64 {
         #[cfg(feature = "O3")]
         {
             let n_i32 = self.len() as i32;
@@ -737,9 +987,6 @@ impl Algorithm for Vec<f64> {
         }
         #[cfg(not(feature = "O3"))]
         {
-            // self.into_iter()
-            //     .fold(f64::MIN, |acc, &val| if acc < val { val } else { acc })
-
             self.into_par_iter()
                 .fold(|| f64::MIN, |acc, &val| if acc < val { val } else { acc })
                 .reduce(
@@ -749,10 +996,7 @@ impl Algorithm for Vec<f64> {
         }
     }
 
-    fn min(&self) -> f64 {
-        // self.iter()
-        //     .fold(f64::MAX, |acc, &val| if acc > val { val } else { acc })
-
+    fn par_min(&self) -> f64 {
         self.into_par_iter()
             .fold(|| f64::MAX, |acc, &val| if acc > val { val } else { acc })
             .reduce(
@@ -760,12 +1004,6 @@ impl Algorithm for Vec<f64> {
                 |acc1, acc2| if acc1 > acc2 { acc2 } else { acc1 },
             )
     }
-
-    fn swap_with_perm(&mut self, p: &Vec<(usize, usize)>) {
-        for (i, j) in p.iter() {
-            self.swap(*i, *j);
-        }
-    }
 }
 
 impl Vector for Vec<f64> {
@@ -802,8 +1040,49 @@ impl Normed for Vec<f64> {
                 }
                 #[cfg(not(feature = "O3"))]
                 {
-                    // self.iter().map(|x| x.powi(2)).sum::<f64>().sqrt()
+                    self.iter().map(|x| x.powi(2)).sum::<f64>().sqrt()
+                }
+            }
+            Norm::Lp(p) => {
+                assert!(
+                    p >= 1f64,
+                    "lp norm is only defined for p>=1, the given value was p={}",
+                    p
+                );
+                self.iter().map(|x| x.powf(p)).sum::<f64>().powf(1f64 / p)
+            }
+            Norm::LInf => self.iter().fold(0f64, |x, y| x.max(y.abs())),
+            Norm::F => unimplemented!(),
+            Norm::Lpq(_, _) => unimplemented!(),
+        }
+    }
+    fn normalize(&self, kind: Norm) -> Self
+    where
+        Self: Sized,
+    {
+        let denom = self.norm(kind);
+        self.fmap(|x| x / denom)
+    }
+}
+
+impl ParallelNormed for Vec<f64> {
+    type UnsignedScalar = f64;
 
+    fn par_norm(&self, kind: Norm) -> f64 {
+        match kind {
+            Norm::L1 => self.iter().map(|x| x.abs()).sum(),
+            Norm::L2 => {
+                #[cfg(feature = "O3")]
+                {
+                    let n_i32 = self.len() as i32;
+                    let res: f64;
+                    unsafe {
+                        res = dnrm2(n_i32, self, 1);
+                    }
+                    res
+                }
+                #[cfg(not(feature = "O3"))]
+                {
                     self.par_iter()
                         .map(|x| x.powi(2))
                         .sum::<Self::UnsignedScalar>()
@@ -816,14 +1095,11 @@ impl Normed for Vec<f64> {
                     "lp norm is only defined for p>=1, the given value was p={}",
                     p
                 );
-                // self.iter().map(|x| x.powf(p)).sum::<f64>().powf(1f64 / p)
-
                 self.par_iter()
                     .map(|x| x.powf(p))
                     .sum::<Self::UnsignedScalar>()
                     .powf(1_f64 / p)
             }
-            // Norm::LInf => self.iter().fold(0f64, |x, y| x.max(y.abs())),
             Norm::LInf => self
                 .par_iter()
                 .fold(|| 0f64, |x, y| x.max(y.abs()))
@@ -832,13 +1108,6 @@ impl Normed for Vec<f64> {
             Norm::Lpq(_, _) => unimplemented!(),
         }
     }
-    fn normalize(&self, kind: Norm) -> Self
-    where
-        Self: Sized,
-    {
-        let denom = self.norm(kind);
-        self.fmap(|x| x / denom)
-    }
 }
 
 impl InnerProduct for Vec<f64> {
@@ -854,10 +1123,26 @@ impl InnerProduct for Vec<f64> {
         }
         #[cfg(not(feature = "O3"))]
         {
-            // self.iter()
-            //     .zip(rhs.iter())
-            //     .fold(0f64, |x, (y1, y2)| x + y1 * y2)
+            self.iter()
+                .zip(rhs.iter())
+                .fold(0f64, |x, (y1, y2)| x + y1 * y2)
+        }
+    }
+}
 
+impl ParallelInnerProduct for Vec<f64> {
+    fn par_dot(&self, rhs: &Self) -> f64 {
+        #[cfg(feature = "O3")]
+        {
+            let n_i32 = self.len() as i32;
+            let res: f64;
+            unsafe {
+                res = ddot(n_i32, self, 1, rhs, 1);
+            }
+            res
+        }
+        #[cfg(not(feature = "O3"))]
+        {
             self.par_iter()
                 .zip(rhs.into_par_iter())
                 .fold(|| 0_f64, |x, (y1, y2)| x + y1 * y2)
@@ -889,12 +1174,36 @@ impl VectorProduct for Vec<f64> {
                 v
             }
             3 => {
-                // let mut v = vec![0f64; 3];
-                // v[0] = self[1] * other[2] - self[2] * other[1];
-                // v[1] = self[2] * other[0] - self[0] * other[2];
-                // v[2] = self[0] * other[1] - self[1] * other[0];
-                // v
+                let mut v = vec![0f64; 3];
+                v[0] = self[1] * other[2] - self[2] * other[1];
+                v[1] = self[2] * other[0] - self[0] * other[2];
+                v[2] = self[0] * other[1] - self[1] * other[0];
+                v
+            }
+            _ => {
+                panic!("Cross product can be defined only in 2 or 3 dimension")
+            }
+        }
+    }
+
+    fn outer(&self, other: &Self) -> Matrix {
+        let m: Matrix = self.into();
+        let n = matrix(other.to_owned(), 1, other.len(), Row);
+        m * n
+    }
+}
 
+impl ParallelVectorProduct for Vec<f64> {
+    fn par_cross(&self, other: &Self) -> Self {
+        assert_eq!(self.len(), other.len());
+        // 2D cross product is ill-defined
+        match self.len() {
+            2 => {
+                let mut v = vec![0f64; 1];
+                v[0] = self[0] * other[1] - self[1] * other[0];
+                v
+            }
+            3 => {
                 let v = (0..3)
                     .into_par_iter()
                     .map(|index| {
@@ -909,12 +1218,6 @@ impl VectorProduct for Vec<f64> {
             }
         }
     }
-
-    fn outer(&self, other: &Self) -> Matrix {
-        let m: Matrix = self.into();
-        let n = matrix(other.to_owned(), 1, other.len(), Row);
-        m * n
-    }
 }
 
 // /// Convenient Vec<f64> Operations (No Clone, No Copy)
diff --git a/src/traits/fp.rs b/src/traits/fp.rs
index 1106f452..fbb66767 100644
--- a/src/traits/fp.rs
+++ b/src/traits/fp.rs
@@ -6,17 +6,17 @@ pub trait FPVector {
 
     fn fmap<F>(&self, f: F) -> Self
     where
-        F: Fn(Self::Scalar) -> Self::Scalar + Send + Sync;
+        F: Fn(Self::Scalar) -> Self::Scalar;
     fn reduce<F, T>(&self, init: T, f: F) -> Self::Scalar
     where
-        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync,
-        T: Into<Self::Scalar> + Send + Sync + Copy;
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar,
+        T: Into<Self::Scalar> + Copy;
     fn zip_with<F>(&self, f: F, other: &Self) -> Self
     where
-        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync;
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar;
     fn filter<F>(&self, f: F) -> Self
     where
-        F: Fn(Self::Scalar) -> bool + Send + Sync;
+        F: Fn(Self::Scalar) -> bool;
     fn take(&self, n: usize) -> Self;
     fn skip(&self, n: usize) -> Self;
     fn sum(&self) -> Self::Scalar;
@@ -31,7 +31,7 @@ pub trait FPMatrix {
     fn skip_col(&self, n: usize) -> Matrix;
     fn fmap<F>(&self, f: F) -> Matrix
     where
-        F: Fn(f64) -> f64 + Send + Sync;
+        F: Fn(f64) -> f64;
     fn col_map<F>(&self, f: F) -> Matrix
     where
         F: Fn(Vec<f64>) -> Vec<f64>;
@@ -45,16 +45,72 @@ pub trait FPMatrix {
     where
         F: Fn(Vec<f64>) -> Vec<f64>;
     fn reduce<F, T>(&self, init: T, f: F) -> f64
+    where
+        F: Fn(f64, f64) -> f64,
+        T: Into<f64> + Copy + Clone;
+    fn zip_with<F>(&self, f: F, other: &Matrix) -> Matrix
+    where
+        F: Fn(f64, f64) -> f64;
+    fn col_reduce<F>(&self, f: F) -> Vec<f64>
+    where
+        F: Fn(Vec<f64>) -> f64;
+    fn row_reduce<F>(&self, f: F) -> Vec<f64>
+    where
+        F: Fn(Vec<f64>) -> f64;
+}
+
+/// Functional Programming tools for Vector in Parallel (Uses Rayon crate)
+pub trait ParallelFPVector {
+    type Scalar;
+
+    fn par_fmap<F>(&self, f: F) -> Self
+    where
+        F: Fn(Self::Scalar) -> Self::Scalar + Send + Sync;
+    fn par_reduce<F, T>(&self, init: T, f: F) -> Self::Scalar
+    where
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync,
+        T: Into<Self::Scalar> + Send + Sync + Copy;
+    fn par_zip_with<F>(&self, f: F, other: &Self) -> Self
+    where
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync;
+    fn par_filter<F>(&self, f: F) -> Self
+    where
+        F: Fn(Self::Scalar) -> bool + Send + Sync;
+
+    fn par_take(&self, n: usize) -> Self;
+    fn par_skip(&self, n: usize) -> Self;
+    fn par_sum(&self) -> Self::Scalar;
+    fn par_prod(&self) -> Self::Scalar;
+}
+
+/// Functional Programming for Matrix in Parallel (Uses Rayon crate)
+pub trait ParallelFPMatrix {
+    fn par_fmap<F>(&self, f: F) -> Matrix
+    where
+        F: Fn(f64) -> f64 + Send + Sync;
+    fn par_reduce<F, T>(&self, init: T, f: F) -> f64
     where
         F: Fn(f64, f64) -> f64 + Send + Sync,
         T: Into<f64> + Copy + Clone + Send + Sync;
-    fn zip_with<F>(&self, f: F, other: &Matrix) -> Matrix
+    fn par_zip_with<F>(&self, f: F, other: &Matrix) -> Matrix
     where
         F: Fn(f64, f64) -> f64 + Send + Sync;
-    fn col_reduce<F>(&self, f: F) -> Vec<f64>
+    fn par_col_reduce<F>(&self, f: F) -> Vec<f64>
     where
         F: Fn(Vec<f64>) -> f64 + Send + Sync;
-    fn row_reduce<F>(&self, f: F) -> Vec<f64>
+    fn par_row_reduce<F>(&self, f: F) -> Vec<f64>
     where
         F: Fn(Vec<f64>) -> f64 + Send + Sync;
+    fn par_take_row(&self, n: usize) -> Matrix;
+    fn par_take_col(&self, n: usize) -> Matrix;
+    fn par_skip_row(&self, n: usize) -> Matrix;
+    fn par_skip_col(&self, n: usize) -> Matrix;
+
+    // These are not in parallel:
+    // fn col_map<F>(&self, f: F) -> Matrix
+    // where
+    //     F: Fn(Vec<f64>) -> Vec<f64>;
+    // fn row_map<F>(&self, f: F) -> Matrix
+    // where
+    //     F: Fn(Vec<f64>) -> Vec<f64>;
 }
diff --git a/src/traits/general.rs b/src/traits/general.rs
index 2737f97b..7ec7fed1 100644
--- a/src/traits/general.rs
+++ b/src/traits/general.rs
@@ -8,3 +8,16 @@ pub trait Algorithm {
     fn min(&self) -> f64;
     fn swap_with_perm(&mut self, p: &Vec<(usize, usize)>);
 }
+
+/// Some algorithms for Vector in Parallel
+pub trait ParallelAlgorithm {
+    fn par_rank(&self) -> Vec<usize>;
+    fn par_arg_max(&self) -> usize;
+    fn par_arg_min(&self) -> usize;
+    fn par_max(&self) -> f64;
+    fn par_min(&self) -> f64;
+
+    // Not implemented in parallel:
+    // fn par_sign(&self) -> f64;
+    // fn par_swap_with_perm(&mut self, p: &Vec<(usize, usize)>);
+}
diff --git a/src/traits/math.rs b/src/traits/math.rs
index 228aad9e..9531ba05 100644
--- a/src/traits/math.rs
+++ b/src/traits/math.rs
@@ -97,3 +97,35 @@ impl Normed for f64 {
         self / self.abs()
     }
 }
+
+// =============================================================================
+// Implementation for parallel traits
+// =============================================================================
+
+/// Mathematical Vector in Parallel
+pub trait ParallelVector {
+    type Scalar;
+    fn par_add_vec(&self, rhs: &Self) -> Self;
+    fn par_sub_vec(&self, rhs: &Self) -> Self;
+}
+
+/// Normed Vector in Parallel
+pub trait ParallelNormed: Vector {
+    type UnsignedScalar;
+    fn par_norm(&self, kind: Norm) -> Self::UnsignedScalar;
+}
+
+/// Inner product Vector in Parallel
+pub trait ParallelInnerProduct: ParallelNormed {
+    fn par_dot(&self, rhs: &Self) -> Self::Scalar;
+}
+
+/// Matrix Products in Parallel
+pub trait ParallelMatrixProduct {
+    fn par_hadamard(&self, other: &Self) -> Matrix;
+}
+
+/// Vector Products in Parallel
+pub trait ParallelVectorProduct: Vector {
+    fn par_cross(&self, other: &Self) -> Self;
+}
diff --git a/src/traits/mod.rs b/src/traits/mod.rs
index 92b224a6..2709f2db 100644
--- a/src/traits/mod.rs
+++ b/src/traits/mod.rs
@@ -1,3 +1,4 @@
+pub mod float;
 pub mod fp;
 pub mod general;
 pub mod math;
@@ -6,4 +7,3 @@ pub mod num;
 pub mod pointer;
 pub mod stable;
 pub mod sugar;
-pub mod float;
diff --git a/src/traits/mutable.rs b/src/traits/mutable.rs
index a8ef80a9..fccb56ce 100644
--- a/src/traits/mutable.rs
+++ b/src/traits/mutable.rs
@@ -4,10 +4,10 @@ pub trait MutFP {
     type Scalar;
     fn mut_map<F>(&mut self, f: F)
     where
-        F: Fn(Self::Scalar) -> Self::Scalar + Send + Sync;
+        F: Fn(Self::Scalar) -> Self::Scalar;
     fn mut_zip_with<F>(&mut self, f: F, other: &Self)
     where
-        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync;
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar;
 }
 
 pub trait MutMatrix {
@@ -16,3 +16,14 @@ pub trait MutMatrix {
     unsafe fn swap(&mut self, idx1: usize, idx2: usize, shape: Shape);
     unsafe fn swap_with_perm(&mut self, p: &Vec<(usize, usize)>, shape: Shape);
 }
+
+// Mutable trait for Vector in Parallel (Uses Rayon crate)
+pub trait ParallelMutFP {
+    type Scalar;
+    fn par_mut_map<F>(&mut self, f: F)
+    where
+        F: Fn(Self::Scalar) -> Self::Scalar + Send + Sync;
+    fn par_mut_zip_with<F>(&mut self, f: F, other: &Self)
+    where
+        F: Fn(Self::Scalar, Self::Scalar) -> Self::Scalar + Send + Sync;
+}
diff --git a/src/traits/sugar.rs b/src/traits/sugar.rs
index 7bb9b989..e1ce7f1e 100644
--- a/src/traits/sugar.rs
+++ b/src/traits/sugar.rs
@@ -11,9 +11,7 @@ where
         + Add<Self::Scalar, Output = Self::Scalar>
         + Sub<Self::Scalar, Output = Self::Scalar>
         + Mul<Self::Scalar, Output = Self::Scalar>
-        + Div<Self::Scalar, Output = Self::Scalar>
-        + Send
-        + Sync,
+        + Div<Self::Scalar, Output = Self::Scalar>,
 {
     //type Scalar;
     fn add_v(&self, v: &Self) -> Self {

From 646d2d00879a9b5839a3b54425a16a9c8588d8f3 Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Tue, 1 Oct 2024 00:33:31 +0200
Subject: [PATCH 09/17] Added further parallel procedures

- Using Rayon to parallelize further traits
for matrix and vector
---
 src/structure/matrix.rs | 184 ++++++++++++++++++++++++++++------------
 src/traits/math.rs      |   1 +
 2 files changed, 130 insertions(+), 55 deletions(-)

diff --git a/src/structure/matrix.rs b/src/structure/matrix.rs
index af3d98e8..d671156e 100644
--- a/src/structure/matrix.rs
+++ b/src/structure/matrix.rs
@@ -615,7 +615,9 @@ use serde::{Deserialize, Serialize};
 
 use crate::rayon::prelude::*;
 use crate::traits::fp::ParallelFPMatrix;
-use crate::traits::math::{ParallelMatrixProduct, ParallelNormed, ParallelVector};
+use crate::traits::math::{
+    ParallelInnerProduct, ParallelMatrixProduct, ParallelNormed, ParallelVector,
+};
 
 pub use self::Shape::{Col, Row};
 use crate::numerical::eigen::{eigen, EigenMethod};
@@ -881,7 +883,7 @@ pub fn par_ml_matrix(s: &str) -> Matrix where {
     let str_rows: Vec<&str> = s.split(';').collect();
     let r = str_rows.len();
     let str_data = str_rows
-        .into_iter()
+        .into_par_iter()
         .map(|x| x.trim().split(' ').collect::<Vec<&str>>())
         .collect::<Vec<Vec<&str>>>();
     let c = str_data[0].len();
@@ -1254,10 +1256,10 @@ impl Matrix {
     ///
     /// fn main() {
     ///     let a = matrix(c!(1,2,3,4), 2, 2, Row);
-    ///     assert_eq!(a.pub_col(0), c!(1,3));
+    ///     assert_eq!(a.par_col(0), c!(1,3));
     /// }
     /// ```
-    pub fn pub_col(&self, index: usize) -> Vec<f64> {
+    pub fn par_col(&self, index: usize) -> Vec<f64> {
         assert!(index < self.col);
 
         let container = (0..self.row)
@@ -1299,10 +1301,10 @@ impl Matrix {
     ///
     /// fn main() {
     ///     let a = matrix(c!(1,2,3,4), 2, 2, Row);
-    ///     assert_eq!(a.pub_row(0), c!(1,2));
+    ///     assert_eq!(a.par_row(0), c!(1,2));
     /// }
     /// ```
-    pub fn pub_row(&self, index: usize) -> Vec<f64> {
+    pub fn par_row(&self, index: usize) -> Vec<f64> {
         assert!(index < self.row);
 
         let container = (0..self.col)
@@ -1374,6 +1376,23 @@ impl Matrix {
         }
     }
 
+    /// Transpose in Parallel
+    ///
+    /// # Examples
+    /// ```
+    /// use peroxide::fuga::*;
+    ///
+    /// let a = par_matrix(vec![1,2,3,4], 2, 2, Row);
+    /// println!("{}", a.par_transpose()); // [[1,3],[2,4]]
+    /// assert_eq!(a.par_transpose(), matrix(vec![1,3,2,4], 2, 2, Row))
+    /// ```
+    pub fn par_transpose(&self) -> Self {
+        match self.shape {
+            Row => par_matrix(self.data.clone(), self.col, self.row, Col),
+            Col => par_matrix(self.data.clone(), self.col, self.row, Row),
+        }
+    }
+
     /// R-like transpose function
     ///
     /// # Examples
@@ -1391,6 +1410,23 @@ impl Matrix {
         self.transpose()
     }
 
+    /// R-like transpose function in parallel
+    ///
+    /// # Examples
+    /// ```
+    /// #[macro_use]
+    /// extern crate peroxide;
+    /// use peroxide::fuga::*;
+    ///
+    /// fn main() {
+    ///     let a = matrix!(1;4;1, 2, 2, Row);
+    ///     assert_eq!(a.transpose(), a.par_t());
+    /// }
+    /// ```
+    pub fn par_t(&self) -> Self {
+        self.par_transpose()
+    }
+
     /// Write to CSV
     ///
     /// # Examples
@@ -1603,7 +1639,7 @@ impl Matrix {
         mat
     }
 
-    /// From index operations
+    /// From index operations in parallel
     pub fn par_from_index<F, G>(f: F, size: (usize, usize)) -> Matrix
     where
         F: Fn(usize, usize) -> G + Copy + Send + Sync,
@@ -1948,7 +1984,7 @@ impl ParallelVector for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
-                let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
                 result
                     .data
                     .par_iter_mut()
@@ -1984,7 +2020,7 @@ impl ParallelVector for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
-                let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
                 result
                     .data
                     .par_iter_mut()
@@ -1999,10 +2035,32 @@ impl ParallelVector for Matrix {
             }
         }
     }
+
+    fn par_mul_scalar(&self, other: Self::Scalar) -> Self {
+        match () {
+            #[cfg(feature = "O3")]
+            () => {
+                let x = &self.data;
+                let mut y = vec![0f64; x.len()];
+                let a_f64 = other;
+                let n_i32 = x.len() as i32;
+
+                unsafe {
+                    daxpy(n_i32, a_f64, x, 1, &mut y, 1);
+                }
+                matrix(y, self.row, self.col, self.shape)
+            }
+            _ => {
+                let scalar = other;
+                self.par_fmap(|x| x * scalar)
+            }
+        }
+    }
 }
 
 impl Normed for Matrix {
     type UnsignedScalar = f64;
+
     fn norm(&self, kind: Norm) -> f64 {
         match kind {
             Norm::F => {
@@ -2147,6 +2205,17 @@ impl InnerProduct for Matrix {
     }
 }
 
+/// Frobenius inner product in parallel
+impl ParallelInnerProduct for Matrix {
+    fn par_dot(&self, rhs: &Self) -> f64 {
+        if self.shape == rhs.shape {
+            self.data.par_dot(&rhs.data)
+        } else {
+            self.data.par_dot(&rhs.change_shape().data)
+        }
+    }
+}
+
 /// TODO: Transpose
 
 /// Matrix as Linear operator for Vector
@@ -2302,25 +2371,27 @@ impl Add<Matrix> for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
-                // let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
-                // for i in 0..self.row {
-                //     for j in 0..self.col {
-                //         result[(i, j)] += other[(i, j)];
-                //     }
-                // }
-                // result
-                let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
+                let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                for i in 0..self.row {
+                    for j in 0..self.col {
+                        result[(i, j)] += other[(i, j)];
+                    }
+                }
                 result
-                    .data
-                    .par_iter_mut()
-                    .enumerate()
-                    .for_each(|(idx, value)| {
-                        let i = idx / self.col;
-                        let j = idx % self.col;
 
-                        *value += other[(i, j)];
-                    });
-                result
+                // Note: Parallel version - how to add the implementation below?
+                // let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
+                // result
+                //     .data
+                //     .par_iter_mut()
+                //     .enumerate()
+                //     .for_each(|(idx, value)| {
+                //         let i = idx / self.col;
+                //         let j = idx % self.col;
+
+                //         *value += other[(i, j)];
+                //     });
+                // result
             }
         }
     }
@@ -2511,11 +2582,12 @@ impl Neg for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => matrix(
-                // self.data.into_iter().map(|x: f64| -x).collect::<Vec<f64>>(),
-                self.data
-                    .into_par_iter()
-                    .map(|x: f64| -x)
-                    .collect::<Vec<f64>>(),
+                self.data.into_iter().map(|x: f64| -x).collect::<Vec<f64>>(),
+                // Note: how to add parallel implementation below?
+                // self.data
+                //     .into_par_iter()
+                //     .map(|x: f64| -x)
+                //     .collect::<Vec<f64>>(),
                 self.row,
                 self.col,
                 self.shape,
@@ -2541,16 +2613,17 @@ impl<'a> Neg for &'a Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => matrix(
-                // self.data
-                //     .clone()
-                //     .into_iter()
-                //     .map(|x: f64| -x)
-                //     .collect::<Vec<f64>>(),
                 self.data
                     .clone()
-                    .into_par_iter()
+                    .into_iter()
                     .map(|x: f64| -x)
                     .collect::<Vec<f64>>(),
+                // Note: how to add parallel implementation below?
+                // self.data
+                //     .clone()
+                //     .into_par_iter()
+                //     .map(|x: f64| -x)
+                //     .collect::<Vec<f64>>(),
                 self.row,
                 self.col,
                 self.shape,
@@ -2595,25 +2668,26 @@ impl Sub<Matrix> for Matrix {
                 matrix(y, self.row, self.col, self.shape)
             }
             _ => {
-                // let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
-                // for i in 0..self.row {
-                //     for j in 0..self.col {
-                //         result[(i, j)] -= other[(i, j)];
-                //     }
-                // }
-                // result
-                let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
-                result
-                    .data
-                    .par_iter_mut()
-                    .enumerate()
-                    .for_each(|(idx, value)| {
-                        let i = idx / self.col;
-                        let j = idx % self.col;
-
-                        *value -= other[(i, j)];
-                    });
+                let mut result = matrix(self.data.clone(), self.row, self.col, self.shape);
+                for i in 0..self.row {
+                    for j in 0..self.col {
+                        result[(i, j)] -= other[(i, j)];
+                    }
+                }
                 result
+                // Note: how to add parallel implementation below?
+                // let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
+                // result
+                //     .data
+                //     .par_iter_mut()
+                //     .enumerate()
+                //     .for_each(|(idx, value)| {
+                //         let i = idx / self.col;
+                //         let j = idx % self.col;
+
+                //         *value -= other[(i, j)];
+                //     });
+                // result
             }
         }
     }
diff --git a/src/traits/math.rs b/src/traits/math.rs
index 9531ba05..2100fd1b 100644
--- a/src/traits/math.rs
+++ b/src/traits/math.rs
@@ -107,6 +107,7 @@ pub trait ParallelVector {
     type Scalar;
     fn par_add_vec(&self, rhs: &Self) -> Self;
     fn par_sub_vec(&self, rhs: &Self) -> Self;
+    fn par_mul_scalar(&self, rhs: Self::Scalar) -> Self;
 }
 
 /// Normed Vector in Parallel

From cdb8fdb3c2a1c8282cae5672877cb8c3bd568cdc Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Sat, 5 Oct 2024 17:20:29 +0200
Subject: [PATCH 10/17] Added benchmarking

- Benchmarking Rayon usage for matrix
---
 Cargo.toml                                 |  8 +++-
 benches/parallel_rayon/matrix_benchmark.rs | 47 ++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 benches/parallel_rayon/matrix_benchmark.rs

diff --git a/Cargo.toml b/Cargo.toml
index 18f5455b..39c51a7a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ keywords = ["Numeric", "Science", "Dataframe", "Plot", "LinearAlgebra"]
 exclude = [
     "example_data/",
     "src/bin/",
-    "benches/",
+    # "benches/",
     "example/",
     "test_data/",
     "peroxide-ad2",
@@ -25,6 +25,7 @@ maintenance = { status = "actively-developed" }
 
 [dev-dependencies]
 float-cmp = "0.9"
+criterion = { version = "0.5.1", features = ["html_reports"] }
 
 [dependencies]
 csv = { version = "1.3", optional = true, default-features = false }
@@ -68,3 +69,8 @@ plot = ["pyo3"]
 nc = ["netcdf"]
 parquet = ["arrow2"]
 complex = ["num-complex", "matrixmultiply/cgemm"]
+
+[[bench]]
+path = "benches/parallel_rayon/matrix_benchmark.rs"
+name = "matrix_benchmark"
+harness = false
diff --git a/benches/parallel_rayon/matrix_benchmark.rs b/benches/parallel_rayon/matrix_benchmark.rs
new file mode 100644
index 00000000..16148e92
--- /dev/null
+++ b/benches/parallel_rayon/matrix_benchmark.rs
@@ -0,0 +1,47 @@
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use peroxide::fuga::*;
+
+pub fn par_matrix_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: 630.92 µs
+    cr.bench_function("ser_matrix_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row)))
+    });
+
+    // Result: 1000x1000 matrix: 9.6995 ms
+    cr.bench_function("par_matrix_bench", |b| {
+        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row)))
+    });
+}
+
+pub fn par_py_matrix_benchmark(cr: &mut Criterion) {
+    let v: Vec<Vec<f64>> = (0..1000)
+        .into_iter()
+        .map(|i: i32| {
+            (0..1000)
+                .into_iter()
+                .map(|j| 2.0 * (i as f64) * (j as f64))
+                .collect::<Vec<f64>>()
+        })
+        .collect::<Vec<Vec<f64>>>();
+
+    // Result: 1000x1000 matrix: 3.9858 ms
+    cr.bench_function("ser_py_matrix_bench", |b| {
+        b.iter(|| black_box(py_matrix(v.clone())))
+    });
+
+    // Result: 1000x1000 matrix: 24.654 ms
+    cr.bench_function("par_py_matrix_bench", |b| {
+        b.iter(|| black_box(par_py_matrix(v.clone())))
+    });
+}
+
+criterion_group!(
+    benches,
+    /*par_matrix_benchmark,*/ par_py_matrix_benchmark
+);
+criterion_main!(benches);

From 8a536be55bc64aefa9515d25b4c0d7b1df2c8a5a Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Sat, 5 Oct 2024 17:53:32 +0200
Subject: [PATCH 11/17] Adding benchmarks for matrix

- Adding benchmark for rayon usage in matrix
---
 benches/parallel_rayon/matrix_benchmark.rs | 107 ++++++++++++++++++++-
 1 file changed, 106 insertions(+), 1 deletion(-)

diff --git a/benches/parallel_rayon/matrix_benchmark.rs b/benches/parallel_rayon/matrix_benchmark.rs
index 16148e92..c859bf1a 100644
--- a/benches/parallel_rayon/matrix_benchmark.rs
+++ b/benches/parallel_rayon/matrix_benchmark.rs
@@ -40,8 +40,113 @@ pub fn par_py_matrix_benchmark(cr: &mut Criterion) {
     });
 }
 
+pub fn par_matrix_change_shape_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: 7.6453 ms
+    cr.bench_function("ser_matrix_change_shape_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).change_shape()))
+    });
+
+    // Result: 1000x1000 matrix: 9.8586 ms
+    cr.bench_function("par_matrix_change_shape_bench", |b| {
+        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_change_shape()))
+    });
+}
+
+pub fn par_matrix_extract_row_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: 571.90 µs
+    cr.bench_function("ser_matrix_extract_row_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).row(100)))
+    });
+
+    // Result: 1000x1000 matrix: 10.440 ms
+    cr.bench_function("par_matrix_extract_row_bench", |b| {
+        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_row(100)))
+    });
+}
+
+pub fn par_matrix_from_index_benchmark(cr: &mut Criterion) {
+    let f = |x: usize, y: usize| 2.0 * (x as f64) * (y as f64);
+    let size: (usize, usize) = (1000, 1000);
+
+    // Result: 1000x1000 matrix: 2.3662 ms
+    cr.bench_function("ser_matrix_from_index_bench", |b| {
+        b.iter(|| black_box(Matrix::from_index(f, size)))
+    });
+
+    // Result: 1000x1000 matrix:  2.3355 ms
+    cr.bench_function("par_matrix_from_index_bench", |b| {
+        b.iter(|| black_box(Matrix::from_index(f, size)))
+    });
+}
+
+pub fn par_matrix_to_vec_and_diag_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: 5.0333 ms
+    cr.bench_function("ser_matrix_to_vec_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).to_vec()))
+    });
+
+    // Result: 1000x1000 matrix: 10.625 ms
+    cr.bench_function("par_matrix_to_vec_bench", |b| {
+        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_to_vec()))
+    });
+
+    // Result: 1000x1000 matrix: 3.0501 ms
+    cr.bench_function("ser_matrix_to_diag_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).to_diag()))
+    });
+
+    // Result: 1000x1000 matrix: 16.362 ms
+    cr.bench_function("par_matrix_to_diag_bench", |b| {
+        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_to_diag()))
+    });
+}
+
+pub fn par_matrix_submat_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: 1.5658 ms
+    cr.bench_function("ser_matrix_submat_bench", |b| {
+        b.iter(|| {
+            black_box(matrix(v.clone(), 1000, 1000, Shape::Row).submat((100, 100), (900, 900)))
+        })
+    });
+
+    // Result: 1000x1000 matrix: 13.000 ms
+    cr.bench_function("par_matrix_submat_bench", |b| {
+        b.iter(|| {
+            black_box(
+                par_matrix(v.clone(), 1000, 1000, Shape::Row).par_submat((100, 100), (900, 900)),
+            )
+        })
+    });
+}
+
 criterion_group!(
     benches,
-    /*par_matrix_benchmark,*/ par_py_matrix_benchmark
+    /*par_matrix_benchmark,*/
+    /*par_py_matrix_benchmark,*/
+    /*par_matrix_change_shape_benchmark,*/
+    /*par_matrix_extract_row_benchmark,*/
+    /*par_matrix_from_index_benchmark,*/
+    /*par_matrix_to_vec_and_diag_benchmark,*/
+    par_matrix_submat_benchmark,
 );
 criterion_main!(benches);

From c104652c67f1537a9090a425926039afa7765b53 Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Sat, 5 Oct 2024 18:41:47 +0200
Subject: [PATCH 12/17] Added further benchmarks for matrix

- Added benchmark for Rayon usage in matrix
- Saved results in benches/data
---
 .../data/rayon_matrix_benchmark_results.md    | 194 ++++++++++++++
 benches/parallel_rayon/matrix_benchmark.rs    | 249 +++++++++++++++++-
 2 files changed, 436 insertions(+), 7 deletions(-)
 create mode 100644 benches/data/rayon_matrix_benchmark_results.md

diff --git a/benches/data/rayon_matrix_benchmark_results.md b/benches/data/rayon_matrix_benchmark_results.md
new file mode 100644
index 00000000..f85feae8
--- /dev/null
+++ b/benches/data/rayon_matrix_benchmark_results.md
@@ -0,0 +1,194 @@
+Lib used for benchmarking: Criterion
+Matrix size: 1000x1000
+
+Running benches/parallel_rayon/matrix_benchmark.rs
+
+ser_matrix_bench        time:   [535.12 µs 544.51 µs 556.68 µs]
+Found 11 outliers among 100 measurements (11.00%)
+  4 (4.00%) high mild
+  7 (7.00%) high severe
+
+par_matrix_bench        time:   [5.0912 ms 5.1431 ms 5.1995 ms]
+Found 7 outliers among 100 measurements (7.00%)
+  1 (1.00%) low mild
+  5 (5.00%) high mild
+  1 (1.00%) high severe
+
+ser_py_matrix_bench     time:   [4.3100 ms 4.3309 ms 4.3544 ms]
+Found 7 outliers among 100 measurements (7.00%)
+  2 (2.00%) high mild
+  5 (5.00%) high severe
+
+par_py_matrix_bench     time:   [11.667 ms 11.789 ms 11.920 ms]
+Found 10 outliers among 100 measurements (10.00%)
+  6 (6.00%) high mild
+  4 (4.00%) high severe
+
+ser_matrix_change_shape_bench
+                        time:   [7.3630 ms 7.4075 ms 7.4608 ms]
+Found 5 outliers among 100 measurements (5.00%)
+  1 (1.00%) high mild
+  4 (4.00%) high severe
+
+par_matrix_change_shape_bench
+                        time:   [10.276 ms 10.385 ms 10.499 ms]
+Found 3 outliers among 100 measurements (3.00%)
+  2 (2.00%) high mild
+  1 (1.00%) high severe
+
+ser_matrix_extract_row_bench
+                        time:   [613.39 µs 622.44 µs 633.72 µs]
+Found 7 outliers among 100 measurements (7.00%)
+  7 (7.00%) high severe
+
+par_matrix_extract_row_bench
+                        time:   [5.4321 ms 5.4851 ms 5.5399 ms]
+Found 4 outliers among 100 measurements (4.00%)
+  4 (4.00%) high mild
+
+ser_matrix_from_index_bench
+                        time:   [2.4174 ms 2.4490 ms 2.4851 ms]
+Found 14 outliers among 100 measurements (14.00%)
+  1 (1.00%) high mild
+  13 (13.00%) high severe
+
+par_matrix_from_index_bench
+                        time:   [2.3912 ms 2.4090 ms 2.4304 ms]
+Found 9 outliers among 100 measurements (9.00%)
+  2 (2.00%) high mild
+  7 (7.00%) high severe
+
+ser_matrix_to_vec_bench time:   [2.4800 ms 2.5082 ms 2.5423 ms]
+Found 10 outliers among 100 measurements (10.00%)
+  4 (4.00%) high mild
+  6 (6.00%) high severe
+
+par_matrix_to_vec_bench time:   [6.4041 ms 6.4618 ms 6.5250 ms]
+Found 6 outliers among 100 measurements (6.00%)
+  5 (5.00%) high mild
+  1 (1.00%) high severe
+
+ser_matrix_to_diag_bench
+                        time:   [2.4335 ms 2.4526 ms 2.4750 ms]
+Found 14 outliers among 100 measurements (14.00%)
+  6 (6.00%) high mild
+  8 (8.00%) high severe
+
+par_matrix_to_diag_bench
+                        time:   [13.514 ms 13.684 ms 13.868 ms]
+Found 10 outliers among 100 measurements (10.00%)
+  7 (7.00%) high mild
+  3 (3.00%) high severe
+
+Benchmarking ser_matrix_submat_bench: Warming up for 3.0000 s
+Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 8.3s, enable flat sampling, or reduce sample count to 50.
+ser_matrix_submat_bench time:   [1.6077 ms 1.6243 ms 1.6451 ms]
+Found 16 outliers among 100 measurements (16.00%)
+  3 (3.00%) high mild
+  13 (13.00%) high severe
+
+par_matrix_submat_bench time:   [10.611 ms 10.761 ms 10.942 ms]
+Found 5 outliers among 100 measurements (5.00%)
+  3 (3.00%) high mild
+  2 (2.00%) high severe
+
+ser_matrix_add_vec_bench
+                        time:   [7.3077 ms 7.3485 ms 7.3946 ms]
+Found 12 outliers among 100 measurements (12.00%)
+  2 (2.00%) high mild
+  10 (10.00%) high severe
+
+par_matrix_add_vec_bench
+                        time:   [11.331 ms 11.480 ms 11.636 ms]
+Found 2 outliers among 100 measurements (2.00%)
+  2 (2.00%) high mild
+
+ser_matrix_norm_bench   time:   [5.1600 ms 5.1864 ms 5.2165 ms]
+Found 7 outliers among 100 measurements (7.00%)
+  1 (1.00%) high mild
+  6 (6.00%) high severe
+
+par_matrix_norm_bench   time:   [2.6565 ms 2.6810 ms 2.7091 ms]
+Found 5 outliers among 100 measurements (5.00%)
+  2 (2.00%) high mild
+  3 (3.00%) high severe
+
+Benchmarking ser_matrix_norm_bench #2: Warming up for 3.0000 s
+Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 8.9s, enable flat sampling, or reduce sample count to 50.
+ser_matrix_norm_bench #2
+                        time:   [1.7262 ms 1.7391 ms 1.7541 ms]
+Found 15 outliers among 100 measurements (15.00%)
+  10 (10.00%) high mild
+  5 (5.00%) high severe
+
+par_matrix_norm_bench #2
+                        time:   [6.7071 ms 6.7883 ms 6.8703 ms]
+Found 1 outliers among 100 measurements (1.00%)
+  1 (1.00%) high mild
+
+ser_matrix_norm_bench #3
+                        time:   [9.7582 ms 9.9006 ms 10.057 ms]
+Found 12 outliers among 100 measurements (12.00%)
+  5 (5.00%) high mild
+  7 (7.00%) high severe
+
+par_matrix_norm_bench #3
+                        time:   [9.3004 ms 9.4088 ms 9.5239 ms]
+Found 1 outliers among 100 measurements (1.00%)
+  1 (1.00%) high mild
+
+ser_matrix_inner_prod_bench
+                        time:   [5.2730 ms 5.3590 ms 5.4583 ms]
+Found 14 outliers among 100 measurements (14.00%)
+  3 (3.00%) high mild
+  11 (11.00%) high severe
+
+par_matrix_inner_prod_bench
+                        time:   [5.0987 ms 5.1644 ms 5.2402 ms]
+Found 7 outliers among 100 measurements (7.00%)
+  3 (3.00%) high mild
+  4 (4.00%) high severe
+
+ser_matrix_hadamard_bench
+                        time:   [5.6521 ms 5.6870 ms 5.7262 ms]
+Found 12 outliers among 100 measurements (12.00%)
+  3 (3.00%) high mild
+  9 (9.00%) high severe
+
+par_matrix_hadamard_bench
+                        time:   [14.155 ms 14.335 ms 14.527 ms]
+Found 4 outliers among 100 measurements (4.00%)
+  3 (3.00%) high mild
+  1 (1.00%) high severe
+
+ser_matrix_take_row_bench
+                        time:   [3.7894 ms 3.8234 ms 3.8613 ms]
+Found 15 outliers among 100 measurements (15.00%)
+  7 (7.00%) high mild
+  8 (8.00%) high severe
+
+par_matrix_take_row_bench
+                        time:   [8.4008 ms 8.5171 ms 8.6523 ms]
+Found 9 outliers among 100 measurements (9.00%)
+  6 (6.00%) high mild
+  3 (3.00%) high severe
+
+ser_matrix_fpmap_bench  time:   [3.2526 ms 3.2739 ms 3.2977 ms]
+Found 12 outliers among 100 measurements (12.00%)
+  2 (2.00%) high mild
+  10 (10.00%) high severe
+
+par_matrix_fpmap_bench  time:   [10.604 ms 10.765 ms 10.937 ms]
+Found 11 outliers among 100 measurements (11.00%)
+  8 (8.00%) high mild
+  3 (3.00%) high severe
+
+ser_matrix_reduce_bench time:   [2.6748 ms 2.6964 ms 2.7201 ms]
+Found 9 outliers among 100 measurements (9.00%)
+  6 (6.00%) high mild
+  3 (3.00%) high severe
+
+par_matrix_reduce_bench time:   [6.2453 ms 6.3198 ms 6.4034 ms]
+Found 6 outliers among 100 measurements (6.00%)
+  4 (4.00%) high mild
+  2 (2.00%) high severe
diff --git a/benches/parallel_rayon/matrix_benchmark.rs b/benches/parallel_rayon/matrix_benchmark.rs
index c859bf1a..a62cd539 100644
--- a/benches/parallel_rayon/matrix_benchmark.rs
+++ b/benches/parallel_rayon/matrix_benchmark.rs
@@ -1,5 +1,11 @@
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use peroxide::fuga::*;
+use peroxide::{
+    fuga::*,
+    traits::{
+        fp::ParallelFPMatrix,
+        math::{ParallelInnerProduct, ParallelMatrixProduct, ParallelNormed, ParallelVector},
+    },
+};
 
 pub fn par_matrix_benchmark(cr: &mut Criterion) {
     let v: Vec<f64> = (0..1000000)
@@ -139,14 +145,243 @@ pub fn par_matrix_submat_benchmark(cr: &mut Criterion) {
     });
 }
 
+pub fn par_matrix_add_vec_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    let w: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 3.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: 7.1774 ms
+    cr.bench_function("ser_matrix_add_vec_bench", |b| {
+        b.iter(|| {
+            black_box(matrix(v.clone(), 1000, 1000, Shape::Row).add_vec(&matrix(
+                w.clone(),
+                1000,
+                1000,
+                Shape::Row,
+            )))
+        })
+    });
+
+    // Result: 1000x1000 matrix: 11.565 ms
+    cr.bench_function("par_matrix_add_vec_bench", |b| {
+        b.iter(|| {
+            black_box(
+                matrix(v.clone(), 1000, 1000, Shape::Row).par_add_vec(&matrix(
+                    w.clone(),
+                    1000,
+                    1000,
+                    Shape::Row,
+                )),
+            )
+        })
+    });
+}
+
+// Check: better parallel results (ran test 6 times)
+pub fn par_matrix_norm_lpq_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: [5.5969 ms 5.7555 ms 5.9515 ms 6.0843 ms 6.3072 ms 6.5636 ms]
+    cr.bench_function("ser_matrix_norm_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).norm(Norm::Lpq(4.0, 2.0))))
+    });
+
+    // Result: 1000x1000 matrix: [3.1796 ms 3.2714 ms 3.3714 ms 3.6123 ms 3.7398 ms 3.8761 ms]
+    cr.bench_function("par_matrix_norm_bench", |b| {
+        b.iter(|| {
+            black_box(matrix(v.clone(), 1000, 1000, Shape::Row).par_norm(Norm::Lpq(4.0, 2.0)))
+        })
+    });
+}
+
+pub fn par_matrix_norm_f_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: 1.7414 ms
+    cr.bench_function("ser_matrix_norm_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).norm(Norm::F)))
+    });
+
+    // Result: 1000x1000 matrix: 9.0499 ms
+    cr.bench_function("par_matrix_norm_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).par_norm(Norm::F)))
+    });
+}
+
+pub fn par_matrix_norm_l1_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: 9.0287 ms
+    cr.bench_function("ser_matrix_norm_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).norm(Norm::L1)))
+    });
+
+    // Result: 1000x1000 matrix: 10.393 ms
+    cr.bench_function("par_matrix_norm_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).par_norm(Norm::L1)))
+    });
+}
+
+// Check: better parallel results (ran test 6 times)
+pub fn par_matrix_inner_prod_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    let w: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 3.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix:  [5.1075 ms 5.1505 ms 5.2013 ms 5.7617 ms 6.0196 ms 6.3009 ms]
+    cr.bench_function("ser_matrix_inner_prod_bench", |b| {
+        b.iter(|| {
+            black_box(matrix(v.clone(), 1000, 1000, Shape::Row).dot(&matrix(
+                w.clone(),
+                1000,
+                1000,
+                Shape::Row,
+            )))
+        })
+    });
+
+    // Result: 1000x1000 matrix: [4.9931 ms 5.0244 ms 5.0642 ms 5.0322 ms 5.0819 ms 5.1404 ms]
+    cr.bench_function("par_matrix_inner_prod_bench", |b| {
+        b.iter(|| {
+            black_box(matrix(v.clone(), 1000, 1000, Shape::Row).par_dot(&matrix(
+                w.clone(),
+                1000,
+                1000,
+                Shape::Row,
+            )))
+        })
+    });
+}
+
+pub fn par_matrix_hadamard_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    let w: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 3.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix:  5.8664 ms
+    cr.bench_function("ser_matrix_hadamard_bench", |b| {
+        b.iter(|| {
+            black_box(matrix(v.clone(), 1000, 1000, Shape::Row).hadamard(&matrix(
+                w.clone(),
+                1000,
+                1000,
+                Shape::Row,
+            )))
+        })
+    });
+
+    // Result: 1000x1000 matrix: 13.982 ms
+    cr.bench_function("par_matrix_hadamard_bench", |b| {
+        b.iter(|| {
+            black_box(
+                matrix(v.clone(), 1000, 1000, Shape::Row).par_hadamard(&matrix(
+                    w.clone(),
+                    1000,
+                    1000,
+                    Shape::Row,
+                )),
+            )
+        })
+    });
+}
+
+pub fn par_matrix_take_row_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+
+    // Result: 1000x1000 matrix: 3.7958 ms
+    cr.bench_function("ser_matrix_take_row_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).take_row(500)))
+    });
+
+    // Result: 1000x1000 matrix: 9.7897 ms
+    cr.bench_function("par_matrix_take_row_bench", |b| {
+        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_take_row(500)))
+    });
+}
+
+pub fn par_matrix_fpmap_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+    let f = |x: f64| 2.0 * x;
+
+    // Result: 1000x1000 matrix: 3.3385 ms
+    cr.bench_function("ser_matrix_fpmap_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).fmap(f)))
+    });
+
+    // Result: 1000x1000 matrix: 9.6310 ms
+    cr.bench_function("par_matrix_fpmap_bench", |b| {
+        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_fmap(f)))
+    });
+}
+
+pub fn par_matrix_reduce_benchmark(cr: &mut Criterion) {
+    let v: Vec<f64> = (0..1000000)
+        .into_iter()
+        .map(|i: i32| 2.0 * (i as f64))
+        .collect::<Vec<f64>>();
+    let f = |x: f64, y: f64| 2.0 * x * y;
+
+    // Result: 1000x1000 matrix:  2.8077 ms
+    cr.bench_function("ser_matrix_reduce_bench", |b| {
+        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).reduce(0.0, f)))
+    });
+
+    // Result: 1000x1000 matrix: 9.4877 ms
+    cr.bench_function("par_matrix_reduce_bench", |b| {
+        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_reduce(0.0, f)))
+    });
+}
+
 criterion_group!(
     benches,
-    /*par_matrix_benchmark,*/
-    /*par_py_matrix_benchmark,*/
-    /*par_matrix_change_shape_benchmark,*/
-    /*par_matrix_extract_row_benchmark,*/
-    /*par_matrix_from_index_benchmark,*/
-    /*par_matrix_to_vec_and_diag_benchmark,*/
+    par_matrix_benchmark,
+    par_py_matrix_benchmark,
+    par_matrix_change_shape_benchmark,
+    par_matrix_extract_row_benchmark,
+    par_matrix_from_index_benchmark,
+    par_matrix_to_vec_and_diag_benchmark,
     par_matrix_submat_benchmark,
+    par_matrix_add_vec_benchmark,
+    par_matrix_norm_lpq_benchmark,
+    par_matrix_norm_f_benchmark,
+    par_matrix_norm_l1_benchmark,
+    par_matrix_inner_prod_benchmark,
+    par_matrix_hadamard_benchmark,
+    par_matrix_take_row_benchmark,
+    par_matrix_fpmap_benchmark,
+    par_matrix_reduce_benchmark,
 );
 criterion_main!(benches);

From af76bfcf24a46bed477cfdb5145245927f83041b Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Mon, 7 Oct 2024 22:41:12 +0200
Subject: [PATCH 13/17] Removed parallelized code based on benchmarks

---
 benches/parallel_rayon/matrix_benchmark.rs | 268 +-------
 src/structure/matrix.rs                    | 713 +--------------------
 2 files changed, 28 insertions(+), 953 deletions(-)

diff --git a/benches/parallel_rayon/matrix_benchmark.rs b/benches/parallel_rayon/matrix_benchmark.rs
index a62cd539..eaaa1e13 100644
--- a/benches/parallel_rayon/matrix_benchmark.rs
+++ b/benches/parallel_rayon/matrix_benchmark.rs
@@ -1,10 +1,7 @@
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use peroxide::{
     fuga::*,
-    traits::{
-        fp::ParallelFPMatrix,
-        math::{ParallelInnerProduct, ParallelMatrixProduct, ParallelNormed, ParallelVector},
-    },
+    traits::math::{ParallelInnerProduct, ParallelNormed},
 };
 
 pub fn par_matrix_benchmark(cr: &mut Criterion) {
@@ -24,62 +21,6 @@ pub fn par_matrix_benchmark(cr: &mut Criterion) {
     });
 }
 
-pub fn par_py_matrix_benchmark(cr: &mut Criterion) {
-    let v: Vec<Vec<f64>> = (0..1000)
-        .into_iter()
-        .map(|i: i32| {
-            (0..1000)
-                .into_iter()
-                .map(|j| 2.0 * (i as f64) * (j as f64))
-                .collect::<Vec<f64>>()
-        })
-        .collect::<Vec<Vec<f64>>>();
-
-    // Result: 1000x1000 matrix: 3.9858 ms
-    cr.bench_function("ser_py_matrix_bench", |b| {
-        b.iter(|| black_box(py_matrix(v.clone())))
-    });
-
-    // Result: 1000x1000 matrix: 24.654 ms
-    cr.bench_function("par_py_matrix_bench", |b| {
-        b.iter(|| black_box(par_py_matrix(v.clone())))
-    });
-}
-
-pub fn par_matrix_change_shape_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    // Result: 1000x1000 matrix: 7.6453 ms
-    cr.bench_function("ser_matrix_change_shape_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).change_shape()))
-    });
-
-    // Result: 1000x1000 matrix: 9.8586 ms
-    cr.bench_function("par_matrix_change_shape_bench", |b| {
-        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_change_shape()))
-    });
-}
-
-pub fn par_matrix_extract_row_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    // Result: 1000x1000 matrix: 571.90 µs
-    cr.bench_function("ser_matrix_extract_row_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).row(100)))
-    });
-
-    // Result: 1000x1000 matrix: 10.440 ms
-    cr.bench_function("par_matrix_extract_row_bench", |b| {
-        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_row(100)))
-    });
-}
-
 pub fn par_matrix_from_index_benchmark(cr: &mut Criterion) {
     let f = |x: usize, y: usize| 2.0 * (x as f64) * (y as f64);
     let size: (usize, usize) = (1000, 1000);
@@ -95,94 +36,6 @@ pub fn par_matrix_from_index_benchmark(cr: &mut Criterion) {
     });
 }
 
-pub fn par_matrix_to_vec_and_diag_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    // Result: 1000x1000 matrix: 5.0333 ms
-    cr.bench_function("ser_matrix_to_vec_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).to_vec()))
-    });
-
-    // Result: 1000x1000 matrix: 10.625 ms
-    cr.bench_function("par_matrix_to_vec_bench", |b| {
-        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_to_vec()))
-    });
-
-    // Result: 1000x1000 matrix: 3.0501 ms
-    cr.bench_function("ser_matrix_to_diag_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).to_diag()))
-    });
-
-    // Result: 1000x1000 matrix: 16.362 ms
-    cr.bench_function("par_matrix_to_diag_bench", |b| {
-        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_to_diag()))
-    });
-}
-
-pub fn par_matrix_submat_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    // Result: 1000x1000 matrix: 1.5658 ms
-    cr.bench_function("ser_matrix_submat_bench", |b| {
-        b.iter(|| {
-            black_box(matrix(v.clone(), 1000, 1000, Shape::Row).submat((100, 100), (900, 900)))
-        })
-    });
-
-    // Result: 1000x1000 matrix: 13.000 ms
-    cr.bench_function("par_matrix_submat_bench", |b| {
-        b.iter(|| {
-            black_box(
-                par_matrix(v.clone(), 1000, 1000, Shape::Row).par_submat((100, 100), (900, 900)),
-            )
-        })
-    });
-}
-
-pub fn par_matrix_add_vec_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    let w: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 3.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    // Result: 1000x1000 matrix: 7.1774 ms
-    cr.bench_function("ser_matrix_add_vec_bench", |b| {
-        b.iter(|| {
-            black_box(matrix(v.clone(), 1000, 1000, Shape::Row).add_vec(&matrix(
-                w.clone(),
-                1000,
-                1000,
-                Shape::Row,
-            )))
-        })
-    });
-
-    // Result: 1000x1000 matrix: 11.565 ms
-    cr.bench_function("par_matrix_add_vec_bench", |b| {
-        b.iter(|| {
-            black_box(
-                matrix(v.clone(), 1000, 1000, Shape::Row).par_add_vec(&matrix(
-                    w.clone(),
-                    1000,
-                    1000,
-                    Shape::Row,
-                )),
-            )
-        })
-    });
-}
-
 // Check: better parallel results (ran test 6 times)
 pub fn par_matrix_norm_lpq_benchmark(cr: &mut Criterion) {
     let v: Vec<f64> = (0..1000000)
@@ -203,23 +56,6 @@ pub fn par_matrix_norm_lpq_benchmark(cr: &mut Criterion) {
     });
 }
 
-pub fn par_matrix_norm_f_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    // Result: 1000x1000 matrix: 1.7414 ms
-    cr.bench_function("ser_matrix_norm_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).norm(Norm::F)))
-    });
-
-    // Result: 1000x1000 matrix: 9.0499 ms
-    cr.bench_function("par_matrix_norm_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).par_norm(Norm::F)))
-    });
-}
-
 pub fn par_matrix_norm_l1_benchmark(cr: &mut Criterion) {
     let v: Vec<f64> = (0..1000000)
         .into_iter()
@@ -274,114 +110,12 @@ pub fn par_matrix_inner_prod_benchmark(cr: &mut Criterion) {
     });
 }
 
-pub fn par_matrix_hadamard_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    let w: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 3.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    // Result: 1000x1000 matrix:  5.8664 ms
-    cr.bench_function("ser_matrix_hadamard_bench", |b| {
-        b.iter(|| {
-            black_box(matrix(v.clone(), 1000, 1000, Shape::Row).hadamard(&matrix(
-                w.clone(),
-                1000,
-                1000,
-                Shape::Row,
-            )))
-        })
-    });
-
-    // Result: 1000x1000 matrix: 13.982 ms
-    cr.bench_function("par_matrix_hadamard_bench", |b| {
-        b.iter(|| {
-            black_box(
-                matrix(v.clone(), 1000, 1000, Shape::Row).par_hadamard(&matrix(
-                    w.clone(),
-                    1000,
-                    1000,
-                    Shape::Row,
-                )),
-            )
-        })
-    });
-}
-
-pub fn par_matrix_take_row_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    // Result: 1000x1000 matrix: 3.7958 ms
-    cr.bench_function("ser_matrix_take_row_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).take_row(500)))
-    });
-
-    // Result: 1000x1000 matrix: 9.7897 ms
-    cr.bench_function("par_matrix_take_row_bench", |b| {
-        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_take_row(500)))
-    });
-}
-
-pub fn par_matrix_fpmap_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-    let f = |x: f64| 2.0 * x;
-
-    // Result: 1000x1000 matrix: 3.3385 ms
-    cr.bench_function("ser_matrix_fpmap_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).fmap(f)))
-    });
-
-    // Result: 1000x1000 matrix: 9.6310 ms
-    cr.bench_function("par_matrix_fpmap_bench", |b| {
-        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_fmap(f)))
-    });
-}
-
-pub fn par_matrix_reduce_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-    let f = |x: f64, y: f64| 2.0 * x * y;
-
-    // Result: 1000x1000 matrix:  2.8077 ms
-    cr.bench_function("ser_matrix_reduce_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row).reduce(0.0, f)))
-    });
-
-    // Result: 1000x1000 matrix: 9.4877 ms
-    cr.bench_function("par_matrix_reduce_bench", |b| {
-        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row).par_reduce(0.0, f)))
-    });
-}
-
 criterion_group!(
     benches,
     par_matrix_benchmark,
-    par_py_matrix_benchmark,
-    par_matrix_change_shape_benchmark,
-    par_matrix_extract_row_benchmark,
     par_matrix_from_index_benchmark,
-    par_matrix_to_vec_and_diag_benchmark,
-    par_matrix_submat_benchmark,
-    par_matrix_add_vec_benchmark,
     par_matrix_norm_lpq_benchmark,
-    par_matrix_norm_f_benchmark,
     par_matrix_norm_l1_benchmark,
     par_matrix_inner_prod_benchmark,
-    par_matrix_hadamard_benchmark,
-    par_matrix_take_row_benchmark,
-    par_matrix_fpmap_benchmark,
-    par_matrix_reduce_benchmark,
 );
 criterion_main!(benches);
diff --git a/src/structure/matrix.rs b/src/structure/matrix.rs
index d671156e..b66267e6 100644
--- a/src/structure/matrix.rs
+++ b/src/structure/matrix.rs
@@ -610,18 +610,14 @@ extern crate lapack;
 use blas::{daxpy, dgemm, dgemv};
 #[cfg(feature = "O3")]
 use lapack::{dgecon, dgeqrf, dgesvd, dgetrf, dgetri, dgetrs, dorgqr, dpotrf};
+use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator, ParallelIterator};
 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 
-use crate::rayon::prelude::*;
-use crate::traits::fp::ParallelFPMatrix;
-use crate::traits::math::{
-    ParallelInnerProduct, ParallelMatrixProduct, ParallelNormed, ParallelVector,
-};
-
 pub use self::Shape::{Col, Row};
 use crate::numerical::eigen::{eigen, EigenMethod};
 use crate::structure::dataframe::{Series, TypedVector};
+use crate::traits::math::{ParallelInnerProduct, ParallelNormed};
 use crate::traits::sugar::ScalableMut;
 use crate::traits::{
     fp::{FPMatrix, FPVector},
@@ -760,14 +756,6 @@ where
     matrix(v, r, c, shape)
 }
 
-/// R-like matrix constructor (Explicit ver.) in parallel
-pub fn par_r_matrix<T>(v: Vec<T>, r: usize, c: usize, shape: Shape) -> Matrix
-where
-    T: Into<f64> + Send + Sync,
-{
-    par_matrix(v, r, c, shape)
-}
-
 /// Python-like matrix constructor
 ///
 /// # Examples
@@ -798,38 +786,6 @@ where
     matrix(data, r, c, Row)
 }
 
-/// Python-like matrix constructor in parallel
-///
-/// # Examples
-/// ```
-/// #[macro_use]
-/// extern crate peroxide;
-/// use peroxide::fuga::*;
-///
-/// fn main() {
-///     let a = par_py_matrix(vec![c!(1,2), c!(3,4)]);
-///     let b = matrix(c!(1,2,3,4), 2, 2, Row);
-///     assert_eq!(a, b);
-/// }
-/// ```
-pub fn par_py_matrix<T>(v: Vec<Vec<T>>) -> Matrix
-where
-    T: Into<f64> + Copy + Send + Sync,
-{
-    let r = v.len();
-    let c = v[0].len();
-    let data = (0..r)
-        .into_par_iter()
-        .flat_map(|i| {
-            (0..c)
-                .into_par_iter()
-                .map(|j| v[i][j].into())
-                .collect::<Vec<f64>>()
-        })
-        .collect::<Vec<f64>>();
-    par_matrix(data, r, c, Row)
-}
-
 /// Matlab-like matrix constructor
 ///
 /// # Examples
@@ -852,7 +808,6 @@ pub fn ml_matrix(s: &str) -> Matrix where {
         .map(|x| x.trim().split(' ').collect::<Vec<&str>>())
         .collect::<Vec<Vec<&str>>>();
     let c = str_data[0].len();
-
     let data = str_data
         .into_iter()
         .flat_map(|t| {
@@ -861,45 +816,9 @@ pub fn ml_matrix(s: &str) -> Matrix where {
                 .collect::<Vec<f64>>()
         })
         .collect::<Vec<f64>>();
-
     matrix(data, r, c, Row)
 }
 
-/// Matlab-like matrix constructor in parallel
-///
-/// # Examples
-/// ```
-/// #[macro_use]
-/// extern crate peroxide;
-/// use peroxide::fuga::*;
-///
-/// fn main() {
-///     let a = par_ml_matrix("1 2; 3 4");
-///     let b = matrix(c!(1,2,3,4), 2, 2, Row);
-///     assert_eq!(a, b);
-/// }
-/// ```
-pub fn par_ml_matrix(s: &str) -> Matrix where {
-    let str_rows: Vec<&str> = s.split(';').collect();
-    let r = str_rows.len();
-    let str_data = str_rows
-        .into_par_iter()
-        .map(|x| x.trim().split(' ').collect::<Vec<&str>>())
-        .collect::<Vec<Vec<&str>>>();
-    let c = str_data[0].len();
-
-    let data = str_data
-        .into_par_iter()
-        .flat_map(|t| {
-            t.into_par_iter()
-                .map(|x| x.parse::<f64>().unwrap())
-                .collect::<Vec<f64>>()
-        })
-        .collect::<Vec<f64>>();
-
-    par_matrix(data, r, c, Row)
-}
-
 /// Pretty Print
 impl fmt::Display for Matrix {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -911,17 +830,9 @@ impl fmt::Display for Matrix {
 impl PartialEq for Matrix {
     fn eq(&self, other: &Matrix) -> bool {
         if self.shape == other.shape {
-            // self.data
-            //     .clone()
-            //     .into_iter()
-            //     .zip(other.data.clone())
-            //     .all(|(x, y)| nearly_eq(x, y))
-            //     && self.row == other.row
-
-            // Note: the sequential version in this case can be replaced by the parallel version?
             self.data
                 .clone()
-                .into_par_iter()
+                .into_iter()
                 .zip(other.data.clone())
                 .all(|(x, y)| nearly_eq(x, y))
                 && self.row == other.row
@@ -1015,7 +926,7 @@ impl Matrix {
         }
     }
 
-    /// Change Bindings in Parallel
+    /// Change Bindings Mutably
     ///
     /// `Row` -> `Col` or `Col` -> `Row`
     ///
@@ -1025,50 +936,9 @@ impl Matrix {
     ///
     /// let a = matrix(vec![1,2,3,4],2,2,Row);
     /// assert_eq!(a.shape, Row);
-    /// let b = a.par_change_shape();
+    /// let b = a.change_shape();
     /// assert_eq!(b.shape, Col);
     /// ```
-    pub fn par_change_shape(&self) -> Self {
-        let r = self.row;
-        let c = self.col;
-        assert_eq!(r * c, self.data.len());
-        let l = r * c - 1;
-        let ref_data = &self.data;
-
-        match self.shape {
-            Row => {
-                let mut data = (0..=l)
-                    .into_par_iter()
-                    .map(|i| ref_data[(i * c) % l])
-                    .collect::<Vec<f64>>();
-                data[l] = ref_data[l];
-                par_matrix(data, r, c, Col)
-            }
-            Col => {
-                let mut data = (0..=l)
-                    .into_par_iter()
-                    .map(|i| ref_data[(i * r) % l])
-                    .collect::<Vec<f64>>();
-                data[l] = ref_data[l];
-                par_matrix(data, r, c, Row)
-            }
-        }
-    }
-
-    /// Change Bindings Mutably
-    ///
-    /// `Row` -> `Col` or `Col` -> `Row`
-    ///
-    /// # Examples
-    /// ```
-    /// use peroxide::fuga::*;
-    ///
-    /// let mut a = matrix(vec![1,2,3,4,5,6,7,8,9],3,3,Row);
-    /// assert_eq!(a.shape, Row);
-    /// a.change_shape_mut();
-    /// assert_eq!(a, matrix(vec![1,4,7,2,5,8,3,6,9],3,3,Col));
-    /// assert_eq!(a.shape, Col);
-    /// ```
     pub fn change_shape_mut(&mut self) {
         let r = self.row;
         let c = self.col;
@@ -1096,47 +966,6 @@ impl Matrix {
         }
     }
 
-    /// Change Bindings Mutably in Parallel
-    ///
-    /// `Row` -> `Col` or `Col` -> `Row`
-    ///
-    /// # Examples
-    /// ```
-    /// use peroxide::fuga::*;
-    ///
-    /// let mut a = matrix(vec![1,2,3,4,5,6,7,8,9],3,3,Row);
-    /// assert_eq!(a.shape, Row);
-    /// a.par_change_shape_mut();
-    /// assert_eq!(a, matrix(vec![1,4,7,2,5,8,3,6,9],3,3,Col));
-    /// assert_eq!(a.shape, Col);
-    /// ```
-    pub fn par_change_shape_mut(&mut self) {
-        let r = self.row;
-        let c = self.col;
-        assert_eq!(r * c, self.data.len());
-        let l = r * c - 1;
-        let ref_data = self.data.clone();
-
-        match self.shape {
-            Row => {
-                self.data = (0..=l)
-                    .into_par_iter()
-                    .map(|i| ref_data[(i * c) % l])
-                    .collect::<Vec<f64>>();
-                self.data[l] = ref_data[l];
-                self.shape = Col;
-            }
-            Col => {
-                self.data = (0..=l)
-                    .into_par_iter()
-                    .map(|i| ref_data[(i * r) % l])
-                    .collect::<Vec<f64>>();
-                self.data[l] = ref_data[l];
-                self.shape = Row;
-            }
-        }
-    }
-
     /// Spread data(1D vector) to 2D formatted String
     ///
     /// # Examples
@@ -1246,29 +1075,6 @@ impl Matrix {
         container
     }
 
-    /// Extract Column in parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    ///
-    /// fn main() {
-    ///     let a = matrix(c!(1,2,3,4), 2, 2, Row);
-    ///     assert_eq!(a.par_col(0), c!(1,3));
-    /// }
-    /// ```
-    pub fn par_col(&self, index: usize) -> Vec<f64> {
-        assert!(index < self.col);
-
-        let container = (0..self.row)
-            .into_par_iter()
-            .map(|i| self[(i, index)])
-            .collect::<Vec<f64>>();
-        container
-    }
-
     /// Extract Row
     ///
     /// # Examples
@@ -1291,29 +1097,6 @@ impl Matrix {
         container
     }
 
-    /// Extract Row in Parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    ///
-    /// fn main() {
-    ///     let a = matrix(c!(1,2,3,4), 2, 2, Row);
-    ///     assert_eq!(a.par_row(0), c!(1,2));
-    /// }
-    /// ```
-    pub fn par_row(&self, index: usize) -> Vec<f64> {
-        assert!(index < self.row);
-
-        let container = (0..self.col)
-            .into_par_iter()
-            .map(|i| self[(index, i)])
-            .collect::<Vec<f64>>();
-        container
-    }
-
     /// Extract diagonal components
     ///
     /// # Examples
@@ -1328,37 +1111,17 @@ impl Matrix {
     /// }
     /// ```
     pub fn diag(&self) -> Vec<f64> {
-        assert_eq!(self.row, self.col);
         let mut container = vec![0f64; self.row];
-        for i in 0..self.row {
-            container[i] = self.data[i * (self.col + 1)];
+        let r = self.row;
+        let c = self.col;
+        assert_eq!(r, c);
+        let c2 = c + 1;
+        for i in 0..r {
+            container[i] = self.data[i * c2];
         }
         container
     }
 
-    /// Extract diagonal components in parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    ///
-    /// fn main() {
-    ///     let a = matrix!(1;4;1, 2, 2, Row);
-    ///     assert_eq!(a.par_diag(), c!(1,4));
-    /// }
-    /// ```
-    pub fn par_diag(&self) -> Vec<f64> {
-        assert_eq!(self.row, self.col);
-
-        let container = (0..self.row)
-            .into_par_iter()
-            .map(|i| self.data[i * (self.col + 1)])
-            .collect::<Vec<f64>>();
-        container
-    }
-
     /// Transpose
     ///
     /// # Examples
@@ -1366,8 +1129,7 @@ impl Matrix {
     /// use peroxide::fuga::*;
     ///
     /// let a = matrix(vec![1,2,3,4], 2, 2, Row);
-    /// println!("{}", a.transpose()); // [[1,3],[2,4]]
-    /// assert_eq!(a.transpose(), matrix(vec![1,3,2,4], 2, 2, Row))
+    /// println!("{}", a); // [[1,3],[2,4]]
     /// ```
     pub fn transpose(&self) -> Self {
         match self.shape {
@@ -1376,23 +1138,6 @@ impl Matrix {
         }
     }
 
-    /// Transpose in Parallel
-    ///
-    /// # Examples
-    /// ```
-    /// use peroxide::fuga::*;
-    ///
-    /// let a = par_matrix(vec![1,2,3,4], 2, 2, Row);
-    /// println!("{}", a.par_transpose()); // [[1,3],[2,4]]
-    /// assert_eq!(a.par_transpose(), matrix(vec![1,3,2,4], 2, 2, Row))
-    /// ```
-    pub fn par_transpose(&self) -> Self {
-        match self.shape {
-            Row => par_matrix(self.data.clone(), self.col, self.row, Col),
-            Col => par_matrix(self.data.clone(), self.col, self.row, Row),
-        }
-    }
-
     /// R-like transpose function
     ///
     /// # Examples
@@ -1410,23 +1155,6 @@ impl Matrix {
         self.transpose()
     }
 
-    /// R-like transpose function in parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    ///
-    /// fn main() {
-    ///     let a = matrix!(1;4;1, 2, 2, Row);
-    ///     assert_eq!(a.transpose(), a.par_t());
-    /// }
-    /// ```
-    pub fn par_t(&self) -> Self {
-        self.par_transpose()
-    }
-
     /// Write to CSV
     ///
     /// # Examples
@@ -1671,28 +1399,6 @@ impl Matrix {
         result
     }
 
-    /// Matrix to `Vec<Vec<f64>>` in parallel
-    ///
-    /// To send `Matrix` to `inline-python`
-    pub fn par_to_vec(&self) -> Vec<Vec<f64>> {
-        (0..self.row)
-            .into_par_iter()
-            .map(|i| self.row(i))
-            .collect::<Vec<Vec<f64>>>()
-    }
-
-    /// To diagonal components
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    ///
-    /// fn main() {
-    ///     let a = matrix!(1;4;1, 2, 2, Row);
-    ///     assert_eq!(a.to_diag(), matrix(c!(1,0,0,4), 2, 2, Row));
-    /// }
     pub fn to_diag(&self) -> Matrix {
         assert_eq!(self.row, self.col, "Should be square matrix");
         let mut result = matrix(vec![0f64; self.row * self.col], self.row, self.col, Row);
@@ -1703,34 +1409,6 @@ impl Matrix {
         result
     }
 
-    /// To diagonal components in parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    ///
-    /// fn main() {
-    ///     let a = matrix!(1;4;1, 2, 2, Row);
-    ///     assert_eq!(a.par_to_diag(), matrix(c!(1,0,0,4), 2, 2, Row));
-    /// }
-    pub fn par_to_diag(&self) -> Matrix {
-        assert_eq!(self.row, self.col, "Should be square matrix");
-
-        let data = (0..self.row)
-            .into_par_iter()
-            .flat_map(|i| {
-                (0..self.col)
-                    .into_par_iter()
-                    .map(|j| if i == j { self.diag()[i] } else { 0_f64 })
-                    .collect::<Vec<f64>>()
-            })
-            .collect::<Vec<f64>>();
-
-        par_matrix(data, self.row, self.col, Row)
-    }
-
     /// Submatrix
     ///
     /// # Description
@@ -1770,49 +1448,6 @@ impl Matrix {
         result
     }
 
-    /// Submatrix in parallel
-    ///
-    /// # Description
-    /// Return below elements of matrix to new matrix
-    ///
-    /// $$
-    /// \begin{pmatrix}
-    /// \\ddots & & & & \\\\
-    ///   & start & \\cdots & end.1 & \\\\
-    ///   & \\vdots & \\ddots & \\vdots & \\\\
-    ///   & end.0 & \\cdots & end & \\\\
-    ///   & & & & \\ddots
-    /// \end{pmatrix}
-    /// $$
-    ///
-    /// # Examples
-    /// ```
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    ///
-    /// fn main() {
-    ///     let a = ml_matrix("1 2 3;4 5 6;7 8 9");
-    ///     let b = ml_matrix("5 6;8 9");
-    ///     let c = a.par_submat((1, 1), (2, 2));
-    ///     assert_eq!(b, c);   
-    /// }
-    /// ```
-    pub fn par_submat(&self, start: (usize, usize), end: (usize, usize)) -> Matrix {
-        let row = end.0 - start.0 + 1;
-        let col = end.1 - start.1 + 1;
-
-        let data = (0..row)
-            .into_par_iter()
-            .flat_map(|i| {
-                (0..col)
-                    .into_par_iter()
-                    .map(|j| self[(start.0 + i, start.1 + j)])
-                    .collect::<Vec<f64>>()
-            })
-            .collect::<Vec<f64>>();
-        par_matrix(data, row, col, self.shape)
-    }
-
     /// Substitute matrix to specific position
     ///
     /// # Description
@@ -1961,106 +1596,8 @@ impl Vector for Matrix {
     }
 }
 
-impl ParallelVector for Matrix {
-    type Scalar = f64;
-
-    fn par_add_vec(&self, other: &Self) -> Self {
-        assert_eq!(self.row, other.row);
-        assert_eq!(self.col, other.col);
-
-        match () {
-            #[cfg(feature = "O3")]
-            () => {
-                if self.shape != other.shape {
-                    return self.add(&other.change_shape());
-                }
-                let x = &self.data;
-                let mut y = other.data.clone();
-                let n_i32 = x.len() as i32;
-                let a_f64 = 1f64;
-                unsafe {
-                    daxpy(n_i32, a_f64, x, 1, &mut y, 1);
-                }
-                matrix(y, self.row, self.col, self.shape)
-            }
-            _ => {
-                let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
-                result
-                    .data
-                    .par_iter_mut()
-                    .enumerate()
-                    .for_each(|(idx, value)| {
-                        let i = idx / self.col;
-                        let j = idx % self.col;
-
-                        *value += other[(i, j)];
-                    });
-
-                result
-            }
-        }
-    }
-
-    fn par_sub_vec(&self, other: &Self) -> Self {
-        assert_eq!(self.row, other.row);
-        assert_eq!(self.col, other.col);
-        match () {
-            #[cfg(feature = "O3")]
-            () => {
-                if self.shape != other.shape {
-                    return self.sub(&other.change_shape());
-                }
-                let x = &other.data;
-                let mut y = self.data.clone();
-                let n_i32 = x.len() as i32;
-                let a_f64 = -1f64;
-                unsafe {
-                    daxpy(n_i32, a_f64, x, 1, &mut y, 1);
-                }
-                matrix(y, self.row, self.col, self.shape)
-            }
-            _ => {
-                let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
-                result
-                    .data
-                    .par_iter_mut()
-                    .enumerate()
-                    .for_each(|(idx, val)| {
-                        let i = idx / self.col;
-                        let j = idx % self.col;
-                        *val -= other[(i, j)];
-                    });
-
-                result
-            }
-        }
-    }
-
-    fn par_mul_scalar(&self, other: Self::Scalar) -> Self {
-        match () {
-            #[cfg(feature = "O3")]
-            () => {
-                let x = &self.data;
-                let mut y = vec![0f64; x.len()];
-                let a_f64 = other;
-                let n_i32 = x.len() as i32;
-
-                unsafe {
-                    daxpy(n_i32, a_f64, x, 1, &mut y, 1);
-                }
-                matrix(y, self.row, self.col, self.shape)
-            }
-            _ => {
-                let scalar = other;
-                self.par_fmap(|x| x * scalar)
-            }
-        }
-    }
-}
-
 impl Normed for Matrix {
     type UnsignedScalar = f64;
-
     fn norm(&self, kind: Norm) -> f64 {
         match kind {
             Norm::F => {
@@ -2133,12 +1670,10 @@ impl ParallelNormed for Matrix {
     fn par_norm(&self, kind: Norm) -> f64 {
         match kind {
             Norm::F => {
-                let s = self
-                    .data
-                    .clone()
-                    .into_par_iter()
-                    .fold(|| 0_f64, |acc, el| acc + el.powi(2))
-                    .sum::<f64>();
+                let mut s = 0f64;
+                for i in 0..self.data.len() {
+                    s += self.data[i].powi(2);
+                }
                 s.sqrt()
             }
             Norm::Lpq(p, q) => {
@@ -2175,7 +1710,7 @@ impl ParallelNormed for Matrix {
                     Col => self.change_shape().norm(Norm::LInf),
                     Row => {
                         for r in 0..self.row {
-                            let s = self.row(r).par_iter().sum();
+                            let s = self.row(r).iter().sum();
                             if s > m {
                                 m = s;
                             }
@@ -2279,6 +1814,7 @@ impl MatrixProduct for Matrix {
 
         let r = self.row;
         let c = self.col;
+
         let mut m = matrix(vec![0f64; r * c], r, c, self.shape);
         for i in 0..r {
             for j in 0..c {
@@ -2289,24 +1825,6 @@ impl MatrixProduct for Matrix {
     }
 }
 
-impl ParallelMatrixProduct for Matrix {
-    fn par_hadamard(&self, other: &Self) -> Matrix {
-        assert_eq!(self.row, other.row);
-        assert_eq!(self.col, other.col);
-
-        let m = (0..self.row)
-            .into_par_iter()
-            .flat_map(|i| {
-                (0..self.col)
-                    .into_par_iter()
-                    .map(|j| self[(i, j)] * other[(i, j)])
-                    .collect::<Vec<f64>>()
-            })
-            .collect::<Vec<f64>>();
-        par_matrix(m, self.row, self.col, self.shape)
-    }
-}
-
 // =============================================================================
 // Common Properties of Matrix & Vec<f64>
 // =============================================================================
@@ -2378,20 +1896,6 @@ impl Add<Matrix> for Matrix {
                     }
                 }
                 result
-
-                // Note: Parallel version - how to add the implementation below?
-                // let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
-                // result
-                //     .data
-                //     .par_iter_mut()
-                //     .enumerate()
-                //     .for_each(|(idx, value)| {
-                //         let i = idx / self.col;
-                //         let j = idx % self.col;
-
-                //         *value += other[(i, j)];
-                //     });
-                // result
             }
         }
     }
@@ -2420,7 +1924,7 @@ impl<'a, 'b> Add<&'b Matrix> for &'a Matrix {
 /// ```
 impl<T> Add<T> for Matrix
 where
-    T: Into<f64> + Copy + Send + Sync,
+    T: Into<f64> + Copy,
 {
     type Output = Self;
     fn add(self, other: T) -> Self {
@@ -2443,7 +1947,7 @@ where
 /// Element-wise addition between &Matrix & f64
 impl<'a, T> Add<T> for &'a Matrix
 where
-    T: Into<f64> + Copy + Send + Sync,
+    T: Into<f64> + Copy,
 {
     type Output = Matrix;
     fn add(self, other: T) -> Self::Output {
@@ -2583,11 +2087,6 @@ impl Neg for Matrix {
             }
             _ => matrix(
                 self.data.into_iter().map(|x: f64| -x).collect::<Vec<f64>>(),
-                // Note: how to add parallel implementation below?
-                // self.data
-                //     .into_par_iter()
-                //     .map(|x: f64| -x)
-                //     .collect::<Vec<f64>>(),
                 self.row,
                 self.col,
                 self.shape,
@@ -2618,12 +2117,6 @@ impl<'a> Neg for &'a Matrix {
                     .into_iter()
                     .map(|x: f64| -x)
                     .collect::<Vec<f64>>(),
-                // Note: how to add parallel implementation below?
-                // self.data
-                //     .clone()
-                //     .into_par_iter()
-                //     .map(|x: f64| -x)
-                //     .collect::<Vec<f64>>(),
                 self.row,
                 self.col,
                 self.shape,
@@ -2675,19 +2168,6 @@ impl Sub<Matrix> for Matrix {
                     }
                 }
                 result
-                // Note: how to add parallel implementation below?
-                // let mut result = par_matrix(self.data.clone(), self.row, self.col, self.shape);
-                // result
-                //     .data
-                //     .par_iter_mut()
-                //     .enumerate()
-                //     .for_each(|(idx, value)| {
-                //         let i = idx / self.col;
-                //         let j = idx % self.col;
-
-                //         *value -= other[(i, j)];
-                //     });
-                // result
             }
         }
     }
@@ -2704,7 +2184,7 @@ impl<'a, 'b> Sub<&'b Matrix> for &'a Matrix {
 /// Subtraction between Matrix & f64
 impl<T> Sub<T> for Matrix
 where
-    T: Into<f64> + Copy + Send + Sync,
+    T: Into<f64> + Copy,
 {
     type Output = Self;
 
@@ -2728,7 +2208,7 @@ where
 /// Subtraction between &Matrix & f64
 impl<'a, T> Sub<T> for &'a Matrix
 where
-    T: Into<f64> + Copy + Send + Sync,
+    T: Into<f64> + Copy,
 {
     type Output = Matrix;
 
@@ -3299,9 +2779,11 @@ impl FPMatrix for Matrix {
         F: Fn(Vec<f64>) -> Vec<f64>,
     {
         let mut result = matrix(vec![0f64; self.row * self.col], self.row, self.col, Col);
+
         for i in 0..self.col {
             result.subs_col(i, &f(self.col(i)));
         }
+
         result
     }
 
@@ -3323,9 +2805,11 @@ impl FPMatrix for Matrix {
         F: Fn(Vec<f64>) -> Vec<f64>,
     {
         let mut result = matrix(vec![0f64; self.row * self.col], self.row, self.col, Row);
+
         for i in 0..self.row {
             result.subs_row(i, &f(self.row(i)));
         }
+
         result
     }
 
@@ -3362,7 +2846,7 @@ impl FPMatrix for Matrix {
     fn reduce<F, T>(&self, init: T, f: F) -> f64
     where
         F: Fn(f64, f64) -> f64,
-        T: Into<f64> + Copy + Clone,
+        T: Into<f64>,
     {
         self.data.iter().fold(init.into(), |x, y| f(x, *y))
     }
@@ -3408,135 +2892,6 @@ impl FPMatrix for Matrix {
     }
 }
 
-impl ParallelFPMatrix for Matrix {
-    fn par_take_row(&self, n: usize) -> Self {
-        if n >= self.row {
-            return self.clone();
-        }
-        match self.shape {
-            Row => {
-                let new_data = self
-                    .data
-                    .clone()
-                    .into_par_iter()
-                    .take(n * self.col)
-                    .collect::<Vec<f64>>();
-                par_matrix(new_data, n, self.col, Row)
-            }
-            Col => {
-                let temp_data = (0..n)
-                    .into_par_iter()
-                    .flat_map(|i| self.row(i))
-                    .collect::<Vec<f64>>();
-                par_matrix(temp_data, n, self.col, Row)
-            }
-        }
-    }
-
-    fn par_take_col(&self, n: usize) -> Self {
-        if n >= self.col {
-            return self.clone();
-        }
-        match self.shape {
-            Col => {
-                let new_data = self
-                    .data
-                    .clone()
-                    .into_par_iter()
-                    .take(n * self.row)
-                    .collect::<Vec<f64>>();
-                par_matrix(new_data, self.row, n, Col)
-            }
-            Row => {
-                let temp_data = (0..n)
-                    .into_par_iter()
-                    .flat_map(|i| self.col(i))
-                    .collect::<Vec<f64>>();
-                par_matrix(temp_data, self.row, n, Col)
-            }
-        }
-    }
-
-    fn par_skip_row(&self, n: usize) -> Self {
-        assert!(n < self.row, "Skip range is larger than row of matrix");
-
-        let temp_data = (n..self.row)
-            .into_par_iter()
-            .flat_map(|i| self.row(i))
-            .collect::<Vec<f64>>();
-        par_matrix(temp_data, self.row - n, self.col, Row)
-    }
-
-    fn par_skip_col(&self, n: usize) -> Self {
-        assert!(n < self.col, "Skip range is larger than col of matrix");
-
-        let temp_data = (n..self.col)
-            .into_par_iter()
-            .flat_map(|i| self.col(i))
-            .collect::<Vec<f64>>();
-        par_matrix(temp_data, self.row, self.col - n, Col)
-    }
-
-    fn par_fmap<F>(&self, f: F) -> Matrix
-    where
-        F: Fn(f64) -> f64 + Send + Sync,
-    {
-        let result = self.data.par_iter().map(|x| f(*x)).collect::<Vec<f64>>();
-        par_matrix(result, self.row, self.col, self.shape)
-    }
-
-    fn par_reduce<F, T>(&self, init: T, f: F) -> f64
-    where
-        F: Fn(f64, f64) -> f64 + Send + Sync,
-        T: Into<f64> + Send + Sync + Copy + Clone,
-    {
-        self.data
-            .clone()
-            .into_par_iter()
-            .fold(|| init.into(), |acc, y| f(acc, y))
-            .sum()
-    }
-
-    fn par_zip_with<F>(&self, f: F, other: &Matrix) -> Self
-    where
-        F: Fn(f64, f64) -> f64 + Send + Sync,
-    {
-        assert_eq!(self.data.len(), other.data.len());
-        let mut a = other.clone();
-        if self.shape != other.shape {
-            a = a.change_shape();
-        }
-
-        let result = self
-            .data
-            .par_iter()
-            .zip(a.data.par_iter())
-            .map(|(x, y)| f(*x, *y))
-            .collect::<Vec<f64>>();
-        par_matrix(result, self.row, self.col, self.shape)
-    }
-
-    fn par_col_reduce<F>(&self, f: F) -> Vec<f64>
-    where
-        F: Fn(Vec<f64>) -> f64 + Send + Sync,
-    {
-        (0..self.col)
-            .into_par_iter()
-            .map(|i| f(self.col(i)))
-            .collect::<Vec<f64>>()
-    }
-
-    fn par_row_reduce<F>(&self, f: F) -> Vec<f64>
-    where
-        F: Fn(Vec<f64>) -> f64 + Send + Sync,
-    {
-        (0..self.row)
-            .into_par_iter()
-            .map(|i| f(self.row(i)))
-            .collect::<Vec<f64>>()
-    }
-}
-
 // =============================================================================
 // Linear Algebra
 // =============================================================================
@@ -3570,20 +2925,6 @@ pub fn diag(n: usize) -> Matrix {
     matrix(v, n, n, Row)
 }
 
-pub fn par_diag(n: usize) -> Matrix {
-    let v = (0..n * n)
-        .into_par_iter()
-        .map(|i| {
-            if i % (n + 1) == 0 {
-                1_f64 // Set diagonal elements to 1
-            } else {
-                0_f64 // All other elements are 0
-            }
-        })
-        .collect::<Vec<f64>>();
-    par_matrix(v, n, n, Row)
-}
-
 /// Data structure for Complete Pivoting LU decomposition
 ///
 /// # Usage

From f6ce0d5a07f1deb94d14cc373ab4414ebbe1fdfa Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Mon, 14 Oct 2024 16:43:38 +0200
Subject: [PATCH 14/17] Removed ParallelAlgorithm trait

- Removed methods in ParallelAlgorithm trait
since they don't seem to offer significant
benefits over their serial counterparts in
the current implementation.
- Removed some ParallelFPVector trait methods
for same reason as above
---
 src/structure/vector.rs | 230 ----------------------------------------
 src/traits/fp.rs        |  17 ---
 src/traits/general.rs   |  13 ---
 3 files changed, 260 deletions(-)

diff --git a/src/structure/vector.rs b/src/structure/vector.rs
index aaa939d5..438c0a1d 100644
--- a/src/structure/vector.rs
+++ b/src/structure/vector.rs
@@ -271,7 +271,6 @@ use crate::rayon::prelude::*;
 
 use crate::structure::matrix::{matrix, Matrix, Row};
 use crate::traits::fp::ParallelFPVector;
-use crate::traits::general::ParallelAlgorithm;
 use crate::traits::math::{ParallelInnerProduct, ParallelNormed, ParallelVectorProduct};
 use crate::traits::mutable::ParallelMutFP;
 use crate::traits::{
@@ -506,65 +505,6 @@ impl ParallelFPVector for Vec<f64> {
             .filter(|x| f(*x))
             .collect::<Vec<f64>>()
     }
-
-    /// Take for `Vec<f64>` in Parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    /// use peroxide::traits::fp::ParallelFPVector;
-    ///
-    /// fn main() {
-    ///     let a = c!(1,2,3,4,5);
-    ///     let b = a.par_take(3);
-    ///     assert_eq!(b, c!(1,2,3));
-    /// }
-    /// ```
-    fn par_take(&self, n: usize) -> Vec<f64> {
-        let mut v = vec![0_f64; n];
-        v[..n]
-            .par_iter_mut()
-            .zip(self[..n].par_iter())
-            .for_each(|(dest, src)| {
-                *dest = *src;
-            }); // parallel equivalent of copy_from_slice()
-        v
-    }
-
-    /// Skip for `Vec<f64>` in Parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    /// use peroxide::traits::fp::ParallelFPVector;
-    ///
-    /// fn main() {
-    ///     let a = c!(1,2,3,4,5);
-    ///     let b = a.par_skip(3);
-    ///     assert_eq!(b, c!(4,5));
-    /// }
-    /// ```
-    fn par_skip(&self, n: usize) -> Vec<f64> {
-        let l = self.len();
-        let mut v = vec![0f64; l - n];
-        v[..(l - n)]
-            .par_iter_mut()
-            .zip(self[n..l].par_iter())
-            .for_each(|(dest, src)| *dest = *src);
-        v
-    }
-
-    fn par_sum(&self) -> f64 {
-        self.par_iter().sum()
-    }
-
-    fn par_prod(&self) -> f64 {
-        self.par_iter().product()
-    }
 }
 
 /// Explicit version of `map`
@@ -836,176 +776,6 @@ impl Algorithm for Vec<f64> {
     }
 }
 
-impl ParallelAlgorithm for Vec<f64> {
-    /// Assign rank in parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    /// use peroxide::traits::general::ParallelAlgorithm;
-    ///
-    /// fn main() {
-    ///     let v = c!(7, 5, 9, 2, 8);
-    ///     assert_eq!(v.par_rank(), vec![2,3,0,4,1]);
-    /// }
-    /// ```
-    fn par_rank(&self) -> Vec<usize> {
-        let l = self.len();
-        let idx = (1..(l + 1)).collect::<Vec<usize>>();
-
-        let mut vec_tup = self
-            .clone()
-            .into_par_iter()
-            .zip(idx.clone())
-            .collect::<Vec<(f64, usize)>>();
-        vec_tup.sort_by(|x, y| x.0.partial_cmp(&y.0).unwrap().reverse());
-        let indices = vec_tup
-            .into_par_iter()
-            .map(|(_, y)| y)
-            .collect::<Vec<usize>>();
-        idx.into_par_iter()
-            .map(|x| {
-                indices
-                    .clone()
-                    .into_par_iter()
-                    .position_any(|t| t == x) // Note: position() is deprecated by rayon
-                    .unwrap()
-            })
-            .collect::<Vec<usize>>()
-    }
-
-    /// arg max in Parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    /// use peroxide::traits::general::ParallelAlgorithm;
-    ///
-    /// fn main() {
-    ///     let v = c!(1,3,2,4,3,7);
-    ///     assert_eq!(v.par_arg_max(),5);
-    ///
-    ///     let v2 = c!(1,3,2,5,6,6);
-    ///     assert_eq!(v2.par_arg_max(),4);
-    /// }
-    /// ```
-    fn par_arg_max(&self) -> usize {
-        #[cfg(feature = "O3")]
-        {
-            let n_i32 = self.len() as i32;
-            let idx: usize;
-            unsafe {
-                idx = idamax(n_i32, self, 1) - 1;
-            }
-            idx
-        }
-        #[cfg(not(feature = "O3"))]
-        {
-            self.into_par_iter()
-                .enumerate()
-                .fold(
-                    || (0usize, f64::MIN),
-                    |acc, (ics, &val)| {
-                        if acc.1 < val {
-                            (ics, val)
-                        } else {
-                            acc
-                        }
-                    },
-                )
-                .reduce(
-                    // Note: need reduce and can not simply do .max() because rayon::..max() requires Ord, which is stricter than PartialOrd
-                    // Hence combine the results from the parallel fold
-                    || (0usize, f64::MIN), // Identity element
-                    |acc1, acc2| {
-                        if acc1.1 < acc2.1 {
-                            acc2 // Return the pair with the larger value
-                        } else {
-                            acc1
-                        }
-                    },
-                )
-                .0
-        }
-    }
-
-    /// arg min in Parallel
-    ///
-    /// # Examples
-    /// ```
-    /// #[macro_use]
-    /// extern crate peroxide;
-    /// use peroxide::fuga::*;
-    /// use peroxide::traits::general::ParallelAlgorithm;
-    ///
-    /// fn main() {
-    ///     let v = c!(1,3,2,4,3,7);
-    ///     assert_eq!(v.par_arg_min(),0);
-    ///
-    ///     let v2 = c!(4,3,2,5,1,6);
-    ///     assert_eq!(v2.par_arg_min(),4);
-    /// }
-    fn par_arg_min(&self) -> usize {
-        self.into_par_iter()
-            .enumerate()
-            .fold(
-                || (0usize, f64::MAX),
-                |acc, (ics, &val)| {
-                    if acc.1 > val {
-                        (ics, val)
-                    } else {
-                        acc
-                    }
-                },
-            )
-            .reduce(
-                || (0usize, f64::MAX),
-                |acc1, acc2| {
-                    if acc1.1 > acc2.1 {
-                        acc2 // Return the pair with the smaller value
-                    } else {
-                        acc1
-                    }
-                },
-            )
-            .0
-    }
-
-    fn par_max(&self) -> f64 {
-        #[cfg(feature = "O3")]
-        {
-            let n_i32 = self.len() as i32;
-            let idx: usize;
-            unsafe {
-                idx = idamax(n_i32, self, 1) - 1;
-            }
-            self[idx]
-        }
-        #[cfg(not(feature = "O3"))]
-        {
-            self.into_par_iter()
-                .fold(|| f64::MIN, |acc, &val| if acc < val { val } else { acc })
-                .reduce(
-                    || f64::MIN,
-                    |acc1, acc2| if acc1 < acc2 { acc2 } else { acc1 },
-                )
-        }
-    }
-
-    fn par_min(&self) -> f64 {
-        self.into_par_iter()
-            .fold(|| f64::MAX, |acc, &val| if acc > val { val } else { acc })
-            .reduce(
-                || f64::MAX,
-                |acc1, acc2| if acc1 > acc2 { acc2 } else { acc1 },
-            )
-    }
-}
-
 impl Vector for Vec<f64> {
     type Scalar = f64;
 
diff --git a/src/traits/fp.rs b/src/traits/fp.rs
index fbb66767..07782cb0 100644
--- a/src/traits/fp.rs
+++ b/src/traits/fp.rs
@@ -76,11 +76,6 @@ pub trait ParallelFPVector {
     fn par_filter<F>(&self, f: F) -> Self
     where
         F: Fn(Self::Scalar) -> bool + Send + Sync;
-
-    fn par_take(&self, n: usize) -> Self;
-    fn par_skip(&self, n: usize) -> Self;
-    fn par_sum(&self) -> Self::Scalar;
-    fn par_prod(&self) -> Self::Scalar;
 }
 
 /// Functional Programming for Matrix in Parallel (Uses Rayon crate)
@@ -101,16 +96,4 @@ pub trait ParallelFPMatrix {
     fn par_row_reduce<F>(&self, f: F) -> Vec<f64>
     where
         F: Fn(Vec<f64>) -> f64 + Send + Sync;
-    fn par_take_row(&self, n: usize) -> Matrix;
-    fn par_take_col(&self, n: usize) -> Matrix;
-    fn par_skip_row(&self, n: usize) -> Matrix;
-    fn par_skip_col(&self, n: usize) -> Matrix;
-
-    // These are not in parallel:
-    // fn col_map<F>(&self, f: F) -> Matrix
-    // where
-    //     F: Fn(Vec<f64>) -> Vec<f64>;
-    // fn row_map<F>(&self, f: F) -> Matrix
-    // where
-    //     F: Fn(Vec<f64>) -> Vec<f64>;
 }
diff --git a/src/traits/general.rs b/src/traits/general.rs
index 7ec7fed1..2737f97b 100644
--- a/src/traits/general.rs
+++ b/src/traits/general.rs
@@ -8,16 +8,3 @@ pub trait Algorithm {
     fn min(&self) -> f64;
     fn swap_with_perm(&mut self, p: &Vec<(usize, usize)>);
 }
-
-/// Some algorithms for Vector in Parallel
-pub trait ParallelAlgorithm {
-    fn par_rank(&self) -> Vec<usize>;
-    fn par_arg_max(&self) -> usize;
-    fn par_arg_min(&self) -> usize;
-    fn par_max(&self) -> f64;
-    fn par_min(&self) -> f64;
-
-    // Not implemented in parallel:
-    // fn par_sign(&self) -> f64;
-    // fn par_swap_with_perm(&mut self, p: &Vec<(usize, usize)>);
-}

From a239f98ddf174124dade3a943881a614b6ad8f4a Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Mon, 14 Oct 2024 16:57:15 +0200
Subject: [PATCH 15/17] Implemented par_reduce

- Implemented par_reduce operation using
reduce_with () from rayon crate
---
 src/structure/vector.rs | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/structure/vector.rs b/src/structure/vector.rs
index 438c0a1d..8d584942 100644
--- a/src/structure/vector.rs
+++ b/src/structure/vector.rs
@@ -454,21 +454,15 @@ impl ParallelFPVector for Vec<f64> {
     ///     assert_eq!(a.par_reduce(0, |x,y| x + y), 5050f64);
     /// }
     /// ```
-    fn par_reduce<F, T>(&self, init: T, f: F) -> f64
+    fn par_reduce<F, T>(&self, _init: T, f: F) -> f64
     where
         F: Fn(f64, f64) -> f64 + Send + Sync,
         T: Into<f64> + Send + Sync + Copy,
     {
-        self.iter().fold(init.into(), |x, &y| f(x, y))
-
-        // Parallel version unimplemented
-
-        // self.par_iter()
-        //     .cloned()
-        //     .fold(|| init.into(), |x, y| f(x, y))
-        //     .sum::<f64>() // Combining fold and reduce to produce a single value
-        //                   // .reduce(|| init.into(), |x, y| f(x, y))  // can not use reduce instead of .sum(), since the fn f used might not be associative (e.g. check test_max_pool_1d)
-        //                   // https://docs.rs/rayon/latest/rayon/iter/trait.ParallelIterator.html#method.reduce
+        self.par_iter()
+            .cloned()
+            .reduce_with(|x, y| f(x, y))
+            .expect("Unable to perform parallel reduce operation")
     }
 
     fn par_zip_with<F>(&self, f: F, other: &Vec<f64>) -> Vec<f64>

From 3737099dda63bb97014816003984bd868ab3e00d Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Mon, 14 Oct 2024 17:11:21 +0200
Subject: [PATCH 16/17] Removed formatting from Cargo.toml file

- Added rayon and criterion as dependencies
---
 Cargo.toml | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 39c51a7a..0c8f0cc7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,14 +10,7 @@ categories = ["science"]
 readme = "README.md"
 documentation = "https://axect.github.io/Peroxide_Doc"
 keywords = ["Numeric", "Science", "Dataframe", "Plot", "LinearAlgebra"]
-exclude = [
-    "example_data/",
-    "src/bin/",
-    # "benches/",
-    "example/",
-    "test_data/",
-    "peroxide-ad2",
-]
+exclude = ["example_data/", "src/bin/", "benches/", "example/", "test_data/", "peroxide-ad2"]
 
 [badges]
 travis-ci = { repository = "axect/peroxide" }
@@ -40,27 +33,18 @@ anyhow = "1.0"
 paste = "1.0"
 #num-complex = "0.3"
 netcdf = { version = "0.7", optional = true, default-features = false }
-pyo3 = { version = "0.22", optional = true, features = [
-    "auto-initialize",
-    "gil-refs",
-] }
+pyo3 = { version = "0.22", optional = true, features = ["auto-initialize", "gil-refs"] }
 blas = { version = "0.22", optional = true }
 lapack = { version = "0.19", optional = true }
 serde = { version = "1.0", features = ["derive"], optional = true }
 json = { version = "0.12", optional = true }
-arrow2 = { version = "0.18", features = [
-    "io_parquet",
-    "io_parquet_compression",
-], optional = true }
+arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"], optional = true }
 num-complex = { version = "0.4", optional = true }
-lambert_w = { version = "0.3.0", default-features = false, features = [
-    "24bits",
-    "50bits",
-] }
+lambert_w = { version = "0.3.0", default-features = false, features = ["24bits", "50bits"] }
 rayon = "1.10"
 
 [package.metadata.docs.rs]
-rustdoc-args = ["--html-in-header", "katex-header.html", "--cfg", "docsrs"]
+rustdoc-args = [ "--html-in-header", "katex-header.html", "--cfg", "docsrs"]
 
 [features]
 default = []

From 863ad6a5b34b247f100797fbd4ef9c99ef61c396 Mon Sep 17 00:00:00 2001
From: Soumya <soumyasen1809@gmail.com>
Date: Mon, 14 Oct 2024 17:21:19 +0200
Subject: [PATCH 17/17] Removed par_matrix implementation

- par_matrix removed from matrix.rs
and benchmark
---
 benches/parallel_rayon/matrix_benchmark.rs | 18 ---------------
 src/structure/matrix.rs                    | 27 +---------------------
 2 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/benches/parallel_rayon/matrix_benchmark.rs b/benches/parallel_rayon/matrix_benchmark.rs
index eaaa1e13..86b39bc9 100644
--- a/benches/parallel_rayon/matrix_benchmark.rs
+++ b/benches/parallel_rayon/matrix_benchmark.rs
@@ -4,23 +4,6 @@ use peroxide::{
     traits::math::{ParallelInnerProduct, ParallelNormed},
 };
 
-pub fn par_matrix_benchmark(cr: &mut Criterion) {
-    let v: Vec<f64> = (0..1000000)
-        .into_iter()
-        .map(|i: i32| 2.0 * (i as f64))
-        .collect::<Vec<f64>>();
-
-    // Result: 1000x1000 matrix: 630.92 µs
-    cr.bench_function("ser_matrix_bench", |b| {
-        b.iter(|| black_box(matrix(v.clone(), 1000, 1000, Shape::Row)))
-    });
-
-    // Result: 1000x1000 matrix: 9.6995 ms
-    cr.bench_function("par_matrix_bench", |b| {
-        b.iter(|| black_box(par_matrix(v.clone(), 1000, 1000, Shape::Row)))
-    });
-}
-
 pub fn par_matrix_from_index_benchmark(cr: &mut Criterion) {
     let f = |x: usize, y: usize| 2.0 * (x as f64) * (y as f64);
     let size: (usize, usize) = (1000, 1000);
@@ -112,7 +95,6 @@ pub fn par_matrix_inner_prod_benchmark(cr: &mut Criterion) {
 
 criterion_group!(
     benches,
-    par_matrix_benchmark,
     par_matrix_from_index_benchmark,
     par_matrix_norm_lpq_benchmark,
     par_matrix_norm_l1_benchmark,
diff --git a/src/structure/matrix.rs b/src/structure/matrix.rs
index b66267e6..226e7958 100644
--- a/src/structure/matrix.rs
+++ b/src/structure/matrix.rs
@@ -723,31 +723,6 @@ where
     }
 }
 
-/// R-like matrix constructor in parallel
-///
-/// # Examples
-/// ```
-/// #[macro_use]
-/// extern crate peroxide;
-/// use peroxide::fuga::*;
-///
-/// fn main() {
-///     let a = par_matrix(c!(1,2,3,4), 2, 2, Row);
-///     a.print(); // Print matrix
-/// }
-/// ```
-pub fn par_matrix<T>(v: Vec<T>, r: usize, c: usize, shape: Shape) -> Matrix
-where
-    T: Into<f64> + Send + Sync,
-{
-    Matrix {
-        data: v.into_par_iter().map(|t| t.into()).collect::<Vec<f64>>(),
-        row: r,
-        col: c,
-        shape,
-    }
-}
-
 /// R-like matrix constructor (Explicit ver.)
 pub fn r_matrix<T>(v: Vec<T>, r: usize, c: usize, shape: Shape) -> Matrix
 where
@@ -1385,7 +1360,7 @@ impl Matrix {
                     .collect::<Vec<f64>>()
             })
             .collect::<Vec<f64>>();
-        par_matrix(data, row, col, Row)
+        matrix(data, row, col, Row)
     }
 
     /// Matrix to `Vec<Vec<f64>>`