use std::{mem, ops}; use crate::stats::float::Float; use crate::stats::tuple::{Tuple, TupledDistributionsBuilder}; use crate::stats::univariate::Percentiles; use crate::stats::univariate::Resamples; #[cfg(feature = "rayon")] use rayon::prelude::*; /// A collection of data points drawn from a population /// /// Invariants: /// /// - The sample contains at least 2 data points /// - The sample contains no `NaN`s #[repr(transparent)] pub struct Sample([A]); // TODO(rust-lang/rfcs#735) move this `impl` into a private percentiles module impl Sample where A: Float, { /// Creates a new sample from an existing slice /// /// # Panics /// /// Panics if `slice` contains any `NaN` or if `slice` has less than two elements #[cfg_attr(feature = "cargo-clippy", allow(clippy::new_ret_no_self))] pub fn new(slice: &[A]) -> &Sample { assert!(slice.len() > 1 && slice.iter().all(|x| !x.is_nan())); unsafe { mem::transmute(slice) } } /// Returns the biggest element in the sample /// /// - Time: `O(length)` pub fn max(&self) -> A { let mut elems = self.iter(); match elems.next() { Some(&head) => elems.fold(head, |a, &b| a.max(b)), // NB `unreachable!` because `Sample` is guaranteed to have at least one data point None => unreachable!(), } } /// Returns the arithmetic average of the sample /// /// - Time: `O(length)` pub fn mean(&self) -> A { let n = self.len(); self.sum() / A::cast(n) } /// Returns the median absolute deviation /// /// The `median` can be optionally passed along to speed up (2X) the computation /// /// - Time: `O(length)` /// - Memory: `O(length)` pub fn median_abs_dev(&self, median: Option) -> A where usize: cast::From>, { let median = median.unwrap_or_else(|| self.percentiles().median()); // NB Although this operation can be SIMD accelerated, the gain is negligible because the // bottle neck is the sorting operation which is part of the computation of the median let abs_devs = self.iter().map(|&x| (x - median).abs()).collect::>(); let abs_devs: &Self = Self::new(&abs_devs); abs_devs.percentiles().median() * A::cast(1.4826) } /// Returns the median absolute deviation as a percentage of the median /// /// - Time: `O(length)` /// - Memory: `O(length)` pub fn median_abs_dev_pct(&self) -> A where usize: cast::From>, { let _100 = A::cast(100); let median = self.percentiles().median(); let mad = self.median_abs_dev(Some(median)); (mad / median) * _100 } /// Returns the smallest element in the sample /// /// - Time: `O(length)` pub fn min(&self) -> A { let mut elems = self.iter(); match elems.next() { Some(&elem) => elems.fold(elem, |a, &b| a.min(b)), // NB `unreachable!` because `Sample` is guaranteed to have at least one data point None => unreachable!(), } } /// Returns a "view" into the percentiles of the sample /// /// This "view" makes consecutive computations of percentiles much faster (`O(1)`) /// /// - Time: `O(N log N) where N = length` /// - Memory: `O(length)` pub fn percentiles(&self) -> Percentiles where usize: cast::From>, { use std::cmp::Ordering; // NB This function assumes that there are no `NaN`s in the sample fn cmp(a: &T, b: &T) -> Ordering where T: PartialOrd, { match a.partial_cmp(b) { Some(o) => o, // Arbitrary way to handle NaNs that should never happen None => Ordering::Equal, } } let mut v = self.to_vec().into_boxed_slice(); #[cfg(feature = "rayon")] v.par_sort_unstable_by(cmp); #[cfg(not(feature = "rayon"))] v.sort_unstable_by(cmp); // NB :-1: to intra-crate privacy rules unsafe { mem::transmute(v) } } /// Returns the standard deviation of the sample /// /// The `mean` can be optionally passed along to speed up (2X) the computation /// /// - Time: `O(length)` pub fn std_dev(&self, mean: Option) -> A { self.var(mean).sqrt() } /// Returns the standard deviation as a percentage of the mean /// /// - Time: `O(length)` pub fn std_dev_pct(&self) -> A { let _100 = A::cast(100); let mean = self.mean(); let std_dev = self.std_dev(Some(mean)); (std_dev / mean) * _100 } /// Returns the sum of all the elements of the sample /// /// - Time: `O(length)` pub fn sum(&self) -> A { crate::stats::sum(self) } /// Returns the t score between these two samples /// /// - Time: `O(length)` pub fn t(&self, other: &Sample) -> A { let (x_bar, y_bar) = (self.mean(), other.mean()); let (s2_x, s2_y) = (self.var(Some(x_bar)), other.var(Some(y_bar))); let n_x = A::cast(self.len()); let n_y = A::cast(other.len()); let num = x_bar - y_bar; let den = (s2_x / n_x + s2_y / n_y).sqrt(); num / den } /// Returns the variance of the sample /// /// The `mean` can be optionally passed along to speed up (2X) the computation /// /// - Time: `O(length)` pub fn var(&self, mean: Option) -> A { use std::ops::Add; let mean = mean.unwrap_or_else(|| self.mean()); let slice = self; let sum = slice .iter() .map(|&x| (x - mean).powi(2)) .fold(A::cast(0), Add::add); sum / A::cast(slice.len() - 1) } // TODO Remove the `T` parameter in favor of `S::Output` /// Returns the bootstrap distributions of the parameters estimated by the 1-sample statistic /// /// - Multi-threaded /// - Time: `O(nresamples)` /// - Memory: `O(nresamples)` pub fn bootstrap(&self, nresamples: usize, statistic: S) -> T::Distributions where S: Fn(&Sample) -> T + Sync, T: Tuple + Send, T::Distributions: Send, T::Builder: Send, { #[cfg(feature = "rayon")] { (0..nresamples) .into_par_iter() .map_init( || Resamples::new(self), |resamples, _| statistic(resamples.next()), ) .fold( || T::Builder::new(0), |mut sub_distributions, sample| { sub_distributions.push(sample); sub_distributions }, ) .reduce( || T::Builder::new(0), |mut a, mut b| { a.extend(&mut b); a }, ) .complete() } #[cfg(not(feature = "rayon"))] { let mut resamples = Resamples::new(self); (0..nresamples) .map(|_| statistic(resamples.next())) .fold(T::Builder::new(0), |mut sub_distributions, sample| { sub_distributions.push(sample); sub_distributions }) .complete() } } #[cfg(test)] pub fn iqr(&self) -> A where usize: cast::From>, { self.percentiles().iqr() } #[cfg(test)] pub fn median(&self) -> A where usize: cast::From>, { self.percentiles().median() } } impl ops::Deref for Sample { type Target = [A]; fn deref(&self) -> &[A] { &self.0 } }