1 use std::{mem, ops};
2 
3 use crate::stats::float::Float;
4 use crate::stats::tuple::{Tuple, TupledDistributionsBuilder};
5 use crate::stats::univariate::Percentiles;
6 use crate::stats::univariate::Resamples;
7 #[cfg(feature = "rayon")]
8 use rayon::prelude::*;
9 
10 /// A collection of data points drawn from a population
11 ///
12 /// Invariants:
13 ///
14 /// - The sample contains at least 2 data points
15 /// - The sample contains no `NaN`s
16 #[repr(transparent)]
17 pub struct Sample<A>([A]);
18 
19 // TODO(rust-lang/rfcs#735) move this `impl` into a private percentiles module
20 impl<A> Sample<A>
21 where
22     A: Float,
23 {
24     /// Creates a new sample from an existing slice
25     ///
26     /// # Panics
27     ///
28     /// Panics if `slice` contains any `NaN` or if `slice` has less than two elements
29     #[cfg_attr(feature = "cargo-clippy", allow(clippy::new_ret_no_self))]
new(slice: &[A]) -> &Sample<A>30     pub fn new(slice: &[A]) -> &Sample<A> {
31         assert!(slice.len() > 1 && slice.iter().all(|x| !x.is_nan()));
32 
33         unsafe { mem::transmute(slice) }
34     }
35 
36     /// Returns the biggest element in the sample
37     ///
38     /// - Time: `O(length)`
max(&self) -> A39     pub fn max(&self) -> A {
40         let mut elems = self.iter();
41 
42         match elems.next() {
43             Some(&head) => elems.fold(head, |a, &b| a.max(b)),
44             // NB `unreachable!` because `Sample` is guaranteed to have at least one data point
45             None => unreachable!(),
46         }
47     }
48 
49     /// Returns the arithmetic average of the sample
50     ///
51     /// - Time: `O(length)`
mean(&self) -> A52     pub fn mean(&self) -> A {
53         let n = self.len();
54 
55         self.sum() / A::cast(n)
56     }
57 
58     /// Returns the median absolute deviation
59     ///
60     /// The `median` can be optionally passed along to speed up (2X) the computation
61     ///
62     /// - Time: `O(length)`
63     /// - Memory: `O(length)`
median_abs_dev(&self, median: Option<A>) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,64     pub fn median_abs_dev(&self, median: Option<A>) -> A
65     where
66         usize: cast::From<A, Output = Result<usize, cast::Error>>,
67     {
68         let median = median.unwrap_or_else(|| self.percentiles().median());
69 
70         // NB Although this operation can be SIMD accelerated, the gain is negligible because the
71         // bottle neck is the sorting operation which is part of the computation of the median
72         let abs_devs = self.iter().map(|&x| (x - median).abs()).collect::<Vec<_>>();
73 
74         let abs_devs: &Self = Self::new(&abs_devs);
75 
76         abs_devs.percentiles().median() * A::cast(1.4826)
77     }
78 
79     /// Returns the median absolute deviation as a percentage of the median
80     ///
81     /// - Time: `O(length)`
82     /// - Memory: `O(length)`
median_abs_dev_pct(&self) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,83     pub fn median_abs_dev_pct(&self) -> A
84     where
85         usize: cast::From<A, Output = Result<usize, cast::Error>>,
86     {
87         let _100 = A::cast(100);
88         let median = self.percentiles().median();
89         let mad = self.median_abs_dev(Some(median));
90 
91         (mad / median) * _100
92     }
93 
94     /// Returns the smallest element in the sample
95     ///
96     /// - Time: `O(length)`
min(&self) -> A97     pub fn min(&self) -> A {
98         let mut elems = self.iter();
99 
100         match elems.next() {
101             Some(&elem) => elems.fold(elem, |a, &b| a.min(b)),
102             // NB `unreachable!` because `Sample` is guaranteed to have at least one data point
103             None => unreachable!(),
104         }
105     }
106 
107     /// Returns a "view" into the percentiles of the sample
108     ///
109     /// This "view" makes consecutive computations of percentiles much faster (`O(1)`)
110     ///
111     /// - Time: `O(N log N) where N = length`
112     /// - Memory: `O(length)`
percentiles(&self) -> Percentiles<A> where usize: cast::From<A, Output = Result<usize, cast::Error>>,113     pub fn percentiles(&self) -> Percentiles<A>
114     where
115         usize: cast::From<A, Output = Result<usize, cast::Error>>,
116     {
117         use std::cmp::Ordering;
118 
119         // NB This function assumes that there are no `NaN`s in the sample
120         fn cmp<T>(a: &T, b: &T) -> Ordering
121         where
122             T: PartialOrd,
123         {
124             match a.partial_cmp(b) {
125                 Some(o) => o,
126                 // Arbitrary way to handle NaNs that should never happen
127                 None => Ordering::Equal,
128             }
129         }
130 
131         let mut v = self.to_vec().into_boxed_slice();
132         #[cfg(feature = "rayon")]
133         v.par_sort_unstable_by(cmp);
134         #[cfg(not(feature = "rayon"))]
135         v.sort_unstable_by(cmp);
136 
137         // NB :-1: to intra-crate privacy rules
138         unsafe { mem::transmute(v) }
139     }
140 
141     /// Returns the standard deviation of the sample
142     ///
143     /// The `mean` can be optionally passed along to speed up (2X) the computation
144     ///
145     /// - Time: `O(length)`
std_dev(&self, mean: Option<A>) -> A146     pub fn std_dev(&self, mean: Option<A>) -> A {
147         self.var(mean).sqrt()
148     }
149 
150     /// Returns the standard deviation as a percentage of the mean
151     ///
152     /// - Time: `O(length)`
std_dev_pct(&self) -> A153     pub fn std_dev_pct(&self) -> A {
154         let _100 = A::cast(100);
155         let mean = self.mean();
156         let std_dev = self.std_dev(Some(mean));
157 
158         (std_dev / mean) * _100
159     }
160 
161     /// Returns the sum of all the elements of the sample
162     ///
163     /// - Time: `O(length)`
sum(&self) -> A164     pub fn sum(&self) -> A {
165         crate::stats::sum(self)
166     }
167 
168     /// Returns the t score between these two samples
169     ///
170     /// - Time: `O(length)`
t(&self, other: &Sample<A>) -> A171     pub fn t(&self, other: &Sample<A>) -> A {
172         let (x_bar, y_bar) = (self.mean(), other.mean());
173         let (s2_x, s2_y) = (self.var(Some(x_bar)), other.var(Some(y_bar)));
174         let n_x = A::cast(self.len());
175         let n_y = A::cast(other.len());
176         let num = x_bar - y_bar;
177         let den = (s2_x / n_x + s2_y / n_y).sqrt();
178 
179         num / den
180     }
181 
182     /// Returns the variance of the sample
183     ///
184     /// The `mean` can be optionally passed along to speed up (2X) the computation
185     ///
186     /// - Time: `O(length)`
var(&self, mean: Option<A>) -> A187     pub fn var(&self, mean: Option<A>) -> A {
188         use std::ops::Add;
189 
190         let mean = mean.unwrap_or_else(|| self.mean());
191         let slice = self;
192 
193         let sum = slice
194             .iter()
195             .map(|&x| (x - mean).powi(2))
196             .fold(A::cast(0), Add::add);
197 
198         sum / A::cast(slice.len() - 1)
199     }
200 
201     // TODO Remove the `T` parameter in favor of `S::Output`
202     /// Returns the bootstrap distributions of the parameters estimated by the 1-sample statistic
203     ///
204     /// - Multi-threaded
205     /// - Time: `O(nresamples)`
206     /// - Memory: `O(nresamples)`
bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions where S: Fn(&Sample<A>) -> T + Sync, T: Tuple + Send, T::Distributions: Send, T::Builder: Send,207     pub fn bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions
208     where
209         S: Fn(&Sample<A>) -> T + Sync,
210         T: Tuple + Send,
211         T::Distributions: Send,
212         T::Builder: Send,
213     {
214         #[cfg(feature = "rayon")]
215         {
216             (0..nresamples)
217                 .into_par_iter()
218                 .map_init(
219                     || Resamples::new(self),
220                     |resamples, _| statistic(resamples.next()),
221                 )
222                 .fold(
223                     || T::Builder::new(0),
224                     |mut sub_distributions, sample| {
225                         sub_distributions.push(sample);
226                         sub_distributions
227                     },
228                 )
229                 .reduce(
230                     || T::Builder::new(0),
231                     |mut a, mut b| {
232                         a.extend(&mut b);
233                         a
234                     },
235                 )
236                 .complete()
237         }
238         #[cfg(not(feature = "rayon"))]
239         {
240             let mut resamples = Resamples::new(self);
241             (0..nresamples)
242                 .map(|_| statistic(resamples.next()))
243                 .fold(T::Builder::new(0), |mut sub_distributions, sample| {
244                     sub_distributions.push(sample);
245                     sub_distributions
246                 })
247                 .complete()
248         }
249     }
250 
251     #[cfg(test)]
iqr(&self) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,252     pub fn iqr(&self) -> A
253     where
254         usize: cast::From<A, Output = Result<usize, cast::Error>>,
255     {
256         self.percentiles().iqr()
257     }
258 
259     #[cfg(test)]
median(&self) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,260     pub fn median(&self) -> A
261     where
262         usize: cast::From<A, Output = Result<usize, cast::Error>>,
263     {
264         self.percentiles().median()
265     }
266 }
267 
268 impl<A> ops::Deref for Sample<A> {
269     type Target = [A];
270 
deref(&self) -> &[A]271     fn deref(&self) -> &[A] {
272         &self.0
273     }
274 }
275