1"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10==================  ==================================================
11Function            Description
12==================  ==================================================
13mean                Arithmetic mean (average) of data.
14fmean               Fast, floating point arithmetic mean.
15geometric_mean      Geometric mean of data.
16harmonic_mean       Harmonic mean of data.
17median              Median (middle value) of data.
18median_low          Low median of data.
19median_high         High median of data.
20median_grouped      Median, or 50th percentile, of grouped data.
21mode                Mode (most common value) of data.
22multimode           List of modes (most common values of data).
23quantiles           Divide data into intervals with equal probability.
24==================  ==================================================
25
26Calculate the arithmetic mean ("the average") of data:
27
28>>> mean([-1.0, 2.5, 3.25, 5.75])
292.625
30
31
32Calculate the standard median of discrete data:
33
34>>> median([2, 3, 4, 5])
353.5
36
37
38Calculate the median, or 50th percentile, of data grouped into class intervals
39centred on the data values provided. E.g. if your data points are rounded to
40the nearest whole number:
41
42>>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
432.8333333333...
44
45This should be interpreted in this way: you have two data points in the class
46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47the class interval 3.5-4.5. The median of these data points is 2.8333...
48
49
50Calculating variability or spread
51---------------------------------
52
53==================  =============================================
54Function            Description
55==================  =============================================
56pvariance           Population variance of data.
57variance            Sample variance of data.
58pstdev              Population standard deviation of data.
59stdev               Sample standard deviation of data.
60==================  =============================================
61
62Calculate the standard deviation of sample data:
63
64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
654.38961843444...
66
67If you have previously calculated the mean, you can pass it as the optional
68second argument to the four "spread" functions to avoid recalculating it:
69
70>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71>>> mu = mean(data)
72>>> pvariance(data, mu)
732.5
74
75
76Statistics for relations between two inputs
77-------------------------------------------
78
79==================  ====================================================
80Function            Description
81==================  ====================================================
82covariance          Sample covariance for two variables.
83correlation         Pearson's correlation coefficient for two variables.
84linear_regression   Intercept and slope for simple linear regression.
85==================  ====================================================
86
87Calculate covariance, Pearson's correlation, and simple linear regression
88for two inputs:
89
90>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
91>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
92>>> covariance(x, y)
930.75
94>>> correlation(x, y)  #doctest: +ELLIPSIS
950.31622776601...
96>>> linear_regression(x, y)  #doctest:
97LinearRegression(slope=0.1, intercept=1.5)
98
99
100Exceptions
101----------
102
103A single exception is defined: StatisticsError is a subclass of ValueError.
104
105"""
106
107__all__ = [
108    'NormalDist',
109    'StatisticsError',
110    'correlation',
111    'covariance',
112    'fmean',
113    'geometric_mean',
114    'harmonic_mean',
115    'linear_regression',
116    'mean',
117    'median',
118    'median_grouped',
119    'median_high',
120    'median_low',
121    'mode',
122    'multimode',
123    'pstdev',
124    'pvariance',
125    'quantiles',
126    'stdev',
127    'variance',
128]
129
130import math
131import numbers
132import random
133import sys
134
135from fractions import Fraction
136from decimal import Decimal
137from itertools import groupby, repeat
138from bisect import bisect_left, bisect_right
139from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
140from functools import reduce
141from operator import mul
142from collections import Counter, namedtuple, defaultdict
143
144_SQRT2 = sqrt(2.0)
145
146# === Exceptions ===
147
148class StatisticsError(ValueError):
149    pass
150
151
152# === Private utilities ===
153
154def _sum(data):
155    """_sum(data) -> (type, sum, count)
156
157    Return a high-precision sum of the given numeric data as a fraction,
158    together with the type to be converted to and the count of items.
159
160    Examples
161    --------
162
163    >>> _sum([3, 2.25, 4.5, -0.5, 0.25])
164    (<class 'float'>, Fraction(19, 2), 5)
165
166    Some sources of round-off error will be avoided:
167
168    # Built-in sum returns zero.
169    >>> _sum([1e50, 1, -1e50] * 1000)
170    (<class 'float'>, Fraction(1000, 1), 3000)
171
172    Fractions and Decimals are also supported:
173
174    >>> from fractions import Fraction as F
175    >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
176    (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
177
178    >>> from decimal import Decimal as D
179    >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
180    >>> _sum(data)
181    (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
182
183    Mixed types are currently treated as an error, except that int is
184    allowed.
185    """
186    count = 0
187    types = set()
188    types_add = types.add
189    partials = {}
190    partials_get = partials.get
191    for typ, values in groupby(data, type):
192        types_add(typ)
193        for n, d in map(_exact_ratio, values):
194            count += 1
195            partials[d] = partials_get(d, 0) + n
196    if None in partials:
197        # The sum will be a NAN or INF. We can ignore all the finite
198        # partials, and just look at this special one.
199        total = partials[None]
200        assert not _isfinite(total)
201    else:
202        # Sum all the partial sums using builtin sum.
203        total = sum(Fraction(n, d) for d, n in partials.items())
204    T = reduce(_coerce, types, int)  # or raise TypeError
205    return (T, total, count)
206
207
208def _ss(data, c=None):
209    """Return the exact mean and sum of square deviations of sequence data.
210
211    Calculations are done in a single pass, allowing the input to be an iterator.
212
213    If given *c* is used the mean; otherwise, it is calculated from the data.
214    Use the *c* argument with care, as it can lead to garbage results.
215
216    """
217    if c is not None:
218        T, ssd, count = _sum((d := x - c) * d for x in data)
219        return (T, ssd, c, count)
220    count = 0
221    types = set()
222    types_add = types.add
223    sx_partials = defaultdict(int)
224    sxx_partials = defaultdict(int)
225    for typ, values in groupby(data, type):
226        types_add(typ)
227        for n, d in map(_exact_ratio, values):
228            count += 1
229            sx_partials[d] += n
230            sxx_partials[d] += n * n
231    if not count:
232        ssd = c = Fraction(0)
233    elif None in sx_partials:
234        # The sum will be a NAN or INF. We can ignore all the finite
235        # partials, and just look at this special one.
236        ssd = c = sx_partials[None]
237        assert not _isfinite(ssd)
238    else:
239        sx = sum(Fraction(n, d) for d, n in sx_partials.items())
240        sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items())
241        # This formula has poor numeric properties for floats,
242        # but with fractions it is exact.
243        ssd = (count * sxx - sx * sx) / count
244        c = sx / count
245    T = reduce(_coerce, types, int)  # or raise TypeError
246    return (T, ssd, c, count)
247
248
249def _isfinite(x):
250    try:
251        return x.is_finite()  # Likely a Decimal.
252    except AttributeError:
253        return math.isfinite(x)  # Coerces to float first.
254
255
256def _coerce(T, S):
257    """Coerce types T and S to a common type, or raise TypeError.
258
259    Coercion rules are currently an implementation detail. See the CoerceTest
260    test class in test_statistics for details.
261    """
262    # See http://bugs.python.org/issue24068.
263    assert T is not bool, "initial type T is bool"
264    # If the types are the same, no need to coerce anything. Put this
265    # first, so that the usual case (no coercion needed) happens as soon
266    # as possible.
267    if T is S:  return T
268    # Mixed int & other coerce to the other type.
269    if S is int or S is bool:  return T
270    if T is int:  return S
271    # If one is a (strict) subclass of the other, coerce to the subclass.
272    if issubclass(S, T):  return S
273    if issubclass(T, S):  return T
274    # Ints coerce to the other type.
275    if issubclass(T, int):  return S
276    if issubclass(S, int):  return T
277    # Mixed fraction & float coerces to float (or float subclass).
278    if issubclass(T, Fraction) and issubclass(S, float):
279        return S
280    if issubclass(T, float) and issubclass(S, Fraction):
281        return T
282    # Any other combination is disallowed.
283    msg = "don't know how to coerce %s and %s"
284    raise TypeError(msg % (T.__name__, S.__name__))
285
286
287def _exact_ratio(x):
288    """Return Real number x to exact (numerator, denominator) pair.
289
290    >>> _exact_ratio(0.25)
291    (1, 4)
292
293    x is expected to be an int, Fraction, Decimal or float.
294    """
295
296    # XXX We should revisit whether using fractions to accumulate exact
297    # ratios is the right way to go.
298
299    # The integer ratios for binary floats can have numerators or
300    # denominators with over 300 decimal digits.  The problem is more
301    # acute with decimal floats where the default decimal context
302    # supports a huge range of exponents from Emin=-999999 to
303    # Emax=999999.  When expanded with as_integer_ratio(), numbers like
304    # Decimal('3.14E+5000') and Decimal('3.14E-5000') have large
305    # numerators or denominators that will slow computation.
306
307    # When the integer ratios are accumulated as fractions, the size
308    # grows to cover the full range from the smallest magnitude to the
309    # largest.  For example, Fraction(3.14E+300) + Fraction(3.14E-300),
310    # has a 616 digit numerator.  Likewise,
311    # Fraction(Decimal('3.14E+5000')) + Fraction(Decimal('3.14E-5000'))
312    # has 10,003 digit numerator.
313
314    # This doesn't seem to have been problem in practice, but it is a
315    # potential pitfall.
316
317    try:
318        return x.as_integer_ratio()
319    except AttributeError:
320        pass
321    except (OverflowError, ValueError):
322        # float NAN or INF.
323        assert not _isfinite(x)
324        return (x, None)
325    try:
326        # x may be an Integral ABC.
327        return (x.numerator, x.denominator)
328    except AttributeError:
329        msg = f"can't convert type '{type(x).__name__}' to numerator/denominator"
330        raise TypeError(msg)
331
332
333def _convert(value, T):
334    """Convert value to given numeric type T."""
335    if type(value) is T:
336        # This covers the cases where T is Fraction, or where value is
337        # a NAN or INF (Decimal or float).
338        return value
339    if issubclass(T, int) and value.denominator != 1:
340        T = float
341    try:
342        # FIXME: what do we do if this overflows?
343        return T(value)
344    except TypeError:
345        if issubclass(T, Decimal):
346            return T(value.numerator) / T(value.denominator)
347        else:
348            raise
349
350
351def _fail_neg(values, errmsg='negative value'):
352    """Iterate over values, failing if any are less than zero."""
353    for x in values:
354        if x < 0:
355            raise StatisticsError(errmsg)
356        yield x
357
358
359def _integer_sqrt_of_frac_rto(n: int, m: int) -> int:
360    """Square root of n/m, rounded to the nearest integer using round-to-odd."""
361    # Reference: https://www.lri.fr/~melquion/doc/05-imacs17_1-expose.pdf
362    a = math.isqrt(n // m)
363    return a | (a*a*m != n)
364
365
366# For 53 bit precision floats, the bit width used in
367# _float_sqrt_of_frac() is 109.
368_sqrt_bit_width: int = 2 * sys.float_info.mant_dig + 3
369
370
371def _float_sqrt_of_frac(n: int, m: int) -> float:
372    """Square root of n/m as a float, correctly rounded."""
373    # See principle and proof sketch at: https://bugs.python.org/msg407078
374    q = (n.bit_length() - m.bit_length() - _sqrt_bit_width) // 2
375    if q >= 0:
376        numerator = _integer_sqrt_of_frac_rto(n, m << 2 * q) << q
377        denominator = 1
378    else:
379        numerator = _integer_sqrt_of_frac_rto(n << -2 * q, m)
380        denominator = 1 << -q
381    return numerator / denominator   # Convert to float
382
383
384def _decimal_sqrt_of_frac(n: int, m: int) -> Decimal:
385    """Square root of n/m as a Decimal, correctly rounded."""
386    # Premise:  For decimal, computing (n/m).sqrt() can be off
387    #           by 1 ulp from the correctly rounded result.
388    # Method:   Check the result, moving up or down a step if needed.
389    if n <= 0:
390        if not n:
391            return Decimal('0.0')
392        n, m = -n, -m
393
394    root = (Decimal(n) / Decimal(m)).sqrt()
395    nr, dr = root.as_integer_ratio()
396
397    plus = root.next_plus()
398    np, dp = plus.as_integer_ratio()
399    # test: n / m > ((root + plus) / 2) ** 2
400    if 4 * n * (dr*dp)**2 > m * (dr*np + dp*nr)**2:
401        return plus
402
403    minus = root.next_minus()
404    nm, dm = minus.as_integer_ratio()
405    # test: n / m < ((root + minus) / 2) ** 2
406    if 4 * n * (dr*dm)**2 < m * (dr*nm + dm*nr)**2:
407        return minus
408
409    return root
410
411
412# === Measures of central tendency (averages) ===
413
414def mean(data):
415    """Return the sample arithmetic mean of data.
416
417    >>> mean([1, 2, 3, 4, 4])
418    2.8
419
420    >>> from fractions import Fraction as F
421    >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
422    Fraction(13, 21)
423
424    >>> from decimal import Decimal as D
425    >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
426    Decimal('0.5625')
427
428    If ``data`` is empty, StatisticsError will be raised.
429    """
430    T, total, n = _sum(data)
431    if n < 1:
432        raise StatisticsError('mean requires at least one data point')
433    return _convert(total / n, T)
434
435
436def fmean(data, weights=None):
437    """Convert data to floats and compute the arithmetic mean.
438
439    This runs faster than the mean() function and it always returns a float.
440    If the input dataset is empty, it raises a StatisticsError.
441
442    >>> fmean([3.5, 4.0, 5.25])
443    4.25
444    """
445    try:
446        n = len(data)
447    except TypeError:
448        # Handle iterators that do not define __len__().
449        n = 0
450        def count(iterable):
451            nonlocal n
452            for n, x in enumerate(iterable, start=1):
453                yield x
454        data = count(data)
455    if weights is None:
456        total = fsum(data)
457        if not n:
458            raise StatisticsError('fmean requires at least one data point')
459        return total / n
460    try:
461        num_weights = len(weights)
462    except TypeError:
463        weights = list(weights)
464        num_weights = len(weights)
465    num = fsum(map(mul, data, weights))
466    if n != num_weights:
467        raise StatisticsError('data and weights must be the same length')
468    den = fsum(weights)
469    if not den:
470        raise StatisticsError('sum of weights must be non-zero')
471    return num / den
472
473
474def geometric_mean(data):
475    """Convert data to floats and compute the geometric mean.
476
477    Raises a StatisticsError if the input dataset is empty,
478    if it contains a zero, or if it contains a negative value.
479
480    No special efforts are made to achieve exact results.
481    (However, this may change in the future.)
482
483    >>> round(geometric_mean([54, 24, 36]), 9)
484    36.0
485    """
486    try:
487        return exp(fmean(map(log, data)))
488    except ValueError:
489        raise StatisticsError('geometric mean requires a non-empty dataset '
490                              'containing positive numbers') from None
491
492
493def harmonic_mean(data, weights=None):
494    """Return the harmonic mean of data.
495
496    The harmonic mean is the reciprocal of the arithmetic mean of the
497    reciprocals of the data.  It can be used for averaging ratios or
498    rates, for example speeds.
499
500    Suppose a car travels 40 km/hr for 5 km and then speeds-up to
501    60 km/hr for another 5 km. What is the average speed?
502
503        >>> harmonic_mean([40, 60])
504        48.0
505
506    Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
507    speeds-up to 60 km/hr for the remaining 30 km of the journey. What
508    is the average speed?
509
510        >>> harmonic_mean([40, 60], weights=[5, 30])
511        56.0
512
513    If ``data`` is empty, or any element is less than zero,
514    ``harmonic_mean`` will raise ``StatisticsError``.
515    """
516    if iter(data) is data:
517        data = list(data)
518    errmsg = 'harmonic mean does not support negative values'
519    n = len(data)
520    if n < 1:
521        raise StatisticsError('harmonic_mean requires at least one data point')
522    elif n == 1 and weights is None:
523        x = data[0]
524        if isinstance(x, (numbers.Real, Decimal)):
525            if x < 0:
526                raise StatisticsError(errmsg)
527            return x
528        else:
529            raise TypeError('unsupported type')
530    if weights is None:
531        weights = repeat(1, n)
532        sum_weights = n
533    else:
534        if iter(weights) is weights:
535            weights = list(weights)
536        if len(weights) != n:
537            raise StatisticsError('Number of weights does not match data size')
538        _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
539    try:
540        data = _fail_neg(data, errmsg)
541        T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
542    except ZeroDivisionError:
543        return 0
544    if total <= 0:
545        raise StatisticsError('Weighted sum must be positive')
546    return _convert(sum_weights / total, T)
547
548# FIXME: investigate ways to calculate medians without sorting? Quickselect?
549def median(data):
550    """Return the median (middle value) of numeric data.
551
552    When the number of data points is odd, return the middle data point.
553    When the number of data points is even, the median is interpolated by
554    taking the average of the two middle values:
555
556    >>> median([1, 3, 5])
557    3
558    >>> median([1, 3, 5, 7])
559    4.0
560
561    """
562    data = sorted(data)
563    n = len(data)
564    if n == 0:
565        raise StatisticsError("no median for empty data")
566    if n % 2 == 1:
567        return data[n // 2]
568    else:
569        i = n // 2
570        return (data[i - 1] + data[i]) / 2
571
572
573def median_low(data):
574    """Return the low median of numeric data.
575
576    When the number of data points is odd, the middle value is returned.
577    When it is even, the smaller of the two middle values is returned.
578
579    >>> median_low([1, 3, 5])
580    3
581    >>> median_low([1, 3, 5, 7])
582    3
583
584    """
585    data = sorted(data)
586    n = len(data)
587    if n == 0:
588        raise StatisticsError("no median for empty data")
589    if n % 2 == 1:
590        return data[n // 2]
591    else:
592        return data[n // 2 - 1]
593
594
595def median_high(data):
596    """Return the high median of data.
597
598    When the number of data points is odd, the middle value is returned.
599    When it is even, the larger of the two middle values is returned.
600
601    >>> median_high([1, 3, 5])
602    3
603    >>> median_high([1, 3, 5, 7])
604    5
605
606    """
607    data = sorted(data)
608    n = len(data)
609    if n == 0:
610        raise StatisticsError("no median for empty data")
611    return data[n // 2]
612
613
614def median_grouped(data, interval=1.0):
615    """Estimates the median for numeric data binned around the midpoints
616    of consecutive, fixed-width intervals.
617
618    The *data* can be any iterable of numeric data with each value being
619    exactly the midpoint of a bin.  At least one value must be present.
620
621    The *interval* is width of each bin.
622
623    For example, demographic information may have been summarized into
624    consecutive ten-year age groups with each group being represented
625    by the 5-year midpoints of the intervals:
626
627        >>> demographics = Counter({
628        ...    25: 172,   # 20 to 30 years old
629        ...    35: 484,   # 30 to 40 years old
630        ...    45: 387,   # 40 to 50 years old
631        ...    55:  22,   # 50 to 60 years old
632        ...    65:   6,   # 60 to 70 years old
633        ... })
634
635    The 50th percentile (median) is the 536th person out of the 1071
636    member cohort.  That person is in the 30 to 40 year old age group.
637
638    The regular median() function would assume that everyone in the
639    tricenarian age group was exactly 35 years old.  A more tenable
640    assumption is that the 484 members of that age group are evenly
641    distributed between 30 and 40.  For that, we use median_grouped().
642
643        >>> data = list(demographics.elements())
644        >>> median(data)
645        35
646        >>> round(median_grouped(data, interval=10), 1)
647        37.5
648
649    The caller is responsible for making sure the data points are separated
650    by exact multiples of *interval*.  This is essential for getting a
651    correct result.  The function does not check this precondition.
652
653    Inputs may be any numeric type that can be coerced to a float during
654    the interpolation step.
655
656    """
657    data = sorted(data)
658    n = len(data)
659    if not n:
660        raise StatisticsError("no median for empty data")
661
662    # Find the value at the midpoint. Remember this corresponds to the
663    # midpoint of the class interval.
664    x = data[n // 2]
665
666    # Using O(log n) bisection, find where all the x values occur in the data.
667    # All x will lie within data[i:j].
668    i = bisect_left(data, x)
669    j = bisect_right(data, x, lo=i)
670
671    # Coerce to floats, raising a TypeError if not possible
672    try:
673        interval = float(interval)
674        x = float(x)
675    except ValueError:
676        raise TypeError(f'Value cannot be converted to a float')
677
678    # Interpolate the median using the formula found at:
679    # https://www.cuemath.com/data/median-of-grouped-data/
680    L = x - interval / 2.0    # Lower limit of the median interval
681    cf = i                    # Cumulative frequency of the preceding interval
682    f = j - i                 # Number of elements in the median internal
683    return L + interval * (n / 2 - cf) / f
684
685
686def mode(data):
687    """Return the most common data point from discrete or nominal data.
688
689    ``mode`` assumes discrete data, and returns a single value. This is the
690    standard treatment of the mode as commonly taught in schools:
691
692        >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
693        3
694
695    This also works with nominal (non-numeric) data:
696
697        >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
698        'red'
699
700    If there are multiple modes with same frequency, return the first one
701    encountered:
702
703        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
704        'red'
705
706    If *data* is empty, ``mode``, raises StatisticsError.
707
708    """
709    pairs = Counter(iter(data)).most_common(1)
710    try:
711        return pairs[0][0]
712    except IndexError:
713        raise StatisticsError('no mode for empty data') from None
714
715
716def multimode(data):
717    """Return a list of the most frequently occurring values.
718
719    Will return more than one result if there are multiple modes
720    or an empty list if *data* is empty.
721
722    >>> multimode('aabbbbbbbbcc')
723    ['b']
724    >>> multimode('aabbbbccddddeeffffgg')
725    ['b', 'd', 'f']
726    >>> multimode('')
727    []
728    """
729    counts = Counter(iter(data))
730    if not counts:
731        return []
732    maxcount = max(counts.values())
733    return [value for value, count in counts.items() if count == maxcount]
734
735
736# Notes on methods for computing quantiles
737# ----------------------------------------
738#
739# There is no one perfect way to compute quantiles.  Here we offer
740# two methods that serve common needs.  Most other packages
741# surveyed offered at least one or both of these two, making them
742# "standard" in the sense of "widely-adopted and reproducible".
743# They are also easy to explain, easy to compute manually, and have
744# straight-forward interpretations that aren't surprising.
745
746# The default method is known as "R6", "PERCENTILE.EXC", or "expected
747# value of rank order statistics". The alternative method is known as
748# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
749
750# For sample data where there is a positive probability for values
751# beyond the range of the data, the R6 exclusive method is a
752# reasonable choice.  Consider a random sample of nine values from a
753# population with a uniform distribution from 0.0 to 1.0.  The
754# distribution of the third ranked sample point is described by
755# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
756# mean=0.300.  Only the latter (which corresponds with R6) gives the
757# desired cut point with 30% of the population falling below that
758# value, making it comparable to a result from an inv_cdf() function.
759# The R6 exclusive method is also idempotent.
760
761# For describing population data where the end points are known to
762# be included in the data, the R7 inclusive method is a reasonable
763# choice.  Instead of the mean, it uses the mode of the beta
764# distribution for the interior points.  Per Hyndman & Fan, "One nice
765# property is that the vertices of Q7(p) divide the range into n - 1
766# intervals, and exactly 100p% of the intervals lie to the left of
767# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
768
769# If needed, other methods could be added.  However, for now, the
770# position is that fewer options make for easier choices and that
771# external packages can be used for anything more advanced.
772
773def quantiles(data, *, n=4, method='exclusive'):
774    """Divide *data* into *n* continuous intervals with equal probability.
775
776    Returns a list of (n - 1) cut points separating the intervals.
777
778    Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
779    Set *n* to 100 for percentiles which gives the 99 cuts points that
780    separate *data* in to 100 equal sized groups.
781
782    The *data* can be any iterable containing sample.
783    The cut points are linearly interpolated between data points.
784
785    If *method* is set to *inclusive*, *data* is treated as population
786    data.  The minimum value is treated as the 0th percentile and the
787    maximum value is treated as the 100th percentile.
788    """
789    if n < 1:
790        raise StatisticsError('n must be at least 1')
791    data = sorted(data)
792    ld = len(data)
793    if ld < 2:
794        raise StatisticsError('must have at least two data points')
795    if method == 'inclusive':
796        m = ld - 1
797        result = []
798        for i in range(1, n):
799            j, delta = divmod(i * m, n)
800            interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
801            result.append(interpolated)
802        return result
803    if method == 'exclusive':
804        m = ld + 1
805        result = []
806        for i in range(1, n):
807            j = i * m // n                               # rescale i to m/n
808            j = 1 if j < 1 else ld-1 if j > ld-1 else j  # clamp to 1 .. ld-1
809            delta = i*m - j*n                            # exact integer math
810            interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
811            result.append(interpolated)
812        return result
813    raise ValueError(f'Unknown method: {method!r}')
814
815
816# === Measures of spread ===
817
818# See http://mathworld.wolfram.com/Variance.html
819#     http://mathworld.wolfram.com/SampleVariance.html
820
821
822def variance(data, xbar=None):
823    """Return the sample variance of data.
824
825    data should be an iterable of Real-valued numbers, with at least two
826    values. The optional argument xbar, if given, should be the mean of
827    the data. If it is missing or None, the mean is automatically calculated.
828
829    Use this function when your data is a sample from a population. To
830    calculate the variance from the entire population, see ``pvariance``.
831
832    Examples:
833
834    >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
835    >>> variance(data)
836    1.3720238095238095
837
838    If you have already calculated the mean of your data, you can pass it as
839    the optional second argument ``xbar`` to avoid recalculating it:
840
841    >>> m = mean(data)
842    >>> variance(data, m)
843    1.3720238095238095
844
845    This function does not check that ``xbar`` is actually the mean of
846    ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
847    impossible results.
848
849    Decimals and Fractions are supported:
850
851    >>> from decimal import Decimal as D
852    >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
853    Decimal('31.01875')
854
855    >>> from fractions import Fraction as F
856    >>> variance([F(1, 6), F(1, 2), F(5, 3)])
857    Fraction(67, 108)
858
859    """
860    T, ss, c, n = _ss(data, xbar)
861    if n < 2:
862        raise StatisticsError('variance requires at least two data points')
863    return _convert(ss / (n - 1), T)
864
865
866def pvariance(data, mu=None):
867    """Return the population variance of ``data``.
868
869    data should be a sequence or iterable of Real-valued numbers, with at least one
870    value. The optional argument mu, if given, should be the mean of
871    the data. If it is missing or None, the mean is automatically calculated.
872
873    Use this function to calculate the variance from the entire population.
874    To estimate the variance from a sample, the ``variance`` function is
875    usually a better choice.
876
877    Examples:
878
879    >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
880    >>> pvariance(data)
881    1.25
882
883    If you have already calculated the mean of the data, you can pass it as
884    the optional second argument to avoid recalculating it:
885
886    >>> mu = mean(data)
887    >>> pvariance(data, mu)
888    1.25
889
890    Decimals and Fractions are supported:
891
892    >>> from decimal import Decimal as D
893    >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
894    Decimal('24.815')
895
896    >>> from fractions import Fraction as F
897    >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
898    Fraction(13, 72)
899
900    """
901    T, ss, c, n = _ss(data, mu)
902    if n < 1:
903        raise StatisticsError('pvariance requires at least one data point')
904    return _convert(ss / n, T)
905
906
907def stdev(data, xbar=None):
908    """Return the square root of the sample variance.
909
910    See ``variance`` for arguments and other details.
911
912    >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
913    1.0810874155219827
914
915    """
916    T, ss, c, n = _ss(data, xbar)
917    if n < 2:
918        raise StatisticsError('stdev requires at least two data points')
919    mss = ss / (n - 1)
920    if issubclass(T, Decimal):
921        return _decimal_sqrt_of_frac(mss.numerator, mss.denominator)
922    return _float_sqrt_of_frac(mss.numerator, mss.denominator)
923
924
925def pstdev(data, mu=None):
926    """Return the square root of the population variance.
927
928    See ``pvariance`` for arguments and other details.
929
930    >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
931    0.986893273527251
932
933    """
934    T, ss, c, n = _ss(data, mu)
935    if n < 1:
936        raise StatisticsError('pstdev requires at least one data point')
937    mss = ss / n
938    if issubclass(T, Decimal):
939        return _decimal_sqrt_of_frac(mss.numerator, mss.denominator)
940    return _float_sqrt_of_frac(mss.numerator, mss.denominator)
941
942
943def _mean_stdev(data):
944    """In one pass, compute the mean and sample standard deviation as floats."""
945    T, ss, xbar, n = _ss(data)
946    if n < 2:
947        raise StatisticsError('stdev requires at least two data points')
948    mss = ss / (n - 1)
949    try:
950        return float(xbar), _float_sqrt_of_frac(mss.numerator, mss.denominator)
951    except AttributeError:
952        # Handle Nans and Infs gracefully
953        return float(xbar), float(xbar) / float(ss)
954
955
956# === Statistics for relations between two inputs ===
957
958# See https://en.wikipedia.org/wiki/Covariance
959#     https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
960#     https://en.wikipedia.org/wiki/Simple_linear_regression
961
962
963def covariance(x, y, /):
964    """Covariance
965
966    Return the sample covariance of two inputs *x* and *y*. Covariance
967    is a measure of the joint variability of two inputs.
968
969    >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
970    >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
971    >>> covariance(x, y)
972    0.75
973    >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
974    >>> covariance(x, z)
975    -7.5
976    >>> covariance(z, x)
977    -7.5
978
979    """
980    n = len(x)
981    if len(y) != n:
982        raise StatisticsError('covariance requires that both inputs have same number of data points')
983    if n < 2:
984        raise StatisticsError('covariance requires at least two data points')
985    xbar = fsum(x) / n
986    ybar = fsum(y) / n
987    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
988    return sxy / (n - 1)
989
990
991def correlation(x, y, /):
992    """Pearson's correlation coefficient
993
994    Return the Pearson's correlation coefficient for two inputs. Pearson's
995    correlation coefficient *r* takes values between -1 and +1. It measures the
996    strength and direction of the linear relationship, where +1 means very
997    strong, positive linear relationship, -1 very strong, negative linear
998    relationship, and 0 no linear relationship.
999
1000    >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
1001    >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
1002    >>> correlation(x, x)
1003    1.0
1004    >>> correlation(x, y)
1005    -1.0
1006
1007    """
1008    n = len(x)
1009    if len(y) != n:
1010        raise StatisticsError('correlation requires that both inputs have same number of data points')
1011    if n < 2:
1012        raise StatisticsError('correlation requires at least two data points')
1013    xbar = fsum(x) / n
1014    ybar = fsum(y) / n
1015    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
1016    sxx = fsum((d := xi - xbar) * d for xi in x)
1017    syy = fsum((d := yi - ybar) * d for yi in y)
1018    try:
1019        return sxy / sqrt(sxx * syy)
1020    except ZeroDivisionError:
1021        raise StatisticsError('at least one of the inputs is constant')
1022
1023
1024LinearRegression = namedtuple('LinearRegression', ('slope', 'intercept'))
1025
1026
1027def linear_regression(x, y, /, *, proportional=False):
1028    """Slope and intercept for simple linear regression.
1029
1030    Return the slope and intercept of simple linear regression
1031    parameters estimated using ordinary least squares. Simple linear
1032    regression describes relationship between an independent variable
1033    *x* and a dependent variable *y* in terms of a linear function:
1034
1035        y = slope * x + intercept + noise
1036
1037    where *slope* and *intercept* are the regression parameters that are
1038    estimated, and noise represents the variability of the data that was
1039    not explained by the linear regression (it is equal to the
1040    difference between predicted and actual values of the dependent
1041    variable).
1042
1043    The parameters are returned as a named tuple.
1044
1045    >>> x = [1, 2, 3, 4, 5]
1046    >>> noise = NormalDist().samples(5, seed=42)
1047    >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)]
1048    >>> linear_regression(x, y)  #doctest: +ELLIPSIS
1049    LinearRegression(slope=3.09078914170..., intercept=1.75684970486...)
1050
1051    If *proportional* is true, the independent variable *x* and the
1052    dependent variable *y* are assumed to be directly proportional.
1053    The data is fit to a line passing through the origin.
1054
1055    Since the *intercept* will always be 0.0, the underlying linear
1056    function simplifies to:
1057
1058        y = slope * x + noise
1059
1060    >>> y = [3 * x[i] + noise[i] for i in range(5)]
1061    >>> linear_regression(x, y, proportional=True)  #doctest: +ELLIPSIS
1062    LinearRegression(slope=3.02447542484..., intercept=0.0)
1063
1064    """
1065    n = len(x)
1066    if len(y) != n:
1067        raise StatisticsError('linear regression requires that both inputs have same number of data points')
1068    if n < 2:
1069        raise StatisticsError('linear regression requires at least two data points')
1070    if proportional:
1071        sxy = fsum(xi * yi for xi, yi in zip(x, y))
1072        sxx = fsum(xi * xi for xi in x)
1073    else:
1074        xbar = fsum(x) / n
1075        ybar = fsum(y) / n
1076        sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
1077        sxx = fsum((d := xi - xbar) * d for xi in x)
1078    try:
1079        slope = sxy / sxx   # equivalent to:  covariance(x, y) / variance(x)
1080    except ZeroDivisionError:
1081        raise StatisticsError('x is constant')
1082    intercept = 0.0 if proportional else ybar - slope * xbar
1083    return LinearRegression(slope=slope, intercept=intercept)
1084
1085
1086## Normal Distribution #####################################################
1087
1088
1089def _normal_dist_inv_cdf(p, mu, sigma):
1090    # There is no closed-form solution to the inverse CDF for the normal
1091    # distribution, so we use a rational approximation instead:
1092    # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
1093    # Normal Distribution".  Applied Statistics. Blackwell Publishing. 37
1094    # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
1095    q = p - 0.5
1096    if fabs(q) <= 0.425:
1097        r = 0.180625 - q * q
1098        # Hash sum: 55.88319_28806_14901_4439
1099        num = (((((((2.50908_09287_30122_6727e+3 * r +
1100                     3.34305_75583_58812_8105e+4) * r +
1101                     6.72657_70927_00870_0853e+4) * r +
1102                     4.59219_53931_54987_1457e+4) * r +
1103                     1.37316_93765_50946_1125e+4) * r +
1104                     1.97159_09503_06551_4427e+3) * r +
1105                     1.33141_66789_17843_7745e+2) * r +
1106                     3.38713_28727_96366_6080e+0) * q
1107        den = (((((((5.22649_52788_52854_5610e+3 * r +
1108                     2.87290_85735_72194_2674e+4) * r +
1109                     3.93078_95800_09271_0610e+4) * r +
1110                     2.12137_94301_58659_5867e+4) * r +
1111                     5.39419_60214_24751_1077e+3) * r +
1112                     6.87187_00749_20579_0830e+2) * r +
1113                     4.23133_30701_60091_1252e+1) * r +
1114                     1.0)
1115        x = num / den
1116        return mu + (x * sigma)
1117    r = p if q <= 0.0 else 1.0 - p
1118    r = sqrt(-log(r))
1119    if r <= 5.0:
1120        r = r - 1.6
1121        # Hash sum: 49.33206_50330_16102_89036
1122        num = (((((((7.74545_01427_83414_07640e-4 * r +
1123                     2.27238_44989_26918_45833e-2) * r +
1124                     2.41780_72517_74506_11770e-1) * r +
1125                     1.27045_82524_52368_38258e+0) * r +
1126                     3.64784_83247_63204_60504e+0) * r +
1127                     5.76949_72214_60691_40550e+0) * r +
1128                     4.63033_78461_56545_29590e+0) * r +
1129                     1.42343_71107_49683_57734e+0)
1130        den = (((((((1.05075_00716_44416_84324e-9 * r +
1131                     5.47593_80849_95344_94600e-4) * r +
1132                     1.51986_66563_61645_71966e-2) * r +
1133                     1.48103_97642_74800_74590e-1) * r +
1134                     6.89767_33498_51000_04550e-1) * r +
1135                     1.67638_48301_83803_84940e+0) * r +
1136                     2.05319_16266_37758_82187e+0) * r +
1137                     1.0)
1138    else:
1139        r = r - 5.0
1140        # Hash sum: 47.52583_31754_92896_71629
1141        num = (((((((2.01033_43992_92288_13265e-7 * r +
1142                     2.71155_55687_43487_57815e-5) * r +
1143                     1.24266_09473_88078_43860e-3) * r +
1144                     2.65321_89526_57612_30930e-2) * r +
1145                     2.96560_57182_85048_91230e-1) * r +
1146                     1.78482_65399_17291_33580e+0) * r +
1147                     5.46378_49111_64114_36990e+0) * r +
1148                     6.65790_46435_01103_77720e+0)
1149        den = (((((((2.04426_31033_89939_78564e-15 * r +
1150                     1.42151_17583_16445_88870e-7) * r +
1151                     1.84631_83175_10054_68180e-5) * r +
1152                     7.86869_13114_56132_59100e-4) * r +
1153                     1.48753_61290_85061_48525e-2) * r +
1154                     1.36929_88092_27358_05310e-1) * r +
1155                     5.99832_20655_58879_37690e-1) * r +
1156                     1.0)
1157    x = num / den
1158    if q < 0.0:
1159        x = -x
1160    return mu + (x * sigma)
1161
1162
1163# If available, use C implementation
1164try:
1165    from _statistics import _normal_dist_inv_cdf
1166except ImportError:
1167    pass
1168
1169
1170class NormalDist:
1171    "Normal distribution of a random variable"
1172    # https://en.wikipedia.org/wiki/Normal_distribution
1173    # https://en.wikipedia.org/wiki/Variance#Properties
1174
1175    __slots__ = {
1176        '_mu': 'Arithmetic mean of a normal distribution',
1177        '_sigma': 'Standard deviation of a normal distribution',
1178    }
1179
1180    def __init__(self, mu=0.0, sigma=1.0):
1181        "NormalDist where mu is the mean and sigma is the standard deviation."
1182        if sigma < 0.0:
1183            raise StatisticsError('sigma must be non-negative')
1184        self._mu = float(mu)
1185        self._sigma = float(sigma)
1186
1187    @classmethod
1188    def from_samples(cls, data):
1189        "Make a normal distribution instance from sample data."
1190        return cls(*_mean_stdev(data))
1191
1192    def samples(self, n, *, seed=None):
1193        "Generate *n* samples for a given mean and standard deviation."
1194        gauss = random.gauss if seed is None else random.Random(seed).gauss
1195        mu, sigma = self._mu, self._sigma
1196        return [gauss(mu, sigma) for i in range(n)]
1197
1198    def pdf(self, x):
1199        "Probability density function.  P(x <= X < x+dx) / dx"
1200        variance = self._sigma * self._sigma
1201        if not variance:
1202            raise StatisticsError('pdf() not defined when sigma is zero')
1203        diff = x - self._mu
1204        return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance)
1205
1206    def cdf(self, x):
1207        "Cumulative distribution function.  P(X <= x)"
1208        if not self._sigma:
1209            raise StatisticsError('cdf() not defined when sigma is zero')
1210        return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * _SQRT2)))
1211
1212    def inv_cdf(self, p):
1213        """Inverse cumulative distribution function.  x : P(X <= x) = p
1214
1215        Finds the value of the random variable such that the probability of
1216        the variable being less than or equal to that value equals the given
1217        probability.
1218
1219        This function is also called the percent point function or quantile
1220        function.
1221        """
1222        if p <= 0.0 or p >= 1.0:
1223            raise StatisticsError('p must be in the range 0.0 < p < 1.0')
1224        if self._sigma <= 0.0:
1225            raise StatisticsError('cdf() not defined when sigma at or below zero')
1226        return _normal_dist_inv_cdf(p, self._mu, self._sigma)
1227
1228    def quantiles(self, n=4):
1229        """Divide into *n* continuous intervals with equal probability.
1230
1231        Returns a list of (n - 1) cut points separating the intervals.
1232
1233        Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
1234        Set *n* to 100 for percentiles which gives the 99 cuts points that
1235        separate the normal distribution in to 100 equal sized groups.
1236        """
1237        return [self.inv_cdf(i / n) for i in range(1, n)]
1238
1239    def overlap(self, other):
1240        """Compute the overlapping coefficient (OVL) between two normal distributions.
1241
1242        Measures the agreement between two normal probability distributions.
1243        Returns a value between 0.0 and 1.0 giving the overlapping area in
1244        the two underlying probability density functions.
1245
1246            >>> N1 = NormalDist(2.4, 1.6)
1247            >>> N2 = NormalDist(3.2, 2.0)
1248            >>> N1.overlap(N2)
1249            0.8035050657330205
1250        """
1251        # See: "The overlapping coefficient as a measure of agreement between
1252        # probability distributions and point estimation of the overlap of two
1253        # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
1254        # http://dx.doi.org/10.1080/03610928908830127
1255        if not isinstance(other, NormalDist):
1256            raise TypeError('Expected another NormalDist instance')
1257        X, Y = self, other
1258        if (Y._sigma, Y._mu) < (X._sigma, X._mu):  # sort to assure commutativity
1259            X, Y = Y, X
1260        X_var, Y_var = X.variance, Y.variance
1261        if not X_var or not Y_var:
1262            raise StatisticsError('overlap() not defined when sigma is zero')
1263        dv = Y_var - X_var
1264        dm = fabs(Y._mu - X._mu)
1265        if not dv:
1266            return 1.0 - erf(dm / (2.0 * X._sigma * _SQRT2))
1267        a = X._mu * Y_var - Y._mu * X_var
1268        b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var))
1269        x1 = (a + b) / dv
1270        x2 = (a - b) / dv
1271        return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
1272
1273    def zscore(self, x):
1274        """Compute the Standard Score.  (x - mean) / stdev
1275
1276        Describes *x* in terms of the number of standard deviations
1277        above or below the mean of the normal distribution.
1278        """
1279        # https://www.statisticshowto.com/probability-and-statistics/z-score/
1280        if not self._sigma:
1281            raise StatisticsError('zscore() not defined when sigma is zero')
1282        return (x - self._mu) / self._sigma
1283
1284    @property
1285    def mean(self):
1286        "Arithmetic mean of the normal distribution."
1287        return self._mu
1288
1289    @property
1290    def median(self):
1291        "Return the median of the normal distribution"
1292        return self._mu
1293
1294    @property
1295    def mode(self):
1296        """Return the mode of the normal distribution
1297
1298        The mode is the value x where which the probability density
1299        function (pdf) takes its maximum value.
1300        """
1301        return self._mu
1302
1303    @property
1304    def stdev(self):
1305        "Standard deviation of the normal distribution."
1306        return self._sigma
1307
1308    @property
1309    def variance(self):
1310        "Square of the standard deviation."
1311        return self._sigma * self._sigma
1312
1313    def __add__(x1, x2):
1314        """Add a constant or another NormalDist instance.
1315
1316        If *other* is a constant, translate mu by the constant,
1317        leaving sigma unchanged.
1318
1319        If *other* is a NormalDist, add both the means and the variances.
1320        Mathematically, this works only if the two distributions are
1321        independent or if they are jointly normally distributed.
1322        """
1323        if isinstance(x2, NormalDist):
1324            return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
1325        return NormalDist(x1._mu + x2, x1._sigma)
1326
1327    def __sub__(x1, x2):
1328        """Subtract a constant or another NormalDist instance.
1329
1330        If *other* is a constant, translate by the constant mu,
1331        leaving sigma unchanged.
1332
1333        If *other* is a NormalDist, subtract the means and add the variances.
1334        Mathematically, this works only if the two distributions are
1335        independent or if they are jointly normally distributed.
1336        """
1337        if isinstance(x2, NormalDist):
1338            return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
1339        return NormalDist(x1._mu - x2, x1._sigma)
1340
1341    def __mul__(x1, x2):
1342        """Multiply both mu and sigma by a constant.
1343
1344        Used for rescaling, perhaps to change measurement units.
1345        Sigma is scaled with the absolute value of the constant.
1346        """
1347        return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
1348
1349    def __truediv__(x1, x2):
1350        """Divide both mu and sigma by a constant.
1351
1352        Used for rescaling, perhaps to change measurement units.
1353        Sigma is scaled with the absolute value of the constant.
1354        """
1355        return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
1356
1357    def __pos__(x1):
1358        "Return a copy of the instance."
1359        return NormalDist(x1._mu, x1._sigma)
1360
1361    def __neg__(x1):
1362        "Negates mu while keeping sigma the same."
1363        return NormalDist(-x1._mu, x1._sigma)
1364
1365    __radd__ = __add__
1366
1367    def __rsub__(x1, x2):
1368        "Subtract a NormalDist from a constant or another NormalDist."
1369        return -(x1 - x2)
1370
1371    __rmul__ = __mul__
1372
1373    def __eq__(x1, x2):
1374        "Two NormalDist objects are equal if their mu and sigma are both equal."
1375        if not isinstance(x2, NormalDist):
1376            return NotImplemented
1377        return x1._mu == x2._mu and x1._sigma == x2._sigma
1378
1379    def __hash__(self):
1380        "NormalDist objects hash equal if their mu and sigma are both equal."
1381        return hash((self._mu, self._sigma))
1382
1383    def __repr__(self):
1384        return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'
1385
1386    def __getstate__(self):
1387        return self._mu, self._sigma
1388
1389    def __setstate__(self, state):
1390        self._mu, self._sigma = state
1391