1""" 2Basic statistics module. 3 4This module provides functions for calculating statistics of data, including 5averages, variance, and standard deviation. 6 7Calculating averages 8-------------------- 9 10================== ================================================== 11Function Description 12================== ================================================== 13mean Arithmetic mean (average) of data. 14fmean Fast, floating point arithmetic mean. 15geometric_mean Geometric mean of data. 16harmonic_mean Harmonic mean of data. 17median Median (middle value) of data. 18median_low Low median of data. 19median_high High median of data. 20median_grouped Median, or 50th percentile, of grouped data. 21mode Mode (most common value) of data. 22multimode List of modes (most common values of data). 23quantiles Divide data into intervals with equal probability. 24================== ================================================== 25 26Calculate the arithmetic mean ("the average") of data: 27 28>>> mean([-1.0, 2.5, 3.25, 5.75]) 292.625 30 31 32Calculate the standard median of discrete data: 33 34>>> median([2, 3, 4, 5]) 353.5 36 37 38Calculate the median, or 50th percentile, of data grouped into class intervals 39centred on the data values provided. E.g. if your data points are rounded to 40the nearest whole number: 41 42>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS 432.8333333333... 44 45This should be interpreted in this way: you have two data points in the class 46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in 47the class interval 3.5-4.5. The median of these data points is 2.8333... 48 49 50Calculating variability or spread 51--------------------------------- 52 53================== ============================================= 54Function Description 55================== ============================================= 56pvariance Population variance of data. 57variance Sample variance of data. 58pstdev Population standard deviation of data. 59stdev Sample standard deviation of data. 60================== ============================================= 61 62Calculate the standard deviation of sample data: 63 64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS 654.38961843444... 66 67If you have previously calculated the mean, you can pass it as the optional 68second argument to the four "spread" functions to avoid recalculating it: 69 70>>> data = [1, 2, 2, 4, 4, 4, 5, 6] 71>>> mu = mean(data) 72>>> pvariance(data, mu) 732.5 74 75 76Statistics for relations between two inputs 77------------------------------------------- 78 79================== ==================================================== 80Function Description 81================== ==================================================== 82covariance Sample covariance for two variables. 83correlation Pearson's correlation coefficient for two variables. 84linear_regression Intercept and slope for simple linear regression. 85================== ==================================================== 86 87Calculate covariance, Pearson's correlation, and simple linear regression 88for two inputs: 89 90>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] 91>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] 92>>> covariance(x, y) 930.75 94>>> correlation(x, y) #doctest: +ELLIPSIS 950.31622776601... 96>>> linear_regression(x, y) #doctest: 97LinearRegression(slope=0.1, intercept=1.5) 98 99 100Exceptions 101---------- 102 103A single exception is defined: StatisticsError is a subclass of ValueError. 104 105""" 106 107__all__ = [ 108 'NormalDist', 109 'StatisticsError', 110 'correlation', 111 'covariance', 112 'fmean', 113 'geometric_mean', 114 'harmonic_mean', 115 'linear_regression', 116 'mean', 117 'median', 118 'median_grouped', 119 'median_high', 120 'median_low', 121 'mode', 122 'multimode', 123 'pstdev', 124 'pvariance', 125 'quantiles', 126 'stdev', 127 'variance', 128] 129 130import math 131import numbers 132import random 133import sys 134 135from fractions import Fraction 136from decimal import Decimal 137from itertools import groupby, repeat 138from bisect import bisect_left, bisect_right 139from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum 140from functools import reduce 141from operator import mul 142from collections import Counter, namedtuple, defaultdict 143 144_SQRT2 = sqrt(2.0) 145 146# === Exceptions === 147 148class StatisticsError(ValueError): 149 pass 150 151 152# === Private utilities === 153 154def _sum(data): 155 """_sum(data) -> (type, sum, count) 156 157 Return a high-precision sum of the given numeric data as a fraction, 158 together with the type to be converted to and the count of items. 159 160 Examples 161 -------- 162 163 >>> _sum([3, 2.25, 4.5, -0.5, 0.25]) 164 (<class 'float'>, Fraction(19, 2), 5) 165 166 Some sources of round-off error will be avoided: 167 168 # Built-in sum returns zero. 169 >>> _sum([1e50, 1, -1e50] * 1000) 170 (<class 'float'>, Fraction(1000, 1), 3000) 171 172 Fractions and Decimals are also supported: 173 174 >>> from fractions import Fraction as F 175 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)]) 176 (<class 'fractions.Fraction'>, Fraction(63, 20), 4) 177 178 >>> from decimal import Decimal as D 179 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")] 180 >>> _sum(data) 181 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4) 182 183 Mixed types are currently treated as an error, except that int is 184 allowed. 185 """ 186 count = 0 187 types = set() 188 types_add = types.add 189 partials = {} 190 partials_get = partials.get 191 for typ, values in groupby(data, type): 192 types_add(typ) 193 for n, d in map(_exact_ratio, values): 194 count += 1 195 partials[d] = partials_get(d, 0) + n 196 if None in partials: 197 # The sum will be a NAN or INF. We can ignore all the finite 198 # partials, and just look at this special one. 199 total = partials[None] 200 assert not _isfinite(total) 201 else: 202 # Sum all the partial sums using builtin sum. 203 total = sum(Fraction(n, d) for d, n in partials.items()) 204 T = reduce(_coerce, types, int) # or raise TypeError 205 return (T, total, count) 206 207 208def _ss(data, c=None): 209 """Return the exact mean and sum of square deviations of sequence data. 210 211 Calculations are done in a single pass, allowing the input to be an iterator. 212 213 If given *c* is used the mean; otherwise, it is calculated from the data. 214 Use the *c* argument with care, as it can lead to garbage results. 215 216 """ 217 if c is not None: 218 T, ssd, count = _sum((d := x - c) * d for x in data) 219 return (T, ssd, c, count) 220 count = 0 221 types = set() 222 types_add = types.add 223 sx_partials = defaultdict(int) 224 sxx_partials = defaultdict(int) 225 for typ, values in groupby(data, type): 226 types_add(typ) 227 for n, d in map(_exact_ratio, values): 228 count += 1 229 sx_partials[d] += n 230 sxx_partials[d] += n * n 231 if not count: 232 ssd = c = Fraction(0) 233 elif None in sx_partials: 234 # The sum will be a NAN or INF. We can ignore all the finite 235 # partials, and just look at this special one. 236 ssd = c = sx_partials[None] 237 assert not _isfinite(ssd) 238 else: 239 sx = sum(Fraction(n, d) for d, n in sx_partials.items()) 240 sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items()) 241 # This formula has poor numeric properties for floats, 242 # but with fractions it is exact. 243 ssd = (count * sxx - sx * sx) / count 244 c = sx / count 245 T = reduce(_coerce, types, int) # or raise TypeError 246 return (T, ssd, c, count) 247 248 249def _isfinite(x): 250 try: 251 return x.is_finite() # Likely a Decimal. 252 except AttributeError: 253 return math.isfinite(x) # Coerces to float first. 254 255 256def _coerce(T, S): 257 """Coerce types T and S to a common type, or raise TypeError. 258 259 Coercion rules are currently an implementation detail. See the CoerceTest 260 test class in test_statistics for details. 261 """ 262 # See http://bugs.python.org/issue24068. 263 assert T is not bool, "initial type T is bool" 264 # If the types are the same, no need to coerce anything. Put this 265 # first, so that the usual case (no coercion needed) happens as soon 266 # as possible. 267 if T is S: return T 268 # Mixed int & other coerce to the other type. 269 if S is int or S is bool: return T 270 if T is int: return S 271 # If one is a (strict) subclass of the other, coerce to the subclass. 272 if issubclass(S, T): return S 273 if issubclass(T, S): return T 274 # Ints coerce to the other type. 275 if issubclass(T, int): return S 276 if issubclass(S, int): return T 277 # Mixed fraction & float coerces to float (or float subclass). 278 if issubclass(T, Fraction) and issubclass(S, float): 279 return S 280 if issubclass(T, float) and issubclass(S, Fraction): 281 return T 282 # Any other combination is disallowed. 283 msg = "don't know how to coerce %s and %s" 284 raise TypeError(msg % (T.__name__, S.__name__)) 285 286 287def _exact_ratio(x): 288 """Return Real number x to exact (numerator, denominator) pair. 289 290 >>> _exact_ratio(0.25) 291 (1, 4) 292 293 x is expected to be an int, Fraction, Decimal or float. 294 """ 295 296 # XXX We should revisit whether using fractions to accumulate exact 297 # ratios is the right way to go. 298 299 # The integer ratios for binary floats can have numerators or 300 # denominators with over 300 decimal digits. The problem is more 301 # acute with decimal floats where the default decimal context 302 # supports a huge range of exponents from Emin=-999999 to 303 # Emax=999999. When expanded with as_integer_ratio(), numbers like 304 # Decimal('3.14E+5000') and Decimal('3.14E-5000') have large 305 # numerators or denominators that will slow computation. 306 307 # When the integer ratios are accumulated as fractions, the size 308 # grows to cover the full range from the smallest magnitude to the 309 # largest. For example, Fraction(3.14E+300) + Fraction(3.14E-300), 310 # has a 616 digit numerator. Likewise, 311 # Fraction(Decimal('3.14E+5000')) + Fraction(Decimal('3.14E-5000')) 312 # has 10,003 digit numerator. 313 314 # This doesn't seem to have been problem in practice, but it is a 315 # potential pitfall. 316 317 try: 318 return x.as_integer_ratio() 319 except AttributeError: 320 pass 321 except (OverflowError, ValueError): 322 # float NAN or INF. 323 assert not _isfinite(x) 324 return (x, None) 325 try: 326 # x may be an Integral ABC. 327 return (x.numerator, x.denominator) 328 except AttributeError: 329 msg = f"can't convert type '{type(x).__name__}' to numerator/denominator" 330 raise TypeError(msg) 331 332 333def _convert(value, T): 334 """Convert value to given numeric type T.""" 335 if type(value) is T: 336 # This covers the cases where T is Fraction, or where value is 337 # a NAN or INF (Decimal or float). 338 return value 339 if issubclass(T, int) and value.denominator != 1: 340 T = float 341 try: 342 # FIXME: what do we do if this overflows? 343 return T(value) 344 except TypeError: 345 if issubclass(T, Decimal): 346 return T(value.numerator) / T(value.denominator) 347 else: 348 raise 349 350 351def _fail_neg(values, errmsg='negative value'): 352 """Iterate over values, failing if any are less than zero.""" 353 for x in values: 354 if x < 0: 355 raise StatisticsError(errmsg) 356 yield x 357 358 359def _integer_sqrt_of_frac_rto(n: int, m: int) -> int: 360 """Square root of n/m, rounded to the nearest integer using round-to-odd.""" 361 # Reference: https://www.lri.fr/~melquion/doc/05-imacs17_1-expose.pdf 362 a = math.isqrt(n // m) 363 return a | (a*a*m != n) 364 365 366# For 53 bit precision floats, the bit width used in 367# _float_sqrt_of_frac() is 109. 368_sqrt_bit_width: int = 2 * sys.float_info.mant_dig + 3 369 370 371def _float_sqrt_of_frac(n: int, m: int) -> float: 372 """Square root of n/m as a float, correctly rounded.""" 373 # See principle and proof sketch at: https://bugs.python.org/msg407078 374 q = (n.bit_length() - m.bit_length() - _sqrt_bit_width) // 2 375 if q >= 0: 376 numerator = _integer_sqrt_of_frac_rto(n, m << 2 * q) << q 377 denominator = 1 378 else: 379 numerator = _integer_sqrt_of_frac_rto(n << -2 * q, m) 380 denominator = 1 << -q 381 return numerator / denominator # Convert to float 382 383 384def _decimal_sqrt_of_frac(n: int, m: int) -> Decimal: 385 """Square root of n/m as a Decimal, correctly rounded.""" 386 # Premise: For decimal, computing (n/m).sqrt() can be off 387 # by 1 ulp from the correctly rounded result. 388 # Method: Check the result, moving up or down a step if needed. 389 if n <= 0: 390 if not n: 391 return Decimal('0.0') 392 n, m = -n, -m 393 394 root = (Decimal(n) / Decimal(m)).sqrt() 395 nr, dr = root.as_integer_ratio() 396 397 plus = root.next_plus() 398 np, dp = plus.as_integer_ratio() 399 # test: n / m > ((root + plus) / 2) ** 2 400 if 4 * n * (dr*dp)**2 > m * (dr*np + dp*nr)**2: 401 return plus 402 403 minus = root.next_minus() 404 nm, dm = minus.as_integer_ratio() 405 # test: n / m < ((root + minus) / 2) ** 2 406 if 4 * n * (dr*dm)**2 < m * (dr*nm + dm*nr)**2: 407 return minus 408 409 return root 410 411 412# === Measures of central tendency (averages) === 413 414def mean(data): 415 """Return the sample arithmetic mean of data. 416 417 >>> mean([1, 2, 3, 4, 4]) 418 2.8 419 420 >>> from fractions import Fraction as F 421 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)]) 422 Fraction(13, 21) 423 424 >>> from decimal import Decimal as D 425 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")]) 426 Decimal('0.5625') 427 428 If ``data`` is empty, StatisticsError will be raised. 429 """ 430 T, total, n = _sum(data) 431 if n < 1: 432 raise StatisticsError('mean requires at least one data point') 433 return _convert(total / n, T) 434 435 436def fmean(data, weights=None): 437 """Convert data to floats and compute the arithmetic mean. 438 439 This runs faster than the mean() function and it always returns a float. 440 If the input dataset is empty, it raises a StatisticsError. 441 442 >>> fmean([3.5, 4.0, 5.25]) 443 4.25 444 """ 445 try: 446 n = len(data) 447 except TypeError: 448 # Handle iterators that do not define __len__(). 449 n = 0 450 def count(iterable): 451 nonlocal n 452 for n, x in enumerate(iterable, start=1): 453 yield x 454 data = count(data) 455 if weights is None: 456 total = fsum(data) 457 if not n: 458 raise StatisticsError('fmean requires at least one data point') 459 return total / n 460 try: 461 num_weights = len(weights) 462 except TypeError: 463 weights = list(weights) 464 num_weights = len(weights) 465 num = fsum(map(mul, data, weights)) 466 if n != num_weights: 467 raise StatisticsError('data and weights must be the same length') 468 den = fsum(weights) 469 if not den: 470 raise StatisticsError('sum of weights must be non-zero') 471 return num / den 472 473 474def geometric_mean(data): 475 """Convert data to floats and compute the geometric mean. 476 477 Raises a StatisticsError if the input dataset is empty, 478 if it contains a zero, or if it contains a negative value. 479 480 No special efforts are made to achieve exact results. 481 (However, this may change in the future.) 482 483 >>> round(geometric_mean([54, 24, 36]), 9) 484 36.0 485 """ 486 try: 487 return exp(fmean(map(log, data))) 488 except ValueError: 489 raise StatisticsError('geometric mean requires a non-empty dataset ' 490 'containing positive numbers') from None 491 492 493def harmonic_mean(data, weights=None): 494 """Return the harmonic mean of data. 495 496 The harmonic mean is the reciprocal of the arithmetic mean of the 497 reciprocals of the data. It can be used for averaging ratios or 498 rates, for example speeds. 499 500 Suppose a car travels 40 km/hr for 5 km and then speeds-up to 501 60 km/hr for another 5 km. What is the average speed? 502 503 >>> harmonic_mean([40, 60]) 504 48.0 505 506 Suppose a car travels 40 km/hr for 5 km, and when traffic clears, 507 speeds-up to 60 km/hr for the remaining 30 km of the journey. What 508 is the average speed? 509 510 >>> harmonic_mean([40, 60], weights=[5, 30]) 511 56.0 512 513 If ``data`` is empty, or any element is less than zero, 514 ``harmonic_mean`` will raise ``StatisticsError``. 515 """ 516 if iter(data) is data: 517 data = list(data) 518 errmsg = 'harmonic mean does not support negative values' 519 n = len(data) 520 if n < 1: 521 raise StatisticsError('harmonic_mean requires at least one data point') 522 elif n == 1 and weights is None: 523 x = data[0] 524 if isinstance(x, (numbers.Real, Decimal)): 525 if x < 0: 526 raise StatisticsError(errmsg) 527 return x 528 else: 529 raise TypeError('unsupported type') 530 if weights is None: 531 weights = repeat(1, n) 532 sum_weights = n 533 else: 534 if iter(weights) is weights: 535 weights = list(weights) 536 if len(weights) != n: 537 raise StatisticsError('Number of weights does not match data size') 538 _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg)) 539 try: 540 data = _fail_neg(data, errmsg) 541 T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data)) 542 except ZeroDivisionError: 543 return 0 544 if total <= 0: 545 raise StatisticsError('Weighted sum must be positive') 546 return _convert(sum_weights / total, T) 547 548# FIXME: investigate ways to calculate medians without sorting? Quickselect? 549def median(data): 550 """Return the median (middle value) of numeric data. 551 552 When the number of data points is odd, return the middle data point. 553 When the number of data points is even, the median is interpolated by 554 taking the average of the two middle values: 555 556 >>> median([1, 3, 5]) 557 3 558 >>> median([1, 3, 5, 7]) 559 4.0 560 561 """ 562 data = sorted(data) 563 n = len(data) 564 if n == 0: 565 raise StatisticsError("no median for empty data") 566 if n % 2 == 1: 567 return data[n // 2] 568 else: 569 i = n // 2 570 return (data[i - 1] + data[i]) / 2 571 572 573def median_low(data): 574 """Return the low median of numeric data. 575 576 When the number of data points is odd, the middle value is returned. 577 When it is even, the smaller of the two middle values is returned. 578 579 >>> median_low([1, 3, 5]) 580 3 581 >>> median_low([1, 3, 5, 7]) 582 3 583 584 """ 585 data = sorted(data) 586 n = len(data) 587 if n == 0: 588 raise StatisticsError("no median for empty data") 589 if n % 2 == 1: 590 return data[n // 2] 591 else: 592 return data[n // 2 - 1] 593 594 595def median_high(data): 596 """Return the high median of data. 597 598 When the number of data points is odd, the middle value is returned. 599 When it is even, the larger of the two middle values is returned. 600 601 >>> median_high([1, 3, 5]) 602 3 603 >>> median_high([1, 3, 5, 7]) 604 5 605 606 """ 607 data = sorted(data) 608 n = len(data) 609 if n == 0: 610 raise StatisticsError("no median for empty data") 611 return data[n // 2] 612 613 614def median_grouped(data, interval=1.0): 615 """Estimates the median for numeric data binned around the midpoints 616 of consecutive, fixed-width intervals. 617 618 The *data* can be any iterable of numeric data with each value being 619 exactly the midpoint of a bin. At least one value must be present. 620 621 The *interval* is width of each bin. 622 623 For example, demographic information may have been summarized into 624 consecutive ten-year age groups with each group being represented 625 by the 5-year midpoints of the intervals: 626 627 >>> demographics = Counter({ 628 ... 25: 172, # 20 to 30 years old 629 ... 35: 484, # 30 to 40 years old 630 ... 45: 387, # 40 to 50 years old 631 ... 55: 22, # 50 to 60 years old 632 ... 65: 6, # 60 to 70 years old 633 ... }) 634 635 The 50th percentile (median) is the 536th person out of the 1071 636 member cohort. That person is in the 30 to 40 year old age group. 637 638 The regular median() function would assume that everyone in the 639 tricenarian age group was exactly 35 years old. A more tenable 640 assumption is that the 484 members of that age group are evenly 641 distributed between 30 and 40. For that, we use median_grouped(). 642 643 >>> data = list(demographics.elements()) 644 >>> median(data) 645 35 646 >>> round(median_grouped(data, interval=10), 1) 647 37.5 648 649 The caller is responsible for making sure the data points are separated 650 by exact multiples of *interval*. This is essential for getting a 651 correct result. The function does not check this precondition. 652 653 Inputs may be any numeric type that can be coerced to a float during 654 the interpolation step. 655 656 """ 657 data = sorted(data) 658 n = len(data) 659 if not n: 660 raise StatisticsError("no median for empty data") 661 662 # Find the value at the midpoint. Remember this corresponds to the 663 # midpoint of the class interval. 664 x = data[n // 2] 665 666 # Using O(log n) bisection, find where all the x values occur in the data. 667 # All x will lie within data[i:j]. 668 i = bisect_left(data, x) 669 j = bisect_right(data, x, lo=i) 670 671 # Coerce to floats, raising a TypeError if not possible 672 try: 673 interval = float(interval) 674 x = float(x) 675 except ValueError: 676 raise TypeError(f'Value cannot be converted to a float') 677 678 # Interpolate the median using the formula found at: 679 # https://www.cuemath.com/data/median-of-grouped-data/ 680 L = x - interval / 2.0 # Lower limit of the median interval 681 cf = i # Cumulative frequency of the preceding interval 682 f = j - i # Number of elements in the median internal 683 return L + interval * (n / 2 - cf) / f 684 685 686def mode(data): 687 """Return the most common data point from discrete or nominal data. 688 689 ``mode`` assumes discrete data, and returns a single value. This is the 690 standard treatment of the mode as commonly taught in schools: 691 692 >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) 693 3 694 695 This also works with nominal (non-numeric) data: 696 697 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) 698 'red' 699 700 If there are multiple modes with same frequency, return the first one 701 encountered: 702 703 >>> mode(['red', 'red', 'green', 'blue', 'blue']) 704 'red' 705 706 If *data* is empty, ``mode``, raises StatisticsError. 707 708 """ 709 pairs = Counter(iter(data)).most_common(1) 710 try: 711 return pairs[0][0] 712 except IndexError: 713 raise StatisticsError('no mode for empty data') from None 714 715 716def multimode(data): 717 """Return a list of the most frequently occurring values. 718 719 Will return more than one result if there are multiple modes 720 or an empty list if *data* is empty. 721 722 >>> multimode('aabbbbbbbbcc') 723 ['b'] 724 >>> multimode('aabbbbccddddeeffffgg') 725 ['b', 'd', 'f'] 726 >>> multimode('') 727 [] 728 """ 729 counts = Counter(iter(data)) 730 if not counts: 731 return [] 732 maxcount = max(counts.values()) 733 return [value for value, count in counts.items() if count == maxcount] 734 735 736# Notes on methods for computing quantiles 737# ---------------------------------------- 738# 739# There is no one perfect way to compute quantiles. Here we offer 740# two methods that serve common needs. Most other packages 741# surveyed offered at least one or both of these two, making them 742# "standard" in the sense of "widely-adopted and reproducible". 743# They are also easy to explain, easy to compute manually, and have 744# straight-forward interpretations that aren't surprising. 745 746# The default method is known as "R6", "PERCENTILE.EXC", or "expected 747# value of rank order statistics". The alternative method is known as 748# "R7", "PERCENTILE.INC", or "mode of rank order statistics". 749 750# For sample data where there is a positive probability for values 751# beyond the range of the data, the R6 exclusive method is a 752# reasonable choice. Consider a random sample of nine values from a 753# population with a uniform distribution from 0.0 to 1.0. The 754# distribution of the third ranked sample point is described by 755# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and 756# mean=0.300. Only the latter (which corresponds with R6) gives the 757# desired cut point with 30% of the population falling below that 758# value, making it comparable to a result from an inv_cdf() function. 759# The R6 exclusive method is also idempotent. 760 761# For describing population data where the end points are known to 762# be included in the data, the R7 inclusive method is a reasonable 763# choice. Instead of the mean, it uses the mode of the beta 764# distribution for the interior points. Per Hyndman & Fan, "One nice 765# property is that the vertices of Q7(p) divide the range into n - 1 766# intervals, and exactly 100p% of the intervals lie to the left of 767# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)." 768 769# If needed, other methods could be added. However, for now, the 770# position is that fewer options make for easier choices and that 771# external packages can be used for anything more advanced. 772 773def quantiles(data, *, n=4, method='exclusive'): 774 """Divide *data* into *n* continuous intervals with equal probability. 775 776 Returns a list of (n - 1) cut points separating the intervals. 777 778 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. 779 Set *n* to 100 for percentiles which gives the 99 cuts points that 780 separate *data* in to 100 equal sized groups. 781 782 The *data* can be any iterable containing sample. 783 The cut points are linearly interpolated between data points. 784 785 If *method* is set to *inclusive*, *data* is treated as population 786 data. The minimum value is treated as the 0th percentile and the 787 maximum value is treated as the 100th percentile. 788 """ 789 if n < 1: 790 raise StatisticsError('n must be at least 1') 791 data = sorted(data) 792 ld = len(data) 793 if ld < 2: 794 raise StatisticsError('must have at least two data points') 795 if method == 'inclusive': 796 m = ld - 1 797 result = [] 798 for i in range(1, n): 799 j, delta = divmod(i * m, n) 800 interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n 801 result.append(interpolated) 802 return result 803 if method == 'exclusive': 804 m = ld + 1 805 result = [] 806 for i in range(1, n): 807 j = i * m // n # rescale i to m/n 808 j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1 809 delta = i*m - j*n # exact integer math 810 interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n 811 result.append(interpolated) 812 return result 813 raise ValueError(f'Unknown method: {method!r}') 814 815 816# === Measures of spread === 817 818# See http://mathworld.wolfram.com/Variance.html 819# http://mathworld.wolfram.com/SampleVariance.html 820 821 822def variance(data, xbar=None): 823 """Return the sample variance of data. 824 825 data should be an iterable of Real-valued numbers, with at least two 826 values. The optional argument xbar, if given, should be the mean of 827 the data. If it is missing or None, the mean is automatically calculated. 828 829 Use this function when your data is a sample from a population. To 830 calculate the variance from the entire population, see ``pvariance``. 831 832 Examples: 833 834 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5] 835 >>> variance(data) 836 1.3720238095238095 837 838 If you have already calculated the mean of your data, you can pass it as 839 the optional second argument ``xbar`` to avoid recalculating it: 840 841 >>> m = mean(data) 842 >>> variance(data, m) 843 1.3720238095238095 844 845 This function does not check that ``xbar`` is actually the mean of 846 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or 847 impossible results. 848 849 Decimals and Fractions are supported: 850 851 >>> from decimal import Decimal as D 852 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 853 Decimal('31.01875') 854 855 >>> from fractions import Fraction as F 856 >>> variance([F(1, 6), F(1, 2), F(5, 3)]) 857 Fraction(67, 108) 858 859 """ 860 T, ss, c, n = _ss(data, xbar) 861 if n < 2: 862 raise StatisticsError('variance requires at least two data points') 863 return _convert(ss / (n - 1), T) 864 865 866def pvariance(data, mu=None): 867 """Return the population variance of ``data``. 868 869 data should be a sequence or iterable of Real-valued numbers, with at least one 870 value. The optional argument mu, if given, should be the mean of 871 the data. If it is missing or None, the mean is automatically calculated. 872 873 Use this function to calculate the variance from the entire population. 874 To estimate the variance from a sample, the ``variance`` function is 875 usually a better choice. 876 877 Examples: 878 879 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25] 880 >>> pvariance(data) 881 1.25 882 883 If you have already calculated the mean of the data, you can pass it as 884 the optional second argument to avoid recalculating it: 885 886 >>> mu = mean(data) 887 >>> pvariance(data, mu) 888 1.25 889 890 Decimals and Fractions are supported: 891 892 >>> from decimal import Decimal as D 893 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 894 Decimal('24.815') 895 896 >>> from fractions import Fraction as F 897 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)]) 898 Fraction(13, 72) 899 900 """ 901 T, ss, c, n = _ss(data, mu) 902 if n < 1: 903 raise StatisticsError('pvariance requires at least one data point') 904 return _convert(ss / n, T) 905 906 907def stdev(data, xbar=None): 908 """Return the square root of the sample variance. 909 910 See ``variance`` for arguments and other details. 911 912 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 913 1.0810874155219827 914 915 """ 916 T, ss, c, n = _ss(data, xbar) 917 if n < 2: 918 raise StatisticsError('stdev requires at least two data points') 919 mss = ss / (n - 1) 920 if issubclass(T, Decimal): 921 return _decimal_sqrt_of_frac(mss.numerator, mss.denominator) 922 return _float_sqrt_of_frac(mss.numerator, mss.denominator) 923 924 925def pstdev(data, mu=None): 926 """Return the square root of the population variance. 927 928 See ``pvariance`` for arguments and other details. 929 930 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 931 0.986893273527251 932 933 """ 934 T, ss, c, n = _ss(data, mu) 935 if n < 1: 936 raise StatisticsError('pstdev requires at least one data point') 937 mss = ss / n 938 if issubclass(T, Decimal): 939 return _decimal_sqrt_of_frac(mss.numerator, mss.denominator) 940 return _float_sqrt_of_frac(mss.numerator, mss.denominator) 941 942 943def _mean_stdev(data): 944 """In one pass, compute the mean and sample standard deviation as floats.""" 945 T, ss, xbar, n = _ss(data) 946 if n < 2: 947 raise StatisticsError('stdev requires at least two data points') 948 mss = ss / (n - 1) 949 try: 950 return float(xbar), _float_sqrt_of_frac(mss.numerator, mss.denominator) 951 except AttributeError: 952 # Handle Nans and Infs gracefully 953 return float(xbar), float(xbar) / float(ss) 954 955 956# === Statistics for relations between two inputs === 957 958# See https://en.wikipedia.org/wiki/Covariance 959# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient 960# https://en.wikipedia.org/wiki/Simple_linear_regression 961 962 963def covariance(x, y, /): 964 """Covariance 965 966 Return the sample covariance of two inputs *x* and *y*. Covariance 967 is a measure of the joint variability of two inputs. 968 969 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] 970 >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] 971 >>> covariance(x, y) 972 0.75 973 >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1] 974 >>> covariance(x, z) 975 -7.5 976 >>> covariance(z, x) 977 -7.5 978 979 """ 980 n = len(x) 981 if len(y) != n: 982 raise StatisticsError('covariance requires that both inputs have same number of data points') 983 if n < 2: 984 raise StatisticsError('covariance requires at least two data points') 985 xbar = fsum(x) / n 986 ybar = fsum(y) / n 987 sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) 988 return sxy / (n - 1) 989 990 991def correlation(x, y, /): 992 """Pearson's correlation coefficient 993 994 Return the Pearson's correlation coefficient for two inputs. Pearson's 995 correlation coefficient *r* takes values between -1 and +1. It measures the 996 strength and direction of the linear relationship, where +1 means very 997 strong, positive linear relationship, -1 very strong, negative linear 998 relationship, and 0 no linear relationship. 999 1000 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] 1001 >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] 1002 >>> correlation(x, x) 1003 1.0 1004 >>> correlation(x, y) 1005 -1.0 1006 1007 """ 1008 n = len(x) 1009 if len(y) != n: 1010 raise StatisticsError('correlation requires that both inputs have same number of data points') 1011 if n < 2: 1012 raise StatisticsError('correlation requires at least two data points') 1013 xbar = fsum(x) / n 1014 ybar = fsum(y) / n 1015 sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) 1016 sxx = fsum((d := xi - xbar) * d for xi in x) 1017 syy = fsum((d := yi - ybar) * d for yi in y) 1018 try: 1019 return sxy / sqrt(sxx * syy) 1020 except ZeroDivisionError: 1021 raise StatisticsError('at least one of the inputs is constant') 1022 1023 1024LinearRegression = namedtuple('LinearRegression', ('slope', 'intercept')) 1025 1026 1027def linear_regression(x, y, /, *, proportional=False): 1028 """Slope and intercept for simple linear regression. 1029 1030 Return the slope and intercept of simple linear regression 1031 parameters estimated using ordinary least squares. Simple linear 1032 regression describes relationship between an independent variable 1033 *x* and a dependent variable *y* in terms of a linear function: 1034 1035 y = slope * x + intercept + noise 1036 1037 where *slope* and *intercept* are the regression parameters that are 1038 estimated, and noise represents the variability of the data that was 1039 not explained by the linear regression (it is equal to the 1040 difference between predicted and actual values of the dependent 1041 variable). 1042 1043 The parameters are returned as a named tuple. 1044 1045 >>> x = [1, 2, 3, 4, 5] 1046 >>> noise = NormalDist().samples(5, seed=42) 1047 >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)] 1048 >>> linear_regression(x, y) #doctest: +ELLIPSIS 1049 LinearRegression(slope=3.09078914170..., intercept=1.75684970486...) 1050 1051 If *proportional* is true, the independent variable *x* and the 1052 dependent variable *y* are assumed to be directly proportional. 1053 The data is fit to a line passing through the origin. 1054 1055 Since the *intercept* will always be 0.0, the underlying linear 1056 function simplifies to: 1057 1058 y = slope * x + noise 1059 1060 >>> y = [3 * x[i] + noise[i] for i in range(5)] 1061 >>> linear_regression(x, y, proportional=True) #doctest: +ELLIPSIS 1062 LinearRegression(slope=3.02447542484..., intercept=0.0) 1063 1064 """ 1065 n = len(x) 1066 if len(y) != n: 1067 raise StatisticsError('linear regression requires that both inputs have same number of data points') 1068 if n < 2: 1069 raise StatisticsError('linear regression requires at least two data points') 1070 if proportional: 1071 sxy = fsum(xi * yi for xi, yi in zip(x, y)) 1072 sxx = fsum(xi * xi for xi in x) 1073 else: 1074 xbar = fsum(x) / n 1075 ybar = fsum(y) / n 1076 sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) 1077 sxx = fsum((d := xi - xbar) * d for xi in x) 1078 try: 1079 slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) 1080 except ZeroDivisionError: 1081 raise StatisticsError('x is constant') 1082 intercept = 0.0 if proportional else ybar - slope * xbar 1083 return LinearRegression(slope=slope, intercept=intercept) 1084 1085 1086## Normal Distribution ##################################################### 1087 1088 1089def _normal_dist_inv_cdf(p, mu, sigma): 1090 # There is no closed-form solution to the inverse CDF for the normal 1091 # distribution, so we use a rational approximation instead: 1092 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the 1093 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37 1094 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330. 1095 q = p - 0.5 1096 if fabs(q) <= 0.425: 1097 r = 0.180625 - q * q 1098 # Hash sum: 55.88319_28806_14901_4439 1099 num = (((((((2.50908_09287_30122_6727e+3 * r + 1100 3.34305_75583_58812_8105e+4) * r + 1101 6.72657_70927_00870_0853e+4) * r + 1102 4.59219_53931_54987_1457e+4) * r + 1103 1.37316_93765_50946_1125e+4) * r + 1104 1.97159_09503_06551_4427e+3) * r + 1105 1.33141_66789_17843_7745e+2) * r + 1106 3.38713_28727_96366_6080e+0) * q 1107 den = (((((((5.22649_52788_52854_5610e+3 * r + 1108 2.87290_85735_72194_2674e+4) * r + 1109 3.93078_95800_09271_0610e+4) * r + 1110 2.12137_94301_58659_5867e+4) * r + 1111 5.39419_60214_24751_1077e+3) * r + 1112 6.87187_00749_20579_0830e+2) * r + 1113 4.23133_30701_60091_1252e+1) * r + 1114 1.0) 1115 x = num / den 1116 return mu + (x * sigma) 1117 r = p if q <= 0.0 else 1.0 - p 1118 r = sqrt(-log(r)) 1119 if r <= 5.0: 1120 r = r - 1.6 1121 # Hash sum: 49.33206_50330_16102_89036 1122 num = (((((((7.74545_01427_83414_07640e-4 * r + 1123 2.27238_44989_26918_45833e-2) * r + 1124 2.41780_72517_74506_11770e-1) * r + 1125 1.27045_82524_52368_38258e+0) * r + 1126 3.64784_83247_63204_60504e+0) * r + 1127 5.76949_72214_60691_40550e+0) * r + 1128 4.63033_78461_56545_29590e+0) * r + 1129 1.42343_71107_49683_57734e+0) 1130 den = (((((((1.05075_00716_44416_84324e-9 * r + 1131 5.47593_80849_95344_94600e-4) * r + 1132 1.51986_66563_61645_71966e-2) * r + 1133 1.48103_97642_74800_74590e-1) * r + 1134 6.89767_33498_51000_04550e-1) * r + 1135 1.67638_48301_83803_84940e+0) * r + 1136 2.05319_16266_37758_82187e+0) * r + 1137 1.0) 1138 else: 1139 r = r - 5.0 1140 # Hash sum: 47.52583_31754_92896_71629 1141 num = (((((((2.01033_43992_92288_13265e-7 * r + 1142 2.71155_55687_43487_57815e-5) * r + 1143 1.24266_09473_88078_43860e-3) * r + 1144 2.65321_89526_57612_30930e-2) * r + 1145 2.96560_57182_85048_91230e-1) * r + 1146 1.78482_65399_17291_33580e+0) * r + 1147 5.46378_49111_64114_36990e+0) * r + 1148 6.65790_46435_01103_77720e+0) 1149 den = (((((((2.04426_31033_89939_78564e-15 * r + 1150 1.42151_17583_16445_88870e-7) * r + 1151 1.84631_83175_10054_68180e-5) * r + 1152 7.86869_13114_56132_59100e-4) * r + 1153 1.48753_61290_85061_48525e-2) * r + 1154 1.36929_88092_27358_05310e-1) * r + 1155 5.99832_20655_58879_37690e-1) * r + 1156 1.0) 1157 x = num / den 1158 if q < 0.0: 1159 x = -x 1160 return mu + (x * sigma) 1161 1162 1163# If available, use C implementation 1164try: 1165 from _statistics import _normal_dist_inv_cdf 1166except ImportError: 1167 pass 1168 1169 1170class NormalDist: 1171 "Normal distribution of a random variable" 1172 # https://en.wikipedia.org/wiki/Normal_distribution 1173 # https://en.wikipedia.org/wiki/Variance#Properties 1174 1175 __slots__ = { 1176 '_mu': 'Arithmetic mean of a normal distribution', 1177 '_sigma': 'Standard deviation of a normal distribution', 1178 } 1179 1180 def __init__(self, mu=0.0, sigma=1.0): 1181 "NormalDist where mu is the mean and sigma is the standard deviation." 1182 if sigma < 0.0: 1183 raise StatisticsError('sigma must be non-negative') 1184 self._mu = float(mu) 1185 self._sigma = float(sigma) 1186 1187 @classmethod 1188 def from_samples(cls, data): 1189 "Make a normal distribution instance from sample data." 1190 return cls(*_mean_stdev(data)) 1191 1192 def samples(self, n, *, seed=None): 1193 "Generate *n* samples for a given mean and standard deviation." 1194 gauss = random.gauss if seed is None else random.Random(seed).gauss 1195 mu, sigma = self._mu, self._sigma 1196 return [gauss(mu, sigma) for i in range(n)] 1197 1198 def pdf(self, x): 1199 "Probability density function. P(x <= X < x+dx) / dx" 1200 variance = self._sigma * self._sigma 1201 if not variance: 1202 raise StatisticsError('pdf() not defined when sigma is zero') 1203 diff = x - self._mu 1204 return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance) 1205 1206 def cdf(self, x): 1207 "Cumulative distribution function. P(X <= x)" 1208 if not self._sigma: 1209 raise StatisticsError('cdf() not defined when sigma is zero') 1210 return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * _SQRT2))) 1211 1212 def inv_cdf(self, p): 1213 """Inverse cumulative distribution function. x : P(X <= x) = p 1214 1215 Finds the value of the random variable such that the probability of 1216 the variable being less than or equal to that value equals the given 1217 probability. 1218 1219 This function is also called the percent point function or quantile 1220 function. 1221 """ 1222 if p <= 0.0 or p >= 1.0: 1223 raise StatisticsError('p must be in the range 0.0 < p < 1.0') 1224 if self._sigma <= 0.0: 1225 raise StatisticsError('cdf() not defined when sigma at or below zero') 1226 return _normal_dist_inv_cdf(p, self._mu, self._sigma) 1227 1228 def quantiles(self, n=4): 1229 """Divide into *n* continuous intervals with equal probability. 1230 1231 Returns a list of (n - 1) cut points separating the intervals. 1232 1233 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. 1234 Set *n* to 100 for percentiles which gives the 99 cuts points that 1235 separate the normal distribution in to 100 equal sized groups. 1236 """ 1237 return [self.inv_cdf(i / n) for i in range(1, n)] 1238 1239 def overlap(self, other): 1240 """Compute the overlapping coefficient (OVL) between two normal distributions. 1241 1242 Measures the agreement between two normal probability distributions. 1243 Returns a value between 0.0 and 1.0 giving the overlapping area in 1244 the two underlying probability density functions. 1245 1246 >>> N1 = NormalDist(2.4, 1.6) 1247 >>> N2 = NormalDist(3.2, 2.0) 1248 >>> N1.overlap(N2) 1249 0.8035050657330205 1250 """ 1251 # See: "The overlapping coefficient as a measure of agreement between 1252 # probability distributions and point estimation of the overlap of two 1253 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr 1254 # http://dx.doi.org/10.1080/03610928908830127 1255 if not isinstance(other, NormalDist): 1256 raise TypeError('Expected another NormalDist instance') 1257 X, Y = self, other 1258 if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity 1259 X, Y = Y, X 1260 X_var, Y_var = X.variance, Y.variance 1261 if not X_var or not Y_var: 1262 raise StatisticsError('overlap() not defined when sigma is zero') 1263 dv = Y_var - X_var 1264 dm = fabs(Y._mu - X._mu) 1265 if not dv: 1266 return 1.0 - erf(dm / (2.0 * X._sigma * _SQRT2)) 1267 a = X._mu * Y_var - Y._mu * X_var 1268 b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var)) 1269 x1 = (a + b) / dv 1270 x2 = (a - b) / dv 1271 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2))) 1272 1273 def zscore(self, x): 1274 """Compute the Standard Score. (x - mean) / stdev 1275 1276 Describes *x* in terms of the number of standard deviations 1277 above or below the mean of the normal distribution. 1278 """ 1279 # https://www.statisticshowto.com/probability-and-statistics/z-score/ 1280 if not self._sigma: 1281 raise StatisticsError('zscore() not defined when sigma is zero') 1282 return (x - self._mu) / self._sigma 1283 1284 @property 1285 def mean(self): 1286 "Arithmetic mean of the normal distribution." 1287 return self._mu 1288 1289 @property 1290 def median(self): 1291 "Return the median of the normal distribution" 1292 return self._mu 1293 1294 @property 1295 def mode(self): 1296 """Return the mode of the normal distribution 1297 1298 The mode is the value x where which the probability density 1299 function (pdf) takes its maximum value. 1300 """ 1301 return self._mu 1302 1303 @property 1304 def stdev(self): 1305 "Standard deviation of the normal distribution." 1306 return self._sigma 1307 1308 @property 1309 def variance(self): 1310 "Square of the standard deviation." 1311 return self._sigma * self._sigma 1312 1313 def __add__(x1, x2): 1314 """Add a constant or another NormalDist instance. 1315 1316 If *other* is a constant, translate mu by the constant, 1317 leaving sigma unchanged. 1318 1319 If *other* is a NormalDist, add both the means and the variances. 1320 Mathematically, this works only if the two distributions are 1321 independent or if they are jointly normally distributed. 1322 """ 1323 if isinstance(x2, NormalDist): 1324 return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma)) 1325 return NormalDist(x1._mu + x2, x1._sigma) 1326 1327 def __sub__(x1, x2): 1328 """Subtract a constant or another NormalDist instance. 1329 1330 If *other* is a constant, translate by the constant mu, 1331 leaving sigma unchanged. 1332 1333 If *other* is a NormalDist, subtract the means and add the variances. 1334 Mathematically, this works only if the two distributions are 1335 independent or if they are jointly normally distributed. 1336 """ 1337 if isinstance(x2, NormalDist): 1338 return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma)) 1339 return NormalDist(x1._mu - x2, x1._sigma) 1340 1341 def __mul__(x1, x2): 1342 """Multiply both mu and sigma by a constant. 1343 1344 Used for rescaling, perhaps to change measurement units. 1345 Sigma is scaled with the absolute value of the constant. 1346 """ 1347 return NormalDist(x1._mu * x2, x1._sigma * fabs(x2)) 1348 1349 def __truediv__(x1, x2): 1350 """Divide both mu and sigma by a constant. 1351 1352 Used for rescaling, perhaps to change measurement units. 1353 Sigma is scaled with the absolute value of the constant. 1354 """ 1355 return NormalDist(x1._mu / x2, x1._sigma / fabs(x2)) 1356 1357 def __pos__(x1): 1358 "Return a copy of the instance." 1359 return NormalDist(x1._mu, x1._sigma) 1360 1361 def __neg__(x1): 1362 "Negates mu while keeping sigma the same." 1363 return NormalDist(-x1._mu, x1._sigma) 1364 1365 __radd__ = __add__ 1366 1367 def __rsub__(x1, x2): 1368 "Subtract a NormalDist from a constant or another NormalDist." 1369 return -(x1 - x2) 1370 1371 __rmul__ = __mul__ 1372 1373 def __eq__(x1, x2): 1374 "Two NormalDist objects are equal if their mu and sigma are both equal." 1375 if not isinstance(x2, NormalDist): 1376 return NotImplemented 1377 return x1._mu == x2._mu and x1._sigma == x2._sigma 1378 1379 def __hash__(self): 1380 "NormalDist objects hash equal if their mu and sigma are both equal." 1381 return hash((self._mu, self._sigma)) 1382 1383 def __repr__(self): 1384 return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})' 1385 1386 def __getstate__(self): 1387 return self._mu, self._sigma 1388 1389 def __setstate__(self, state): 1390 self._mu, self._sigma = state 1391