xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/cpu/DistributionKernels.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/CPUGeneratorImpl.h>
3 #include <ATen/Dispatch.h>
4 #include <ATen/Generator.h>
5 #include <ATen/core/DistributionsHelper.h>
6 #include <ATen/native/Distributions.h>
7 #include <ATen/native/cpu/DistributionTemplates.h>
8 
9 #include <ATen/native/UnaryOps.h>
10 
11 #ifndef AT_PER_OPERATOR_HEADERS
12 #include <ATen/Functions.h>
13 #else
14 #include <ATen/ops/empty.h>
15 #endif
16 
17 #include <cmath>
18 #include <limits>
19 #include <type_traits>
20 
21 #if AT_MKL_ENABLED()
22 #include <mkl.h>
23 #include <cpuinfo.h>
24 #endif
25 
26 namespace at::native {
27 namespace {
28 
cauchy_kernel(TensorIteratorBase & iter,double median,double sigma,std::optional<Generator> gen)29 static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
30   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
31   templates::cpu::cauchy_kernel(iter, median, sigma, generator);
32 }
33 
bernoulli_tensor_kernel(const TensorBase & self,const TensorBase & p_,std::optional<Generator> gen)34 void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, std::optional<Generator> gen) {
35   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
36   templates::cpu::bernoulli_kernel(self, p_, generator);
37 }
38 
39 #if !AT_MKL_ENABLED()
bernoulli_scalar_kernel_default(const TensorBase & self,double p,std::optional<Generator> gen)40 void bernoulli_scalar_kernel_default(const TensorBase &self, double p, std::optional<Generator> gen) {
41   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
42   templates::cpu::bernoulli_kernel(self, p, generator);
43 }
44 
bernoulli_scalar_kernel(const TensorBase & self,double p,std::optional<Generator> gen)45 void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Generator> gen) {
46   bernoulli_scalar_kernel_default(self, p, gen);
47 }
48 #else
bernoulli_scalar_kernel(const TensorBase & self,double p,std::optional<Generator> gen)49 void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Generator> gen) {
50   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
51   int64_t seed;
52   {
53     // See Note [Acquire lock when using random generators]
54     std::lock_guard<std::mutex> lock(generator->mutex_);
55     seed = generator->random();
56   }
57   int64_t n = self.numel();
58   bool contig = self.is_contiguous();
59 
60   AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
61   self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
62     at::Tensor tmp_int_tensor;
63     if (std::is_same<scalar_t, int>::value && contig) {
64       tmp_int_tensor = self;
65     } else {
66       tmp_int_tensor = at::empty(self.sizes(), self.options().dtype(at::kInt));
67     }
68 
69     scalar_t *self_ptr = self.data_ptr<scalar_t>();
70     int *sample_int_ptr = tmp_int_tensor.data_ptr<int>();
71 
72     auto sample = [&](int64_t begin, int64_t end) {
73       int64_t len = end - begin;
74       if (len > 0) {
75         VSLStreamStatePtr stream;
76         vslNewStream(&stream, VSL_BRNG_MCG31, seed);
77         vslSkipAheadStream(stream, begin);
78         viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, len,
79           sample_int_ptr + begin, p);
80         vslDeleteStream(&stream);
81 
82         // vectorized copy if using buffer and contiguous, i.e., being non-int
83         // type and contiguous
84         if (!std::is_same<scalar_t, int>::value && contig) {
85           scalar_t *self_seg = self_ptr + begin;
86           int* tmp_seg = sample_int_ptr + begin;
87           at::vec::convert<int, scalar_t>(tmp_seg, self_seg, len);
88         }
89       }
90     };
91 
92     parallel_for(0, n, /* grain_size= */ 800, sample);
93 
94     // copy_ if using buffer and non contiguous
95     if (!contig) {
96       OptionalTensorRef(self)->copy_(tmp_int_tensor);
97     }
98   });
99 }
100 #endif
101 
exponential_kernel_default(TensorIteratorBase & iter,double lambda,std::optional<Generator> gen)102 static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
103   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
104   templates::cpu::exponential_kernel(iter, lambda, generator);
105 }
106 
107 #if (!AT_MKL_ENABLED() || defined(FBCODE_CAFFE2))
exponential_kernel(TensorIteratorBase & iter,double lambda,std::optional<Generator> gen)108 void exponential_kernel(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
109   exponential_kernel_default(iter, lambda, gen);
110 }
111 #else
exponential_kernel(TensorIteratorBase & iter,double lambda,std::optional<Generator> gen)112 void exponential_kernel(TensorIteratorBase &iter, double lambda, std::optional<Generator> gen) {
113   TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
114 
115   Tensor self = iter.tensor(0);
116   if (lambda > 0 && !std::isinf(lambda) && !std::isnan(lambda)) {
117     CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
118     int64_t seed;
119     {
120       // See Note [Acquire lock when using random generators]
121       std::lock_guard<std::mutex> lock(generator->mutex_);
122       if (self.scalar_type() == at::kDouble)
123         seed = generator->random64();
124       else
125         seed = generator->random();
126     }
127     int64_t n = self.numel();
128     bool contig = self.is_contiguous();
129 
130     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "exponential_cpu", [&] {
131       at::Tensor tmp_tensor;
132       constexpr bool is_df = std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value;
133       if (is_df && contig) {
134         tmp_tensor = self;
135       } else if (std::is_same<scalar_t, double>::value) {
136         tmp_tensor = at::empty(self.sizes(), self.options().dtype(at::kDouble));
137       } else {
138         tmp_tensor = at::empty(self.sizes(), self.options().dtype(at::kFloat));
139       }
140 
141       scalar_t *self_ptr = self.data_ptr<scalar_t>();
142       using tmp_scalar_t = typename std::conditional_t<std::is_same<scalar_t, double>::value, double, float>;
143       tmp_scalar_t *sample_ptr = tmp_tensor.data_ptr<tmp_scalar_t>();
144 
145       // Intel MKL vRngExponential variate originally does not exclude 0.
146       // However, to align with pytorch exponential variate definition which excludes 0,
147       // we shift the MKL vRngExponential distribution location by adding a very small constant, eps.
148       // If X ~ Exp(lambda), then E(X) = 1/lambda, and V(X) = 1/lambda**2.
149       // If Y = X + eps, where eps ~= 0, then E(Y) = (1/lambda) + eps, and V(Y) = 1/lambda**2.
150       // If eps is very small, the two distributions are indistinguishable, and are almost identical.
151       // The detail of location-shifted MKL vRngExponential is as follows.
152       // PDF:         f(x) = lambda * exp( -lambda * (x - eps) )
153       // CDF:         F(x) = 1 - exp( -lambda * (x - eps) )
154       // Mean:        E[X+eps] = (1/lambda) + eps
155       // Variance:    V[X+eps] = 1/lambda**2
156       auto eps = std::numeric_limits<tmp_scalar_t>::min();
157 
158       auto sample = [&](int64_t begin, int64_t end) {
159         int64_t len = end - begin;
160         if (len > 0) {
161           VSLStreamStatePtr stream;
162           if constexpr (std::is_same<scalar_t, double>::value) {
163             vslNewStream(&stream, VSL_BRNG_MCG31, seed);
164             vslSkipAheadStream(stream, begin);
165             vdRngExponential(VSL_RNG_METHOD_EXPONENTIAL_ICDF, stream, len,
166               (double *)(sample_ptr + begin), eps, 1./lambda);
167             vslDeleteStream(&stream);
168           } else {
169             vslNewStream(&stream, VSL_BRNG_MCG31, seed);
170             vslSkipAheadStream(stream, begin);
171             vsRngExponential(VSL_RNG_METHOD_EXPONENTIAL_ICDF, stream, len,
172               (float *) (sample_ptr + begin), eps, 1./lambda);
173             vslDeleteStream(&stream);
174           }
175           // vectorized copy if using buffer and contiguous
176           if (!is_df && contig) {
177             scalar_t *self_seg = self_ptr + begin;
178             tmp_scalar_t *tmp_seg = sample_ptr + begin;
179             at::vec::convert<tmp_scalar_t, scalar_t>(tmp_seg, self_seg, len);
180           }
181         }
182       };
183 
184       parallel_for(0, n, /* grain_size= */ 800, sample);
185 
186       // copy_ if using buffer and non contiguous
187       if (!contig) {
188         self.copy_(tmp_tensor);
189       }
190     });
191   } else {
192     // The situation of inf and nan, move to using the default version
193     exponential_kernel_default(iter, lambda, gen);
194   }
195 }
196 #endif
197 
geometric_kernel(TensorIteratorBase & iter,double p,std::optional<Generator> gen)198 static void geometric_kernel(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
199   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
200   templates::cpu::geometric_kernel(iter, p, generator);
201 }
202 
log_normal_kernel(TensorIteratorBase & iter,double mean,double std,std::optional<Generator> gen)203 static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
204   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
205   templates::cpu::log_normal_kernel(iter, mean, std, generator);
206 }
207 
uniform_kernel(TensorIteratorBase & iter,double from,double to,std::optional<Generator> gen)208 void uniform_kernel(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
209   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
210   templates::cpu::uniform_kernel(iter, from, to, generator);
211 }
212 
normal_kernel(const TensorBase & self,double mean,double std,std::optional<Generator> gen)213 void normal_kernel(const TensorBase &self, double mean, double std, std::optional<Generator> gen) {
214   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
215   templates::cpu::normal_kernel(self, mean, std, generator);
216 }
217 
random_from_to_kernel(TensorIteratorBase & iter,uint64_t range,int64_t base,std::optional<Generator> gen)218 static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
219   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
220   templates::cpu::random_from_to_kernel(iter, range, base, generator);
221 }
222 
random_kernel(TensorIteratorBase & iter,std::optional<Generator> gen)223 static void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
224   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
225   templates::cpu::random_kernel(iter, generator);
226 }
227 
228 // This is the special kernel to handle single specific case:
229 // from(inclusive) = std::numeric_limits<int64_t>::lowest()
230 // to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
random_full_64_bits_range_kernel(TensorIteratorBase & iter,std::optional<Generator> gen)231 static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
232   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
233   templates::cpu::random_full_64_bits_range_kernel(iter, generator);
234 }
235 
236 } // namespace (anonymous)
237 
238 REGISTER_DISPATCH(bernoulli_tensor_stub, &bernoulli_tensor_kernel);
239 REGISTER_DISPATCH(bernoulli_scalar_stub, &bernoulli_scalar_kernel);
240 REGISTER_DISPATCH(cauchy_stub, &cauchy_kernel);
241 REGISTER_DISPATCH(exponential_stub, &exponential_kernel);
242 REGISTER_DISPATCH(geometric_stub, &geometric_kernel);
243 REGISTER_DISPATCH(log_normal_stub, &log_normal_kernel);
244 REGISTER_DISPATCH(normal_stub, &normal_kernel);
245 REGISTER_DISPATCH(uniform_stub, &uniform_kernel);
246 REGISTER_DISPATCH(random_from_to_stub, &random_from_to_kernel);
247 REGISTER_DISPATCH(random_full_64_bits_range_stub, &random_full_64_bits_range_kernel);
248 REGISTER_DISPATCH(random_stub, &random_kernel);
249 
250 } // namespace at::native
251