xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/mkldnn/Utils.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <ATen/Config.h>
4 #include <ATen/core/List.h>
5 #include <ATen/core/Tensor.h>
6 #include <c10/util/ArrayRef.h>
7 #include <c10/util/strides.h>
8 #if !defined(__s390x__) && !defined(__powerpc__)
9 #include <cpuinfo.h>
10 #endif
11 #include <vector>
12 
13 #if AT_MKLDNN_ENABLED()
14 #include <ideep/tensor.hpp>
15 #endif // AT_MKLDNN_ENABLED()
16 
17 namespace at { namespace native {
18 
19 std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
20     const Tensor& input,
21     IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias,
22     double eps, bool inplace = false);
23 
24 std::vector<int64_t> pool_output_sizes(
25     IntArrayRef input_size,
26     IntArrayRef kernel_size,
27     IntArrayRef stride,
28     IntArrayRef padding_l,
29     IntArrayRef padding_r,
30     IntArrayRef dilation,
31     bool ceil_mode);
32 
33 void check_mkldnn_binary_fusion_inputs(
34     const Tensor& input,
35     const Tensor& other,
36     const Tensor& weight,
37     const Tensor& bias);
38 
padding_r(IntArrayRef padding,IntArrayRef output_padding)39 inline std::vector<int64_t> padding_r(
40     IntArrayRef padding, IntArrayRef output_padding)
41 {
42   // ConvTranpose padding adjustment
43   //
44   // PyTorch uses padding/output_padding:
45   //   osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1
46   //
47   // MKLDNN uses padding_l/padding_r:
48   //   osize = (isize - 1) * stride - padding_l - padding_r + dilation * (kernel_size - 1) + 1
49   //
50   // So: padding_l = padding, padding_r = padding - output_padding
51   //
52   auto dim = padding.size();
53   std::vector<int64_t> pad_r(dim);
54   for (const auto d : c10::irange(dim)) {
55     pad_r[d] = padding[d] - output_padding[d];
56   }
57   return pad_r;
58 }
59 
60 // Make sure input has default contiguous strides if it's contiguous tensors for better performance.
61 // For example, for tensor of size = [1, 1280], stride = [0, 1], we'll convert it to size = [1, 1280], stride = [1280, 1]
62 // before calling oneDNN for better performance.
may_convert_to_default_contiguous_strides(const Tensor & input)63 inline Tensor may_convert_to_default_contiguous_strides(const Tensor& input) {
64   auto input_size = input.sizes().vec();
65   auto input_stride = input.strides().vec();
66   auto input_default_contiguous_strides = c10::contiguous_strides(input_size);
67   if (input.is_contiguous() && input_stride != c10::IntArrayRef(input_default_contiguous_strides)) {
68      return input.as_strided(input_size, input_default_contiguous_strides);
69   }
70   return input;
71 }
72 
73 #if AT_MKLDNN_ENABLED()
74 
75 using AttrFunction = std::function<ideep::attr_t(
76     torch::List<std::optional<at::Scalar>>,
77     std::optional<c10::string_view>)>;
78 
79 const std::map<c10::string_view, AttrFunction>& fusion_unary_attr_map();
80 
81 const std::map<c10::string_view, ideep::algorithm>& fusion_unary_alg_map();
82 
83 const std::map<c10::string_view, ideep::algorithm>& fusion_binary_alg_map();
84 
85 #endif // AT_MKLDNN_ENABLED()
86 };
87 
88 #if defined(__aarch64__)
mkldnn_bf16_device_check_arm()89 inline bool mkldnn_bf16_device_check_arm() {
90   return cpuinfo_initialize() && cpuinfo_has_arm_bf16();
91 }
92 
is_arm_neoverse()93 inline bool is_arm_neoverse() {
94   return (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 &&
95           (cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1 ||
96            cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v2 ||
97            cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_n1 ||
98            cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_n2));
99 }
100 #else
mkldnn_bf16_device_check_arm()101 constexpr bool mkldnn_bf16_device_check_arm() {
102   return false;
103 }
104 
is_arm_neoverse()105 constexpr bool is_arm_neoverse() {
106   return false;
107 }
108 #endif
109 
110 #if AT_MKLDNN_ENABLED()
mkldnn_bf16_device_check()111 inline bool mkldnn_bf16_device_check() {
112 #if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))
113   // Use ideep to check bf16 on X64 as cpuinfo has no avx_ne_convert check.
114   return ideep::has_bf16_type_support();
115 #else
116   return mkldnn_bf16_device_check_arm();
117 #endif
118 }
119 
mkldnn_fp16_device_check()120 inline bool mkldnn_fp16_device_check() {
121 #if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))
122   return ideep::has_fp16_type_support();
123 #else
124   return false;
125 #endif
126 }
127 
128 #else
mkldnn_bf16_device_check()129 inline bool mkldnn_bf16_device_check() {
130   return false;
131 }
mkldnn_fp16_device_check()132 inline bool mkldnn_fp16_device_check() {
133   return false;
134 }
135 #endif
136 
mkldnn_check_low_precision(ScalarType input_t,std::string name)137 inline void mkldnn_check_low_precision(ScalarType input_t, std::string name) {
138   if (input_t == ScalarType::BFloat16) {
139     TORCH_CHECK(
140         mkldnn_bf16_device_check(),
141         name,
142         ": bf16 path needs the cpu support avx_ne_convert or avx512bw, avx512vl and avx512dq");
143   } else if (input_t == ScalarType::Half) {
144     TORCH_CHECK(
145         mkldnn_fp16_device_check(),
146         name,
147         ": fp16 path needs the cpu support avx_ne_convert or avx512_fp16");
148   }
149 }
150 
151 }
152