1 #pragma once
2
3 #include <ATen/Config.h>
4 #include <ATen/core/List.h>
5 #include <ATen/core/Tensor.h>
6 #include <c10/util/ArrayRef.h>
7 #include <c10/util/strides.h>
8 #if !defined(__s390x__) && !defined(__powerpc__)
9 #include <cpuinfo.h>
10 #endif
11 #include <vector>
12
13 #if AT_MKLDNN_ENABLED()
14 #include <ideep/tensor.hpp>
15 #endif // AT_MKLDNN_ENABLED()
16
17 namespace at { namespace native {
18
19 std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
20 const Tensor& input,
21 IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias,
22 double eps, bool inplace = false);
23
24 std::vector<int64_t> pool_output_sizes(
25 IntArrayRef input_size,
26 IntArrayRef kernel_size,
27 IntArrayRef stride,
28 IntArrayRef padding_l,
29 IntArrayRef padding_r,
30 IntArrayRef dilation,
31 bool ceil_mode);
32
33 void check_mkldnn_binary_fusion_inputs(
34 const Tensor& input,
35 const Tensor& other,
36 const Tensor& weight,
37 const Tensor& bias);
38
padding_r(IntArrayRef padding,IntArrayRef output_padding)39 inline std::vector<int64_t> padding_r(
40 IntArrayRef padding, IntArrayRef output_padding)
41 {
42 // ConvTranpose padding adjustment
43 //
44 // PyTorch uses padding/output_padding:
45 // osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1
46 //
47 // MKLDNN uses padding_l/padding_r:
48 // osize = (isize - 1) * stride - padding_l - padding_r + dilation * (kernel_size - 1) + 1
49 //
50 // So: padding_l = padding, padding_r = padding - output_padding
51 //
52 auto dim = padding.size();
53 std::vector<int64_t> pad_r(dim);
54 for (const auto d : c10::irange(dim)) {
55 pad_r[d] = padding[d] - output_padding[d];
56 }
57 return pad_r;
58 }
59
60 // Make sure input has default contiguous strides if it's contiguous tensors for better performance.
61 // For example, for tensor of size = [1, 1280], stride = [0, 1], we'll convert it to size = [1, 1280], stride = [1280, 1]
62 // before calling oneDNN for better performance.
may_convert_to_default_contiguous_strides(const Tensor & input)63 inline Tensor may_convert_to_default_contiguous_strides(const Tensor& input) {
64 auto input_size = input.sizes().vec();
65 auto input_stride = input.strides().vec();
66 auto input_default_contiguous_strides = c10::contiguous_strides(input_size);
67 if (input.is_contiguous() && input_stride != c10::IntArrayRef(input_default_contiguous_strides)) {
68 return input.as_strided(input_size, input_default_contiguous_strides);
69 }
70 return input;
71 }
72
73 #if AT_MKLDNN_ENABLED()
74
75 using AttrFunction = std::function<ideep::attr_t(
76 torch::List<std::optional<at::Scalar>>,
77 std::optional<c10::string_view>)>;
78
79 const std::map<c10::string_view, AttrFunction>& fusion_unary_attr_map();
80
81 const std::map<c10::string_view, ideep::algorithm>& fusion_unary_alg_map();
82
83 const std::map<c10::string_view, ideep::algorithm>& fusion_binary_alg_map();
84
85 #endif // AT_MKLDNN_ENABLED()
86 };
87
88 #if defined(__aarch64__)
mkldnn_bf16_device_check_arm()89 inline bool mkldnn_bf16_device_check_arm() {
90 return cpuinfo_initialize() && cpuinfo_has_arm_bf16();
91 }
92
is_arm_neoverse()93 inline bool is_arm_neoverse() {
94 return (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 &&
95 (cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1 ||
96 cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v2 ||
97 cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_n1 ||
98 cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_n2));
99 }
100 #else
mkldnn_bf16_device_check_arm()101 constexpr bool mkldnn_bf16_device_check_arm() {
102 return false;
103 }
104
is_arm_neoverse()105 constexpr bool is_arm_neoverse() {
106 return false;
107 }
108 #endif
109
110 #if AT_MKLDNN_ENABLED()
mkldnn_bf16_device_check()111 inline bool mkldnn_bf16_device_check() {
112 #if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))
113 // Use ideep to check bf16 on X64 as cpuinfo has no avx_ne_convert check.
114 return ideep::has_bf16_type_support();
115 #else
116 return mkldnn_bf16_device_check_arm();
117 #endif
118 }
119
mkldnn_fp16_device_check()120 inline bool mkldnn_fp16_device_check() {
121 #if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))
122 return ideep::has_fp16_type_support();
123 #else
124 return false;
125 #endif
126 }
127
128 #else
mkldnn_bf16_device_check()129 inline bool mkldnn_bf16_device_check() {
130 return false;
131 }
mkldnn_fp16_device_check()132 inline bool mkldnn_fp16_device_check() {
133 return false;
134 }
135 #endif
136
mkldnn_check_low_precision(ScalarType input_t,std::string name)137 inline void mkldnn_check_low_precision(ScalarType input_t, std::string name) {
138 if (input_t == ScalarType::BFloat16) {
139 TORCH_CHECK(
140 mkldnn_bf16_device_check(),
141 name,
142 ": bf16 path needs the cpu support avx_ne_convert or avx512bw, avx512vl and avx512dq");
143 } else if (input_t == ScalarType::Half) {
144 TORCH_CHECK(
145 mkldnn_fp16_device_check(),
146 name,
147 ": fp16 path needs the cpu support avx_ne_convert or avx512_fp16");
148 }
149 }
150
151 }
152