xref: /aosp_15_r20/external/pytorch/caffe2/perfkernels/common.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 // !!!! PLEASE READ !!!!
2 // Minimize (transitively) included headers from _avx*.cc because some of the
3 // functions defined in the headers compiled with platform dependent compiler
4 // options can be reused by other translation units generating illegal
5 // instruction run-time error.
6 
7 // Common utilities for writing performance kernels and easy dispatching of
8 // different backends.
9 /*
10 The general workflow shall be as follows, say we want to
11 implement a functionality called void foo(int a, float b).
12 
13 In foo.h, do:
14    void foo(int a, float b);
15 
16 In foo_avx512.cc, do:
17    void foo__avx512(int a, float b) {
18      [actual avx512 implementation]
19    }
20 
21 In foo_avx2.cc, do:
22    void foo__avx2(int a, float b) {
23      [actual avx2 implementation]
24    }
25 
26 In foo_avx.cc, do:
27    void foo__avx(int a, float b) {
28      [actual avx implementation]
29    }
30 
31 In foo.cc, do:
32    // The base implementation should *always* be provided.
33    void foo__base(int a, float b) {
34      [base, possibly slow implementation]
35    }
36    decltype(foo__base) foo__avx512;
37    decltype(foo__base) foo__avx2;
38    decltype(foo__base) foo__avx;
39    void foo(int a, float b) {
40      // You should always order things by their preference, faster
41      // implementations earlier in the function.
42      AVX512_DO(foo, a, b);
43      AVX2_DO(foo, a, b);
44      AVX_DO(foo, a, b);
45      BASE_DO(foo, a, b);
46    }
47 
48 */
49 // Details: this functionality basically covers the cases for both build time
50 // and run time architecture support.
51 //
52 // During build time:
53 //    The build system should provide flags CAFFE2_PERF_WITH_AVX512,
54 //    CAFFE2_PERF_WITH_AVX2, and CAFFE2_PERF_WITH_AVX that corresponds to the
55 //    __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__, and __AVX__ flags the
56 //    compiler provides. Note that we do not use the compiler flags but rely on
57 //    the build system flags, because the common files (like foo.cc above) will
58 //    always be built without __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__
59 //    and __AVX__.
60 // During run time:
61 //    we use cpuinfo to identify cpu support and run the proper functions.
62 
63 #pragma once
64 
65 #if defined(CAFFE2_PERF_WITH_AVX512) || defined(CAFFE2_PERF_WITH_AVX2) \
66      || defined(CAFFE2_PERF_WITH_AVX)
67 #include <cpuinfo.h>
68 #endif
69 
70 // DO macros: these should be used in your entry function, similar to foo()
71 // above, that routes implementations based on CPU capability.
72 
73 #define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__);
74 
75 #ifdef CAFFE2_PERF_WITH_AVX512
76 #define AVX512_DO(funcname, ...)                                   \
77   {                                                                \
78     static const bool isDo = cpuinfo_initialize() &&               \
79         cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512dq() && \
80         cpuinfo_has_x86_avx512vl();                                \
81     if (isDo) {                                                    \
82       return funcname##__avx512(__VA_ARGS__);                      \
83     }                                                              \
84   }
85 #else // CAFFE2_PERF_WITH_AVX512
86 #define AVX512_DO(funcname, ...)
87 #endif // CAFFE2_PERF_WITH_AVX512
88 
89 #ifdef CAFFE2_PERF_WITH_AVX2
90 #define AVX2_DO(funcname, ...)                                               \
91   {                                                                          \
92     static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2(); \
93     if (isDo) {                                                              \
94       return funcname##__avx2(__VA_ARGS__);                                  \
95     }                                                                        \
96   }
97 #define AVX2_FMA_DO(funcname, ...)                                             \
98   {                                                                            \
99     static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2() && \
100         cpuinfo_has_x86_fma3();                                                \
101     if (isDo) {                                                                \
102       return funcname##__avx2_fma(__VA_ARGS__);                                \
103     }                                                                          \
104   }
105 #else // CAFFE2_PERF_WITH_AVX2
106 #define AVX2_DO(funcname, ...)
107 #define AVX2_FMA_DO(funcname, ...)
108 #endif // CAFFE2_PERF_WITH_AVX2
109 
110 #ifdef CAFFE2_PERF_WITH_AVX
111 #define AVX_DO(funcname, ...)                                               \
112   {                                                                         \
113     static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx(); \
114     if (isDo) {                                                             \
115       return funcname##__avx(__VA_ARGS__);                                  \
116     }                                                                       \
117   }
118 #define AVX_F16C_DO(funcname, ...)                                            \
119   {                                                                           \
120     static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx() && \
121         cpuinfo_has_x86_f16c();                                               \
122     if (isDo) {                                                               \
123       return funcname##__avx_f16c(__VA_ARGS__);                               \
124     }                                                                         \
125   }
126 #else // CAFFE2_PERF_WITH_AVX
127 #define AVX_DO(funcname, ...)
128 #define AVX_F16C_DO(funcname, ...)
129 #endif // CAFFE2_PERF_WITH_AVX
130