1 // !!!! PLEASE READ !!!! 2 // Minimize (transitively) included headers from _avx*.cc because some of the 3 // functions defined in the headers compiled with platform dependent compiler 4 // options can be reused by other translation units generating illegal 5 // instruction run-time error. 6 7 // Common utilities for writing performance kernels and easy dispatching of 8 // different backends. 9 /* 10 The general workflow shall be as follows, say we want to 11 implement a functionality called void foo(int a, float b). 12 13 In foo.h, do: 14 void foo(int a, float b); 15 16 In foo_avx512.cc, do: 17 void foo__avx512(int a, float b) { 18 [actual avx512 implementation] 19 } 20 21 In foo_avx2.cc, do: 22 void foo__avx2(int a, float b) { 23 [actual avx2 implementation] 24 } 25 26 In foo_avx.cc, do: 27 void foo__avx(int a, float b) { 28 [actual avx implementation] 29 } 30 31 In foo.cc, do: 32 // The base implementation should *always* be provided. 33 void foo__base(int a, float b) { 34 [base, possibly slow implementation] 35 } 36 decltype(foo__base) foo__avx512; 37 decltype(foo__base) foo__avx2; 38 decltype(foo__base) foo__avx; 39 void foo(int a, float b) { 40 // You should always order things by their preference, faster 41 // implementations earlier in the function. 42 AVX512_DO(foo, a, b); 43 AVX2_DO(foo, a, b); 44 AVX_DO(foo, a, b); 45 BASE_DO(foo, a, b); 46 } 47 48 */ 49 // Details: this functionality basically covers the cases for both build time 50 // and run time architecture support. 51 // 52 // During build time: 53 // The build system should provide flags CAFFE2_PERF_WITH_AVX512, 54 // CAFFE2_PERF_WITH_AVX2, and CAFFE2_PERF_WITH_AVX that corresponds to the 55 // __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__, and __AVX__ flags the 56 // compiler provides. Note that we do not use the compiler flags but rely on 57 // the build system flags, because the common files (like foo.cc above) will 58 // always be built without __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__ 59 // and __AVX__. 60 // During run time: 61 // we use cpuinfo to identify cpu support and run the proper functions. 62 63 #pragma once 64 65 #if defined(CAFFE2_PERF_WITH_AVX512) || defined(CAFFE2_PERF_WITH_AVX2) \ 66 || defined(CAFFE2_PERF_WITH_AVX) 67 #include <cpuinfo.h> 68 #endif 69 70 // DO macros: these should be used in your entry function, similar to foo() 71 // above, that routes implementations based on CPU capability. 72 73 #define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__); 74 75 #ifdef CAFFE2_PERF_WITH_AVX512 76 #define AVX512_DO(funcname, ...) \ 77 { \ 78 static const bool isDo = cpuinfo_initialize() && \ 79 cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512dq() && \ 80 cpuinfo_has_x86_avx512vl(); \ 81 if (isDo) { \ 82 return funcname##__avx512(__VA_ARGS__); \ 83 } \ 84 } 85 #else // CAFFE2_PERF_WITH_AVX512 86 #define AVX512_DO(funcname, ...) 87 #endif // CAFFE2_PERF_WITH_AVX512 88 89 #ifdef CAFFE2_PERF_WITH_AVX2 90 #define AVX2_DO(funcname, ...) \ 91 { \ 92 static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2(); \ 93 if (isDo) { \ 94 return funcname##__avx2(__VA_ARGS__); \ 95 } \ 96 } 97 #define AVX2_FMA_DO(funcname, ...) \ 98 { \ 99 static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2() && \ 100 cpuinfo_has_x86_fma3(); \ 101 if (isDo) { \ 102 return funcname##__avx2_fma(__VA_ARGS__); \ 103 } \ 104 } 105 #else // CAFFE2_PERF_WITH_AVX2 106 #define AVX2_DO(funcname, ...) 107 #define AVX2_FMA_DO(funcname, ...) 108 #endif // CAFFE2_PERF_WITH_AVX2 109 110 #ifdef CAFFE2_PERF_WITH_AVX 111 #define AVX_DO(funcname, ...) \ 112 { \ 113 static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx(); \ 114 if (isDo) { \ 115 return funcname##__avx(__VA_ARGS__); \ 116 } \ 117 } 118 #define AVX_F16C_DO(funcname, ...) \ 119 { \ 120 static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx() && \ 121 cpuinfo_has_x86_f16c(); \ 122 if (isDo) { \ 123 return funcname##__avx_f16c(__VA_ARGS__); \ 124 } \ 125 } 126 #else // CAFFE2_PERF_WITH_AVX 127 #define AVX_DO(funcname, ...) 128 #define AVX_F16C_DO(funcname, ...) 129 #endif // CAFFE2_PERF_WITH_AVX 130