Lines Matching full:benchmark
18 #include <benchmark/benchmark.h>
38 static void GEMMBenchmark(benchmark::State& state, in GEMMBenchmark()
42 benchmark::utils::IsaCheckFunction isa_check = nullptr) in GEMMBenchmark()
52 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr); in GEMMBenchmark()
53 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr); in GEMMBenchmark()
69 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), in GEMMBenchmark()
89 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float)); in GEMMBenchmark()
104 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); in GEMMBenchmark()
109 state.counters["FLOPS"] = benchmark::Counter( in GEMMBenchmark()
110 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate); in GEMMBenchmark()
113 static void PPMM1PBenchmark(benchmark::State& state, in PPMM1PBenchmark()
118 benchmark::utils::IsaCheckFunction isa_check = nullptr) in PPMM1PBenchmark()
128 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr); in PPMM1PBenchmark()
146 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), in PPMM1PBenchmark()
166 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float)); in PPMM1PBenchmark()
182 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); in PPMM1PBenchmark()
187 state.counters["FLOPS"] = benchmark::Counter( in PPMM1PBenchmark()
188 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate); in PPMM1PBenchmark()
191 static void PPMM2PBenchmark(benchmark::State& state, in PPMM2PBenchmark()
196 benchmark::utils::IsaCheckFunction isa_check = nullptr) in PPMM2PBenchmark()
206 const size_t mc_stride = benchmark::utils::RoundUp(mc, mr); in PPMM2PBenchmark()
207 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr); in PPMM2PBenchmark()
225 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), in PPMM2PBenchmark()
245 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float)); in PPMM2PBenchmark()
264 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); in PPMM2PBenchmark()
269 state.counters["FLOPS"] = benchmark::Counter( in PPMM2PBenchmark()
270 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate); in PPMM2PBenchmark()
274 static void RuyBenchmark(benchmark::State& state, uint32_t threads) in RuyBenchmark()
285 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), in RuyBenchmark()
297 // Note: context must be static to avoid the cost of re-creating it for each benchmark. in RuyBenchmark()
313 …// Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recordi… in RuyBenchmark()
335 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float)); in RuyBenchmark()
346 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); in RuyBenchmark()
351 state.counters["FLOPS"] = benchmark::Counter( in RuyBenchmark()
352 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate); in RuyBenchmark()
355 static void ruy_st(benchmark::State& state, const char* net) in ruy_st()
362 static void GEMMBenchmark(benchmark::State& state, in GEMMBenchmark()
366 benchmark::utils::IsaCheckFunction isa_check = nullptr) in GEMMBenchmark()
376 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr); in GEMMBenchmark()
377 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr); in GEMMBenchmark()
393 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), in GEMMBenchmark()
426 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float)); in GEMMBenchmark()
443 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); in GEMMBenchmark()
448 state.counters["FLOPS"] = benchmark::Counter( in GEMMBenchmark()
449 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate); in GEMMBenchmark()
454 static void f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) { in f32_gemm_1x8__aarch64_neonfma_ld64()
458 static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) { in f32_gemm_1x12__aarch64_neonfma_cortex_a53()
462 static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) { in f32_gemm_1x8__aarch64_neonfma_cortex_a53()
466 …static void f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net… in f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a53()
470 static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) { in f32_gemm_1x8__aarch64_neonfma_cortex_a75()
474 …static void f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net… in f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75()
478 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) { in f32_gemm_4x12__aarch64_neonfma_cortex_a53()
482 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch64_neonfma_cortex_a53()
486 …static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net… in f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53()
490 static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch64_neonfma_cortex_a55()
494 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch64_neonfma_cortex_a75()
498 …static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net… in f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75()
502 static void f32_gemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) { in f32_gemm_4x2__aarch64_neonfma_cortex_a75()
506 …static void f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net… in f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75()
510 static void f32_gemm_4x2__aarch64_neonfma_ld64(benchmark::State& state, const char* net) { in f32_gemm_4x2__aarch64_neonfma_ld64()
514 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch64_neonfma_ld64()
518 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch64_neonfma_ld128()
522 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) { in f32_gemm_5x8__aarch64_neonfma_cortex_a75()
526 …static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net… in f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75()
530 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) { in f32_gemm_6x8__aarch64_neonfma_ld64()
534 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) { in f32_gemm_6x8__aarch64_neonfma_ld128()
538 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) { in f32_gemm_6x8__aarch64_neonfma_cortex_a53()
542 …static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net… in f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53()
546 static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) { in f32_gemm_6x8__aarch64_neonfma_cortex_a55()
550 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) { in f32_gemm_6x8__aarch64_neonfma_cortex_a73()
554 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) { in f32_gemm_6x8__aarch64_neonfma_cortex_a75()
558 …static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net… in f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75()
562 static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_1x8__neonfma_lane_ld64()
566 static void f32_gemm_4x2__neonfma_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_4x2__neonfma_lane_ld64()
570 static void f32_gemm_6x2__neonfma_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_6x2__neonfma_lane_ld64()
574 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_4x8__neonfma_lane_ld64()
578 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) { in f32_gemm_4x8__neonfma_lane_ld128()
582 static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_5x8__neonfma_lane_ld64()
586 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_6x8__neonfma_lane_ld64()
590 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) { in f32_gemm_6x8__neonfma_lane_ld128()
633 static void f32_gemm_4x4__aarch32_vfp_ld64(benchmark::State& state, const char* net) { in BENCHMARK_GEMM()
635 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckVFP); in BENCHMARK_GEMM()
638 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch32_neon_ld64()
640 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8__aarch32_neon_ld64()
642 static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch32_neon_cortex_a7()
644 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8__aarch32_neon_cortex_a7()
646 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch32_neon_cortex_a53()
648 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8__aarch32_neon_cortex_a53()
650 static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch32_neon_prfm_cortex_a53()
652 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8__aarch32_neon_prfm_cortex_a53()
654 static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch32_neon_cortex_a55()
656 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8__aarch32_neon_cortex_a55()
658 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch32_neon_cortex_a75()
660 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8__aarch32_neon_cortex_a75()
662 static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) { in f32_gemm_4x8__aarch32_neon_prfm_cortex_a75()
664 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8__aarch32_neon_prfm_cortex_a75()
678 static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) { in BENCHMARK_GEMM()
680 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in BENCHMARK_GEMM()
682 static void f32_gemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_4x2__neon_lane_ld64()
684 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x2__neon_lane_ld64()
686 static void f32_gemm_6x2__neon_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_6x2__neon_lane_ld64()
688 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_6x2__neon_lane_ld64()
690 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_4x8__neon_lane_ld64()
692 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8__neon_lane_ld64()
694 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) { in f32_gemm_4x8__neon_lane_ld128()
696 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8__neon_lane_ld128()
698 static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_5x8__neon_lane_ld64()
700 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_5x8__neon_lane_ld64()
702 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) { in f32_gemm_6x8__neon_lane_ld64()
704 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_6x8__neon_lane_ld64()
706 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) { in f32_gemm_6x8__neon_lane_ld128()
708 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_6x8__neon_lane_ld128()
710 static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) { in f32_gemm_1x8__neonfma_dup_ld64()
712 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_gemm_1x8__neonfma_dup_ld64()
714 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) { in f32_gemm_4x8__neonfma_dup_ld64()
716 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_gemm_4x8__neonfma_dup_ld64()
718 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) { in f32_gemm_4x8__neonfma_dup_ld128()
720 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_gemm_4x8__neonfma_dup_ld128()
722 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) { in f32_gemm_6x8__neonfma_dup_ld64()
724 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_gemm_6x8__neonfma_dup_ld64()
726 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) { in f32_gemm_6x8__neonfma_dup_ld128()
728 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_gemm_6x8__neonfma_dup_ld128()
730 static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) { in f32_gemm_1x8s4__neon()
732 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_1x8s4__neon()
734 static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) { in f32_gemm_1x8s4__neonfma()
736 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_gemm_1x8s4__neonfma()
738 static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) { in f32_gemm_4x8s4__neon()
740 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_4x8s4__neon()
742 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) { in f32_gemm_4x8s4__neonfma()
744 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_gemm_4x8s4__neonfma()
746 static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) { in f32_gemm_6x8s4__neon()
748 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_6x8s4__neon()
750 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) { in f32_gemm_6x8s4__neonfma()
752 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_gemm_6x8s4__neonfma()
754 static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) { in f32_gemm_8x8s4__neon()
756 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in f32_gemm_8x8s4__neon()
758 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) { in f32_gemm_8x8s4__neonfma()
760 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_gemm_8x8s4__neonfma()
762 static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) { in f32_ppmm_4x8_unipass__neonfma()
764 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_ppmm_4x8_unipass__neonfma()
766 static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) { in f32_ppmm_4x8_twopass__neonfma()
768 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA); in f32_ppmm_4x8_twopass__neonfma()
802 static void jit_f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) in BENCHMARK_GEMM()
805 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in BENCHMARK_GEMM()
807 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) in jit_f32_gemm_4x8__aarch32_neon_cortex_a7()
810 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in jit_f32_gemm_4x8__aarch32_neon_cortex_a7()
812 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) in jit_f32_gemm_4x8__aarch32_neon_cortex_a53()
815 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in jit_f32_gemm_4x8__aarch32_neon_cortex_a53()
817 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) in jit_f32_gemm_4x8__aarch32_neon_cortex_a55()
820 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in jit_f32_gemm_4x8__aarch32_neon_cortex_a55()
822 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) in jit_f32_gemm_4x8__aarch32_neon_cortex_a75()
825 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in jit_f32_gemm_4x8__aarch32_neon_cortex_a75()
827 …static void jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* ne… in jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75()
830 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75()
842 static void jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) in BENCHMARK_GEMM()
845 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in BENCHMARK_GEMM()
847 …static void jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char*… in jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75()
850 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75()
852 static void jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) in jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75()
855 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75()
857 …static void jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char*… in jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75()
860 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75()
862 static void jit_f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) in jit_f32_gemm_6x8__aarch64_neonfma_ld128()
865 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON); in jit_f32_gemm_6x8__aarch64_neonfma_ld128()
874 static void name(benchmark::State &state, const char *net) { \
879 benchmark::utils::CheckNEON); \
893 static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) { in f32_gemm_1x16__avx512f_broadcast()
895 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F); in f32_gemm_1x16__avx512f_broadcast()
897 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) { in f32_gemm_4x16__avx512f_broadcast()
899 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F); in f32_gemm_4x16__avx512f_broadcast()
901 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) { in f32_gemm_5x16__avx512f_broadcast()
903 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F); in f32_gemm_5x16__avx512f_broadcast()
905 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) { in f32_gemm_6x16__avx512f_broadcast()
907 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F); in f32_gemm_6x16__avx512f_broadcast()
909 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) { in f32_gemm_7x16__avx512f_broadcast()
911 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F); in f32_gemm_7x16__avx512f_broadcast()
913 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) { in f32_gemm_8x16__avx512f_broadcast()
915 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F); in f32_gemm_8x16__avx512f_broadcast()
918 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_1x8__fma3_broadcast()
920 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_1x8__fma3_broadcast()
922 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_4x8__fma3_broadcast()
924 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_4x8__fma3_broadcast()
926 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_5x8__fma3_broadcast()
928 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_5x8__fma3_broadcast()
930 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_6x8__fma3_broadcast()
932 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_6x8__fma3_broadcast()
934 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_7x8__fma3_broadcast()
936 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_7x8__fma3_broadcast()
938 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_8x8__fma3_broadcast()
940 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_8x8__fma3_broadcast()
942 static void f32_gemm_1x16__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_1x16__fma3_broadcast()
944 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_1x16__fma3_broadcast()
946 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_3x16__fma3_broadcast()
948 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_3x16__fma3_broadcast()
950 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_4x16__fma3_broadcast()
952 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_4x16__fma3_broadcast()
954 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_5x16__fma3_broadcast()
956 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_5x16__fma3_broadcast()
959 static void f32_gemm_1x16s4__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_1x16s4__fma3_broadcast()
961 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_1x16s4__fma3_broadcast()
963 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_3x16s4__fma3_broadcast()
965 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_3x16s4__fma3_broadcast()
967 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_4x16s4__fma3_broadcast()
969 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_4x16s4__fma3_broadcast()
971 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, const char* net) { in f32_gemm_5x16s4__fma3_broadcast()
973 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3); in f32_gemm_5x16s4__fma3_broadcast()
976 static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) { in f32_gemm_1x8__avx_broadcast()
978 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX); in f32_gemm_1x8__avx_broadcast()
980 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) { in f32_gemm_4x8__avx_broadcast()
982 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX); in f32_gemm_4x8__avx_broadcast()
984 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) { in f32_gemm_5x8__avx_broadcast()
986 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX); in f32_gemm_5x8__avx_broadcast()
988 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) { in f32_gemm_6x8__avx_broadcast()
990 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX); in f32_gemm_6x8__avx_broadcast()
992 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) { in f32_gemm_7x8__avx_broadcast()
994 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX); in f32_gemm_7x8__avx_broadcast()
996 static void f32_gemm_1x16__avx_broadcast(benchmark::State& state, const char* net) { in f32_gemm_1x16__avx_broadcast()
998 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX); in f32_gemm_1x16__avx_broadcast()
1000 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, const char* net) { in f32_gemm_3x16__avx_broadcast()
1002 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX); in f32_gemm_3x16__avx_broadcast()
1004 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, const char* net) { in f32_gemm_4x16__avx_broadcast()
1006 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX); in f32_gemm_4x16__avx_broadcast()
1008 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, const char* net) { in f32_gemm_5x16__avx_broadcast()
1010 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX); in f32_gemm_5x16__avx_broadcast()
1013 static void f32_gemm_1x8__sse2_dup(benchmark::State& state, const char* net) { in f32_gemm_1x8__sse2_dup()
1017 static void f32_gemm_3x8__sse2_dup(benchmark::State& state, const char* net) { in f32_gemm_3x8__sse2_dup()
1021 static void f32_gemm_4x8__sse2_dup(benchmark::State& state, const char* net) { in f32_gemm_4x8__sse2_dup()
1025 static void f32_gemm_5x8__sse2_dup(benchmark::State& state, const char* net) { in f32_gemm_5x8__sse2_dup()
1030 static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) { in f32_gemm_1x8__sse_load1()
1034 static void f32_gemm_3x8__sse_load1(benchmark::State& state, const char* net) { in f32_gemm_3x8__sse_load1()
1038 static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) { in f32_gemm_4x8__sse_load1()
1042 static void f32_gemm_5x8__sse_load1(benchmark::State& state, const char* net) { in f32_gemm_5x8__sse_load1()
1047 static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) { in f32_gemm_1x8__sse_dup()
1051 static void f32_gemm_3x8__sse_dup(benchmark::State& state, const char* net) { in f32_gemm_3x8__sse_dup()
1055 static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) { in f32_gemm_4x8__sse_dup()
1059 static void f32_gemm_5x8__sse_dup(benchmark::State& state, const char* net) { in f32_gemm_5x8__sse_dup()
1064 static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) { in f32_gemm_1x8s4__sse()
1068 static void f32_gemm_3x8s4__sse(benchmark::State& state, const char* net) { in f32_gemm_3x8s4__sse()
1072 static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) { in f32_gemm_4x8s4__sse()
1076 static void f32_gemm_5x8s4__sse(benchmark::State& state, const char* net) { in f32_gemm_5x8s4__sse()
1081 static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) { in f32_ppmm_4x8_unipass__sse()
1085 static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) { in f32_ppmm_4x8_twopass__sse()
1149 static void f32_gemm_3x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) { in BENCHMARK_GEMM()
1153 static void f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_4x8__wasmrelaxedsimd_loadsplat()
1157 static void f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_5x8__wasmrelaxedsimd_loadsplat()
1161 static void f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_6x8__wasmrelaxedsimd_loadsplat()
1165 …static void f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat()
1169 …static void f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat()
1173 …static void f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat()
1177 …static void f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat()
1181 static void f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) { in f32_gemm_3x8__wasmrelaxedsimd_splat()
1185 static void f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) { in f32_gemm_4x8__wasmrelaxedsimd_splat()
1189 static void f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) { in f32_gemm_5x8__wasmrelaxedsimd_splat()
1193 static void f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) { in f32_gemm_6x8__wasmrelaxedsimd_splat()
1197 static void f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) { in f32_gemm_3x8__wasmrelaxedsimd_fma_splat()
1201 static void f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) { in f32_gemm_4x8__wasmrelaxedsimd_fma_splat()
1205 static void f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) { in f32_gemm_5x8__wasmrelaxedsimd_fma_splat()
1209 static void f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) { in f32_gemm_6x8__wasmrelaxedsimd_fma_splat()
1213 static void f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) { in f32_gemm_3x8s4__wasmrelaxedsimd()
1217 static void f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) { in f32_gemm_4x8s4__wasmrelaxedsimd()
1221 static void f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) { in f32_gemm_5x8s4__wasmrelaxedsimd()
1225 static void f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) { in f32_gemm_6x8s4__wasmrelaxedsimd()
1229 static void f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) { in f32_gemm_3x8s4__wasmrelaxedsimd_fma()
1233 static void f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) { in f32_gemm_4x8s4__wasmrelaxedsimd_fma()
1237 static void f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) { in f32_gemm_5x8s4__wasmrelaxedsimd_fma()
1241 static void f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) { in f32_gemm_6x8s4__wasmrelaxedsimd_fma()
1279 static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) { in BENCHMARK_GEMM()
1283 static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_4x8__wasmsimd_arm_loadsplat()
1287 static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_5x8__wasmsimd_arm_loadsplat()
1291 static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_6x8__wasmsimd_arm_loadsplat()
1295 static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_3x8__wasmsimd_x86_loadsplat()
1299 static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_4x8__wasmsimd_x86_loadsplat()
1303 static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_5x8__wasmsimd_x86_loadsplat()
1307 static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) { in f32_gemm_6x8__wasmsimd_x86_loadsplat()
1311 static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) { in f32_gemm_3x8__wasmsimd_arm_splat()
1315 static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) { in f32_gemm_4x8__wasmsimd_arm_splat()
1319 static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) { in f32_gemm_5x8__wasmsimd_arm_splat()
1323 static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) { in f32_gemm_6x8__wasmsimd_arm_splat()
1327 static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) { in f32_gemm_3x8__wasmsimd_x86_splat()
1331 static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) { in f32_gemm_4x8__wasmsimd_x86_splat()
1335 static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) { in f32_gemm_5x8__wasmsimd_x86_splat()
1339 static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) { in f32_gemm_6x8__wasmsimd_x86_splat()
1343 static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) { in f32_gemm_3x8s4__wasmsimd_arm()
1347 static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) { in f32_gemm_4x8s4__wasmsimd_arm()
1351 static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) { in f32_gemm_5x8s4__wasmsimd_arm()
1355 static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) { in f32_gemm_6x8s4__wasmsimd_arm()
1359 static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) { in f32_gemm_3x8s4__wasmsimd_x86()
1363 static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) { in f32_gemm_4x8s4__wasmsimd_x86()
1367 static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) { in f32_gemm_5x8s4__wasmsimd_x86()
1371 static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) { in f32_gemm_6x8s4__wasmsimd_x86()
1376 static void f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State& state, const char* net) { in f32_ppmm_4x8_unipass__wasmsimd_arm_splat()
1380 static void f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State& state, const char* net) { in f32_ppmm_4x8_unipass__wasmsimd_x86_splat()
1385 static void f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State& state, const char* net) { in f32_ppmm_4x8_twopass__wasmsimd_arm_splat()
1389 static void f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State& state, const char* net) { in f32_ppmm_4x8_twopass__wasmsimd_x86_splat()
1432 static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) { in BENCHMARK_GEMM()
1436 static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) { in f32_gemm_2x4__scalar()
1440 static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) { in f32_gemm_4x4__scalar()
1445 static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) { in f32_ppmm_2x4_unipass__scalar()
1449 static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) { in f32_ppmm_4x2_unipass__scalar()
1453 static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) { in f32_ppmm_4x4_unipass__scalar()
1457 static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) { in f32_ppmm_3x3_unipass__scalar()
1462 static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) { in f32_ppmm_2x4_twopass__scalar()
1466 static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) { in f32_ppmm_4x2_twopass__scalar()
1470 static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) { in f32_ppmm_4x4_twopass__scalar()
1474 static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) { in f32_ppmm_3x3_twopass__scalar()