1 /* Copyright 2019 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "ruy/block_map.h"
17
18 #include <cstddef>
19 #include <cstdint>
20 #include <cstdlib>
21 #include <limits>
22 #include <vector>
23
24 #include "ruy/cpu_cache_params.h"
25 #include "ruy/gtest_wrapper.h"
26 #include "ruy/platform.h"
27 #include "ruy/side_pair.h"
28
29 namespace ruy {
30 namespace {
31
32 #if RUY_PLATFORM_NEON_64
33
34 // Unless otherwise specified, these tests have been tuned on ARM Cortex-A55.
MakeBlockMapTuningTest(int rows,int cols,int depth,int kernel_rows,int kernel_cols,int lhs_scalar_size,int rhs_scalar_size,int tentative_thread_count,int expected_num_blocks_base_log2,int expected_rectangularness_log2)35 void MakeBlockMapTuningTest(int rows, int cols, int depth, int kernel_rows,
36 int kernel_cols, int lhs_scalar_size,
37 int rhs_scalar_size, int tentative_thread_count,
38 int expected_num_blocks_base_log2,
39 int expected_rectangularness_log2) {
40 // Plausible Cortex-A55 cache sizes.
41 CpuCacheParams cpu_cache_params;
42 cpu_cache_params.local_cache_size = 128 * 1024;
43 cpu_cache_params.last_level_cache_size = 1024 * 1024;
44 BlockMap block_map;
45 MakeBlockMap(rows, cols, depth, kernel_rows, kernel_cols, lhs_scalar_size,
46 rhs_scalar_size, tentative_thread_count, cpu_cache_params,
47 &block_map);
48 EXPECT_EQ(block_map.num_blocks_base_log2, expected_num_blocks_base_log2);
49 EXPECT_EQ(std::min(block_map.rectangularness_log2[Side::kLhs],
50 block_map.rectangularness_log2[Side::kRhs]),
51 0);
52 EXPECT_EQ(std::max(block_map.rectangularness_log2[Side::kLhs],
53 block_map.rectangularness_log2[Side::kRhs]),
54 expected_rectangularness_log2);
55 }
56
TEST(BlockMapTest,MakeBlockMapTuningTest8bitCubicShapesOneThreadNeonDotprod)57 TEST(BlockMapTest, MakeBlockMapTuningTest8bitCubicShapesOneThreadNeonDotprod) {
58 MakeBlockMapTuningTest(32, 32, 32, 8, 8, 1, 1, /* tentative_thread_count */ 1,
59 /* expected_num_blocks_base_log2 */ 0,
60 /* expected_rectangularness_log2 */ 0);
61 MakeBlockMapTuningTest(48, 48, 48, 8, 8, 1, 1, /* tentative_thread_count */ 1,
62 /* expected_num_blocks_base_log2 */ 0,
63 /* expected_rectangularness_log2 */ 0);
64 MakeBlockMapTuningTest(64, 64, 64, 8, 8, 1, 1, /* tentative_thread_count */ 1,
65 /* expected_num_blocks_base_log2 */ 0,
66 /* expected_rectangularness_log2 */ 0);
67 MakeBlockMapTuningTest(96, 96, 96, 8, 8, 1, 1, /* tentative_thread_count */ 1,
68 /* expected_num_blocks_base_log2 */ 0,
69 /* expected_rectangularness_log2 */ 0);
70 MakeBlockMapTuningTest(128, 128, 128, 8, 8, 1, 1,
71 /* tentative_thread_count */ 1,
72 /* expected_num_blocks_base_log2 */ 0,
73 /* expected_rectangularness_log2 */ 0);
74 MakeBlockMapTuningTest(192, 192, 192, 8, 8, 1, 1,
75 /* tentative_thread_count */ 1,
76 /* expected_num_blocks_base_log2 */ 0,
77 /* expected_rectangularness_log2 */ 0);
78 MakeBlockMapTuningTest(256, 256, 256, 8, 8, 1, 1,
79 /* tentative_thread_count */ 1,
80 /* expected_num_blocks_base_log2 */ 1,
81 /* expected_rectangularness_log2 */ 0);
82 MakeBlockMapTuningTest(384, 384, 384, 8, 8, 1, 1,
83 /* tentative_thread_count */ 1,
84 /* expected_num_blocks_base_log2 */ 1,
85 /* expected_rectangularness_log2 */ 0);
86 }
87
TEST(BlockMapTest,MakeBlockMapTuningTest8bitCubicShapesFourThreadsNeonDotprod)88 TEST(BlockMapTest,
89 MakeBlockMapTuningTest8bitCubicShapesFourThreadsNeonDotprod) {
90 MakeBlockMapTuningTest(32, 32, 32, 8, 8, 1, 1, /* tentative_thread_count */ 4,
91 /* expected_num_blocks_base_log2 */ 1,
92 /* expected_rectangularness_log2 */ 0);
93 MakeBlockMapTuningTest(48, 48, 48, 8, 8, 1, 1, /* tentative_thread_count */ 4,
94 /* expected_num_blocks_base_log2 */ 1,
95 /* expected_rectangularness_log2 */ 0);
96 MakeBlockMapTuningTest(64, 64, 64, 8, 8, 1, 1, /* tentative_thread_count */ 4,
97 /* expected_num_blocks_base_log2 */ 1,
98 /* expected_rectangularness_log2 */ 0);
99 MakeBlockMapTuningTest(96, 96, 96, 8, 8, 1, 1, /* tentative_thread_count */ 4,
100 /* expected_num_blocks_base_log2 */ 1,
101 /* expected_rectangularness_log2 */ 0);
102 MakeBlockMapTuningTest(128, 128, 128, 8, 8, 1, 1,
103 /* tentative_thread_count */ 4,
104 /* expected_num_blocks_base_log2 */ 1,
105 /* expected_rectangularness_log2 */ 0);
106 MakeBlockMapTuningTest(192, 192, 192, 8, 8, 1, 1,
107 /* tentative_thread_count */ 4,
108 /* expected_num_blocks_base_log2 */ 1,
109 /* expected_rectangularness_log2 */ 0);
110 MakeBlockMapTuningTest(256, 256, 256, 8, 8, 1, 1,
111 /* tentative_thread_count */ 4,
112 /* expected_num_blocks_base_log2 */ 2,
113 /* expected_rectangularness_log2 */ 0);
114 MakeBlockMapTuningTest(384, 384, 384, 8, 8, 1, 1,
115 /* tentative_thread_count */ 4,
116 /* expected_num_blocks_base_log2 */ 2,
117 /* expected_rectangularness_log2 */ 0);
118 }
119
TEST(BlockMapTest,MakeBlockMapTuningTest32bit)120 TEST(BlockMapTest, MakeBlockMapTuningTest32bit) {
121 MakeBlockMapTuningTest(256, 256, 256, 8, 8, 4, 4,
122 /* tentative_thread_count */ 4,
123 /* expected_num_blocks_base_log2 */ 3,
124 /* expected_rectangularness_log2 */ 0);
125 MakeBlockMapTuningTest(4096, 4096, 4096, 8, 8, 4, 4,
126 /* tentative_thread_count */ 4,
127 /* expected_num_blocks_base_log2 */ 7,
128 /* expected_rectangularness_log2 */ 0);
129 }
130
TEST(BlockMapTest,MakeBlockMapTuningTestRectangular)131 TEST(BlockMapTest, MakeBlockMapTuningTestRectangular) {
132 MakeBlockMapTuningTest(256, 16, 256, 8, 8, 1, 1,
133 /* tentative_thread_count */ 1,
134 /* expected_num_blocks_base_log2 */ 0,
135 /* expected_rectangularness_log2 */ 3);
136 MakeBlockMapTuningTest(24, 2400, 256, 8, 8, 1, 1,
137 /* tentative_thread_count */ 1,
138 /* expected_num_blocks_base_log2 */ 0,
139 /* expected_rectangularness_log2 */ 6);
140 }
141
142 #endif
143
L1Distance(const SidePair<int> & a,const SidePair<int> & b)144 int L1Distance(const SidePair<int>& a, const SidePair<int>& b) {
145 return std::abs(a[Side::kLhs] - b[Side::kLhs]) +
146 std::abs(a[Side::kRhs] - b[Side::kRhs]);
147 }
148
GetBlockByIndexSquareTest(int num_blocks_base_log2,BlockMapTraversalOrder traversal_order)149 void GetBlockByIndexSquareTest(int num_blocks_base_log2,
150 BlockMapTraversalOrder traversal_order) {
151 // Arbitrary, does not affect this test. 3 is just a typical value.
152 constexpr int kKernelSizeLog2 = 3;
153
154 const int size_log2 = num_blocks_base_log2 + kKernelSizeLog2;
155 BlockMap block_map;
156 block_map.thread_count = 1;
157 block_map.traversal_order = traversal_order;
158 block_map.num_blocks_base_log2 = num_blocks_base_log2;
159 for (Side side : {Side::kLhs, Side::kRhs}) {
160 block_map.dims[side] = 1 << size_log2;
161 block_map.rectangularness_log2[side] = 0;
162 block_map.kernel_dims[side] = 1 << kKernelSizeLog2;
163 block_map.small_block_dims[side] = block_map.kernel_dims[side];
164 block_map.large_blocks[side] = 0;
165 }
166
167 const int num_blocks_per_side = 1 << num_blocks_base_log2;
168 const int num_blocks = num_blocks_per_side * num_blocks_per_side;
169 EXPECT_EQ(num_blocks, NumBlocks(block_map));
170
171 // Perform a full traversal of all blocks, as if computing a whole matrix
172 // multiplication.
173 //
174 // Used to record how many times each block was hit by the traversal.
175 std::vector<int> block_hit_counts(num_blocks);
176 // Here we guard an assumption that all traversal orders start at (0, 0).
177 SidePair<int> previous_block_coords(0, 0);
178 // Sum of L1 norm of the coordinate change at every step of the traversal.
179 std::int64_t total_l1_distance = 0;
180 // Number of jumps i.e. traversal steps with a L1 norm greater than 1.
181 int discontinuity_count = 0;
182 for (int block_index = 0; block_index < num_blocks; block_index++) {
183 SidePair<int> block_coords;
184 GetBlockByIndex(block_map, block_index, &block_coords);
185 ++block_hit_counts[block_coords[Side::kLhs] +
186 num_blocks_per_side * block_coords[Side::kRhs]];
187 int distance = L1Distance(block_coords, previous_block_coords);
188 total_l1_distance += distance;
189 discontinuity_count += (distance > 1);
190 previous_block_coords = block_coords;
191 }
192
193 // Verify that each block was traversed exactly once.
194 for (int l = 0; l < num_blocks_per_side; l++) {
195 for (int r = 0; r < num_blocks_per_side; r++) {
196 EXPECT_EQ(block_hit_counts[l + num_blocks_per_side * r], 1);
197 }
198 }
199
200 // Verify that the discontinuity_count and total_l1_distance are as expected
201 // for the given traversal_order.
202 switch (traversal_order) {
203 case BlockMapTraversalOrder::kFractalHilbert:
204 // No discontinuity at all with this space-filling continuous curve!
205 EXPECT_EQ(discontinuity_count, 0);
206 // Therefore, total_l1_distance has to be the number of blocks minus one.
207 EXPECT_EQ(total_l1_distance, num_blocks - 1);
208 break;
209 case BlockMapTraversalOrder::kLinear:
210 EXPECT_EQ(discontinuity_count, num_blocks_per_side - 1);
211 EXPECT_EQ(total_l1_distance,
212 2 * num_blocks_per_side * (num_blocks_per_side - 1));
213 break;
214 case BlockMapTraversalOrder::kFractalZ:
215 EXPECT_EQ(discontinuity_count, num_blocks > 1 ? (num_blocks / 2 - 1) : 0);
216 EXPECT_EQ(total_l1_distance,
217 2 * num_blocks_per_side * (num_blocks_per_side - 1));
218 break;
219 case BlockMapTraversalOrder::kFractalU: {
220 if (num_blocks_base_log2 == 0) {
221 EXPECT_EQ(discontinuity_count, 0);
222 EXPECT_EQ(total_l1_distance, 0);
223 } else {
224 int expected_discontinuity_count = 0;
225 int expected_total_l1_distance = 3;
226 for (int i = 2; i <= num_blocks_base_log2; i++) {
227 expected_discontinuity_count = 4 * expected_discontinuity_count + 2;
228 expected_total_l1_distance =
229 4 * expected_total_l1_distance + (1 << (i + 1)) - 1;
230 }
231 EXPECT_EQ(discontinuity_count, expected_discontinuity_count);
232 EXPECT_EQ(total_l1_distance, expected_total_l1_distance);
233 }
234 break;
235 }
236 default:
237 abort();
238 }
239 }
240
TEST(BlockMapTest,GetBlockByIndexSquare)241 TEST(BlockMapTest, GetBlockByIndexSquare) {
242 for (int num_blocks_base_log2 = 0; num_blocks_base_log2 <= 10;
243 num_blocks_base_log2++) {
244 for (BlockMapTraversalOrder traversal_order :
245 {BlockMapTraversalOrder::kLinear, BlockMapTraversalOrder::kFractalZ,
246 BlockMapTraversalOrder::kFractalU,
247 BlockMapTraversalOrder::kFractalHilbert}) {
248 GetBlockByIndexSquareTest(num_blocks_base_log2, traversal_order);
249 }
250 }
251 }
252
253 } // namespace
254 } // namespace ruy
255
main(int argc,char ** argv)256 int main(int argc, char **argv) {
257 ::testing::InitGoogleTest(&argc, argv);
258 return RUN_ALL_TESTS();
259 }
260