xref: /aosp_15_r20/external/swiftshader/tests/VulkanBenchmarks/ComputeBenchmarks.cpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 // Copyright 2021 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Util.hpp"
16 #include "VulkanTester.hpp"
17 
18 #include "benchmark/benchmark.h"
19 
20 #include <cmath>
21 #include <cstring>
22 #include <sstream>
23 
24 // C++ reference implementation for single-threaded 'compute' operations.
25 template<typename Init, typename Func>
CppCompute(benchmark::State & state,Init init,Func op)26 void CppCompute(benchmark::State &state, Init init, Func op)
27 {
28 	int64_t numElements = state.range(0);
29 	float *bufferIn = (float *)malloc(numElements * sizeof(float));
30 	float *bufferOut = (float *)malloc(numElements * sizeof(float));
31 
32 	for(int64_t i = 0; i < numElements; i++)
33 	{
34 		bufferIn[i] = init(i);
35 	}
36 
37 	for(auto _ : state)
38 	{
39 		for(int64_t i = 0; i < numElements; i++)
40 		{
41 			bufferOut[i] = op(bufferIn[i]);
42 		}
43 	}
44 
45 	free(bufferIn);
46 	free(bufferOut);
47 }
48 
zero(int64_t i)49 float zero(int64_t i)
50 {
51 	return 0.0f;
52 }
53 
one(int64_t i)54 float one(int64_t i)
55 {
56 	return 1.0f;
57 }
58 
__anon1e3422c60102(float x) 59 BENCHMARK_CAPTURE(CppCompute, mov, zero, [](float x) { return x; })->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
60 BENCHMARK_CAPTURE(CppCompute, sqrt, one, sqrtf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
61 BENCHMARK_CAPTURE(CppCompute, sin, zero, sinf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
62 BENCHMARK_CAPTURE(CppCompute, cos, zero, cosf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
63 BENCHMARK_CAPTURE(CppCompute, exp, zero, expf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
64 BENCHMARK_CAPTURE(CppCompute, log, one, logf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
65 
66 class ComputeBenchmark
67 {
68 protected:
ComputeBenchmark()69 	ComputeBenchmark()
70 	{
71 		tester.initialize();
72 	}
73 
74 	VulkanTester tester;
75 };
76 
77 // Base class for compute benchmarks that read from an input buffer and write to an
78 // output buffer of the same length.
79 class BufferToBufferComputeBenchmark : public ComputeBenchmark
80 {
81 public:
BufferToBufferComputeBenchmark(const benchmark::State & state)82 	BufferToBufferComputeBenchmark(const benchmark::State &state)
83 	    : state(state)
84 	{
85 		device = tester.getDevice();
86 	}
87 
~BufferToBufferComputeBenchmark()88 	virtual ~BufferToBufferComputeBenchmark()
89 	{
90 		device.destroyCommandPool(commandPool);
91 		device.destroyDescriptorPool(descriptorPool);
92 		device.destroyPipeline(pipeline);
93 		device.destroyDescriptorSetLayout(descriptorSetLayout);
94 		device.destroyBuffer(bufferIn);
95 		device.destroyBuffer(bufferOut);
96 		device.freeMemory(deviceMemory);
97 	}
98 
99 	void run();
100 
101 protected:
102 	void initialize(const std::string &glslShader);
103 
104 	uint32_t localSizeX = 128;
105 	uint32_t localSizeY = 1;
106 	uint32_t localSizeZ = 1;
107 
108 private:
109 	const benchmark::State &state;
110 
111 	// Weak references
112 	vk::Device device;
113 	vk::Queue queue;
114 	vk::CommandBuffer commandBuffer;
115 
116 	// Owned resources
117 	vk::CommandPool commandPool;
118 	vk::DescriptorPool descriptorPool;
119 	vk::Pipeline pipeline;
120 	vk::DescriptorSetLayout descriptorSetLayout;
121 	vk::DeviceMemory deviceMemory;
122 	vk::Buffer bufferIn;
123 	vk::Buffer bufferOut;
124 };
125 
initialize(const std::string & glslShader)126 void BufferToBufferComputeBenchmark::initialize(const std::string &glslShader)
127 {
128 	auto code = Util::compileGLSLtoSPIRV(glslShader.c_str(), EShLanguage::EShLangCompute);
129 
130 	auto &device = tester.getDevice();
131 	auto &physicalDevice = tester.getPhysicalDevice();
132 	queue = device.getQueue(0, 0);  // TODO: Don't assume this queue can do compute.
133 
134 	size_t numElements = state.range(0);
135 	size_t inOffset = 0;
136 	size_t outOffset = numElements;
137 	size_t buffersTotalElements = 2 * numElements;
138 	size_t buffersSize = sizeof(uint32_t) * buffersTotalElements;
139 
140 	// TODO: vk::MemoryRequirements memoryRequirements = device.getBufferMemoryRequirements(buffer);
141 	vk::MemoryAllocateInfo allocateInfo;
142 	allocateInfo.allocationSize = buffersSize;  // TODO: memoryRequirements.size
143 	allocateInfo.memoryTypeIndex = 0;           // TODO: memoryRequirements.memoryTypeBits
144 	deviceMemory = device.allocateMemory(allocateInfo);
145 
146 	uint32_t *buffers = (uint32_t *)device.mapMemory(deviceMemory, 0, buffersSize);
147 	memset(buffers, 0, buffersSize);
148 
149 	for(size_t i = 0; i < numElements; i++)
150 	{
151 		buffers[inOffset + i] = (uint32_t)i;
152 	}
153 
154 	device.unmapMemory(deviceMemory);
155 	buffers = nullptr;
156 
157 	vk::BufferCreateInfo bufferCreateInfo({}, sizeof(uint32_t) * numElements, vk::BufferUsageFlagBits::eStorageBuffer);
158 	bufferIn = device.createBuffer(bufferCreateInfo);
159 	device.bindBufferMemory(bufferIn, deviceMemory, sizeof(uint32_t) * inOffset);
160 
161 	bufferOut = device.createBuffer(bufferCreateInfo);
162 	device.bindBufferMemory(bufferOut, deviceMemory, sizeof(uint32_t) * outOffset);
163 
164 	vk::ShaderModuleCreateInfo moduleCreateInfo;
165 	moduleCreateInfo.codeSize = code.size() * sizeof(uint32_t);
166 	moduleCreateInfo.pCode = (uint32_t *)code.data();
167 	vk::ShaderModule shaderModule = device.createShaderModule(moduleCreateInfo);
168 
169 	vk::DescriptorSetLayoutBinding in;
170 	in.binding = 0;
171 	in.descriptorCount = 1;
172 	in.descriptorType = vk::DescriptorType::eStorageBuffer;
173 	in.stageFlags = vk::ShaderStageFlagBits::eCompute;
174 
175 	vk::DescriptorSetLayoutBinding out;
176 	out.binding = 1;
177 	out.descriptorCount = 1;
178 	out.descriptorType = vk::DescriptorType::eStorageBuffer;
179 	out.stageFlags = vk::ShaderStageFlagBits::eCompute;
180 
181 	std::vector<vk::DescriptorSetLayoutBinding> setLayoutBindings = { in, out };
182 	vk::DescriptorSetLayoutCreateInfo layoutInfo;
183 	layoutInfo.bindingCount = static_cast<uint32_t>(setLayoutBindings.size());
184 	layoutInfo.pBindings = setLayoutBindings.data();
185 	descriptorSetLayout = device.createDescriptorSetLayout(layoutInfo);
186 
187 	vk::PipelineLayoutCreateInfo pipelineLayoutCreateInfo;
188 	pipelineLayoutCreateInfo.setLayoutCount = 1;
189 	pipelineLayoutCreateInfo.pSetLayouts = &descriptorSetLayout;
190 	vk::PipelineLayout pipelineLayout = device.createPipelineLayout(pipelineLayoutCreateInfo);
191 
192 	vk::ComputePipelineCreateInfo computePipelineCreateInfo;
193 	computePipelineCreateInfo.layout = pipelineLayout;
194 	computePipelineCreateInfo.stage.stage = vk::ShaderStageFlagBits::eCompute;
195 	computePipelineCreateInfo.stage.module = shaderModule;
196 	computePipelineCreateInfo.stage.pName = "main";
197 	pipeline = device.createComputePipeline({}, computePipelineCreateInfo).value;
198 
199 	// "A shader module can be destroyed while pipelines created using its shaders are still in use."
200 	device.destroyShaderModule(shaderModule);
201 
202 	std::array<vk::DescriptorPoolSize, 1> poolSizes = {};
203 	poolSizes[0].type = vk::DescriptorType::eStorageBuffer;
204 	poolSizes[0].descriptorCount = 2;
205 	vk::DescriptorPoolCreateInfo descriptorPoolCreateInfo;
206 	descriptorPoolCreateInfo.maxSets = 1;
207 	descriptorPoolCreateInfo.poolSizeCount = static_cast<uint32_t>(poolSizes.size());
208 	descriptorPoolCreateInfo.pPoolSizes = poolSizes.data();
209 
210 	descriptorPool = device.createDescriptorPool(descriptorPoolCreateInfo);
211 
212 	vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo;
213 	descriptorSetAllocateInfo.descriptorPool = descriptorPool;
214 	descriptorSetAllocateInfo.descriptorSetCount = 1;
215 	descriptorSetAllocateInfo.pSetLayouts = &descriptorSetLayout;
216 	auto descriptorSets = device.allocateDescriptorSets(descriptorSetAllocateInfo);
217 
218 	vk::DescriptorBufferInfo inBufferInfo;
219 	inBufferInfo.buffer = bufferIn;
220 	inBufferInfo.offset = 0;
221 	inBufferInfo.range = VK_WHOLE_SIZE;
222 
223 	vk::DescriptorBufferInfo outBufferInfo;
224 	outBufferInfo.buffer = bufferOut;
225 	outBufferInfo.offset = 0;
226 	outBufferInfo.range = VK_WHOLE_SIZE;
227 
228 	std::array<vk::WriteDescriptorSet, 2> descriptorWrites = {};
229 
230 	descriptorWrites[0].dstSet = descriptorSets[0];
231 	descriptorWrites[0].dstBinding = 0;
232 	descriptorWrites[0].dstArrayElement = 0;
233 	descriptorWrites[0].descriptorType = vk::DescriptorType::eStorageBuffer;
234 	descriptorWrites[0].descriptorCount = 1;
235 	descriptorWrites[0].pBufferInfo = &inBufferInfo;
236 
237 	descriptorWrites[1].dstSet = descriptorSets[0];
238 	descriptorWrites[1].dstBinding = 1;
239 	descriptorWrites[1].dstArrayElement = 0;
240 	descriptorWrites[1].descriptorType = vk::DescriptorType::eStorageBuffer;
241 	descriptorWrites[1].descriptorCount = 1;
242 	descriptorWrites[1].pBufferInfo = &outBufferInfo;
243 
244 	device.updateDescriptorSets(static_cast<uint32_t>(descriptorWrites.size()), descriptorWrites.data(), 0, nullptr);
245 
246 	vk::CommandPoolCreateInfo commandPoolCreateInfo;
247 	commandPoolCreateInfo.queueFamilyIndex = 0;  // TODO: Don't assume queue family 0 can do compute.
248 	commandPoolCreateInfo.flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer;
249 	commandPool = device.createCommandPool(commandPoolCreateInfo);
250 
251 	vk::CommandBufferAllocateInfo commandBufferAllocateInfo;
252 	commandBufferAllocateInfo.commandPool = commandPool;
253 	commandBufferAllocateInfo.commandBufferCount = 1;
254 	commandBufferAllocateInfo.level = vk::CommandBufferLevel::ePrimary;
255 	auto commandBuffers = device.allocateCommandBuffers(commandBufferAllocateInfo);
256 
257 	// Record the command buffer
258 	commandBuffer = commandBuffers[0];
259 
260 	vk::CommandBufferBeginInfo commandBufferBeginInfo;
261 	commandBuffer.begin(commandBufferBeginInfo);
262 
263 	commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
264 	commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0, 1, &descriptorSets[0], 0, nullptr);
265 
266 	commandBuffer.dispatch((uint32_t)(numElements / localSizeX), 1, 1);
267 
268 	commandBuffer.end();
269 
270 	// Destroy objects we don't have to hold on to after command buffer recording.
271 	// "A VkPipelineLayout object must not be destroyed while any command buffer that uses it is in the recording state."
272 	device.destroyPipelineLayout(pipelineLayout);
273 }
274 
run()275 void BufferToBufferComputeBenchmark::run()
276 {
277 	vk::SubmitInfo submitInfo;
278 	submitInfo.commandBufferCount = 1;
279 	submitInfo.pCommandBuffers = &commandBuffer;
280 	queue.submit(submitInfo);
281 	queue.waitIdle();
282 }
283 
284 // Performs an operation `op` on each element.
285 class ComputeOp : public BufferToBufferComputeBenchmark
286 {
287 public:
ComputeOp(const benchmark::State & state,const char * op,const char * precision)288 	ComputeOp(const benchmark::State &state, const char *op, const char *precision)
289 	    : BufferToBufferComputeBenchmark(state)
290 	{
291 		std::stringstream src;
292 		src << R"(#version 450
293 			layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
294 			layout(binding = 0, std430) buffer InBuffer
295 			{
296 				float Data[];
297 			} In;
298 			layout(binding = 1, std430) buffer OutBuffer
299 			{
300 				float Data[];
301 			} Out;
302 			void main()
303 			{
304 				)"
305 		    << precision << R"( float x = In.Data[gl_GlobalInvocationID.x];
306 				Out.Data[gl_GlobalInvocationID.x] = )"
307 		    << op << R"( (x);
308 			})";
309 
310 		initialize(src.str());
311 	}
312 };
313 
Compute(benchmark::State & state,const char * op,const char * precision="highp")314 static void Compute(benchmark::State &state, const char *op, const char *precision = "highp")
315 {
316 	ComputeOp benchmark(state, op, precision);
317 
318 	// Execute once to have the Reactor routine generated.
319 	benchmark.run();
320 
321 	for(auto _ : state)
322 	{
323 		benchmark.run();
324 	}
325 }
326 
327 BENCHMARK_CAPTURE(Compute, mov, "")->RangeMultiplier(2)->Range(128, 4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
328 
329 BENCHMARK_CAPTURE(Compute, sqrt_highp, "sqrt", "highp")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
330 BENCHMARK_CAPTURE(Compute, sin_highp, "sin", "highp")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
331 BENCHMARK_CAPTURE(Compute, cos_highp, "cos", "highp")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
332 BENCHMARK_CAPTURE(Compute, exp_highp, "exp", "highp")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
333 BENCHMARK_CAPTURE(Compute, log_highp, "log", "highp")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
334 
335 BENCHMARK_CAPTURE(Compute, sqrt_mediump, "sqrt", "mediump")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
336 BENCHMARK_CAPTURE(Compute, sin_mediump, "sin", "mediump")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
337 BENCHMARK_CAPTURE(Compute, cos_mediump, "cos", "mediump")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
338 BENCHMARK_CAPTURE(Compute, exp_mediump, "exp", "mediump")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
339 BENCHMARK_CAPTURE(Compute, log_mediump, "log", "mediump")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();