xref: /aosp_15_r20/external/igt-gpu-tools/tests/i915/gem_streaming_writes.c (revision d83cc019efdc2edc6c4b16e9034a3ceb8d35d77c)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Chris Wilson <[email protected]>
25  *
26  */
27 
28 #include "igt.h"
29 #include <unistd.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <fcntl.h>
34 #include <inttypes.h>
35 #include <pthread.h>
36 #include <errno.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include "drm.h"
40 
41 #define OBJECT_SIZE 1024*1024
42 #define CHUNK_SIZE 32
43 
44 #define COPY_BLT_CMD		(2<<29|0x53<<22|0x6)
45 #define BLT_WRITE_ALPHA		(1<<21)
46 #define BLT_WRITE_RGB		(1<<20)
47 #define BLT_WRITE_ARGB (BLT_WRITE_ALPHA | BLT_WRITE_RGB)
48 
49 #define LOCAL_I915_EXEC_HANDLE_LUT (1<<12)
50 
51 IGT_TEST_DESCRIPTION("Test of streaming writes into active GPU sources");
52 
53 #define SRC 0
54 #define DST 1
55 #define BATCH 2
56 
57 #define src exec[SRC].handle
58 #define src_offset exec[SRC].offset
59 #define dst exec[DST].handle
60 #define dst_offset exec[DST].offset
61 
test_streaming(int fd,int mode,int sync)62 static void test_streaming(int fd, int mode, int sync)
63 {
64 	const int has_64bit_reloc = intel_gen(intel_get_drm_devid(fd)) >= 8;
65 	struct drm_i915_gem_execbuffer2 execbuf;
66 	struct drm_i915_gem_exec_object2 exec[3];
67 	struct drm_i915_gem_relocation_entry reloc[128];
68 	uint32_t tmp[] = { MI_BATCH_BUFFER_END };
69 	uint64_t __src_offset, __dst_offset;
70 	uint32_t *s, *d;
71 	uint32_t offset;
72 	struct {
73 		uint32_t handle;
74 		uint64_t offset;
75 	} *batch;
76 	int i, n;
77 
78 	memset(exec, 0, sizeof(exec));
79 	exec[SRC].handle = gem_create(fd, OBJECT_SIZE);
80 	exec[DST].handle = gem_create(fd, OBJECT_SIZE);
81 
82 	switch (mode) {
83 	case 0: /* cpu/snoop */
84 		gem_set_caching(fd, src, I915_CACHING_CACHED);
85 		s = gem_mmap__cpu(fd, src, 0, OBJECT_SIZE,
86 				  PROT_READ | PROT_WRITE);
87 		break;
88 	case 1: /* gtt */
89 		s = gem_mmap__gtt(fd, src, OBJECT_SIZE,
90 				  PROT_READ | PROT_WRITE);
91 		break;
92 	case 2: /* wc */
93 		s = gem_mmap__wc(fd, src, 0, OBJECT_SIZE,
94 				 PROT_READ | PROT_WRITE);
95 		break;
96 	}
97 	*s = 0; /* fault the object into the mappable range first (for GTT) */
98 
99 	d = gem_mmap__cpu(fd, dst, 0, OBJECT_SIZE, PROT_READ);
100 
101 	gem_write(fd, dst, 0, tmp, sizeof(tmp));
102 	memset(&execbuf, 0, sizeof(execbuf));
103 	execbuf.buffers_ptr = to_user_pointer(exec);
104 	execbuf.buffer_count = 2;
105 	execbuf.flags = LOCAL_I915_EXEC_HANDLE_LUT;
106 	if (__gem_execbuf(fd, &execbuf)) {
107 		execbuf.flags = 0;
108 		igt_require(__gem_execbuf(fd, &execbuf) == 0);
109 	}
110 	/* We assume that the active objects are fixed to avoid relocations */
111 	__src_offset = src_offset;
112 	__dst_offset = dst_offset;
113 
114 	memset(reloc, 0, sizeof(reloc));
115 	for (i = 0; i < 64; i++) {
116 		reloc[2*i+0].offset = 64*i + 4 * sizeof(uint32_t);
117 		reloc[2*i+0].delta = 0;
118 		reloc[2*i+0].target_handle = execbuf.flags & LOCAL_I915_EXEC_HANDLE_LUT ? DST : dst;
119 		reloc[2*i+0].presumed_offset = dst_offset;
120 		reloc[2*i+0].read_domains = I915_GEM_DOMAIN_RENDER;
121 		reloc[2*i+0].write_domain = I915_GEM_DOMAIN_RENDER;
122 
123 		reloc[2*i+1].offset = 64*i + 7 * sizeof(uint32_t);
124 		if (has_64bit_reloc)
125 			reloc[2*i+1].offset +=  sizeof(uint32_t);
126 		reloc[2*i+1].delta = 0;
127 		reloc[2*i+1].target_handle = execbuf.flags & LOCAL_I915_EXEC_HANDLE_LUT ? SRC : src;
128 		reloc[2*i+1].presumed_offset = src_offset;
129 		reloc[2*i+1].read_domains = I915_GEM_DOMAIN_RENDER;
130 		reloc[2*i+1].write_domain = 0;
131 	}
132 	gem_execbuf(fd, &execbuf);
133 	igt_assert_eq_u64(__src_offset, src_offset);
134 	igt_assert_eq_u64(__dst_offset, dst_offset);
135 
136 	exec[DST].flags = EXEC_OBJECT_WRITE;
137 	exec[BATCH].relocation_count = 2;
138 	execbuf.buffer_count = 3;
139 	execbuf.flags |= I915_EXEC_NO_RELOC;
140 	if (gem_has_blt(fd))
141 		execbuf.flags |= I915_EXEC_BLT;
142 
143 	batch = malloc(sizeof(*batch) * (OBJECT_SIZE / CHUNK_SIZE / 64));
144 	for (i = n = 0; i < OBJECT_SIZE / CHUNK_SIZE / 64; i++) {
145 		uint32_t *base;
146 
147 		batch[i].handle = gem_create(fd, 4096);
148 		batch[i].offset = 0;
149 
150 		base = gem_mmap__cpu(fd, batch[i].handle, 0, 4096, PROT_WRITE);
151 		gem_set_domain(fd, batch[i].handle,
152 				I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
153 
154 		for (int j = 0; j < 64; j++) {
155 			unsigned x = (n * CHUNK_SIZE) % 4096 >> 2;
156 			unsigned y = (n * CHUNK_SIZE) / 4096;
157 			uint32_t *b = base + 16 * j;
158 			int k = 0;
159 
160 			b[k] = COPY_BLT_CMD | BLT_WRITE_ARGB;
161 			if (has_64bit_reloc)
162 				b[k] += 2;
163 			k++;
164 			b[k++] = 0xcc << 16 | 1 << 25 | 1 << 24 | 4096;
165 			b[k++] = (y << 16) | x;
166 			b[k++] = ((y+1) << 16) | (x + (CHUNK_SIZE >> 2));
167 			b[k++] = dst_offset;
168 			if (has_64bit_reloc)
169 				b[k++] = dst_offset >> 32;
170 			b[k++] = (y << 16) | x;
171 			b[k++] = 4096;
172 			b[k++] = src_offset;
173 			if (has_64bit_reloc)
174 				b[k++] = src_offset >> 32;
175 			b[k++] = MI_BATCH_BUFFER_END;
176 
177 			n++;
178 		}
179 
180 		munmap(base, 4096);
181 	}
182 
183 	for (int pass = 0; pass < 256; pass++) {
184 		int domain = mode ? I915_GEM_DOMAIN_GTT : I915_GEM_DOMAIN_CPU;
185 		gem_set_domain(fd, src, domain, domain);
186 
187 		if (pass == 0) {
188 			for (i = 0; i < OBJECT_SIZE/4; i++)
189 				s[i] = i;
190 		}
191 
192 		/* Now copy from the src to the dst in 32byte chunks */
193 		for (offset = 0; offset < OBJECT_SIZE; offset += CHUNK_SIZE) {
194 			int b;
195 
196 			if (pass) {
197 				if (sync)
198 					gem_set_domain(fd, src, domain, domain);
199 				for (i = 0; i < CHUNK_SIZE/4; i++)
200 					s[offset/4 + i] = (OBJECT_SIZE*pass + offset)/4 + i;
201 			}
202 
203 			igt_assert(exec[DST].flags & EXEC_OBJECT_WRITE);
204 
205 			b = offset / CHUNK_SIZE / 64;
206 			n = offset / CHUNK_SIZE % 64;
207 			exec[BATCH].relocs_ptr = to_user_pointer((reloc + 2*n));
208 			exec[BATCH].handle = batch[b].handle;
209 			exec[BATCH].offset = batch[b].offset;
210 			execbuf.batch_start_offset = 64*n;
211 
212 			gem_execbuf(fd, &execbuf);
213 			igt_assert_eq_u64(__src_offset, src_offset);
214 			igt_assert_eq_u64(__dst_offset, dst_offset);
215 
216 			batch[b].offset = exec[BATCH].offset;
217 		}
218 
219 		gem_set_domain(fd, dst, I915_GEM_DOMAIN_CPU, 0);
220 		for (offset = 0; offset < OBJECT_SIZE/4; offset++)
221 			igt_assert_eq(pass*OBJECT_SIZE/4 + offset, d[offset]);
222 	}
223 
224 	for (i = 0; i < OBJECT_SIZE / CHUNK_SIZE / 64; i++)
225 		gem_close(fd, batch[i].handle);
226 	free(batch);
227 
228 	munmap(s, OBJECT_SIZE);
229 	gem_close(fd, src);
230 	munmap(d, OBJECT_SIZE);
231 	gem_close(fd, dst);
232 }
233 
test_batch(int fd,int mode,int reverse)234 static void test_batch(int fd, int mode, int reverse)
235 {
236 	const int has_64bit_reloc = intel_gen(intel_get_drm_devid(fd)) >= 8;
237 	struct drm_i915_gem_execbuffer2 execbuf;
238 	struct drm_i915_gem_exec_object2 exec[3];
239 	struct drm_i915_gem_relocation_entry reloc[2];
240 	uint32_t tmp[] = { MI_BATCH_BUFFER_END };
241 	uint64_t __src_offset, __dst_offset;
242 	bool need_64b_start_offset = true;
243 	uint64_t batch_size;
244 	uint32_t *s, *d;
245 	uint32_t *base;
246 	uint32_t offset;
247 
248 	memset(exec, 0, sizeof(exec));
249 	exec[DST].handle = gem_create(fd, OBJECT_SIZE);
250 	exec[SRC].handle = gem_create(fd, OBJECT_SIZE);
251 
252 	s = gem_mmap__wc(fd, src, 0, OBJECT_SIZE, PROT_READ | PROT_WRITE);
253 
254 	d = gem_mmap__cpu(fd, dst, 0, OBJECT_SIZE, PROT_READ);
255 
256 	memset(reloc, 0, sizeof(reloc));
257 	reloc[0].offset =  4 * sizeof(uint32_t);
258 	reloc[0].delta = 0;
259 	reloc[0].target_handle = execbuf.flags & LOCAL_I915_EXEC_HANDLE_LUT ? DST : dst;
260 	reloc[0].presumed_offset = dst_offset;
261 	reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
262 	reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
263 
264 	reloc[1].offset = 7 * sizeof(uint32_t);
265 	if (has_64bit_reloc)
266 		reloc[1].offset +=  sizeof(uint32_t);
267 	reloc[1].delta = 0;
268 	reloc[1].target_handle = execbuf.flags & LOCAL_I915_EXEC_HANDLE_LUT ? SRC : src;
269 	reloc[1].presumed_offset = src_offset;
270 	reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
271 	reloc[1].write_domain = 0;
272 
273 	batch_size = ALIGN(OBJECT_SIZE / CHUNK_SIZE * 128, 4096);
274 	exec[BATCH].relocs_ptr = to_user_pointer(reloc);
275 	exec[BATCH].relocation_count = 2;
276 	exec[BATCH].handle = gem_create(fd, batch_size);
277 
278 	switch (mode) {
279 	case 0: /* cpu/snoop */
280 		igt_require(gem_has_llc(fd));
281 		base = gem_mmap__cpu(fd, exec[BATCH].handle, 0, batch_size,
282 				     PROT_READ | PROT_WRITE);
283 		break;
284 	case 1: /* gtt */
285 		base = gem_mmap__gtt(fd, exec[BATCH].handle, batch_size,
286 				     PROT_READ | PROT_WRITE);
287 		break;
288 	case 2: /* wc */
289 		base = gem_mmap__wc(fd, exec[BATCH].handle, 0, batch_size,
290 				    PROT_READ | PROT_WRITE);
291 		break;
292 	}
293 	*base = 0; /* fault the object into the mappable range first */
294 
295 	gem_write(fd, exec[BATCH].handle, 0, tmp, sizeof(tmp));
296 	memset(&execbuf, 0, sizeof(execbuf));
297 	execbuf.buffers_ptr = to_user_pointer(exec);
298 	execbuf.buffer_count = 3;
299 	execbuf.flags = LOCAL_I915_EXEC_HANDLE_LUT;
300 	if (gem_has_blt(fd))
301 		execbuf.flags |= I915_EXEC_BLT;
302 	if (__gem_execbuf(fd, &execbuf)) {
303 		execbuf.flags &= ~LOCAL_I915_EXEC_HANDLE_LUT;
304 		gem_execbuf(fd, &execbuf);
305 	}
306 	execbuf.flags |= I915_EXEC_NO_RELOC;
307 	exec[DST].flags = EXEC_OBJECT_WRITE;
308 	/* We assume that the active objects are fixed to avoid relocations */
309 	exec[BATCH].relocation_count = 0;
310 	__src_offset = src_offset;
311 	__dst_offset = dst_offset;
312 
313 	offset = mode ? I915_GEM_DOMAIN_GTT : I915_GEM_DOMAIN_CPU;
314 	gem_set_domain(fd, exec[BATCH].handle, offset, offset);
315 	for (int pass = 0; pass < 256; pass++) {
316 		gem_set_domain(fd, src, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
317 		for (offset = 0; offset < OBJECT_SIZE/4; offset++)
318 			s[offset] = OBJECT_SIZE*pass/4 + offset;
319 
320 		/* Now copy from the src to the dst in 32byte chunks */
321 		for (offset = 0; offset < OBJECT_SIZE / CHUNK_SIZE; offset++) {
322 			unsigned x = (offset * CHUNK_SIZE) % 4096 >> 2;
323 			unsigned y = (offset * CHUNK_SIZE) / 4096;
324 			int k;
325 
326 			execbuf.batch_start_offset = 128 * offset;
327 			if (!need_64b_start_offset)
328 				execbuf.batch_start_offset += 8 * (pass & 7);
329 			igt_assert(execbuf.batch_start_offset <= batch_size - 64);
330 			if (reverse)
331 				execbuf.batch_start_offset = batch_size - execbuf.batch_start_offset - 64;
332 			igt_assert(execbuf.batch_start_offset <= batch_size - 64);
333 			k = execbuf.batch_start_offset / 4;
334 
335 			base[k] = COPY_BLT_CMD | BLT_WRITE_ARGB;
336 			if (has_64bit_reloc)
337 				base[k] += 2;
338 			k++;
339 			base[k++] = 0xcc << 16 | 1 << 25 | 1 << 24 | 4096;
340 			base[k++] = (y << 16) | x;
341 			base[k++] = ((y+1) << 16) | (x + (CHUNK_SIZE >> 2));
342 			base[k++] = dst_offset;
343 			if (has_64bit_reloc)
344 				base[k++] = dst_offset >> 32;
345 			base[k++] = (y << 16) | x;
346 			base[k++] = 4096;
347 			base[k++] = src_offset;
348 			if (has_64bit_reloc)
349 				base[k++] = src_offset >> 32;
350 			base[k++] = MI_BATCH_BUFFER_END;
351 
352 			igt_assert(exec[DST].flags & EXEC_OBJECT_WRITE);
353 			gem_execbuf(fd, &execbuf);
354 			igt_assert_eq_u64(__src_offset, src_offset);
355 			igt_assert_eq_u64(__dst_offset, dst_offset);
356 		}
357 
358 		gem_set_domain(fd, dst, I915_GEM_DOMAIN_CPU, 0);
359 		for (offset = 0; offset < OBJECT_SIZE/4; offset++)
360 			igt_assert_eq(pass*OBJECT_SIZE/4 + offset, d[offset]);
361 	}
362 
363 	munmap(base, OBJECT_SIZE / CHUNK_SIZE * 128);
364 	gem_close(fd, exec[BATCH].handle);
365 
366 	munmap(s, OBJECT_SIZE);
367 	gem_close(fd, src);
368 	munmap(d, OBJECT_SIZE);
369 	gem_close(fd, dst);
370 }
371 
372 igt_main
373 {
374 	int fd, sync;
375 
376 	igt_fixture {
377 		fd = drm_open_driver(DRIVER_INTEL);
378 		igt_require_gem(fd);
379 	}
380 
381 	for (sync = 2; sync--; ) {
382 		igt_subtest_f("cpu%s", sync ? "-sync":"")
383 			test_streaming(fd, 0, sync);
384 		igt_subtest_f("gtt%s", sync ? "-sync":"")
385 			test_streaming(fd, 1, sync);
386 		igt_subtest_f("wc%s", sync ? "-sync":"")
387 			test_streaming(fd, 2, sync);
388 	}
389 
390 	igt_subtest("batch-cpu")
391 		test_batch(fd, 0, 0);
392 	igt_subtest("batch-gtt")
393 		test_batch(fd, 1, 0);
394 	igt_subtest("batch-wc")
395 		test_batch(fd, 2, 0);
396 	igt_subtest("batch-reverse-cpu")
397 		test_batch(fd, 0, 1);
398 	igt_subtest("batch-reverse-gtt")
399 		test_batch(fd, 1, 1);
400 	igt_subtest("batch-reverse-wc")
401 		test_batch(fd, 2, 1);
402 
403 	igt_fixture
404 		close(fd);
405 }
406