1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <stdlib.h>
25 #include <unistd.h>
26 #include <limits.h>
27 #include <assert.h>
28 #include <sys/mman.h>
29
30 #include "anv_private.h"
31
32 #include "common/intel_aux_map.h"
33 #include "util/anon_file.h"
34 #include "util/futex.h"
35
36 #ifdef HAVE_VALGRIND
37 #define VG_NOACCESS_READ(__ptr) ({ \
38 VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \
39 __typeof(*(__ptr)) __val = *(__ptr); \
40 VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\
41 __val; \
42 })
43 #define VG_NOACCESS_WRITE(__ptr, __val) ({ \
44 VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr))); \
45 *(__ptr) = (__val); \
46 VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr))); \
47 })
48 #else
49 #define VG_NOACCESS_READ(__ptr) (*(__ptr))
50 #define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val))
51 #endif
52
53 #ifndef MAP_POPULATE
54 #define MAP_POPULATE 0
55 #endif
56
57 /* Design goals:
58 *
59 * - Lock free (except when resizing underlying bos)
60 *
61 * - Constant time allocation with typically only one atomic
62 *
63 * - Multiple allocation sizes without fragmentation
64 *
65 * - Can grow while keeping addresses and offset of contents stable
66 *
67 * - All allocations within one bo so we can point one of the
68 * STATE_BASE_ADDRESS pointers at it.
69 *
70 * The overall design is a two-level allocator: top level is a fixed size, big
71 * block (8k) allocator, which operates out of a bo. Allocation is done by
72 * either pulling a block from the free list or growing the used range of the
73 * bo. Growing the range may run out of space in the bo which we then need to
74 * grow. Growing the bo is tricky in a multi-threaded, lockless environment:
75 * we need to keep all pointers and contents in the old map valid. GEM bos in
76 * general can't grow, but we use a trick: we create a memfd and use ftruncate
77 * to grow it as necessary. We mmap the new size and then create a gem bo for
78 * it using the new gem userptr ioctl. Without heavy-handed locking around
79 * our allocation fast-path, there isn't really a way to munmap the old mmap,
80 * so we just keep it around until garbage collection time. While the block
81 * allocator is lockless for normal operations, we block other threads trying
82 * to allocate while we're growing the map. It shouldn't happen often, and
83 * growing is fast anyway.
84 *
85 * At the next level we can use various sub-allocators. The state pool is a
86 * pool of smaller, fixed size objects, which operates much like the block
87 * pool. It uses a free list for freeing objects, but when it runs out of
88 * space it just allocates a new block from the block pool. This allocator is
89 * intended for longer lived state objects such as SURFACE_STATE and most
90 * other persistent state objects in the API. We may need to track more info
91 * with these object and a pointer back to the CPU object (eg VkImage). In
92 * those cases we just allocate a slightly bigger object and put the extra
93 * state after the GPU state object.
94 *
95 * The state stream allocator works similar to how the i965 DRI driver streams
96 * all its state. Even with Vulkan, we need to emit transient state (whether
97 * surface state base or dynamic state base), and for that we can just get a
98 * block and fill it up. These cases are local to a command buffer and the
99 * sub-allocator need not be thread safe. The streaming allocator gets a new
100 * block when it runs out of space and chains them together so they can be
101 * easily freed.
102 */
103
104 /* Allocations are always at least 64 byte aligned, so 1 is an invalid value.
105 * We use it to indicate the free list is empty. */
106 #define EMPTY UINT32_MAX
107
108 /* On FreeBSD PAGE_SIZE is already defined in
109 * /usr/include/machine/param.h that is indirectly
110 * included here.
111 */
112 #ifndef PAGE_SIZE
113 #define PAGE_SIZE 4096
114 #endif
115
116 struct anv_state_table_cleanup {
117 void *map;
118 size_t size;
119 };
120
121 #define ANV_STATE_TABLE_CLEANUP_INIT ((struct anv_state_table_cleanup){0})
122 #define ANV_STATE_ENTRY_SIZE (sizeof(struct anv_free_entry))
123
124 static VkResult
125 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size);
126
127 VkResult
anv_state_table_init(struct anv_state_table * table,struct anv_device * device,uint32_t initial_entries)128 anv_state_table_init(struct anv_state_table *table,
129 struct anv_device *device,
130 uint32_t initial_entries)
131 {
132 VkResult result;
133
134 table->device = device;
135
136 /* Just make it 2GB up-front. The Linux kernel won't actually back it
137 * with pages until we either map and fault on one of them or we use
138 * userptr and send a chunk of it off to the GPU.
139 */
140 table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table");
141 if (table->fd == -1)
142 return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
143
144 if (!u_vector_init(&table->cleanups, 8,
145 sizeof(struct anv_state_table_cleanup))) {
146 result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
147 goto fail_fd;
148 }
149
150 table->state.next = 0;
151 table->state.end = 0;
152 table->size = 0;
153
154 uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;
155 result = anv_state_table_expand_range(table, initial_size);
156 if (result != VK_SUCCESS)
157 goto fail_cleanups;
158
159 return VK_SUCCESS;
160
161 fail_cleanups:
162 u_vector_finish(&table->cleanups);
163 fail_fd:
164 close(table->fd);
165
166 return result;
167 }
168
169 static VkResult
anv_state_table_expand_range(struct anv_state_table * table,uint32_t size)170 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
171 {
172 void *map;
173 struct anv_state_table_cleanup *cleanup;
174
175 /* Assert that we only ever grow the pool */
176 assert(size >= table->state.end);
177
178 /* Make sure that we don't go outside the bounds of the memfd */
179 if (size > BLOCK_POOL_MEMFD_SIZE)
180 return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
181
182 cleanup = u_vector_add(&table->cleanups);
183 if (!cleanup)
184 return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
185
186 *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
187
188 /* Just leak the old map until we destroy the pool. We can't munmap it
189 * without races or imposing locking on the block allocate fast path. On
190 * the whole the leaked maps adds up to less than the size of the
191 * current map. MAP_POPULATE seems like the right thing to do, but we
192 * should try to get some numbers.
193 */
194 map = mmap(NULL, size, PROT_READ | PROT_WRITE,
195 MAP_SHARED | MAP_POPULATE, table->fd, 0);
196 if (map == MAP_FAILED) {
197 return vk_errorf(table->device, VK_ERROR_OUT_OF_HOST_MEMORY,
198 "mmap failed: %m");
199 }
200
201 cleanup->map = map;
202 cleanup->size = size;
203
204 table->map = map;
205 table->size = size;
206
207 return VK_SUCCESS;
208 }
209
210 static VkResult
anv_state_table_grow(struct anv_state_table * table)211 anv_state_table_grow(struct anv_state_table *table)
212 {
213 VkResult result = VK_SUCCESS;
214
215 uint32_t used = align(table->state.next * ANV_STATE_ENTRY_SIZE,
216 PAGE_SIZE);
217 uint32_t old_size = table->size;
218
219 /* The block pool is always initialized to a nonzero size and this function
220 * is always called after initialization.
221 */
222 assert(old_size > 0);
223
224 uint32_t required = MAX2(used, old_size);
225 if (used * 2 <= required) {
226 /* If we're in this case then this isn't the firsta allocation and we
227 * already have enough space on both sides to hold double what we
228 * have allocated. There's nothing for us to do.
229 */
230 goto done;
231 }
232
233 uint32_t size = old_size * 2;
234 while (size < required)
235 size *= 2;
236
237 assert(size > table->size);
238
239 result = anv_state_table_expand_range(table, size);
240
241 done:
242 return result;
243 }
244
245 void
anv_state_table_finish(struct anv_state_table * table)246 anv_state_table_finish(struct anv_state_table *table)
247 {
248 struct anv_state_table_cleanup *cleanup;
249
250 u_vector_foreach(cleanup, &table->cleanups) {
251 if (cleanup->map)
252 munmap(cleanup->map, cleanup->size);
253 }
254
255 u_vector_finish(&table->cleanups);
256
257 close(table->fd);
258 }
259
260 VkResult
anv_state_table_add(struct anv_state_table * table,uint32_t * idx,uint32_t count)261 anv_state_table_add(struct anv_state_table *table, uint32_t *idx,
262 uint32_t count)
263 {
264 struct anv_block_state state, old, new;
265 VkResult result;
266
267 assert(idx);
268
269 while(1) {
270 state.u64 = __sync_fetch_and_add(&table->state.u64, count);
271 if (state.next + count <= state.end) {
272 assert(table->map);
273 struct anv_free_entry *entry = &table->map[state.next];
274 for (int i = 0; i < count; i++) {
275 entry[i].state.idx = state.next + i;
276 }
277 *idx = state.next;
278 return VK_SUCCESS;
279 } else if (state.next <= state.end) {
280 /* We allocated the first block outside the pool so we have to grow
281 * the pool. pool_state->next acts a mutex: threads who try to
282 * allocate now will get block indexes above the current limit and
283 * hit futex_wait below.
284 */
285 new.next = state.next + count;
286 do {
287 result = anv_state_table_grow(table);
288 if (result != VK_SUCCESS)
289 return result;
290 new.end = table->size / ANV_STATE_ENTRY_SIZE;
291 } while (new.end < new.next);
292
293 old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64);
294 if (old.next != state.next)
295 futex_wake(&table->state.end, INT32_MAX);
296 } else {
297 futex_wait(&table->state.end, state.end, NULL);
298 continue;
299 }
300 }
301 }
302
303 void
anv_free_list_push(union anv_free_list * list,struct anv_state_table * table,uint32_t first,uint32_t count)304 anv_free_list_push(union anv_free_list *list,
305 struct anv_state_table *table,
306 uint32_t first, uint32_t count)
307 {
308 union anv_free_list current, old, new;
309 uint32_t last = first;
310
311 for (uint32_t i = 1; i < count; i++, last++)
312 table->map[last].next = last + 1;
313
314 old.u64 = list->u64;
315 do {
316 current = old;
317 table->map[last].next = current.offset;
318 new.offset = first;
319 new.count = current.count + 1;
320 old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
321 } while (old.u64 != current.u64);
322 }
323
324 struct anv_state *
anv_free_list_pop(union anv_free_list * list,struct anv_state_table * table)325 anv_free_list_pop(union anv_free_list *list,
326 struct anv_state_table *table)
327 {
328 union anv_free_list current, new, old;
329
330 current.u64 = list->u64;
331 while (current.offset != EMPTY) {
332 __sync_synchronize();
333 new.offset = table->map[current.offset].next;
334 new.count = current.count + 1;
335 old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
336 if (old.u64 == current.u64) {
337 struct anv_free_entry *entry = &table->map[current.offset];
338 return &entry->state;
339 }
340 current = old;
341 }
342
343 return NULL;
344 }
345
346 static VkResult
347 anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size);
348
349 VkResult
anv_block_pool_init(struct anv_block_pool * pool,struct anv_device * device,const char * name,uint64_t start_address,uint32_t initial_size,uint32_t max_size)350 anv_block_pool_init(struct anv_block_pool *pool,
351 struct anv_device *device,
352 const char *name,
353 uint64_t start_address,
354 uint32_t initial_size,
355 uint32_t max_size)
356 {
357 VkResult result;
358
359 /* Make sure VMA addresses are aligned for the block pool */
360 assert(anv_is_aligned(start_address, device->info->mem_alignment));
361 assert(anv_is_aligned(initial_size, device->info->mem_alignment));
362 assert(max_size > 0);
363 assert(max_size > initial_size);
364
365 pool->name = name;
366 pool->device = device;
367 pool->nbos = 0;
368 pool->size = 0;
369 pool->start_address = intel_canonical_address(start_address);
370 pool->max_size = max_size;
371
372 pool->bo = NULL;
373
374 pool->state.next = 0;
375 pool->state.end = 0;
376
377 pool->bo_alloc_flags =
378 ANV_BO_ALLOC_FIXED_ADDRESS |
379 ANV_BO_ALLOC_MAPPED |
380 ANV_BO_ALLOC_HOST_CACHED_COHERENT |
381 ANV_BO_ALLOC_CAPTURE |
382 ANV_BO_ALLOC_INTERNAL;
383
384 result = anv_block_pool_expand_range(pool, initial_size);
385 if (result != VK_SUCCESS)
386 return result;
387
388 /* Make the entire pool available in the front of the pool. If back
389 * allocation needs to use this space, the "ends" will be re-arranged.
390 */
391 pool->state.end = pool->size;
392
393 return VK_SUCCESS;
394 }
395
396 void
anv_block_pool_finish(struct anv_block_pool * pool)397 anv_block_pool_finish(struct anv_block_pool *pool)
398 {
399 anv_block_pool_foreach_bo(bo, pool) {
400 assert(bo->refcount == 1);
401 anv_device_release_bo(pool->device, bo);
402 }
403 }
404
405 static VkResult
anv_block_pool_expand_range(struct anv_block_pool * pool,uint32_t size)406 anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size)
407 {
408 /* Assert that we only ever grow the pool */
409 assert(size >= pool->state.end);
410
411 /* For state pool BOs we have to be a bit careful about where we place them
412 * in the GTT. There are two documented workarounds for state base address
413 * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset
414 * which state that those two base addresses do not support 48-bit
415 * addresses and need to be placed in the bottom 32-bit range.
416 * Unfortunately, this is not quite accurate.
417 *
418 * The real problem is that we always set the size of our state pools in
419 * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most
420 * likely significantly smaller. We do this because we do not no at the
421 * time we emit STATE_BASE_ADDRESS whether or not we will need to expand
422 * the pool during command buffer building so we don't actually have a
423 * valid final size. If the address + size, as seen by STATE_BASE_ADDRESS
424 * overflows 48 bits, the GPU appears to treat all accesses to the buffer
425 * as being out of bounds and returns zero. For dynamic state, this
426 * usually just leads to rendering corruptions, but shaders that are all
427 * zero hang the GPU immediately.
428 *
429 * The easiest solution to do is exactly what the bogus workarounds say to
430 * do: restrict these buffers to 32-bit addresses. We could also pin the
431 * BO to some particular location of our choosing, but that's significantly
432 * more work than just not setting a flag. So, we explicitly DO NOT set
433 * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the
434 * hard work for us. When using softpin, we're in control and the fixed
435 * addresses we choose are fine for base addresses.
436 */
437
438 uint32_t new_bo_size = size - pool->size;
439 struct anv_bo *new_bo = NULL;
440 VkResult result = anv_device_alloc_bo(pool->device,
441 pool->name,
442 new_bo_size,
443 pool->bo_alloc_flags,
444 intel_48b_address(pool->start_address + pool->size),
445 &new_bo);
446 if (result != VK_SUCCESS)
447 return result;
448
449 pool->bos[pool->nbos++] = new_bo;
450
451 /* This pointer will always point to the first BO in the list */
452 pool->bo = pool->bos[0];
453
454 assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS);
455 pool->size = size;
456
457 return VK_SUCCESS;
458 }
459
460 /** Returns current memory map of the block pool.
461 *
462 * The returned pointer points to the map for the memory at the specified
463 * offset. The offset parameter is relative to the "center" of the block pool
464 * rather than the start of the block pool BO map.
465 */
466 void*
anv_block_pool_map(struct anv_block_pool * pool,int32_t offset,uint32_t size)467 anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
468 {
469 struct anv_bo *bo = NULL;
470 int32_t bo_offset = 0;
471 anv_block_pool_foreach_bo(iter_bo, pool) {
472 if (offset < bo_offset + iter_bo->size) {
473 bo = iter_bo;
474 break;
475 }
476 bo_offset += iter_bo->size;
477 }
478 assert(bo != NULL);
479 assert(offset >= bo_offset);
480 assert((offset - bo_offset) + size <= bo->size);
481
482 return bo->map + (offset - bo_offset);
483 }
484
485 /** Grows and re-centers the block pool.
486 *
487 * We grow the block pool in one or both directions in such a way that the
488 * following conditions are met:
489 *
490 * 1) The size of the entire pool is always a power of two.
491 *
492 * 2) The pool only grows on both ends. Neither end can get
493 * shortened.
494 *
495 * 3) At the end of the allocation, we have about twice as much space
496 * allocated for each end as we have used. This way the pool doesn't
497 * grow too far in one direction or the other.
498 *
499 * 4) We have enough space allocated for at least one more block in
500 * whichever side `state` points to.
501 *
502 * 5) The center of the pool is always aligned to both the block_size of
503 * the pool and a 4K CPU page.
504 */
505 static uint32_t
anv_block_pool_grow(struct anv_block_pool * pool,struct anv_block_state * state,uint32_t contiguous_size)506 anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
507 uint32_t contiguous_size)
508 {
509 VkResult result = VK_SUCCESS;
510
511 pthread_mutex_lock(&pool->device->mutex);
512
513 assert(state == &pool->state);
514
515 /* Gather a little usage information on the pool. Since we may have
516 * threads waiting in queue to get some storage while we resize, it's
517 * actually possible that total_used will be larger than old_size. In
518 * particular, block_pool_alloc() increments state->next prior to
519 * calling block_pool_grow, so this ensures that we get enough space for
520 * which ever side tries to grow the pool.
521 *
522 * We align to a page size because it makes it easier to do our
523 * calculations later in such a way that we state page-aigned.
524 */
525 uint32_t total_used = align(pool->state.next, PAGE_SIZE);
526
527 uint32_t old_size = pool->size;
528
529 /* The block pool is always initialized to a nonzero size and this function
530 * is always called after initialization.
531 */
532 assert(old_size > 0);
533
534 /* total_used may actually be smaller than the actual requirement because
535 * they are based on the next pointers which are updated prior to calling
536 * this function.
537 */
538 uint32_t required = MAX2(total_used, old_size);
539
540 /* With softpin, the pool is made up of a bunch of buffers with separate
541 * maps. Make sure we have enough contiguous space that we can get a
542 * properly contiguous map for the next chunk.
543 */
544 required = MAX2(required, old_size + contiguous_size);
545
546 if (required > pool->max_size) {
547 result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
548 } else if (total_used * 2 > required) {
549 uint32_t size = old_size * 2;
550 while (size < required)
551 size *= 2;
552
553 size = MIN2(size, pool->max_size);
554 assert(size > pool->size);
555
556 result = anv_block_pool_expand_range(pool, size);
557 }
558
559 pthread_mutex_unlock(&pool->device->mutex);
560
561 if (result != VK_SUCCESS)
562 return 0;
563
564 /* Return the appropriate new size. This function never actually
565 * updates state->next. Instead, we let the caller do that because it
566 * needs to do so in order to maintain its concurrency model.
567 */
568 return pool->size;
569 }
570
571 static VkResult
anv_block_pool_alloc_new(struct anv_block_pool * pool,struct anv_block_state * pool_state,uint32_t block_size,int64_t * offset,uint32_t * padding)572 anv_block_pool_alloc_new(struct anv_block_pool *pool,
573 struct anv_block_state *pool_state,
574 uint32_t block_size,
575 int64_t *offset,
576 uint32_t *padding)
577 {
578 struct anv_block_state state, old, new;
579
580 /* Most allocations won't generate any padding */
581 if (padding)
582 *padding = 0;
583
584 while (1) {
585 state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);
586 if (state.next + block_size > pool->max_size) {
587 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
588 } else if (state.next + block_size <= state.end) {
589 *offset = state.next;
590 return VK_SUCCESS;
591 } else if (state.next <= state.end) {
592 if (state.next < state.end) {
593 /* We need to grow the block pool, but still have some leftover
594 * space that can't be used by that particular allocation. So we
595 * add that as a "padding", and return it.
596 */
597 uint32_t leftover = state.end - state.next;
598
599 /* If there is some leftover space in the pool, the caller must
600 * deal with it.
601 */
602 assert(leftover == 0 || padding);
603 if (padding)
604 *padding = leftover;
605 state.next += leftover;
606 }
607
608 /* We allocated the first block outside the pool so we have to grow
609 * the pool. pool_state->next acts a mutex: threads who try to
610 * allocate now will get block indexes above the current limit and
611 * hit futex_wait below.
612 */
613 new.next = state.next + block_size;
614 do {
615 new.end = anv_block_pool_grow(pool, pool_state, block_size);
616 if (pool->size > 0 && new.end == 0) {
617 futex_wake(&pool_state->end, INT32_MAX);
618 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
619 }
620 } while (new.end < new.next);
621
622 old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
623 if (old.next != state.next)
624 futex_wake(&pool_state->end, INT32_MAX);
625 *offset = state.next;
626 return VK_SUCCESS;
627 } else {
628 futex_wait(&pool_state->end, state.end, NULL);
629 continue;
630 }
631 }
632 }
633
634 VkResult
anv_block_pool_alloc(struct anv_block_pool * pool,uint32_t block_size,int64_t * offset,uint32_t * padding)635 anv_block_pool_alloc(struct anv_block_pool *pool,
636 uint32_t block_size,
637 int64_t *offset, uint32_t *padding)
638 {
639 return anv_block_pool_alloc_new(pool, &pool->state, block_size, offset, padding);
640 }
641
642 VkResult
anv_state_pool_init(struct anv_state_pool * pool,struct anv_device * device,const struct anv_state_pool_params * params)643 anv_state_pool_init(struct anv_state_pool *pool,
644 struct anv_device *device,
645 const struct anv_state_pool_params *params)
646 {
647 uint32_t initial_size = MAX2(params->block_size * 16,
648 device->info->mem_alignment);
649
650 VkResult result = anv_block_pool_init(&pool->block_pool, device,
651 params->name,
652 params->base_address + params->start_offset,
653 initial_size,
654 params->max_size);
655 if (result != VK_SUCCESS)
656 return result;
657
658 pool->start_offset = params->start_offset;
659
660 result = anv_state_table_init(&pool->table, device, 64);
661 if (result != VK_SUCCESS) {
662 anv_block_pool_finish(&pool->block_pool);
663 return result;
664 }
665
666 assert(util_is_power_of_two_or_zero(params->block_size));
667 pool->block_size = params->block_size;
668 for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
669 pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;
670 pool->buckets[i].block.next = 0;
671 pool->buckets[i].block.end = 0;
672 }
673 VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
674
675 return VK_SUCCESS;
676 }
677
678 void
anv_state_pool_finish(struct anv_state_pool * pool)679 anv_state_pool_finish(struct anv_state_pool *pool)
680 {
681 VG(VALGRIND_DESTROY_MEMPOOL(pool));
682 anv_state_table_finish(&pool->table);
683 anv_block_pool_finish(&pool->block_pool);
684 }
685
686 static VkResult
anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool * pool,struct anv_block_pool * block_pool,uint32_t state_size,uint32_t block_size,int64_t * offset,uint32_t * padding)687 anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
688 struct anv_block_pool *block_pool,
689 uint32_t state_size,
690 uint32_t block_size,
691 int64_t *offset,
692 uint32_t *padding)
693 {
694 struct anv_block_state block, old, new;
695
696 /* We don't always use anv_block_pool_alloc(), which would set *padding to
697 * zero for us. So if we have a pointer to padding, we must zero it out
698 * ourselves here, to make sure we always return some sensible value.
699 */
700 if (padding)
701 *padding = 0;
702
703 /* If our state is large, we don't need any sub-allocation from a block.
704 * Instead, we just grab whole (potentially large) blocks.
705 */
706 if (state_size >= block_size)
707 return anv_block_pool_alloc(block_pool, state_size, offset, padding);
708
709 restart:
710 block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);
711
712 if (block.next < block.end) {
713 *offset = block.next;
714 return VK_SUCCESS;
715 } else if (block.next == block.end) {
716 VkResult result = anv_block_pool_alloc(block_pool, block_size,
717 offset, padding);
718 if (result != VK_SUCCESS)
719 return result;
720 new.next = *offset + state_size;
721 new.end = *offset + block_size;
722 old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
723 if (old.next != block.next)
724 futex_wake(&pool->block.end, INT32_MAX);
725 return result;
726 } else {
727 futex_wait(&pool->block.end, block.end, NULL);
728 goto restart;
729 }
730 }
731
732 static uint32_t
anv_state_pool_get_bucket(uint32_t size)733 anv_state_pool_get_bucket(uint32_t size)
734 {
735 unsigned size_log2 = util_logbase2_ceil(size);
736 assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
737 if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
738 size_log2 = ANV_MIN_STATE_SIZE_LOG2;
739 return size_log2 - ANV_MIN_STATE_SIZE_LOG2;
740 }
741
742 static uint32_t
anv_state_pool_get_bucket_size(uint32_t bucket)743 anv_state_pool_get_bucket_size(uint32_t bucket)
744 {
745 uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2;
746 return 1 << size_log2;
747 }
748
749 /** Helper to push a chunk into the state table.
750 *
751 * It creates 'count' entries into the state table and update their sizes,
752 * offsets and maps, also pushing them as "free" states.
753 */
754 static void
anv_state_pool_return_blocks(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t count,uint32_t block_size)755 anv_state_pool_return_blocks(struct anv_state_pool *pool,
756 uint32_t chunk_offset, uint32_t count,
757 uint32_t block_size)
758 {
759 /* Disallow returning 0 chunks */
760 assert(count != 0);
761
762 /* Make sure we always return chunks aligned to the block_size */
763 assert(chunk_offset % block_size == 0);
764
765 uint32_t st_idx;
766 UNUSED VkResult result = anv_state_table_add(&pool->table, &st_idx, count);
767 assert(result == VK_SUCCESS);
768 for (int i = 0; i < count; i++) {
769 /* update states that were added back to the state table */
770 struct anv_state *state_i = anv_state_table_get(&pool->table,
771 st_idx + i);
772 state_i->alloc_size = block_size;
773 state_i->offset = pool->start_offset + chunk_offset + block_size * i;
774 state_i->map = anv_block_pool_map(&pool->block_pool,
775 state_i->offset,
776 state_i->alloc_size);
777 }
778
779 uint32_t block_bucket = anv_state_pool_get_bucket(block_size);
780
781 if (block_bucket >= ARRAY_SIZE(pool->buckets))
782 return;
783
784 anv_free_list_push(&pool->buckets[block_bucket].free_list,
785 &pool->table, st_idx, count);
786 }
787
788 /** Returns a chunk of memory back to the state pool.
789 *
790 * Do a two-level split. If chunk_size is bigger than divisor
791 * (pool->block_size), we return as many divisor sized blocks as we can, from
792 * the end of the chunk.
793 *
794 * The remaining is then split into smaller blocks (starting at small_size if
795 * it is non-zero), with larger blocks always being taken from the end of the
796 * chunk.
797 */
798 static void
anv_state_pool_return_chunk(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t chunk_size,uint32_t small_size)799 anv_state_pool_return_chunk(struct anv_state_pool *pool,
800 uint32_t chunk_offset, uint32_t chunk_size,
801 uint32_t small_size)
802 {
803 uint32_t divisor = pool->block_size;
804 uint32_t nblocks = chunk_size / divisor;
805 uint32_t rest = chunk_size - nblocks * divisor;
806
807 if (nblocks > 0) {
808 /* First return divisor aligned and sized chunks. We start returning
809 * larger blocks from the end of the chunk, since they should already be
810 * aligned to divisor. Also anv_state_pool_return_blocks() only accepts
811 * aligned chunks.
812 */
813 uint32_t offset = chunk_offset + rest;
814 anv_state_pool_return_blocks(pool, offset, nblocks, divisor);
815 }
816
817 chunk_size = rest;
818 divisor /= 2;
819
820 if (small_size > 0 && small_size < divisor)
821 divisor = small_size;
822
823 uint32_t min_size = 1 << ANV_MIN_STATE_SIZE_LOG2;
824
825 /* Just as before, return larger divisor aligned blocks from the end of the
826 * chunk first.
827 */
828 while (chunk_size > 0 && divisor >= min_size) {
829 nblocks = chunk_size / divisor;
830 rest = chunk_size - nblocks * divisor;
831 if (nblocks > 0) {
832 anv_state_pool_return_blocks(pool, chunk_offset + rest,
833 nblocks, divisor);
834 chunk_size = rest;
835 }
836 divisor /= 2;
837 }
838 }
839
840 static struct anv_state
anv_state_pool_alloc_no_vg(struct anv_state_pool * pool,uint32_t size,uint32_t align)841 anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
842 uint32_t size, uint32_t align)
843 {
844 uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align));
845
846 if (bucket >= ARRAY_SIZE(pool->buckets))
847 return ANV_STATE_NULL;
848
849 struct anv_state *state;
850 uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket);
851 int64_t offset;
852
853 /* Try free list first. */
854 state = anv_free_list_pop(&pool->buckets[bucket].free_list,
855 &pool->table);
856 if (state) {
857 assert(state->offset >= pool->start_offset);
858 goto done;
859 }
860
861 /* Try to grab a chunk from some larger bucket and split it up */
862 for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) {
863 state = anv_free_list_pop(&pool->buckets[b].free_list, &pool->table);
864 if (state) {
865 unsigned chunk_size = anv_state_pool_get_bucket_size(b);
866 int32_t chunk_offset = state->offset;
867
868 /* First lets update the state we got to its new size. offset and map
869 * remain the same.
870 */
871 state->alloc_size = alloc_size;
872
873 /* Now return the unused part of the chunk back to the pool as free
874 * blocks
875 *
876 * There are a couple of options as to what we do with it:
877 *
878 * 1) We could fully split the chunk into state.alloc_size sized
879 * pieces. However, this would mean that allocating a 16B
880 * state could potentially split a 2MB chunk into 512K smaller
881 * chunks. This would lead to unnecessary fragmentation.
882 *
883 * 2) The classic "buddy allocator" method would have us split the
884 * chunk in half and return one half. Then we would split the
885 * remaining half in half and return one half, and repeat as
886 * needed until we get down to the size we want. However, if
887 * you are allocating a bunch of the same size state (which is
888 * the common case), this means that every other allocation has
889 * to go up a level and every fourth goes up two levels, etc.
890 * This is not nearly as efficient as it could be if we did a
891 * little more work up-front.
892 *
893 * 3) Split the difference between (1) and (2) by doing a
894 * two-level split. If it's bigger than some fixed block_size,
895 * we split it into block_size sized chunks and return all but
896 * one of them. Then we split what remains into
897 * state.alloc_size sized chunks and return them.
898 *
899 * We choose something close to option (3), which is implemented with
900 * anv_state_pool_return_chunk(). That is done by returning the
901 * remaining of the chunk, with alloc_size as a hint of the size that
902 * we want the smaller chunk split into.
903 */
904 anv_state_pool_return_chunk(pool, chunk_offset + alloc_size,
905 chunk_size - alloc_size, alloc_size);
906 goto done;
907 }
908 }
909
910 uint32_t padding;
911 VkResult result =
912 anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
913 &pool->block_pool,
914 alloc_size,
915 pool->block_size,
916 &offset,
917 &padding);
918 if (result != VK_SUCCESS)
919 return ANV_STATE_NULL;
920
921 /* Every time we allocate a new state, add it to the state pool */
922 uint32_t idx = 0;
923 result = anv_state_table_add(&pool->table, &idx, 1);
924 assert(result == VK_SUCCESS);
925
926 state = anv_state_table_get(&pool->table, idx);
927 state->offset = pool->start_offset + offset;
928 state->alloc_size = alloc_size;
929 state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
930
931 if (padding > 0) {
932 uint32_t return_offset = offset - padding;
933 anv_state_pool_return_chunk(pool, return_offset, padding, 0);
934 }
935
936 done:
937 return *state;
938 }
939
940 struct anv_state
anv_state_pool_alloc(struct anv_state_pool * pool,uint32_t size,uint32_t align)941 anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)
942 {
943 if (size == 0)
944 return ANV_STATE_NULL;
945
946 struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align);
947 VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size));
948 return state;
949 }
950
951 static void
anv_state_pool_free_no_vg(struct anv_state_pool * pool,struct anv_state state)952 anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)
953 {
954 assert(util_is_power_of_two_or_zero(state.alloc_size));
955 unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);
956
957 assert(state.offset >= pool->start_offset);
958
959 if (bucket >= ARRAY_SIZE(pool->buckets))
960 return;
961
962 anv_free_list_push(&pool->buckets[bucket].free_list,
963 &pool->table, state.idx, 1);
964 }
965
966 void
anv_state_pool_free(struct anv_state_pool * pool,struct anv_state state)967 anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)
968 {
969 if (state.alloc_size == 0)
970 return;
971
972 VG(VALGRIND_MEMPOOL_FREE(pool, state.map));
973 anv_state_pool_free_no_vg(pool, state);
974 }
975
976 struct anv_state_stream_block {
977 struct anv_state block;
978
979 /* The next block */
980 struct anv_state_stream_block *next;
981
982 #ifdef HAVE_VALGRIND
983 /* A pointer to the first user-allocated thing in this block. This is
984 * what valgrind sees as the start of the block.
985 */
986 void *_vg_ptr;
987 #endif
988 };
989
990 /* The state stream allocator is a one-shot, single threaded allocator for
991 * variable sized blocks. We use it for allocating dynamic state.
992 */
993 void
anv_state_stream_init(struct anv_state_stream * stream,struct anv_state_pool * state_pool,uint32_t block_size)994 anv_state_stream_init(struct anv_state_stream *stream,
995 struct anv_state_pool *state_pool,
996 uint32_t block_size)
997 {
998 stream->state_pool = state_pool;
999 stream->block_size = block_size;
1000
1001 stream->block = ANV_STATE_NULL;
1002
1003 /* Ensure that next + whatever > block_size. This way the first call to
1004 * state_stream_alloc fetches a new block.
1005 */
1006 stream->next = block_size;
1007
1008 stream->total_size = 0;
1009 util_dynarray_init(&stream->all_blocks, NULL);
1010
1011 VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
1012 }
1013
1014 void
anv_state_stream_finish(struct anv_state_stream * stream)1015 anv_state_stream_finish(struct anv_state_stream *stream)
1016 {
1017 util_dynarray_foreach(&stream->all_blocks, struct anv_state, block) {
1018 VG(VALGRIND_MEMPOOL_FREE(stream, block->map));
1019 VG(VALGRIND_MAKE_MEM_NOACCESS(block->map, block->alloc_size));
1020 anv_state_pool_free_no_vg(stream->state_pool, *block);
1021 }
1022 util_dynarray_fini(&stream->all_blocks);
1023
1024 VG(VALGRIND_DESTROY_MEMPOOL(stream));
1025 }
1026
1027 struct anv_state
anv_state_stream_alloc(struct anv_state_stream * stream,uint32_t size,uint32_t alignment)1028 anv_state_stream_alloc(struct anv_state_stream *stream,
1029 uint32_t size, uint32_t alignment)
1030 {
1031 if (size == 0)
1032 return ANV_STATE_NULL;
1033
1034 assert(alignment <= PAGE_SIZE);
1035
1036 uint32_t offset = align(stream->next, alignment);
1037 if (offset + size > stream->block.alloc_size) {
1038 uint32_t block_size = stream->block_size;
1039 if (block_size < size)
1040 block_size = util_next_power_of_two(size);
1041
1042 stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,
1043 block_size, PAGE_SIZE);
1044 if (stream->block.alloc_size == 0)
1045 return ANV_STATE_NULL;
1046
1047 util_dynarray_append(&stream->all_blocks,
1048 struct anv_state, stream->block);
1049 VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size));
1050
1051 /* Reset back to the start */
1052 stream->next = offset = 0;
1053 assert(offset + size <= stream->block.alloc_size);
1054 stream->total_size += block_size;
1055 }
1056 const bool new_block = stream->next == 0;
1057
1058 struct anv_state state = stream->block;
1059 state.offset += offset;
1060 state.alloc_size = size;
1061 state.map += offset;
1062
1063 stream->next = offset + size;
1064
1065 if (new_block) {
1066 assert(state.map == stream->block.map);
1067 VG(VALGRIND_MEMPOOL_ALLOC(stream, state.map, size));
1068 } else {
1069 /* This only updates the mempool. The newly allocated chunk is still
1070 * marked as NOACCESS. */
1071 VG(VALGRIND_MEMPOOL_CHANGE(stream, stream->block.map, stream->block.map,
1072 stream->next));
1073 /* Mark the newly allocated chunk as undefined */
1074 VG(VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size));
1075 }
1076
1077 return state;
1078 }
1079
1080 void
anv_state_reserved_pool_init(struct anv_state_reserved_pool * pool,struct anv_state_pool * parent,uint32_t count,uint32_t size,uint32_t alignment)1081 anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool,
1082 struct anv_state_pool *parent,
1083 uint32_t count, uint32_t size, uint32_t alignment)
1084 {
1085 pool->pool = parent;
1086 pool->reserved_blocks = ANV_FREE_LIST_EMPTY;
1087 pool->count = count;
1088
1089 for (unsigned i = 0; i < count; i++) {
1090 struct anv_state state = anv_state_pool_alloc(pool->pool, size, alignment);
1091 anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1092 }
1093 }
1094
1095 void
anv_state_reserved_pool_finish(struct anv_state_reserved_pool * pool)1096 anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool)
1097 {
1098 struct anv_state *state;
1099
1100 while ((state = anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table))) {
1101 anv_state_pool_free(pool->pool, *state);
1102 pool->count--;
1103 }
1104 assert(pool->count == 0);
1105 }
1106
1107 struct anv_state
anv_state_reserved_pool_alloc(struct anv_state_reserved_pool * pool)1108 anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool)
1109 {
1110 return *anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table);
1111 }
1112
1113 void
anv_state_reserved_pool_free(struct anv_state_reserved_pool * pool,struct anv_state state)1114 anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
1115 struct anv_state state)
1116 {
1117 anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1118 }
1119
1120 VkResult
anv_state_reserved_array_pool_init(struct anv_state_reserved_array_pool * pool,struct anv_state_pool * parent,uint32_t count,uint32_t size,uint32_t alignment)1121 anv_state_reserved_array_pool_init(struct anv_state_reserved_array_pool *pool,
1122 struct anv_state_pool *parent,
1123 uint32_t count, uint32_t size, uint32_t alignment)
1124 {
1125 struct anv_device *device = parent->block_pool.device;
1126
1127 pool->pool = parent;
1128 pool->count = count;
1129 pool->size = size;
1130 pool->stride = align(size, alignment);
1131 pool->states = vk_zalloc(&device->vk.alloc,
1132 sizeof(BITSET_WORD) * BITSET_WORDS(pool->count), 8,
1133 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1134 if (pool->states == NULL)
1135 return vk_error(&device->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
1136
1137 BITSET_SET_RANGE(pool->states, 0, pool->count - 1);
1138 simple_mtx_init(&pool->mutex, mtx_plain);
1139
1140 pool->state = anv_state_pool_alloc(pool->pool, pool->stride * count, alignment);
1141 if (pool->state.alloc_size == 0) {
1142 vk_free(&device->vk.alloc, pool->states);
1143 return vk_error(&device->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1144 }
1145
1146 return VK_SUCCESS;
1147 }
1148
1149 void
anv_state_reserved_array_pool_finish(struct anv_state_reserved_array_pool * pool)1150 anv_state_reserved_array_pool_finish(struct anv_state_reserved_array_pool *pool)
1151 {
1152 anv_state_pool_free(pool->pool, pool->state);
1153 vk_free(&pool->pool->block_pool.device->vk.alloc, pool->states);
1154 simple_mtx_destroy(&pool->mutex);
1155 }
1156
1157 struct anv_state
anv_state_reserved_array_pool_alloc(struct anv_state_reserved_array_pool * pool,bool alloc_back)1158 anv_state_reserved_array_pool_alloc(struct anv_state_reserved_array_pool *pool,
1159 bool alloc_back)
1160 {
1161 simple_mtx_lock(&pool->mutex);
1162 int idx = alloc_back ?
1163 __bitset_last_bit(pool->states, BITSET_WORDS(pool->count)) :
1164 __bitset_ffs(pool->states, BITSET_WORDS(pool->count));
1165 if (idx != 0)
1166 BITSET_CLEAR(pool->states, idx - 1);
1167 simple_mtx_unlock(&pool->mutex);
1168
1169 if (idx == 0)
1170 return ANV_STATE_NULL;
1171
1172 idx--;
1173
1174 struct anv_state state = pool->state;
1175 state.offset += idx * pool->stride;
1176 state.map += idx * pool->stride;
1177 state.alloc_size = pool->size;
1178
1179 return state;
1180 }
1181
1182 struct anv_state
anv_state_reserved_array_pool_alloc_index(struct anv_state_reserved_array_pool * pool,uint32_t idx)1183 anv_state_reserved_array_pool_alloc_index(struct anv_state_reserved_array_pool *pool,
1184 uint32_t idx)
1185 {
1186 simple_mtx_lock(&pool->mutex);
1187 bool already_allocated = !BITSET_TEST(pool->states, idx);
1188 if (!already_allocated)
1189 BITSET_CLEAR(pool->states, idx);
1190 simple_mtx_unlock(&pool->mutex);
1191
1192 if (already_allocated)
1193 return ANV_STATE_NULL;
1194
1195 struct anv_state state = pool->state;
1196 state.offset += idx * pool->stride;
1197 state.map += idx * pool->stride;
1198 state.alloc_size = pool->size;
1199
1200 return state;
1201 }
1202
1203 uint32_t
anv_state_reserved_array_pool_state_index(struct anv_state_reserved_array_pool * pool,struct anv_state state)1204 anv_state_reserved_array_pool_state_index(struct anv_state_reserved_array_pool *pool,
1205 struct anv_state state)
1206 {
1207 return (state.offset - pool->state.offset) / pool->stride;
1208 }
1209
1210 void
anv_state_reserved_array_pool_free(struct anv_state_reserved_array_pool * pool,struct anv_state state)1211 anv_state_reserved_array_pool_free(struct anv_state_reserved_array_pool *pool,
1212 struct anv_state state)
1213 {
1214 unsigned idx = (state.offset - pool->state.offset) / pool->stride;
1215 simple_mtx_lock(&pool->mutex);
1216 BITSET_SET(pool->states, idx);
1217 simple_mtx_unlock(&pool->mutex);
1218 }
1219
1220 void
anv_bo_pool_init(struct anv_bo_pool * pool,struct anv_device * device,const char * name,enum anv_bo_alloc_flags alloc_flags)1221 anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
1222 const char *name, enum anv_bo_alloc_flags alloc_flags)
1223 {
1224 pool->name = name;
1225 pool->device = device;
1226 pool->bo_alloc_flags = alloc_flags;
1227
1228 for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1229 util_sparse_array_free_list_init(&pool->free_list[i],
1230 &device->bo_cache.bo_map, 0,
1231 offsetof(struct anv_bo, free_index));
1232 }
1233
1234 VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
1235 }
1236
1237 void
anv_bo_pool_finish(struct anv_bo_pool * pool)1238 anv_bo_pool_finish(struct anv_bo_pool *pool)
1239 {
1240 for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1241 while (1) {
1242 struct anv_bo *bo =
1243 util_sparse_array_free_list_pop_elem(&pool->free_list[i]);
1244 if (bo == NULL)
1245 break;
1246
1247 /* anv_device_release_bo is going to "free" it */
1248 VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1));
1249 anv_device_release_bo(pool->device, bo);
1250 }
1251 }
1252
1253 VG(VALGRIND_DESTROY_MEMPOOL(pool));
1254 }
1255
1256 VkResult
anv_bo_pool_alloc(struct anv_bo_pool * pool,uint32_t size,struct anv_bo ** bo_out)1257 anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
1258 struct anv_bo **bo_out)
1259 {
1260 const unsigned size_log2 = size < 4096 ? 12 : util_logbase2_ceil(size);
1261 const unsigned pow2_size = 1 << size_log2;
1262 const unsigned bucket = size_log2 - 12;
1263 assert(bucket < ARRAY_SIZE(pool->free_list));
1264
1265 struct anv_bo *bo =
1266 util_sparse_array_free_list_pop_elem(&pool->free_list[bucket]);
1267 if (bo != NULL) {
1268 VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1269 *bo_out = bo;
1270 return VK_SUCCESS;
1271 }
1272
1273 VkResult result = anv_device_alloc_bo(pool->device,
1274 pool->name,
1275 pow2_size,
1276 pool->bo_alloc_flags,
1277 0 /* explicit_address */,
1278 &bo);
1279 if (result != VK_SUCCESS)
1280 return result;
1281
1282 /* We want it to look like it came from this pool */
1283 VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
1284 VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1285
1286 *bo_out = bo;
1287
1288 return VK_SUCCESS;
1289 }
1290
1291 void
anv_bo_pool_free(struct anv_bo_pool * pool,struct anv_bo * bo)1292 anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo)
1293 {
1294 VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));
1295
1296 assert(util_is_power_of_two_or_zero(bo->size));
1297 const unsigned size_log2 = util_logbase2_ceil(bo->size);
1298 const unsigned bucket = size_log2 - 12;
1299 assert(bucket < ARRAY_SIZE(pool->free_list));
1300
1301 assert(util_sparse_array_get(&pool->device->bo_cache.bo_map,
1302 bo->gem_handle) == bo);
1303 util_sparse_array_free_list_push(&pool->free_list[bucket],
1304 &bo->gem_handle, 1);
1305 }
1306
1307 // Scratch pool
1308
1309 void
anv_scratch_pool_init(struct anv_device * device,struct anv_scratch_pool * pool,bool protected)1310 anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool,
1311 bool protected)
1312 {
1313 memset(pool, 0, sizeof(*pool));
1314 pool->alloc_flags = ANV_BO_ALLOC_INTERNAL |
1315 (protected ? ANV_BO_ALLOC_PROTECTED : 0) |
1316 (device->info->verx10 < 125 ? ANV_BO_ALLOC_32BIT_ADDRESS : 0);
1317 }
1318
1319 void
anv_scratch_pool_finish(struct anv_device * device,struct anv_scratch_pool * pool)1320 anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool)
1321 {
1322 for (unsigned s = 0; s < ARRAY_SIZE(pool->bos[0]); s++) {
1323 for (unsigned i = 0; i < 16; i++) {
1324 if (pool->bos[i][s] != NULL)
1325 anv_device_release_bo(device, pool->bos[i][s]);
1326 }
1327 }
1328
1329 for (unsigned i = 0; i < 16; i++) {
1330 if (pool->surf_states[i].map != NULL) {
1331 anv_state_pool_free(&device->scratch_surface_state_pool,
1332 pool->surf_states[i]);
1333 }
1334 }
1335 }
1336
1337 struct anv_bo *
anv_scratch_pool_alloc(struct anv_device * device,struct anv_scratch_pool * pool,gl_shader_stage stage,unsigned per_thread_scratch)1338 anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
1339 gl_shader_stage stage, unsigned per_thread_scratch)
1340 {
1341 if (per_thread_scratch == 0)
1342 return NULL;
1343
1344 unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1345 assert(scratch_size_log2 < 16);
1346
1347 assert(stage < ARRAY_SIZE(pool->bos));
1348
1349 const struct intel_device_info *devinfo = device->info;
1350
1351 /* On GFX version 12.5, scratch access changed to a surface-based model.
1352 * Instead of each shader type having its own layout based on IDs passed
1353 * from the relevant fixed-function unit, all scratch access is based on
1354 * thread IDs like it always has been for compute.
1355 */
1356 if (devinfo->verx10 >= 125)
1357 stage = MESA_SHADER_COMPUTE;
1358
1359 struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]);
1360
1361 if (bo != NULL)
1362 return bo;
1363
1364 assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
1365 uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
1366
1367 /* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they
1368 * are still relative to the general state base address. When we emit
1369 * STATE_BASE_ADDRESS, we set general state base address to 0 and the size
1370 * to the maximum (1 page under 4GB). This allows us to just place the
1371 * scratch buffers anywhere we wish in the bottom 32 bits of address space
1372 * and just set the scratch base pointer in 3DSTATE_*S using a relocation.
1373 * However, in order to do so, we need to ensure that the kernel does not
1374 * place the scratch BO above the 32-bit boundary.
1375 *
1376 * NOTE: Technically, it can't go "anywhere" because the top page is off
1377 * limits. However, when EXEC_OBJECT_SUPPORTS_48B_ADDRESS is set, the
1378 * kernel allocates space using
1379 *
1380 * end = min_t(u64, end, (1ULL << 32) - I915_GTT_PAGE_SIZE);
1381 *
1382 * so nothing will ever touch the top page.
1383 */
1384 VkResult result = anv_device_alloc_bo(device, "scratch", size,
1385 pool->alloc_flags,
1386 0 /* explicit_address */,
1387 &bo);
1388 if (result != VK_SUCCESS)
1389 return NULL; /* TODO */
1390
1391 struct anv_bo *current_bo =
1392 p_atomic_cmpxchg(&pool->bos[scratch_size_log2][stage], NULL, bo);
1393 if (current_bo) {
1394 anv_device_release_bo(device, bo);
1395 return current_bo;
1396 } else {
1397 return bo;
1398 }
1399 }
1400
1401 uint32_t
anv_scratch_pool_get_surf(struct anv_device * device,struct anv_scratch_pool * pool,unsigned per_thread_scratch)1402 anv_scratch_pool_get_surf(struct anv_device *device,
1403 struct anv_scratch_pool *pool,
1404 unsigned per_thread_scratch)
1405 {
1406 assert(device->info->verx10 >= 125);
1407
1408 if (per_thread_scratch == 0)
1409 return 0;
1410
1411 unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1412 assert(scratch_size_log2 < 16);
1413
1414 uint32_t surf = p_atomic_read(&pool->surfs[scratch_size_log2]);
1415 if (surf > 0)
1416 return surf;
1417
1418 struct anv_bo *bo =
1419 anv_scratch_pool_alloc(device, pool, MESA_SHADER_COMPUTE,
1420 per_thread_scratch);
1421 struct anv_address addr = { .bo = bo };
1422
1423 struct anv_state state =
1424 anv_state_pool_alloc(&device->scratch_surface_state_pool,
1425 device->isl_dev.ss.size, 64);
1426
1427 isl_surf_usage_flags_t usage =
1428 (pool->alloc_flags & ANV_BO_ALLOC_PROTECTED) ?
1429 ISL_SURF_USAGE_PROTECTED_BIT : 0;
1430
1431 isl_buffer_fill_state(&device->isl_dev, state.map,
1432 .address = anv_address_physical(addr),
1433 .size_B = bo->size,
1434 .mocs = anv_mocs(device, bo, usage),
1435 .format = ISL_FORMAT_RAW,
1436 .swizzle = ISL_SWIZZLE_IDENTITY,
1437 .stride_B = per_thread_scratch,
1438 .is_scratch = true);
1439
1440 uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
1441 0, state.offset);
1442 if (current) {
1443 anv_state_pool_free(&device->scratch_surface_state_pool, state);
1444 return current;
1445 } else {
1446 pool->surf_states[scratch_size_log2] = state;
1447 return state.offset;
1448 }
1449 }
1450
1451 VkResult
anv_bo_cache_init(struct anv_bo_cache * cache,struct anv_device * device)1452 anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device)
1453 {
1454 util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024);
1455
1456 if (pthread_mutex_init(&cache->mutex, NULL)) {
1457 util_sparse_array_finish(&cache->bo_map);
1458 return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
1459 "pthread_mutex_init failed: %m");
1460 }
1461
1462 return VK_SUCCESS;
1463 }
1464
1465 void
anv_bo_cache_finish(struct anv_bo_cache * cache)1466 anv_bo_cache_finish(struct anv_bo_cache *cache)
1467 {
1468 util_sparse_array_finish(&cache->bo_map);
1469 pthread_mutex_destroy(&cache->mutex);
1470 }
1471
1472 static void
anv_bo_unmap_close(struct anv_device * device,struct anv_bo * bo)1473 anv_bo_unmap_close(struct anv_device *device, struct anv_bo *bo)
1474 {
1475 if (bo->map && !bo->from_host_ptr)
1476 anv_device_unmap_bo(device, bo, bo->map, bo->size, false /* replace */);
1477
1478 assert(bo->gem_handle != 0);
1479 device->kmd_backend->gem_close(device, bo);
1480 }
1481
1482 static void
anv_bo_vma_free(struct anv_device * device,struct anv_bo * bo)1483 anv_bo_vma_free(struct anv_device *device, struct anv_bo *bo)
1484 {
1485 if (bo->offset != 0 && !(bo->alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS)) {
1486 assert(bo->vma_heap != NULL);
1487 anv_vma_free(device, bo->vma_heap, bo->offset, bo->size);
1488 }
1489 bo->vma_heap = NULL;
1490 }
1491
1492 static void
anv_bo_finish(struct anv_device * device,struct anv_bo * bo)1493 anv_bo_finish(struct anv_device *device, struct anv_bo *bo)
1494 {
1495 /* Not releasing vma in case unbind fails */
1496 if (device->kmd_backend->vm_unbind_bo(device, bo) == VK_SUCCESS)
1497 anv_bo_vma_free(device, bo);
1498
1499 anv_bo_unmap_close(device, bo);
1500 }
1501
1502 static VkResult
anv_bo_vma_alloc_or_close(struct anv_device * device,struct anv_bo * bo,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address)1503 anv_bo_vma_alloc_or_close(struct anv_device *device,
1504 struct anv_bo *bo,
1505 enum anv_bo_alloc_flags alloc_flags,
1506 uint64_t explicit_address)
1507 {
1508 assert(bo->vma_heap == NULL);
1509 assert(explicit_address == intel_48b_address(explicit_address));
1510
1511 uint32_t align = device->physical->info.mem_alignment;
1512
1513 /* If it's big enough to store a tiled resource, we need 64K alignment */
1514 if (bo->size >= 64 * 1024)
1515 align = MAX2(64 * 1024, align);
1516
1517 /* If we're using the AUX map, make sure we follow the required
1518 * alignment.
1519 */
1520 if (alloc_flags & ANV_BO_ALLOC_AUX_TT_ALIGNED)
1521 align = MAX2(intel_aux_map_get_alignment(device->aux_map_ctx), align);
1522
1523 /* Opportunistically align addresses to 2Mb when above 1Mb. We do this
1524 * because this gives an opportunity for the kernel to use Transparent Huge
1525 * Pages (the 2MB page table layout) for faster memory access.
1526 *
1527 * Only available on ICL+.
1528 */
1529 if (device->info->ver >= 11 && bo->size >= 1 * 1024 * 1024)
1530 align = MAX2(2 * 1024 * 1024, align);
1531
1532 if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
1533 bo->offset = intel_canonical_address(explicit_address);
1534 } else {
1535 bo->offset = anv_vma_alloc(device, bo->size, align, alloc_flags,
1536 explicit_address, &bo->vma_heap);
1537 if (bo->offset == 0) {
1538 anv_bo_unmap_close(device, bo);
1539 return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
1540 "failed to allocate virtual address for BO");
1541 }
1542 }
1543
1544 return VK_SUCCESS;
1545 }
1546
1547 enum intel_device_info_mmap_mode
anv_bo_get_mmap_mode(struct anv_device * device,struct anv_bo * bo)1548 anv_bo_get_mmap_mode(struct anv_device *device, struct anv_bo *bo)
1549 {
1550 enum anv_bo_alloc_flags alloc_flags = bo->alloc_flags;
1551
1552 if (device->info->has_set_pat_uapi)
1553 return anv_device_get_pat_entry(device, alloc_flags)->mmap;
1554
1555 if (anv_physical_device_has_vram(device->physical)) {
1556 if ((alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) ||
1557 (alloc_flags & ANV_BO_ALLOC_IMPORTED))
1558 return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1559
1560 return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1561 }
1562
1563 /* gfx9 atom */
1564 if (!device->info->has_llc) {
1565 /* user wants a cached and coherent memory but to achieve it without
1566 * LLC in older platforms DRM_IOCTL_I915_GEM_SET_CACHING needs to be
1567 * supported and set.
1568 */
1569 if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED)
1570 return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1571
1572 return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1573 }
1574
1575 if (alloc_flags & (ANV_BO_ALLOC_SCANOUT | ANV_BO_ALLOC_EXTERNAL))
1576 return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1577
1578 return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1579 }
1580
1581 VkResult
anv_device_alloc_bo(struct anv_device * device,const char * name,uint64_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address,struct anv_bo ** bo_out)1582 anv_device_alloc_bo(struct anv_device *device,
1583 const char *name,
1584 uint64_t size,
1585 enum anv_bo_alloc_flags alloc_flags,
1586 uint64_t explicit_address,
1587 struct anv_bo **bo_out)
1588 {
1589 /* bo that needs CPU access needs to be HOST_CACHED, HOST_COHERENT or both */
1590 assert((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0 ||
1591 (alloc_flags & (ANV_BO_ALLOC_HOST_CACHED | ANV_BO_ALLOC_HOST_COHERENT)));
1592
1593 /* In platforms with LLC we can promote all bos to cached+coherent for free */
1594 const enum anv_bo_alloc_flags not_allowed_promotion = ANV_BO_ALLOC_SCANOUT |
1595 ANV_BO_ALLOC_EXTERNAL |
1596 ANV_BO_ALLOC_PROTECTED;
1597 if (device->info->has_llc && ((alloc_flags & not_allowed_promotion) == 0))
1598 alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
1599
1600 const uint32_t bo_flags =
1601 device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
1602
1603 /* The kernel is going to give us whole pages anyway. */
1604 size = align64(size, 4096);
1605
1606 const uint64_t ccs_offset = size;
1607 if (alloc_flags & ANV_BO_ALLOC_AUX_CCS) {
1608 assert(device->info->has_aux_map);
1609 size += size / INTEL_AUX_MAP_MAIN_SIZE_SCALEDOWN;
1610 size = align64(size, 4096);
1611 }
1612
1613 const struct intel_memory_class_instance *regions[2];
1614 uint32_t nregions = 0;
1615
1616 /* If we have vram size, we have multiple memory regions and should choose
1617 * one of them.
1618 */
1619 if (anv_physical_device_has_vram(device->physical)) {
1620 /* This always try to put the object in local memory. Here
1621 * vram_non_mappable & vram_mappable actually are the same region.
1622 */
1623 if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM)
1624 regions[nregions++] = device->physical->sys.region;
1625 else
1626 regions[nregions++] = device->physical->vram_non_mappable.region;
1627
1628 /* If the buffer is mapped on the host, add the system memory region.
1629 * This ensures that if the buffer cannot live in mappable local memory,
1630 * it can be spilled to system memory.
1631 */
1632 if (!(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) &&
1633 ((alloc_flags & ANV_BO_ALLOC_MAPPED) ||
1634 (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)))
1635 regions[nregions++] = device->physical->sys.region;
1636 } else {
1637 regions[nregions++] = device->physical->sys.region;
1638 }
1639
1640 uint64_t actual_size;
1641 uint32_t gem_handle = device->kmd_backend->gem_create(device, regions,
1642 nregions, size,
1643 alloc_flags,
1644 &actual_size);
1645 if (gem_handle == 0)
1646 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1647
1648 struct anv_bo new_bo = {
1649 .name = name,
1650 .gem_handle = gem_handle,
1651 .refcount = 1,
1652 .offset = -1,
1653 .size = size,
1654 .ccs_offset = ccs_offset,
1655 .actual_size = actual_size,
1656 .flags = bo_flags,
1657 .alloc_flags = alloc_flags,
1658 };
1659
1660 if (alloc_flags & ANV_BO_ALLOC_MAPPED) {
1661 VkResult result = anv_device_map_bo(device, &new_bo, 0, size,
1662 NULL, &new_bo.map);
1663 if (unlikely(result != VK_SUCCESS)) {
1664 device->kmd_backend->gem_close(device, &new_bo);
1665 return result;
1666 }
1667 }
1668
1669 VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1670 alloc_flags,
1671 explicit_address);
1672 if (result != VK_SUCCESS)
1673 return result;
1674
1675 result = device->kmd_backend->vm_bind_bo(device, &new_bo);
1676 if (result != VK_SUCCESS) {
1677 anv_bo_vma_free(device, &new_bo);
1678 anv_bo_unmap_close(device, &new_bo);
1679 return result;
1680 }
1681
1682 assert(new_bo.gem_handle);
1683
1684 /* If we just got this gem_handle from anv_bo_init_new then we know no one
1685 * else is touching this BO at the moment so we don't need to lock here.
1686 */
1687 struct anv_bo *bo = anv_device_lookup_bo(device, new_bo.gem_handle);
1688 *bo = new_bo;
1689
1690 *bo_out = bo;
1691
1692 ANV_RMV(bo_allocate, device, bo);
1693
1694 return VK_SUCCESS;
1695 }
1696
1697 VkResult
anv_device_map_bo(struct anv_device * device,struct anv_bo * bo,uint64_t offset,size_t size,void * placed_addr,void ** map_out)1698 anv_device_map_bo(struct anv_device *device,
1699 struct anv_bo *bo,
1700 uint64_t offset,
1701 size_t size,
1702 void *placed_addr,
1703 void **map_out)
1704 {
1705 assert(!bo->from_host_ptr);
1706 assert(size > 0);
1707
1708 void *map = device->kmd_backend->gem_mmap(device, bo, offset, size, placed_addr);
1709 if (unlikely(map == MAP_FAILED))
1710 return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
1711
1712 assert(placed_addr == NULL || map == placed_addr);
1713
1714 assert(map != NULL);
1715 VG(VALGRIND_MALLOCLIKE_BLOCK(map, size, 0, 1));
1716
1717 if (map_out)
1718 *map_out = map;
1719
1720 return VK_SUCCESS;
1721 }
1722
1723 VkResult
anv_device_unmap_bo(struct anv_device * device,struct anv_bo * bo,void * map,size_t map_size,bool replace)1724 anv_device_unmap_bo(struct anv_device *device,
1725 struct anv_bo *bo,
1726 void *map, size_t map_size,
1727 bool replace)
1728 {
1729 assert(!bo->from_host_ptr);
1730
1731 if (replace) {
1732 map = mmap(map, map_size, PROT_NONE,
1733 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
1734 if (map == MAP_FAILED) {
1735 return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
1736 "Failed to map over original mapping");
1737 }
1738 } else {
1739 VG(VALGRIND_FREELIKE_BLOCK(map, 0));
1740 munmap(map, map_size);
1741 }
1742 return VK_SUCCESS;
1743 }
1744
1745 VkResult
anv_device_import_bo_from_host_ptr(struct anv_device * device,void * host_ptr,uint32_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1746 anv_device_import_bo_from_host_ptr(struct anv_device *device,
1747 void *host_ptr, uint32_t size,
1748 enum anv_bo_alloc_flags alloc_flags,
1749 uint64_t client_address,
1750 struct anv_bo **bo_out)
1751 {
1752 assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1753 ANV_BO_ALLOC_HOST_CACHED |
1754 ANV_BO_ALLOC_HOST_COHERENT |
1755 ANV_BO_ALLOC_AUX_CCS |
1756 ANV_BO_ALLOC_PROTECTED |
1757 ANV_BO_ALLOC_FIXED_ADDRESS)));
1758 assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
1759
1760 struct anv_bo_cache *cache = &device->bo_cache;
1761 const uint32_t bo_flags =
1762 device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
1763
1764 uint32_t gem_handle = device->kmd_backend->gem_create_userptr(device, host_ptr, size);
1765 if (!gem_handle)
1766 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1767
1768 pthread_mutex_lock(&cache->mutex);
1769
1770 struct anv_bo *bo = NULL;
1771 if (device->info->kmd_type == INTEL_KMD_TYPE_XE) {
1772 bo = vk_zalloc(&device->vk.alloc, sizeof(*bo), 8,
1773 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1774 if (!bo) {
1775 pthread_mutex_unlock(&cache->mutex);
1776 return VK_ERROR_OUT_OF_HOST_MEMORY;
1777 }
1778 } else {
1779 bo = anv_device_lookup_bo(device, gem_handle);
1780 }
1781
1782 if (bo->refcount > 0) {
1783 /* VK_EXT_external_memory_host doesn't require handling importing the
1784 * same pointer twice at the same time, but we don't get in the way. If
1785 * kernel gives us the same gem_handle, only succeed if the flags match.
1786 */
1787 assert(bo->gem_handle == gem_handle);
1788 if (bo_flags != bo->flags) {
1789 pthread_mutex_unlock(&cache->mutex);
1790 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1791 "same host pointer imported two different ways");
1792 }
1793
1794 if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
1795 (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
1796 pthread_mutex_unlock(&cache->mutex);
1797 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1798 "The same BO was imported with and without buffer "
1799 "device address");
1800 }
1801
1802 if (client_address && client_address != intel_48b_address(bo->offset)) {
1803 pthread_mutex_unlock(&cache->mutex);
1804 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1805 "The same BO was imported at two different "
1806 "addresses");
1807 }
1808
1809 __sync_fetch_and_add(&bo->refcount, 1);
1810 } else {
1811 alloc_flags |= ANV_BO_ALLOC_IMPORTED;
1812 struct anv_bo new_bo = {
1813 .name = "host-ptr",
1814 .gem_handle = gem_handle,
1815 .refcount = 1,
1816 .offset = -1,
1817 .size = size,
1818 .actual_size = size,
1819 .map = host_ptr,
1820 .flags = bo_flags,
1821 .alloc_flags = alloc_flags,
1822 .from_host_ptr = true,
1823 };
1824
1825 VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1826 alloc_flags,
1827 client_address);
1828 if (result != VK_SUCCESS) {
1829 pthread_mutex_unlock(&cache->mutex);
1830 return result;
1831 }
1832
1833 result = device->kmd_backend->vm_bind_bo(device, &new_bo);
1834 if (result != VK_SUCCESS) {
1835 anv_bo_vma_free(device, &new_bo);
1836 pthread_mutex_unlock(&cache->mutex);
1837 return result;
1838 }
1839
1840 *bo = new_bo;
1841
1842 ANV_RMV(bo_allocate, device, bo);
1843 }
1844
1845 pthread_mutex_unlock(&cache->mutex);
1846 *bo_out = bo;
1847
1848 return VK_SUCCESS;
1849 }
1850
1851 VkResult
anv_device_import_bo(struct anv_device * device,int fd,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1852 anv_device_import_bo(struct anv_device *device,
1853 int fd,
1854 enum anv_bo_alloc_flags alloc_flags,
1855 uint64_t client_address,
1856 struct anv_bo **bo_out)
1857 {
1858 assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1859 ANV_BO_ALLOC_HOST_CACHED |
1860 ANV_BO_ALLOC_HOST_COHERENT |
1861 ANV_BO_ALLOC_FIXED_ADDRESS)));
1862 assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
1863
1864 struct anv_bo_cache *cache = &device->bo_cache;
1865
1866 pthread_mutex_lock(&cache->mutex);
1867
1868 uint32_t gem_handle = anv_gem_fd_to_handle(device, fd);
1869 if (!gem_handle) {
1870 pthread_mutex_unlock(&cache->mutex);
1871 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1872 }
1873
1874 struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
1875
1876 uint32_t bo_flags;
1877 VkResult result = anv_gem_import_bo_alloc_flags_to_bo_flags(device, bo,
1878 alloc_flags,
1879 &bo_flags);
1880 if (result != VK_SUCCESS) {
1881 pthread_mutex_unlock(&cache->mutex);
1882 return result;
1883 }
1884
1885 if (bo->refcount > 0) {
1886 if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
1887 (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
1888 pthread_mutex_unlock(&cache->mutex);
1889 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1890 "The same BO was imported with and without buffer "
1891 "device address");
1892 }
1893
1894 if (client_address && client_address != intel_48b_address(bo->offset)) {
1895 pthread_mutex_unlock(&cache->mutex);
1896 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1897 "The same BO was imported at two different "
1898 "addresses");
1899 }
1900
1901 __sync_fetch_and_add(&bo->refcount, 1);
1902 } else {
1903 alloc_flags |= ANV_BO_ALLOC_IMPORTED;
1904 struct anv_bo new_bo = {
1905 .name = "imported",
1906 .gem_handle = gem_handle,
1907 .refcount = 1,
1908 .offset = -1,
1909 .alloc_flags = alloc_flags,
1910 };
1911
1912 off_t size = lseek(fd, 0, SEEK_END);
1913 if (size == (off_t)-1) {
1914 device->kmd_backend->gem_close(device, &new_bo);
1915 pthread_mutex_unlock(&cache->mutex);
1916 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1917 }
1918 new_bo.size = size;
1919 new_bo.actual_size = size;
1920
1921 VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1922 alloc_flags,
1923 client_address);
1924 if (result != VK_SUCCESS) {
1925 pthread_mutex_unlock(&cache->mutex);
1926 return result;
1927 }
1928
1929 result = device->kmd_backend->vm_bind_bo(device, &new_bo);
1930 if (result != VK_SUCCESS) {
1931 anv_bo_vma_free(device, &new_bo);
1932 pthread_mutex_unlock(&cache->mutex);
1933 return result;
1934 }
1935
1936 *bo = new_bo;
1937
1938 ANV_RMV(bo_allocate, device, bo);
1939 }
1940
1941 bo->flags = bo_flags;
1942
1943 pthread_mutex_unlock(&cache->mutex);
1944 *bo_out = bo;
1945
1946 return VK_SUCCESS;
1947 }
1948
1949 VkResult
anv_device_export_bo(struct anv_device * device,struct anv_bo * bo,int * fd_out)1950 anv_device_export_bo(struct anv_device *device,
1951 struct anv_bo *bo, int *fd_out)
1952 {
1953 assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
1954
1955 /* This BO must have been flagged external in order for us to be able
1956 * to export it. This is done based on external options passed into
1957 * anv_AllocateMemory.
1958 */
1959 assert(anv_bo_is_external(bo));
1960
1961 int fd = anv_gem_handle_to_fd(device, bo->gem_handle);
1962 if (fd < 0)
1963 return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
1964
1965 *fd_out = fd;
1966
1967 return VK_SUCCESS;
1968 }
1969
1970 VkResult
anv_device_get_bo_tiling(struct anv_device * device,struct anv_bo * bo,enum isl_tiling * tiling_out)1971 anv_device_get_bo_tiling(struct anv_device *device,
1972 struct anv_bo *bo,
1973 enum isl_tiling *tiling_out)
1974 {
1975 int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
1976 if (i915_tiling < 0) {
1977 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1978 "failed to get BO tiling: %m");
1979 }
1980
1981 *tiling_out = isl_tiling_from_i915_tiling(i915_tiling);
1982
1983 return VK_SUCCESS;
1984 }
1985
1986 VkResult
anv_device_set_bo_tiling(struct anv_device * device,struct anv_bo * bo,uint32_t row_pitch_B,enum isl_tiling tiling)1987 anv_device_set_bo_tiling(struct anv_device *device,
1988 struct anv_bo *bo,
1989 uint32_t row_pitch_B,
1990 enum isl_tiling tiling)
1991 {
1992 int ret = anv_gem_set_tiling(device, bo->gem_handle, row_pitch_B,
1993 isl_tiling_to_i915_tiling(tiling));
1994 if (ret) {
1995 return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
1996 "failed to set BO tiling: %m");
1997 }
1998
1999 return VK_SUCCESS;
2000 }
2001
2002 static bool
atomic_dec_not_one(uint32_t * counter)2003 atomic_dec_not_one(uint32_t *counter)
2004 {
2005 uint32_t old, val;
2006
2007 val = *counter;
2008 while (1) {
2009 if (val == 1)
2010 return false;
2011
2012 old = __sync_val_compare_and_swap(counter, val, val - 1);
2013 if (old == val)
2014 return true;
2015
2016 val = old;
2017 }
2018 }
2019
2020 void
anv_device_release_bo(struct anv_device * device,struct anv_bo * bo)2021 anv_device_release_bo(struct anv_device *device,
2022 struct anv_bo *bo)
2023 {
2024 struct anv_bo_cache *cache = &device->bo_cache;
2025 const bool bo_is_xe_userptr = device->info->kmd_type == INTEL_KMD_TYPE_XE &&
2026 bo->from_host_ptr;
2027 assert(bo_is_xe_userptr ||
2028 anv_device_lookup_bo(device, bo->gem_handle) == bo);
2029
2030 /* Try to decrement the counter but don't go below one. If this succeeds
2031 * then the refcount has been decremented and we are not the last
2032 * reference.
2033 */
2034 if (atomic_dec_not_one(&bo->refcount))
2035 return;
2036
2037 ANV_RMV(bo_destroy, device, bo);
2038
2039 pthread_mutex_lock(&cache->mutex);
2040
2041 /* We are probably the last reference since our attempt to decrement above
2042 * failed. However, we can't actually know until we are inside the mutex.
2043 * Otherwise, someone could import the BO between the decrement and our
2044 * taking the mutex.
2045 */
2046 if (unlikely(__sync_sub_and_fetch(&bo->refcount, 1) > 0)) {
2047 /* Turns out we're not the last reference. Unlock and bail. */
2048 pthread_mutex_unlock(&cache->mutex);
2049 return;
2050 }
2051 assert(bo->refcount == 0);
2052
2053 /* Memset the BO just in case. The refcount being zero should be enough to
2054 * prevent someone from assuming the data is valid but it's safer to just
2055 * stomp to zero just in case. We explicitly do this *before* we actually
2056 * close the GEM handle to ensure that if anyone allocates something and
2057 * gets the same GEM handle, the memset has already happen and won't stomp
2058 * all over any data they may write in this BO.
2059 */
2060 struct anv_bo old_bo = *bo;
2061
2062 if (bo_is_xe_userptr)
2063 vk_free(&device->vk.alloc, bo);
2064 else
2065 memset(bo, 0, sizeof(*bo));
2066
2067 anv_bo_finish(device, &old_bo);
2068
2069 /* Don't unlock until we've actually closed the BO. The whole point of
2070 * the BO cache is to ensure that we correctly handle races with creating
2071 * and releasing GEM handles and we don't want to let someone import the BO
2072 * again between mutex unlock and closing the GEM handle.
2073 */
2074 pthread_mutex_unlock(&cache->mutex);
2075 }
2076