xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/anv_allocator.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <stdlib.h>
25 #include <unistd.h>
26 #include <limits.h>
27 #include <assert.h>
28 #include <sys/mman.h>
29 
30 #include "anv_private.h"
31 
32 #include "common/intel_aux_map.h"
33 #include "util/anon_file.h"
34 #include "util/futex.h"
35 
36 #ifdef HAVE_VALGRIND
37 #define VG_NOACCESS_READ(__ptr) ({                       \
38    VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \
39    __typeof(*(__ptr)) __val = *(__ptr);                  \
40    VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\
41    __val;                                                \
42 })
43 #define VG_NOACCESS_WRITE(__ptr, __val) ({                  \
44    VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr)));  \
45    *(__ptr) = (__val);                                      \
46    VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));   \
47 })
48 #else
49 #define VG_NOACCESS_READ(__ptr) (*(__ptr))
50 #define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val))
51 #endif
52 
53 #ifndef MAP_POPULATE
54 #define MAP_POPULATE 0
55 #endif
56 
57 /* Design goals:
58  *
59  *  - Lock free (except when resizing underlying bos)
60  *
61  *  - Constant time allocation with typically only one atomic
62  *
63  *  - Multiple allocation sizes without fragmentation
64  *
65  *  - Can grow while keeping addresses and offset of contents stable
66  *
67  *  - All allocations within one bo so we can point one of the
68  *    STATE_BASE_ADDRESS pointers at it.
69  *
70  * The overall design is a two-level allocator: top level is a fixed size, big
71  * block (8k) allocator, which operates out of a bo.  Allocation is done by
72  * either pulling a block from the free list or growing the used range of the
73  * bo.  Growing the range may run out of space in the bo which we then need to
74  * grow.  Growing the bo is tricky in a multi-threaded, lockless environment:
75  * we need to keep all pointers and contents in the old map valid.  GEM bos in
76  * general can't grow, but we use a trick: we create a memfd and use ftruncate
77  * to grow it as necessary.  We mmap the new size and then create a gem bo for
78  * it using the new gem userptr ioctl.  Without heavy-handed locking around
79  * our allocation fast-path, there isn't really a way to munmap the old mmap,
80  * so we just keep it around until garbage collection time.  While the block
81  * allocator is lockless for normal operations, we block other threads trying
82  * to allocate while we're growing the map.  It shouldn't happen often, and
83  * growing is fast anyway.
84  *
85  * At the next level we can use various sub-allocators.  The state pool is a
86  * pool of smaller, fixed size objects, which operates much like the block
87  * pool.  It uses a free list for freeing objects, but when it runs out of
88  * space it just allocates a new block from the block pool.  This allocator is
89  * intended for longer lived state objects such as SURFACE_STATE and most
90  * other persistent state objects in the API.  We may need to track more info
91  * with these object and a pointer back to the CPU object (eg VkImage).  In
92  * those cases we just allocate a slightly bigger object and put the extra
93  * state after the GPU state object.
94  *
95  * The state stream allocator works similar to how the i965 DRI driver streams
96  * all its state.  Even with Vulkan, we need to emit transient state (whether
97  * surface state base or dynamic state base), and for that we can just get a
98  * block and fill it up.  These cases are local to a command buffer and the
99  * sub-allocator need not be thread safe.  The streaming allocator gets a new
100  * block when it runs out of space and chains them together so they can be
101  * easily freed.
102  */
103 
104 /* Allocations are always at least 64 byte aligned, so 1 is an invalid value.
105  * We use it to indicate the free list is empty. */
106 #define EMPTY UINT32_MAX
107 
108 /* On FreeBSD PAGE_SIZE is already defined in
109  * /usr/include/machine/param.h that is indirectly
110  * included here.
111  */
112 #ifndef PAGE_SIZE
113 #define PAGE_SIZE 4096
114 #endif
115 
116 struct anv_state_table_cleanup {
117    void *map;
118    size_t size;
119 };
120 
121 #define ANV_STATE_TABLE_CLEANUP_INIT ((struct anv_state_table_cleanup){0})
122 #define ANV_STATE_ENTRY_SIZE (sizeof(struct anv_free_entry))
123 
124 static VkResult
125 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size);
126 
127 VkResult
anv_state_table_init(struct anv_state_table * table,struct anv_device * device,uint32_t initial_entries)128 anv_state_table_init(struct anv_state_table *table,
129                     struct anv_device *device,
130                     uint32_t initial_entries)
131 {
132    VkResult result;
133 
134    table->device = device;
135 
136    /* Just make it 2GB up-front.  The Linux kernel won't actually back it
137     * with pages until we either map and fault on one of them or we use
138     * userptr and send a chunk of it off to the GPU.
139     */
140    table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table");
141    if (table->fd == -1)
142       return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
143 
144    if (!u_vector_init(&table->cleanups, 8,
145                       sizeof(struct anv_state_table_cleanup))) {
146       result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
147       goto fail_fd;
148    }
149 
150    table->state.next = 0;
151    table->state.end = 0;
152    table->size = 0;
153 
154    uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;
155    result = anv_state_table_expand_range(table, initial_size);
156    if (result != VK_SUCCESS)
157       goto fail_cleanups;
158 
159    return VK_SUCCESS;
160 
161  fail_cleanups:
162    u_vector_finish(&table->cleanups);
163  fail_fd:
164    close(table->fd);
165 
166    return result;
167 }
168 
169 static VkResult
anv_state_table_expand_range(struct anv_state_table * table,uint32_t size)170 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
171 {
172    void *map;
173    struct anv_state_table_cleanup *cleanup;
174 
175    /* Assert that we only ever grow the pool */
176    assert(size >= table->state.end);
177 
178    /* Make sure that we don't go outside the bounds of the memfd */
179    if (size > BLOCK_POOL_MEMFD_SIZE)
180       return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
181 
182    cleanup = u_vector_add(&table->cleanups);
183    if (!cleanup)
184       return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
185 
186    *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
187 
188    /* Just leak the old map until we destroy the pool.  We can't munmap it
189     * without races or imposing locking on the block allocate fast path. On
190     * the whole the leaked maps adds up to less than the size of the
191     * current map.  MAP_POPULATE seems like the right thing to do, but we
192     * should try to get some numbers.
193     */
194    map = mmap(NULL, size, PROT_READ | PROT_WRITE,
195               MAP_SHARED | MAP_POPULATE, table->fd, 0);
196    if (map == MAP_FAILED) {
197       return vk_errorf(table->device, VK_ERROR_OUT_OF_HOST_MEMORY,
198                        "mmap failed: %m");
199    }
200 
201    cleanup->map = map;
202    cleanup->size = size;
203 
204    table->map = map;
205    table->size = size;
206 
207    return VK_SUCCESS;
208 }
209 
210 static VkResult
anv_state_table_grow(struct anv_state_table * table)211 anv_state_table_grow(struct anv_state_table *table)
212 {
213    VkResult result = VK_SUCCESS;
214 
215    uint32_t used = align(table->state.next * ANV_STATE_ENTRY_SIZE,
216                          PAGE_SIZE);
217    uint32_t old_size = table->size;
218 
219    /* The block pool is always initialized to a nonzero size and this function
220     * is always called after initialization.
221     */
222    assert(old_size > 0);
223 
224    uint32_t required = MAX2(used, old_size);
225    if (used * 2 <= required) {
226       /* If we're in this case then this isn't the firsta allocation and we
227        * already have enough space on both sides to hold double what we
228        * have allocated.  There's nothing for us to do.
229        */
230       goto done;
231    }
232 
233    uint32_t size = old_size * 2;
234    while (size < required)
235       size *= 2;
236 
237    assert(size > table->size);
238 
239    result = anv_state_table_expand_range(table, size);
240 
241  done:
242    return result;
243 }
244 
245 void
anv_state_table_finish(struct anv_state_table * table)246 anv_state_table_finish(struct anv_state_table *table)
247 {
248    struct anv_state_table_cleanup *cleanup;
249 
250    u_vector_foreach(cleanup, &table->cleanups) {
251       if (cleanup->map)
252          munmap(cleanup->map, cleanup->size);
253    }
254 
255    u_vector_finish(&table->cleanups);
256 
257    close(table->fd);
258 }
259 
260 VkResult
anv_state_table_add(struct anv_state_table * table,uint32_t * idx,uint32_t count)261 anv_state_table_add(struct anv_state_table *table, uint32_t *idx,
262                     uint32_t count)
263 {
264    struct anv_block_state state, old, new;
265    VkResult result;
266 
267    assert(idx);
268 
269    while(1) {
270       state.u64 = __sync_fetch_and_add(&table->state.u64, count);
271       if (state.next + count <= state.end) {
272          assert(table->map);
273          struct anv_free_entry *entry = &table->map[state.next];
274          for (int i = 0; i < count; i++) {
275             entry[i].state.idx = state.next + i;
276          }
277          *idx = state.next;
278          return VK_SUCCESS;
279       } else if (state.next <= state.end) {
280          /* We allocated the first block outside the pool so we have to grow
281           * the pool.  pool_state->next acts a mutex: threads who try to
282           * allocate now will get block indexes above the current limit and
283           * hit futex_wait below.
284           */
285          new.next = state.next + count;
286          do {
287             result = anv_state_table_grow(table);
288             if (result != VK_SUCCESS)
289                return result;
290             new.end = table->size / ANV_STATE_ENTRY_SIZE;
291          } while (new.end < new.next);
292 
293          old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64);
294          if (old.next != state.next)
295             futex_wake(&table->state.end, INT32_MAX);
296       } else {
297          futex_wait(&table->state.end, state.end, NULL);
298          continue;
299       }
300    }
301 }
302 
303 void
anv_free_list_push(union anv_free_list * list,struct anv_state_table * table,uint32_t first,uint32_t count)304 anv_free_list_push(union anv_free_list *list,
305                    struct anv_state_table *table,
306                    uint32_t first, uint32_t count)
307 {
308    union anv_free_list current, old, new;
309    uint32_t last = first;
310 
311    for (uint32_t i = 1; i < count; i++, last++)
312       table->map[last].next = last + 1;
313 
314    old.u64 = list->u64;
315    do {
316       current = old;
317       table->map[last].next = current.offset;
318       new.offset = first;
319       new.count = current.count + 1;
320       old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
321    } while (old.u64 != current.u64);
322 }
323 
324 struct anv_state *
anv_free_list_pop(union anv_free_list * list,struct anv_state_table * table)325 anv_free_list_pop(union anv_free_list *list,
326                   struct anv_state_table *table)
327 {
328    union anv_free_list current, new, old;
329 
330    current.u64 = list->u64;
331    while (current.offset != EMPTY) {
332       __sync_synchronize();
333       new.offset = table->map[current.offset].next;
334       new.count = current.count + 1;
335       old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
336       if (old.u64 == current.u64) {
337          struct anv_free_entry *entry = &table->map[current.offset];
338          return &entry->state;
339       }
340       current = old;
341    }
342 
343    return NULL;
344 }
345 
346 static VkResult
347 anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size);
348 
349 VkResult
anv_block_pool_init(struct anv_block_pool * pool,struct anv_device * device,const char * name,uint64_t start_address,uint32_t initial_size,uint32_t max_size)350 anv_block_pool_init(struct anv_block_pool *pool,
351                     struct anv_device *device,
352                     const char *name,
353                     uint64_t start_address,
354                     uint32_t initial_size,
355                     uint32_t max_size)
356 {
357    VkResult result;
358 
359    /* Make sure VMA addresses are aligned for the block pool */
360    assert(anv_is_aligned(start_address, device->info->mem_alignment));
361    assert(anv_is_aligned(initial_size, device->info->mem_alignment));
362    assert(max_size > 0);
363    assert(max_size > initial_size);
364 
365    pool->name = name;
366    pool->device = device;
367    pool->nbos = 0;
368    pool->size = 0;
369    pool->start_address = intel_canonical_address(start_address);
370    pool->max_size = max_size;
371 
372    pool->bo = NULL;
373 
374    pool->state.next = 0;
375    pool->state.end = 0;
376 
377    pool->bo_alloc_flags =
378       ANV_BO_ALLOC_FIXED_ADDRESS |
379       ANV_BO_ALLOC_MAPPED |
380       ANV_BO_ALLOC_HOST_CACHED_COHERENT |
381       ANV_BO_ALLOC_CAPTURE |
382       ANV_BO_ALLOC_INTERNAL;
383 
384    result = anv_block_pool_expand_range(pool, initial_size);
385    if (result != VK_SUCCESS)
386       return result;
387 
388    /* Make the entire pool available in the front of the pool.  If back
389     * allocation needs to use this space, the "ends" will be re-arranged.
390     */
391    pool->state.end = pool->size;
392 
393    return VK_SUCCESS;
394 }
395 
396 void
anv_block_pool_finish(struct anv_block_pool * pool)397 anv_block_pool_finish(struct anv_block_pool *pool)
398 {
399    anv_block_pool_foreach_bo(bo, pool) {
400       assert(bo->refcount == 1);
401       anv_device_release_bo(pool->device, bo);
402    }
403 }
404 
405 static VkResult
anv_block_pool_expand_range(struct anv_block_pool * pool,uint32_t size)406 anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size)
407 {
408    /* Assert that we only ever grow the pool */
409    assert(size >= pool->state.end);
410 
411    /* For state pool BOs we have to be a bit careful about where we place them
412     * in the GTT.  There are two documented workarounds for state base address
413     * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset
414     * which state that those two base addresses do not support 48-bit
415     * addresses and need to be placed in the bottom 32-bit range.
416     * Unfortunately, this is not quite accurate.
417     *
418     * The real problem is that we always set the size of our state pools in
419     * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most
420     * likely significantly smaller.  We do this because we do not no at the
421     * time we emit STATE_BASE_ADDRESS whether or not we will need to expand
422     * the pool during command buffer building so we don't actually have a
423     * valid final size.  If the address + size, as seen by STATE_BASE_ADDRESS
424     * overflows 48 bits, the GPU appears to treat all accesses to the buffer
425     * as being out of bounds and returns zero.  For dynamic state, this
426     * usually just leads to rendering corruptions, but shaders that are all
427     * zero hang the GPU immediately.
428     *
429     * The easiest solution to do is exactly what the bogus workarounds say to
430     * do: restrict these buffers to 32-bit addresses.  We could also pin the
431     * BO to some particular location of our choosing, but that's significantly
432     * more work than just not setting a flag.  So, we explicitly DO NOT set
433     * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the
434     * hard work for us.  When using softpin, we're in control and the fixed
435     * addresses we choose are fine for base addresses.
436     */
437 
438    uint32_t new_bo_size = size - pool->size;
439    struct anv_bo *new_bo = NULL;
440    VkResult result = anv_device_alloc_bo(pool->device,
441                                          pool->name,
442                                          new_bo_size,
443                                          pool->bo_alloc_flags,
444                                          intel_48b_address(pool->start_address + pool->size),
445                                          &new_bo);
446    if (result != VK_SUCCESS)
447       return result;
448 
449    pool->bos[pool->nbos++] = new_bo;
450 
451    /* This pointer will always point to the first BO in the list */
452    pool->bo = pool->bos[0];
453 
454    assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS);
455    pool->size = size;
456 
457    return VK_SUCCESS;
458 }
459 
460 /** Returns current memory map of the block pool.
461  *
462  * The returned pointer points to the map for the memory at the specified
463  * offset. The offset parameter is relative to the "center" of the block pool
464  * rather than the start of the block pool BO map.
465  */
466 void*
anv_block_pool_map(struct anv_block_pool * pool,int32_t offset,uint32_t size)467 anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
468 {
469    struct anv_bo *bo = NULL;
470    int32_t bo_offset = 0;
471    anv_block_pool_foreach_bo(iter_bo, pool) {
472       if (offset < bo_offset + iter_bo->size) {
473          bo = iter_bo;
474          break;
475       }
476       bo_offset += iter_bo->size;
477    }
478    assert(bo != NULL);
479    assert(offset >= bo_offset);
480    assert((offset - bo_offset) + size <= bo->size);
481 
482    return bo->map + (offset - bo_offset);
483 }
484 
485 /** Grows and re-centers the block pool.
486  *
487  * We grow the block pool in one or both directions in such a way that the
488  * following conditions are met:
489  *
490  *  1) The size of the entire pool is always a power of two.
491  *
492  *  2) The pool only grows on both ends.  Neither end can get
493  *     shortened.
494  *
495  *  3) At the end of the allocation, we have about twice as much space
496  *     allocated for each end as we have used.  This way the pool doesn't
497  *     grow too far in one direction or the other.
498  *
499  *  4) We have enough space allocated for at least one more block in
500  *     whichever side `state` points to.
501  *
502  *  5) The center of the pool is always aligned to both the block_size of
503  *     the pool and a 4K CPU page.
504  */
505 static uint32_t
anv_block_pool_grow(struct anv_block_pool * pool,struct anv_block_state * state,uint32_t contiguous_size)506 anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
507                     uint32_t contiguous_size)
508 {
509    VkResult result = VK_SUCCESS;
510 
511    pthread_mutex_lock(&pool->device->mutex);
512 
513    assert(state == &pool->state);
514 
515    /* Gather a little usage information on the pool.  Since we may have
516     * threads waiting in queue to get some storage while we resize, it's
517     * actually possible that total_used will be larger than old_size.  In
518     * particular, block_pool_alloc() increments state->next prior to
519     * calling block_pool_grow, so this ensures that we get enough space for
520     * which ever side tries to grow the pool.
521     *
522     * We align to a page size because it makes it easier to do our
523     * calculations later in such a way that we state page-aigned.
524     */
525    uint32_t total_used = align(pool->state.next, PAGE_SIZE);
526 
527    uint32_t old_size = pool->size;
528 
529    /* The block pool is always initialized to a nonzero size and this function
530     * is always called after initialization.
531     */
532    assert(old_size > 0);
533 
534    /* total_used may actually be smaller than the actual requirement because
535     * they are based on the next pointers which are updated prior to calling
536     * this function.
537     */
538    uint32_t required = MAX2(total_used, old_size);
539 
540    /* With softpin, the pool is made up of a bunch of buffers with separate
541     * maps.  Make sure we have enough contiguous space that we can get a
542     * properly contiguous map for the next chunk.
543     */
544    required = MAX2(required, old_size + contiguous_size);
545 
546    if (required > pool->max_size) {
547       result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
548    } else if (total_used * 2 > required) {
549       uint32_t size = old_size * 2;
550       while (size < required)
551          size *= 2;
552 
553       size = MIN2(size, pool->max_size);
554       assert(size > pool->size);
555 
556       result = anv_block_pool_expand_range(pool, size);
557    }
558 
559    pthread_mutex_unlock(&pool->device->mutex);
560 
561    if (result != VK_SUCCESS)
562       return 0;
563 
564    /* Return the appropriate new size.  This function never actually
565     * updates state->next.  Instead, we let the caller do that because it
566     * needs to do so in order to maintain its concurrency model.
567     */
568    return pool->size;
569 }
570 
571 static VkResult
anv_block_pool_alloc_new(struct anv_block_pool * pool,struct anv_block_state * pool_state,uint32_t block_size,int64_t * offset,uint32_t * padding)572 anv_block_pool_alloc_new(struct anv_block_pool *pool,
573                          struct anv_block_state *pool_state,
574                          uint32_t block_size,
575                          int64_t *offset,
576                          uint32_t *padding)
577 {
578    struct anv_block_state state, old, new;
579 
580    /* Most allocations won't generate any padding */
581    if (padding)
582       *padding = 0;
583 
584    while (1) {
585       state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);
586       if (state.next + block_size > pool->max_size) {
587          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
588       } else if (state.next + block_size <= state.end) {
589          *offset =  state.next;
590          return VK_SUCCESS;
591       } else if (state.next <= state.end) {
592          if (state.next < state.end) {
593             /* We need to grow the block pool, but still have some leftover
594              * space that can't be used by that particular allocation. So we
595              * add that as a "padding", and return it.
596              */
597             uint32_t leftover = state.end - state.next;
598 
599             /* If there is some leftover space in the pool, the caller must
600              * deal with it.
601              */
602             assert(leftover == 0 || padding);
603             if (padding)
604                *padding = leftover;
605             state.next += leftover;
606          }
607 
608          /* We allocated the first block outside the pool so we have to grow
609           * the pool.  pool_state->next acts a mutex: threads who try to
610           * allocate now will get block indexes above the current limit and
611           * hit futex_wait below.
612           */
613          new.next = state.next + block_size;
614          do {
615             new.end = anv_block_pool_grow(pool, pool_state, block_size);
616             if (pool->size > 0 && new.end == 0) {
617                futex_wake(&pool_state->end, INT32_MAX);
618                return VK_ERROR_OUT_OF_DEVICE_MEMORY;
619             }
620          } while (new.end < new.next);
621 
622          old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
623          if (old.next != state.next)
624             futex_wake(&pool_state->end, INT32_MAX);
625          *offset = state.next;
626          return VK_SUCCESS;
627       } else {
628          futex_wait(&pool_state->end, state.end, NULL);
629          continue;
630       }
631    }
632 }
633 
634 VkResult
anv_block_pool_alloc(struct anv_block_pool * pool,uint32_t block_size,int64_t * offset,uint32_t * padding)635 anv_block_pool_alloc(struct anv_block_pool *pool,
636                      uint32_t block_size,
637                      int64_t *offset, uint32_t *padding)
638 {
639    return anv_block_pool_alloc_new(pool, &pool->state, block_size, offset, padding);
640 }
641 
642 VkResult
anv_state_pool_init(struct anv_state_pool * pool,struct anv_device * device,const struct anv_state_pool_params * params)643 anv_state_pool_init(struct anv_state_pool *pool,
644                     struct anv_device *device,
645                     const struct anv_state_pool_params *params)
646 {
647    uint32_t initial_size = MAX2(params->block_size * 16,
648                                 device->info->mem_alignment);
649 
650    VkResult result = anv_block_pool_init(&pool->block_pool, device,
651                                          params->name,
652                                          params->base_address + params->start_offset,
653                                          initial_size,
654                                          params->max_size);
655    if (result != VK_SUCCESS)
656       return result;
657 
658    pool->start_offset = params->start_offset;
659 
660    result = anv_state_table_init(&pool->table, device, 64);
661    if (result != VK_SUCCESS) {
662       anv_block_pool_finish(&pool->block_pool);
663       return result;
664    }
665 
666    assert(util_is_power_of_two_or_zero(params->block_size));
667    pool->block_size = params->block_size;
668    for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
669       pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;
670       pool->buckets[i].block.next = 0;
671       pool->buckets[i].block.end = 0;
672    }
673    VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
674 
675    return VK_SUCCESS;
676 }
677 
678 void
anv_state_pool_finish(struct anv_state_pool * pool)679 anv_state_pool_finish(struct anv_state_pool *pool)
680 {
681    VG(VALGRIND_DESTROY_MEMPOOL(pool));
682    anv_state_table_finish(&pool->table);
683    anv_block_pool_finish(&pool->block_pool);
684 }
685 
686 static VkResult
anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool * pool,struct anv_block_pool * block_pool,uint32_t state_size,uint32_t block_size,int64_t * offset,uint32_t * padding)687 anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
688                                     struct anv_block_pool *block_pool,
689                                     uint32_t state_size,
690                                     uint32_t block_size,
691                                     int64_t *offset,
692                                     uint32_t *padding)
693 {
694    struct anv_block_state block, old, new;
695 
696    /* We don't always use anv_block_pool_alloc(), which would set *padding to
697     * zero for us. So if we have a pointer to padding, we must zero it out
698     * ourselves here, to make sure we always return some sensible value.
699     */
700    if (padding)
701       *padding = 0;
702 
703    /* If our state is large, we don't need any sub-allocation from a block.
704     * Instead, we just grab whole (potentially large) blocks.
705     */
706    if (state_size >= block_size)
707       return anv_block_pool_alloc(block_pool, state_size, offset, padding);
708 
709  restart:
710    block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);
711 
712    if (block.next < block.end) {
713       *offset = block.next;
714       return VK_SUCCESS;
715    } else if (block.next == block.end) {
716       VkResult result = anv_block_pool_alloc(block_pool, block_size,
717                                              offset, padding);
718       if (result != VK_SUCCESS)
719          return result;
720       new.next = *offset + state_size;
721       new.end = *offset + block_size;
722       old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
723       if (old.next != block.next)
724          futex_wake(&pool->block.end, INT32_MAX);
725       return result;
726    } else {
727       futex_wait(&pool->block.end, block.end, NULL);
728       goto restart;
729    }
730 }
731 
732 static uint32_t
anv_state_pool_get_bucket(uint32_t size)733 anv_state_pool_get_bucket(uint32_t size)
734 {
735    unsigned size_log2 = util_logbase2_ceil(size);
736    assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
737    if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
738       size_log2 = ANV_MIN_STATE_SIZE_LOG2;
739    return size_log2 - ANV_MIN_STATE_SIZE_LOG2;
740 }
741 
742 static uint32_t
anv_state_pool_get_bucket_size(uint32_t bucket)743 anv_state_pool_get_bucket_size(uint32_t bucket)
744 {
745    uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2;
746    return 1 << size_log2;
747 }
748 
749 /** Helper to push a chunk into the state table.
750  *
751  * It creates 'count' entries into the state table and update their sizes,
752  * offsets and maps, also pushing them as "free" states.
753  */
754 static void
anv_state_pool_return_blocks(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t count,uint32_t block_size)755 anv_state_pool_return_blocks(struct anv_state_pool *pool,
756                              uint32_t chunk_offset, uint32_t count,
757                              uint32_t block_size)
758 {
759    /* Disallow returning 0 chunks */
760    assert(count != 0);
761 
762    /* Make sure we always return chunks aligned to the block_size */
763    assert(chunk_offset % block_size == 0);
764 
765    uint32_t st_idx;
766    UNUSED VkResult result = anv_state_table_add(&pool->table, &st_idx, count);
767    assert(result == VK_SUCCESS);
768    for (int i = 0; i < count; i++) {
769       /* update states that were added back to the state table */
770       struct anv_state *state_i = anv_state_table_get(&pool->table,
771                                                       st_idx + i);
772       state_i->alloc_size = block_size;
773       state_i->offset = pool->start_offset + chunk_offset + block_size * i;
774       state_i->map = anv_block_pool_map(&pool->block_pool,
775                                         state_i->offset,
776                                         state_i->alloc_size);
777    }
778 
779    uint32_t block_bucket = anv_state_pool_get_bucket(block_size);
780 
781    if (block_bucket >= ARRAY_SIZE(pool->buckets))
782       return;
783 
784    anv_free_list_push(&pool->buckets[block_bucket].free_list,
785                       &pool->table, st_idx, count);
786 }
787 
788 /** Returns a chunk of memory back to the state pool.
789  *
790  * Do a two-level split. If chunk_size is bigger than divisor
791  * (pool->block_size), we return as many divisor sized blocks as we can, from
792  * the end of the chunk.
793  *
794  * The remaining is then split into smaller blocks (starting at small_size if
795  * it is non-zero), with larger blocks always being taken from the end of the
796  * chunk.
797  */
798 static void
anv_state_pool_return_chunk(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t chunk_size,uint32_t small_size)799 anv_state_pool_return_chunk(struct anv_state_pool *pool,
800                             uint32_t chunk_offset, uint32_t chunk_size,
801                             uint32_t small_size)
802 {
803    uint32_t divisor = pool->block_size;
804    uint32_t nblocks = chunk_size / divisor;
805    uint32_t rest = chunk_size - nblocks * divisor;
806 
807    if (nblocks > 0) {
808       /* First return divisor aligned and sized chunks. We start returning
809        * larger blocks from the end of the chunk, since they should already be
810        * aligned to divisor. Also anv_state_pool_return_blocks() only accepts
811        * aligned chunks.
812        */
813       uint32_t offset = chunk_offset + rest;
814       anv_state_pool_return_blocks(pool, offset, nblocks, divisor);
815    }
816 
817    chunk_size = rest;
818    divisor /= 2;
819 
820    if (small_size > 0 && small_size < divisor)
821       divisor = small_size;
822 
823    uint32_t min_size = 1 << ANV_MIN_STATE_SIZE_LOG2;
824 
825    /* Just as before, return larger divisor aligned blocks from the end of the
826     * chunk first.
827     */
828    while (chunk_size > 0 && divisor >= min_size) {
829       nblocks = chunk_size / divisor;
830       rest = chunk_size - nblocks * divisor;
831       if (nblocks > 0) {
832          anv_state_pool_return_blocks(pool, chunk_offset + rest,
833                                       nblocks, divisor);
834          chunk_size = rest;
835       }
836       divisor /= 2;
837    }
838 }
839 
840 static struct anv_state
anv_state_pool_alloc_no_vg(struct anv_state_pool * pool,uint32_t size,uint32_t align)841 anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
842                            uint32_t size, uint32_t align)
843 {
844    uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align));
845 
846    if (bucket >= ARRAY_SIZE(pool->buckets))
847       return ANV_STATE_NULL;
848 
849    struct anv_state *state;
850    uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket);
851    int64_t offset;
852 
853    /* Try free list first. */
854    state = anv_free_list_pop(&pool->buckets[bucket].free_list,
855                              &pool->table);
856    if (state) {
857       assert(state->offset >= pool->start_offset);
858       goto done;
859    }
860 
861    /* Try to grab a chunk from some larger bucket and split it up */
862    for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) {
863       state = anv_free_list_pop(&pool->buckets[b].free_list, &pool->table);
864       if (state) {
865          unsigned chunk_size = anv_state_pool_get_bucket_size(b);
866          int32_t chunk_offset = state->offset;
867 
868          /* First lets update the state we got to its new size. offset and map
869           * remain the same.
870           */
871          state->alloc_size = alloc_size;
872 
873          /* Now return the unused part of the chunk back to the pool as free
874           * blocks
875           *
876           * There are a couple of options as to what we do with it:
877           *
878           *    1) We could fully split the chunk into state.alloc_size sized
879           *       pieces.  However, this would mean that allocating a 16B
880           *       state could potentially split a 2MB chunk into 512K smaller
881           *       chunks.  This would lead to unnecessary fragmentation.
882           *
883           *    2) The classic "buddy allocator" method would have us split the
884           *       chunk in half and return one half.  Then we would split the
885           *       remaining half in half and return one half, and repeat as
886           *       needed until we get down to the size we want.  However, if
887           *       you are allocating a bunch of the same size state (which is
888           *       the common case), this means that every other allocation has
889           *       to go up a level and every fourth goes up two levels, etc.
890           *       This is not nearly as efficient as it could be if we did a
891           *       little more work up-front.
892           *
893           *    3) Split the difference between (1) and (2) by doing a
894           *       two-level split.  If it's bigger than some fixed block_size,
895           *       we split it into block_size sized chunks and return all but
896           *       one of them.  Then we split what remains into
897           *       state.alloc_size sized chunks and return them.
898           *
899           * We choose something close to option (3), which is implemented with
900           * anv_state_pool_return_chunk(). That is done by returning the
901           * remaining of the chunk, with alloc_size as a hint of the size that
902           * we want the smaller chunk split into.
903           */
904          anv_state_pool_return_chunk(pool, chunk_offset + alloc_size,
905                                      chunk_size - alloc_size, alloc_size);
906          goto done;
907       }
908    }
909 
910    uint32_t padding;
911    VkResult result =
912       anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
913                                           &pool->block_pool,
914                                           alloc_size,
915                                           pool->block_size,
916                                           &offset,
917                                           &padding);
918    if (result != VK_SUCCESS)
919       return ANV_STATE_NULL;
920 
921    /* Every time we allocate a new state, add it to the state pool */
922    uint32_t idx = 0;
923    result = anv_state_table_add(&pool->table, &idx, 1);
924    assert(result == VK_SUCCESS);
925 
926    state = anv_state_table_get(&pool->table, idx);
927    state->offset = pool->start_offset + offset;
928    state->alloc_size = alloc_size;
929    state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
930 
931    if (padding > 0) {
932       uint32_t return_offset = offset - padding;
933       anv_state_pool_return_chunk(pool, return_offset, padding, 0);
934    }
935 
936 done:
937    return *state;
938 }
939 
940 struct anv_state
anv_state_pool_alloc(struct anv_state_pool * pool,uint32_t size,uint32_t align)941 anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)
942 {
943    if (size == 0)
944       return ANV_STATE_NULL;
945 
946    struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align);
947    VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size));
948    return state;
949 }
950 
951 static void
anv_state_pool_free_no_vg(struct anv_state_pool * pool,struct anv_state state)952 anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)
953 {
954    assert(util_is_power_of_two_or_zero(state.alloc_size));
955    unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);
956 
957    assert(state.offset >= pool->start_offset);
958 
959    if (bucket >= ARRAY_SIZE(pool->buckets))
960       return;
961 
962    anv_free_list_push(&pool->buckets[bucket].free_list,
963                       &pool->table, state.idx, 1);
964 }
965 
966 void
anv_state_pool_free(struct anv_state_pool * pool,struct anv_state state)967 anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)
968 {
969    if (state.alloc_size == 0)
970       return;
971 
972    VG(VALGRIND_MEMPOOL_FREE(pool, state.map));
973    anv_state_pool_free_no_vg(pool, state);
974 }
975 
976 struct anv_state_stream_block {
977    struct anv_state block;
978 
979    /* The next block */
980    struct anv_state_stream_block *next;
981 
982 #ifdef HAVE_VALGRIND
983    /* A pointer to the first user-allocated thing in this block.  This is
984     * what valgrind sees as the start of the block.
985     */
986    void *_vg_ptr;
987 #endif
988 };
989 
990 /* The state stream allocator is a one-shot, single threaded allocator for
991  * variable sized blocks.  We use it for allocating dynamic state.
992  */
993 void
anv_state_stream_init(struct anv_state_stream * stream,struct anv_state_pool * state_pool,uint32_t block_size)994 anv_state_stream_init(struct anv_state_stream *stream,
995                       struct anv_state_pool *state_pool,
996                       uint32_t block_size)
997 {
998    stream->state_pool = state_pool;
999    stream->block_size = block_size;
1000 
1001    stream->block = ANV_STATE_NULL;
1002 
1003    /* Ensure that next + whatever > block_size.  This way the first call to
1004     * state_stream_alloc fetches a new block.
1005     */
1006    stream->next = block_size;
1007 
1008    stream->total_size = 0;
1009    util_dynarray_init(&stream->all_blocks, NULL);
1010 
1011    VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
1012 }
1013 
1014 void
anv_state_stream_finish(struct anv_state_stream * stream)1015 anv_state_stream_finish(struct anv_state_stream *stream)
1016 {
1017    util_dynarray_foreach(&stream->all_blocks, struct anv_state, block) {
1018       VG(VALGRIND_MEMPOOL_FREE(stream, block->map));
1019       VG(VALGRIND_MAKE_MEM_NOACCESS(block->map, block->alloc_size));
1020       anv_state_pool_free_no_vg(stream->state_pool, *block);
1021    }
1022    util_dynarray_fini(&stream->all_blocks);
1023 
1024    VG(VALGRIND_DESTROY_MEMPOOL(stream));
1025 }
1026 
1027 struct anv_state
anv_state_stream_alloc(struct anv_state_stream * stream,uint32_t size,uint32_t alignment)1028 anv_state_stream_alloc(struct anv_state_stream *stream,
1029                        uint32_t size, uint32_t alignment)
1030 {
1031    if (size == 0)
1032       return ANV_STATE_NULL;
1033 
1034    assert(alignment <= PAGE_SIZE);
1035 
1036    uint32_t offset = align(stream->next, alignment);
1037    if (offset + size > stream->block.alloc_size) {
1038       uint32_t block_size = stream->block_size;
1039       if (block_size < size)
1040          block_size = util_next_power_of_two(size);
1041 
1042       stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,
1043                                                  block_size, PAGE_SIZE);
1044       if (stream->block.alloc_size == 0)
1045          return ANV_STATE_NULL;
1046 
1047       util_dynarray_append(&stream->all_blocks,
1048                            struct anv_state, stream->block);
1049       VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size));
1050 
1051       /* Reset back to the start */
1052       stream->next = offset = 0;
1053       assert(offset + size <= stream->block.alloc_size);
1054       stream->total_size += block_size;
1055    }
1056    const bool new_block = stream->next == 0;
1057 
1058    struct anv_state state = stream->block;
1059    state.offset += offset;
1060    state.alloc_size = size;
1061    state.map += offset;
1062 
1063    stream->next = offset + size;
1064 
1065    if (new_block) {
1066       assert(state.map == stream->block.map);
1067       VG(VALGRIND_MEMPOOL_ALLOC(stream, state.map, size));
1068    } else {
1069       /* This only updates the mempool.  The newly allocated chunk is still
1070        * marked as NOACCESS. */
1071       VG(VALGRIND_MEMPOOL_CHANGE(stream, stream->block.map, stream->block.map,
1072                                  stream->next));
1073       /* Mark the newly allocated chunk as undefined */
1074       VG(VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size));
1075    }
1076 
1077    return state;
1078 }
1079 
1080 void
anv_state_reserved_pool_init(struct anv_state_reserved_pool * pool,struct anv_state_pool * parent,uint32_t count,uint32_t size,uint32_t alignment)1081 anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool,
1082                              struct anv_state_pool *parent,
1083                              uint32_t count, uint32_t size, uint32_t alignment)
1084 {
1085    pool->pool = parent;
1086    pool->reserved_blocks = ANV_FREE_LIST_EMPTY;
1087    pool->count = count;
1088 
1089    for (unsigned i = 0; i < count; i++) {
1090       struct anv_state state = anv_state_pool_alloc(pool->pool, size, alignment);
1091       anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1092    }
1093 }
1094 
1095 void
anv_state_reserved_pool_finish(struct anv_state_reserved_pool * pool)1096 anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool)
1097 {
1098    struct anv_state *state;
1099 
1100    while ((state = anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table))) {
1101       anv_state_pool_free(pool->pool, *state);
1102       pool->count--;
1103    }
1104    assert(pool->count == 0);
1105 }
1106 
1107 struct anv_state
anv_state_reserved_pool_alloc(struct anv_state_reserved_pool * pool)1108 anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool)
1109 {
1110    return *anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table);
1111 }
1112 
1113 void
anv_state_reserved_pool_free(struct anv_state_reserved_pool * pool,struct anv_state state)1114 anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
1115                              struct anv_state state)
1116 {
1117    anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1118 }
1119 
1120 VkResult
anv_state_reserved_array_pool_init(struct anv_state_reserved_array_pool * pool,struct anv_state_pool * parent,uint32_t count,uint32_t size,uint32_t alignment)1121 anv_state_reserved_array_pool_init(struct anv_state_reserved_array_pool *pool,
1122                                    struct anv_state_pool *parent,
1123                                    uint32_t count, uint32_t size, uint32_t alignment)
1124 {
1125    struct anv_device *device = parent->block_pool.device;
1126 
1127    pool->pool = parent;
1128    pool->count = count;
1129    pool->size = size;
1130    pool->stride = align(size, alignment);
1131    pool->states = vk_zalloc(&device->vk.alloc,
1132                             sizeof(BITSET_WORD) * BITSET_WORDS(pool->count), 8,
1133                             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1134    if (pool->states == NULL)
1135       return vk_error(&device->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
1136 
1137    BITSET_SET_RANGE(pool->states, 0, pool->count - 1);
1138    simple_mtx_init(&pool->mutex, mtx_plain);
1139 
1140    pool->state = anv_state_pool_alloc(pool->pool, pool->stride * count, alignment);
1141    if (pool->state.alloc_size == 0) {
1142       vk_free(&device->vk.alloc, pool->states);
1143       return vk_error(&device->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1144    }
1145 
1146    return VK_SUCCESS;
1147 }
1148 
1149 void
anv_state_reserved_array_pool_finish(struct anv_state_reserved_array_pool * pool)1150 anv_state_reserved_array_pool_finish(struct anv_state_reserved_array_pool *pool)
1151 {
1152    anv_state_pool_free(pool->pool, pool->state);
1153    vk_free(&pool->pool->block_pool.device->vk.alloc, pool->states);
1154    simple_mtx_destroy(&pool->mutex);
1155 }
1156 
1157 struct anv_state
anv_state_reserved_array_pool_alloc(struct anv_state_reserved_array_pool * pool,bool alloc_back)1158 anv_state_reserved_array_pool_alloc(struct anv_state_reserved_array_pool *pool,
1159                                     bool alloc_back)
1160 {
1161    simple_mtx_lock(&pool->mutex);
1162    int idx = alloc_back ?
1163       __bitset_last_bit(pool->states, BITSET_WORDS(pool->count)) :
1164       __bitset_ffs(pool->states, BITSET_WORDS(pool->count));
1165    if (idx != 0)
1166       BITSET_CLEAR(pool->states, idx - 1);
1167    simple_mtx_unlock(&pool->mutex);
1168 
1169    if (idx == 0)
1170       return ANV_STATE_NULL;
1171 
1172    idx--;
1173 
1174    struct anv_state state = pool->state;
1175    state.offset += idx * pool->stride;
1176    state.map += idx * pool->stride;
1177    state.alloc_size = pool->size;
1178 
1179    return state;
1180 }
1181 
1182 struct anv_state
anv_state_reserved_array_pool_alloc_index(struct anv_state_reserved_array_pool * pool,uint32_t idx)1183 anv_state_reserved_array_pool_alloc_index(struct anv_state_reserved_array_pool *pool,
1184                                           uint32_t idx)
1185 {
1186    simple_mtx_lock(&pool->mutex);
1187    bool already_allocated = !BITSET_TEST(pool->states, idx);
1188    if (!already_allocated)
1189       BITSET_CLEAR(pool->states, idx);
1190    simple_mtx_unlock(&pool->mutex);
1191 
1192    if (already_allocated)
1193       return ANV_STATE_NULL;
1194 
1195    struct anv_state state = pool->state;
1196    state.offset += idx * pool->stride;
1197    state.map += idx * pool->stride;
1198    state.alloc_size = pool->size;
1199 
1200    return state;
1201 }
1202 
1203 uint32_t
anv_state_reserved_array_pool_state_index(struct anv_state_reserved_array_pool * pool,struct anv_state state)1204 anv_state_reserved_array_pool_state_index(struct anv_state_reserved_array_pool *pool,
1205                                           struct anv_state state)
1206 {
1207    return (state.offset - pool->state.offset) / pool->stride;
1208 }
1209 
1210 void
anv_state_reserved_array_pool_free(struct anv_state_reserved_array_pool * pool,struct anv_state state)1211 anv_state_reserved_array_pool_free(struct anv_state_reserved_array_pool *pool,
1212                                   struct anv_state state)
1213 {
1214    unsigned idx = (state.offset - pool->state.offset) / pool->stride;
1215    simple_mtx_lock(&pool->mutex);
1216    BITSET_SET(pool->states, idx);
1217    simple_mtx_unlock(&pool->mutex);
1218  }
1219 
1220 void
anv_bo_pool_init(struct anv_bo_pool * pool,struct anv_device * device,const char * name,enum anv_bo_alloc_flags alloc_flags)1221 anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
1222                  const char *name, enum anv_bo_alloc_flags alloc_flags)
1223 {
1224    pool->name = name;
1225    pool->device = device;
1226    pool->bo_alloc_flags = alloc_flags;
1227 
1228    for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1229       util_sparse_array_free_list_init(&pool->free_list[i],
1230                                        &device->bo_cache.bo_map, 0,
1231                                        offsetof(struct anv_bo, free_index));
1232    }
1233 
1234    VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
1235 }
1236 
1237 void
anv_bo_pool_finish(struct anv_bo_pool * pool)1238 anv_bo_pool_finish(struct anv_bo_pool *pool)
1239 {
1240    for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1241       while (1) {
1242          struct anv_bo *bo =
1243             util_sparse_array_free_list_pop_elem(&pool->free_list[i]);
1244          if (bo == NULL)
1245             break;
1246 
1247          /* anv_device_release_bo is going to "free" it */
1248          VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1));
1249          anv_device_release_bo(pool->device, bo);
1250       }
1251    }
1252 
1253    VG(VALGRIND_DESTROY_MEMPOOL(pool));
1254 }
1255 
1256 VkResult
anv_bo_pool_alloc(struct anv_bo_pool * pool,uint32_t size,struct anv_bo ** bo_out)1257 anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
1258                   struct anv_bo **bo_out)
1259 {
1260    const unsigned size_log2 = size < 4096 ? 12 : util_logbase2_ceil(size);
1261    const unsigned pow2_size = 1 << size_log2;
1262    const unsigned bucket = size_log2 - 12;
1263    assert(bucket < ARRAY_SIZE(pool->free_list));
1264 
1265    struct anv_bo *bo =
1266       util_sparse_array_free_list_pop_elem(&pool->free_list[bucket]);
1267    if (bo != NULL) {
1268       VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1269       *bo_out = bo;
1270       return VK_SUCCESS;
1271    }
1272 
1273    VkResult result = anv_device_alloc_bo(pool->device,
1274                                          pool->name,
1275                                          pow2_size,
1276                                          pool->bo_alloc_flags,
1277                                          0 /* explicit_address */,
1278                                          &bo);
1279    if (result != VK_SUCCESS)
1280       return result;
1281 
1282    /* We want it to look like it came from this pool */
1283    VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
1284    VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1285 
1286    *bo_out = bo;
1287 
1288    return VK_SUCCESS;
1289 }
1290 
1291 void
anv_bo_pool_free(struct anv_bo_pool * pool,struct anv_bo * bo)1292 anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo)
1293 {
1294    VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));
1295 
1296    assert(util_is_power_of_two_or_zero(bo->size));
1297    const unsigned size_log2 = util_logbase2_ceil(bo->size);
1298    const unsigned bucket = size_log2 - 12;
1299    assert(bucket < ARRAY_SIZE(pool->free_list));
1300 
1301    assert(util_sparse_array_get(&pool->device->bo_cache.bo_map,
1302                                 bo->gem_handle) == bo);
1303    util_sparse_array_free_list_push(&pool->free_list[bucket],
1304                                     &bo->gem_handle, 1);
1305 }
1306 
1307 // Scratch pool
1308 
1309 void
anv_scratch_pool_init(struct anv_device * device,struct anv_scratch_pool * pool,bool protected)1310 anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool,
1311                       bool protected)
1312 {
1313    memset(pool, 0, sizeof(*pool));
1314    pool->alloc_flags = ANV_BO_ALLOC_INTERNAL |
1315       (protected ? ANV_BO_ALLOC_PROTECTED : 0) |
1316       (device->info->verx10 < 125 ? ANV_BO_ALLOC_32BIT_ADDRESS : 0);
1317 }
1318 
1319 void
anv_scratch_pool_finish(struct anv_device * device,struct anv_scratch_pool * pool)1320 anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool)
1321 {
1322    for (unsigned s = 0; s < ARRAY_SIZE(pool->bos[0]); s++) {
1323       for (unsigned i = 0; i < 16; i++) {
1324          if (pool->bos[i][s] != NULL)
1325             anv_device_release_bo(device, pool->bos[i][s]);
1326       }
1327    }
1328 
1329    for (unsigned i = 0; i < 16; i++) {
1330       if (pool->surf_states[i].map != NULL) {
1331          anv_state_pool_free(&device->scratch_surface_state_pool,
1332                              pool->surf_states[i]);
1333       }
1334    }
1335 }
1336 
1337 struct anv_bo *
anv_scratch_pool_alloc(struct anv_device * device,struct anv_scratch_pool * pool,gl_shader_stage stage,unsigned per_thread_scratch)1338 anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
1339                        gl_shader_stage stage, unsigned per_thread_scratch)
1340 {
1341    if (per_thread_scratch == 0)
1342       return NULL;
1343 
1344    unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1345    assert(scratch_size_log2 < 16);
1346 
1347    assert(stage < ARRAY_SIZE(pool->bos));
1348 
1349    const struct intel_device_info *devinfo = device->info;
1350 
1351    /* On GFX version 12.5, scratch access changed to a surface-based model.
1352     * Instead of each shader type having its own layout based on IDs passed
1353     * from the relevant fixed-function unit, all scratch access is based on
1354     * thread IDs like it always has been for compute.
1355     */
1356    if (devinfo->verx10 >= 125)
1357       stage = MESA_SHADER_COMPUTE;
1358 
1359    struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]);
1360 
1361    if (bo != NULL)
1362       return bo;
1363 
1364    assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
1365    uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
1366 
1367    /* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they
1368     * are still relative to the general state base address.  When we emit
1369     * STATE_BASE_ADDRESS, we set general state base address to 0 and the size
1370     * to the maximum (1 page under 4GB).  This allows us to just place the
1371     * scratch buffers anywhere we wish in the bottom 32 bits of address space
1372     * and just set the scratch base pointer in 3DSTATE_*S using a relocation.
1373     * However, in order to do so, we need to ensure that the kernel does not
1374     * place the scratch BO above the 32-bit boundary.
1375     *
1376     * NOTE: Technically, it can't go "anywhere" because the top page is off
1377     * limits.  However, when EXEC_OBJECT_SUPPORTS_48B_ADDRESS is set, the
1378     * kernel allocates space using
1379     *
1380     *    end = min_t(u64, end, (1ULL << 32) - I915_GTT_PAGE_SIZE);
1381     *
1382     * so nothing will ever touch the top page.
1383     */
1384    VkResult result = anv_device_alloc_bo(device, "scratch", size,
1385                                          pool->alloc_flags,
1386                                          0 /* explicit_address */,
1387                                          &bo);
1388    if (result != VK_SUCCESS)
1389       return NULL; /* TODO */
1390 
1391    struct anv_bo *current_bo =
1392       p_atomic_cmpxchg(&pool->bos[scratch_size_log2][stage], NULL, bo);
1393    if (current_bo) {
1394       anv_device_release_bo(device, bo);
1395       return current_bo;
1396    } else {
1397       return bo;
1398    }
1399 }
1400 
1401 uint32_t
anv_scratch_pool_get_surf(struct anv_device * device,struct anv_scratch_pool * pool,unsigned per_thread_scratch)1402 anv_scratch_pool_get_surf(struct anv_device *device,
1403                           struct anv_scratch_pool *pool,
1404                           unsigned per_thread_scratch)
1405 {
1406    assert(device->info->verx10 >= 125);
1407 
1408    if (per_thread_scratch == 0)
1409       return 0;
1410 
1411    unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1412    assert(scratch_size_log2 < 16);
1413 
1414    uint32_t surf = p_atomic_read(&pool->surfs[scratch_size_log2]);
1415    if (surf > 0)
1416       return surf;
1417 
1418    struct anv_bo *bo =
1419       anv_scratch_pool_alloc(device, pool, MESA_SHADER_COMPUTE,
1420                              per_thread_scratch);
1421    struct anv_address addr = { .bo = bo };
1422 
1423    struct anv_state state =
1424       anv_state_pool_alloc(&device->scratch_surface_state_pool,
1425                            device->isl_dev.ss.size, 64);
1426 
1427    isl_surf_usage_flags_t usage =
1428       (pool->alloc_flags & ANV_BO_ALLOC_PROTECTED) ?
1429       ISL_SURF_USAGE_PROTECTED_BIT : 0;
1430 
1431    isl_buffer_fill_state(&device->isl_dev, state.map,
1432                          .address = anv_address_physical(addr),
1433                          .size_B = bo->size,
1434                          .mocs = anv_mocs(device, bo, usage),
1435                          .format = ISL_FORMAT_RAW,
1436                          .swizzle = ISL_SWIZZLE_IDENTITY,
1437                          .stride_B = per_thread_scratch,
1438                          .is_scratch = true);
1439 
1440    uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
1441                                        0, state.offset);
1442    if (current) {
1443       anv_state_pool_free(&device->scratch_surface_state_pool, state);
1444       return current;
1445    } else {
1446       pool->surf_states[scratch_size_log2] = state;
1447       return state.offset;
1448    }
1449 }
1450 
1451 VkResult
anv_bo_cache_init(struct anv_bo_cache * cache,struct anv_device * device)1452 anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device)
1453 {
1454    util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024);
1455 
1456    if (pthread_mutex_init(&cache->mutex, NULL)) {
1457       util_sparse_array_finish(&cache->bo_map);
1458       return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
1459                        "pthread_mutex_init failed: %m");
1460    }
1461 
1462    return VK_SUCCESS;
1463 }
1464 
1465 void
anv_bo_cache_finish(struct anv_bo_cache * cache)1466 anv_bo_cache_finish(struct anv_bo_cache *cache)
1467 {
1468    util_sparse_array_finish(&cache->bo_map);
1469    pthread_mutex_destroy(&cache->mutex);
1470 }
1471 
1472 static void
anv_bo_unmap_close(struct anv_device * device,struct anv_bo * bo)1473 anv_bo_unmap_close(struct anv_device *device, struct anv_bo *bo)
1474 {
1475    if (bo->map && !bo->from_host_ptr)
1476       anv_device_unmap_bo(device, bo, bo->map, bo->size, false /* replace */);
1477 
1478    assert(bo->gem_handle != 0);
1479    device->kmd_backend->gem_close(device, bo);
1480 }
1481 
1482 static void
anv_bo_vma_free(struct anv_device * device,struct anv_bo * bo)1483 anv_bo_vma_free(struct anv_device *device, struct anv_bo *bo)
1484 {
1485    if (bo->offset != 0 && !(bo->alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS)) {
1486       assert(bo->vma_heap != NULL);
1487       anv_vma_free(device, bo->vma_heap, bo->offset, bo->size);
1488    }
1489    bo->vma_heap = NULL;
1490 }
1491 
1492 static void
anv_bo_finish(struct anv_device * device,struct anv_bo * bo)1493 anv_bo_finish(struct anv_device *device, struct anv_bo *bo)
1494 {
1495    /* Not releasing vma in case unbind fails */
1496    if (device->kmd_backend->vm_unbind_bo(device, bo) == VK_SUCCESS)
1497       anv_bo_vma_free(device, bo);
1498 
1499    anv_bo_unmap_close(device, bo);
1500 }
1501 
1502 static VkResult
anv_bo_vma_alloc_or_close(struct anv_device * device,struct anv_bo * bo,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address)1503 anv_bo_vma_alloc_or_close(struct anv_device *device,
1504                           struct anv_bo *bo,
1505                           enum anv_bo_alloc_flags alloc_flags,
1506                           uint64_t explicit_address)
1507 {
1508    assert(bo->vma_heap == NULL);
1509    assert(explicit_address == intel_48b_address(explicit_address));
1510 
1511    uint32_t align = device->physical->info.mem_alignment;
1512 
1513    /* If it's big enough to store a tiled resource, we need 64K alignment */
1514    if (bo->size >= 64 * 1024)
1515       align = MAX2(64 * 1024, align);
1516 
1517    /* If we're using the AUX map, make sure we follow the required
1518     * alignment.
1519     */
1520    if (alloc_flags & ANV_BO_ALLOC_AUX_TT_ALIGNED)
1521       align = MAX2(intel_aux_map_get_alignment(device->aux_map_ctx), align);
1522 
1523    /* Opportunistically align addresses to 2Mb when above 1Mb. We do this
1524     * because this gives an opportunity for the kernel to use Transparent Huge
1525     * Pages (the 2MB page table layout) for faster memory access.
1526     *
1527     * Only available on ICL+.
1528     */
1529    if (device->info->ver >= 11 && bo->size >= 1 * 1024 * 1024)
1530       align = MAX2(2 * 1024 * 1024, align);
1531 
1532    if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
1533       bo->offset = intel_canonical_address(explicit_address);
1534    } else {
1535       bo->offset = anv_vma_alloc(device, bo->size, align, alloc_flags,
1536                                  explicit_address, &bo->vma_heap);
1537       if (bo->offset == 0) {
1538          anv_bo_unmap_close(device, bo);
1539          return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
1540                           "failed to allocate virtual address for BO");
1541       }
1542    }
1543 
1544    return VK_SUCCESS;
1545 }
1546 
1547 enum intel_device_info_mmap_mode
anv_bo_get_mmap_mode(struct anv_device * device,struct anv_bo * bo)1548 anv_bo_get_mmap_mode(struct anv_device *device, struct anv_bo *bo)
1549 {
1550    enum anv_bo_alloc_flags alloc_flags = bo->alloc_flags;
1551 
1552    if (device->info->has_set_pat_uapi)
1553       return anv_device_get_pat_entry(device, alloc_flags)->mmap;
1554 
1555    if (anv_physical_device_has_vram(device->physical)) {
1556       if ((alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) ||
1557           (alloc_flags & ANV_BO_ALLOC_IMPORTED))
1558          return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1559 
1560       return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1561    }
1562 
1563    /* gfx9 atom */
1564    if (!device->info->has_llc) {
1565       /* user wants a cached and coherent memory but to achieve it without
1566        * LLC in older platforms DRM_IOCTL_I915_GEM_SET_CACHING needs to be
1567        * supported and set.
1568        */
1569       if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED)
1570          return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1571 
1572       return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1573    }
1574 
1575    if (alloc_flags & (ANV_BO_ALLOC_SCANOUT | ANV_BO_ALLOC_EXTERNAL))
1576       return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1577 
1578    return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1579 }
1580 
1581 VkResult
anv_device_alloc_bo(struct anv_device * device,const char * name,uint64_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address,struct anv_bo ** bo_out)1582 anv_device_alloc_bo(struct anv_device *device,
1583                     const char *name,
1584                     uint64_t size,
1585                     enum anv_bo_alloc_flags alloc_flags,
1586                     uint64_t explicit_address,
1587                     struct anv_bo **bo_out)
1588 {
1589    /* bo that needs CPU access needs to be HOST_CACHED, HOST_COHERENT or both */
1590    assert((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0 ||
1591           (alloc_flags & (ANV_BO_ALLOC_HOST_CACHED | ANV_BO_ALLOC_HOST_COHERENT)));
1592 
1593    /* In platforms with LLC we can promote all bos to cached+coherent for free */
1594    const enum anv_bo_alloc_flags not_allowed_promotion = ANV_BO_ALLOC_SCANOUT |
1595                                                          ANV_BO_ALLOC_EXTERNAL |
1596                                                          ANV_BO_ALLOC_PROTECTED;
1597    if (device->info->has_llc && ((alloc_flags & not_allowed_promotion) == 0))
1598       alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
1599 
1600    const uint32_t bo_flags =
1601          device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
1602 
1603    /* The kernel is going to give us whole pages anyway. */
1604    size = align64(size, 4096);
1605 
1606    const uint64_t ccs_offset = size;
1607    if (alloc_flags & ANV_BO_ALLOC_AUX_CCS) {
1608       assert(device->info->has_aux_map);
1609       size += size / INTEL_AUX_MAP_MAIN_SIZE_SCALEDOWN;
1610       size = align64(size, 4096);
1611    }
1612 
1613    const struct intel_memory_class_instance *regions[2];
1614    uint32_t nregions = 0;
1615 
1616    /* If we have vram size, we have multiple memory regions and should choose
1617     * one of them.
1618     */
1619    if (anv_physical_device_has_vram(device->physical)) {
1620       /* This always try to put the object in local memory. Here
1621        * vram_non_mappable & vram_mappable actually are the same region.
1622        */
1623       if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM)
1624          regions[nregions++] = device->physical->sys.region;
1625       else
1626          regions[nregions++] = device->physical->vram_non_mappable.region;
1627 
1628       /* If the buffer is mapped on the host, add the system memory region.
1629        * This ensures that if the buffer cannot live in mappable local memory,
1630        * it can be spilled to system memory.
1631        */
1632       if (!(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) &&
1633           ((alloc_flags & ANV_BO_ALLOC_MAPPED) ||
1634            (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)))
1635          regions[nregions++] = device->physical->sys.region;
1636    } else {
1637       regions[nregions++] = device->physical->sys.region;
1638    }
1639 
1640    uint64_t actual_size;
1641    uint32_t gem_handle = device->kmd_backend->gem_create(device, regions,
1642                                                          nregions, size,
1643                                                          alloc_flags,
1644                                                          &actual_size);
1645    if (gem_handle == 0)
1646       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1647 
1648    struct anv_bo new_bo = {
1649       .name = name,
1650       .gem_handle = gem_handle,
1651       .refcount = 1,
1652       .offset = -1,
1653       .size = size,
1654       .ccs_offset = ccs_offset,
1655       .actual_size = actual_size,
1656       .flags = bo_flags,
1657       .alloc_flags = alloc_flags,
1658    };
1659 
1660    if (alloc_flags & ANV_BO_ALLOC_MAPPED) {
1661       VkResult result = anv_device_map_bo(device, &new_bo, 0, size,
1662                                           NULL, &new_bo.map);
1663       if (unlikely(result != VK_SUCCESS)) {
1664          device->kmd_backend->gem_close(device, &new_bo);
1665          return result;
1666       }
1667    }
1668 
1669    VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1670                                                alloc_flags,
1671                                                explicit_address);
1672    if (result != VK_SUCCESS)
1673       return result;
1674 
1675    result = device->kmd_backend->vm_bind_bo(device, &new_bo);
1676    if (result != VK_SUCCESS) {
1677       anv_bo_vma_free(device, &new_bo);
1678       anv_bo_unmap_close(device, &new_bo);
1679       return result;
1680    }
1681 
1682    assert(new_bo.gem_handle);
1683 
1684    /* If we just got this gem_handle from anv_bo_init_new then we know no one
1685     * else is touching this BO at the moment so we don't need to lock here.
1686     */
1687    struct anv_bo *bo = anv_device_lookup_bo(device, new_bo.gem_handle);
1688    *bo = new_bo;
1689 
1690    *bo_out = bo;
1691 
1692    ANV_RMV(bo_allocate, device, bo);
1693 
1694    return VK_SUCCESS;
1695 }
1696 
1697 VkResult
anv_device_map_bo(struct anv_device * device,struct anv_bo * bo,uint64_t offset,size_t size,void * placed_addr,void ** map_out)1698 anv_device_map_bo(struct anv_device *device,
1699                   struct anv_bo *bo,
1700                   uint64_t offset,
1701                   size_t size,
1702                   void *placed_addr,
1703                   void **map_out)
1704 {
1705    assert(!bo->from_host_ptr);
1706    assert(size > 0);
1707 
1708    void *map = device->kmd_backend->gem_mmap(device, bo, offset, size, placed_addr);
1709    if (unlikely(map == MAP_FAILED))
1710       return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
1711 
1712    assert(placed_addr == NULL || map == placed_addr);
1713 
1714    assert(map != NULL);
1715    VG(VALGRIND_MALLOCLIKE_BLOCK(map, size, 0, 1));
1716 
1717    if (map_out)
1718       *map_out = map;
1719 
1720    return VK_SUCCESS;
1721 }
1722 
1723 VkResult
anv_device_unmap_bo(struct anv_device * device,struct anv_bo * bo,void * map,size_t map_size,bool replace)1724 anv_device_unmap_bo(struct anv_device *device,
1725                     struct anv_bo *bo,
1726                     void *map, size_t map_size,
1727                     bool replace)
1728 {
1729    assert(!bo->from_host_ptr);
1730 
1731    if (replace) {
1732       map = mmap(map, map_size, PROT_NONE,
1733                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
1734       if (map == MAP_FAILED) {
1735          return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
1736                           "Failed to map over original mapping");
1737       }
1738    } else {
1739       VG(VALGRIND_FREELIKE_BLOCK(map, 0));
1740       munmap(map, map_size);
1741    }
1742    return VK_SUCCESS;
1743 }
1744 
1745 VkResult
anv_device_import_bo_from_host_ptr(struct anv_device * device,void * host_ptr,uint32_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1746 anv_device_import_bo_from_host_ptr(struct anv_device *device,
1747                                    void *host_ptr, uint32_t size,
1748                                    enum anv_bo_alloc_flags alloc_flags,
1749                                    uint64_t client_address,
1750                                    struct anv_bo **bo_out)
1751 {
1752    assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1753                            ANV_BO_ALLOC_HOST_CACHED |
1754                            ANV_BO_ALLOC_HOST_COHERENT |
1755                            ANV_BO_ALLOC_AUX_CCS |
1756                            ANV_BO_ALLOC_PROTECTED |
1757                            ANV_BO_ALLOC_FIXED_ADDRESS)));
1758    assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
1759 
1760    struct anv_bo_cache *cache = &device->bo_cache;
1761    const uint32_t bo_flags =
1762          device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
1763 
1764    uint32_t gem_handle = device->kmd_backend->gem_create_userptr(device, host_ptr, size);
1765    if (!gem_handle)
1766       return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1767 
1768    pthread_mutex_lock(&cache->mutex);
1769 
1770    struct anv_bo *bo = NULL;
1771    if (device->info->kmd_type == INTEL_KMD_TYPE_XE) {
1772       bo = vk_zalloc(&device->vk.alloc, sizeof(*bo), 8,
1773                      VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1774       if (!bo) {
1775          pthread_mutex_unlock(&cache->mutex);
1776          return VK_ERROR_OUT_OF_HOST_MEMORY;
1777       }
1778    } else {
1779       bo = anv_device_lookup_bo(device, gem_handle);
1780    }
1781 
1782    if (bo->refcount > 0) {
1783       /* VK_EXT_external_memory_host doesn't require handling importing the
1784        * same pointer twice at the same time, but we don't get in the way.  If
1785        * kernel gives us the same gem_handle, only succeed if the flags match.
1786        */
1787       assert(bo->gem_handle == gem_handle);
1788       if (bo_flags != bo->flags) {
1789          pthread_mutex_unlock(&cache->mutex);
1790          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1791                           "same host pointer imported two different ways");
1792       }
1793 
1794       if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
1795           (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
1796          pthread_mutex_unlock(&cache->mutex);
1797          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1798                           "The same BO was imported with and without buffer "
1799                           "device address");
1800       }
1801 
1802       if (client_address && client_address != intel_48b_address(bo->offset)) {
1803          pthread_mutex_unlock(&cache->mutex);
1804          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1805                           "The same BO was imported at two different "
1806                           "addresses");
1807       }
1808 
1809       __sync_fetch_and_add(&bo->refcount, 1);
1810    } else {
1811       alloc_flags |= ANV_BO_ALLOC_IMPORTED;
1812       struct anv_bo new_bo = {
1813          .name = "host-ptr",
1814          .gem_handle = gem_handle,
1815          .refcount = 1,
1816          .offset = -1,
1817          .size = size,
1818          .actual_size = size,
1819          .map = host_ptr,
1820          .flags = bo_flags,
1821          .alloc_flags = alloc_flags,
1822          .from_host_ptr = true,
1823       };
1824 
1825       VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1826                                                   alloc_flags,
1827                                                   client_address);
1828       if (result != VK_SUCCESS) {
1829          pthread_mutex_unlock(&cache->mutex);
1830          return result;
1831       }
1832 
1833       result = device->kmd_backend->vm_bind_bo(device, &new_bo);
1834       if (result != VK_SUCCESS) {
1835          anv_bo_vma_free(device, &new_bo);
1836          pthread_mutex_unlock(&cache->mutex);
1837          return result;
1838       }
1839 
1840       *bo = new_bo;
1841 
1842       ANV_RMV(bo_allocate, device, bo);
1843    }
1844 
1845    pthread_mutex_unlock(&cache->mutex);
1846    *bo_out = bo;
1847 
1848    return VK_SUCCESS;
1849 }
1850 
1851 VkResult
anv_device_import_bo(struct anv_device * device,int fd,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1852 anv_device_import_bo(struct anv_device *device,
1853                      int fd,
1854                      enum anv_bo_alloc_flags alloc_flags,
1855                      uint64_t client_address,
1856                      struct anv_bo **bo_out)
1857 {
1858    assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1859                            ANV_BO_ALLOC_HOST_CACHED |
1860                            ANV_BO_ALLOC_HOST_COHERENT |
1861                            ANV_BO_ALLOC_FIXED_ADDRESS)));
1862    assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
1863 
1864    struct anv_bo_cache *cache = &device->bo_cache;
1865 
1866    pthread_mutex_lock(&cache->mutex);
1867 
1868    uint32_t gem_handle = anv_gem_fd_to_handle(device, fd);
1869    if (!gem_handle) {
1870       pthread_mutex_unlock(&cache->mutex);
1871       return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1872    }
1873 
1874    struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
1875 
1876    uint32_t bo_flags;
1877    VkResult result = anv_gem_import_bo_alloc_flags_to_bo_flags(device, bo,
1878                                                                alloc_flags,
1879                                                                &bo_flags);
1880    if (result != VK_SUCCESS) {
1881       pthread_mutex_unlock(&cache->mutex);
1882       return result;
1883    }
1884 
1885    if (bo->refcount > 0) {
1886       if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
1887           (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
1888          pthread_mutex_unlock(&cache->mutex);
1889          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1890                           "The same BO was imported with and without buffer "
1891                           "device address");
1892       }
1893 
1894       if (client_address && client_address != intel_48b_address(bo->offset)) {
1895          pthread_mutex_unlock(&cache->mutex);
1896          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1897                           "The same BO was imported at two different "
1898                           "addresses");
1899       }
1900 
1901       __sync_fetch_and_add(&bo->refcount, 1);
1902    } else {
1903       alloc_flags |= ANV_BO_ALLOC_IMPORTED;
1904       struct anv_bo new_bo = {
1905          .name = "imported",
1906          .gem_handle = gem_handle,
1907          .refcount = 1,
1908          .offset = -1,
1909          .alloc_flags = alloc_flags,
1910       };
1911 
1912       off_t size = lseek(fd, 0, SEEK_END);
1913       if (size == (off_t)-1) {
1914          device->kmd_backend->gem_close(device, &new_bo);
1915          pthread_mutex_unlock(&cache->mutex);
1916          return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1917       }
1918       new_bo.size = size;
1919       new_bo.actual_size = size;
1920 
1921       VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1922                                                   alloc_flags,
1923                                                   client_address);
1924       if (result != VK_SUCCESS) {
1925          pthread_mutex_unlock(&cache->mutex);
1926          return result;
1927       }
1928 
1929       result = device->kmd_backend->vm_bind_bo(device, &new_bo);
1930       if (result != VK_SUCCESS) {
1931          anv_bo_vma_free(device, &new_bo);
1932          pthread_mutex_unlock(&cache->mutex);
1933          return result;
1934       }
1935 
1936       *bo = new_bo;
1937 
1938       ANV_RMV(bo_allocate, device, bo);
1939    }
1940 
1941    bo->flags = bo_flags;
1942 
1943    pthread_mutex_unlock(&cache->mutex);
1944    *bo_out = bo;
1945 
1946    return VK_SUCCESS;
1947 }
1948 
1949 VkResult
anv_device_export_bo(struct anv_device * device,struct anv_bo * bo,int * fd_out)1950 anv_device_export_bo(struct anv_device *device,
1951                      struct anv_bo *bo, int *fd_out)
1952 {
1953    assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
1954 
1955    /* This BO must have been flagged external in order for us to be able
1956     * to export it.  This is done based on external options passed into
1957     * anv_AllocateMemory.
1958     */
1959    assert(anv_bo_is_external(bo));
1960 
1961    int fd = anv_gem_handle_to_fd(device, bo->gem_handle);
1962    if (fd < 0)
1963       return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
1964 
1965    *fd_out = fd;
1966 
1967    return VK_SUCCESS;
1968 }
1969 
1970 VkResult
anv_device_get_bo_tiling(struct anv_device * device,struct anv_bo * bo,enum isl_tiling * tiling_out)1971 anv_device_get_bo_tiling(struct anv_device *device,
1972                          struct anv_bo *bo,
1973                          enum isl_tiling *tiling_out)
1974 {
1975    int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
1976    if (i915_tiling < 0) {
1977       return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1978                        "failed to get BO tiling: %m");
1979    }
1980 
1981    *tiling_out = isl_tiling_from_i915_tiling(i915_tiling);
1982 
1983    return VK_SUCCESS;
1984 }
1985 
1986 VkResult
anv_device_set_bo_tiling(struct anv_device * device,struct anv_bo * bo,uint32_t row_pitch_B,enum isl_tiling tiling)1987 anv_device_set_bo_tiling(struct anv_device *device,
1988                          struct anv_bo *bo,
1989                          uint32_t row_pitch_B,
1990                          enum isl_tiling tiling)
1991 {
1992    int ret = anv_gem_set_tiling(device, bo->gem_handle, row_pitch_B,
1993                                 isl_tiling_to_i915_tiling(tiling));
1994    if (ret) {
1995       return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
1996                        "failed to set BO tiling: %m");
1997    }
1998 
1999    return VK_SUCCESS;
2000 }
2001 
2002 static bool
atomic_dec_not_one(uint32_t * counter)2003 atomic_dec_not_one(uint32_t *counter)
2004 {
2005    uint32_t old, val;
2006 
2007    val = *counter;
2008    while (1) {
2009       if (val == 1)
2010          return false;
2011 
2012       old = __sync_val_compare_and_swap(counter, val, val - 1);
2013       if (old == val)
2014          return true;
2015 
2016       val = old;
2017    }
2018 }
2019 
2020 void
anv_device_release_bo(struct anv_device * device,struct anv_bo * bo)2021 anv_device_release_bo(struct anv_device *device,
2022                       struct anv_bo *bo)
2023 {
2024    struct anv_bo_cache *cache = &device->bo_cache;
2025    const bool bo_is_xe_userptr = device->info->kmd_type == INTEL_KMD_TYPE_XE &&
2026                                  bo->from_host_ptr;
2027    assert(bo_is_xe_userptr ||
2028           anv_device_lookup_bo(device, bo->gem_handle) == bo);
2029 
2030    /* Try to decrement the counter but don't go below one.  If this succeeds
2031     * then the refcount has been decremented and we are not the last
2032     * reference.
2033     */
2034    if (atomic_dec_not_one(&bo->refcount))
2035       return;
2036 
2037    ANV_RMV(bo_destroy, device, bo);
2038 
2039    pthread_mutex_lock(&cache->mutex);
2040 
2041    /* We are probably the last reference since our attempt to decrement above
2042     * failed.  However, we can't actually know until we are inside the mutex.
2043     * Otherwise, someone could import the BO between the decrement and our
2044     * taking the mutex.
2045     */
2046    if (unlikely(__sync_sub_and_fetch(&bo->refcount, 1) > 0)) {
2047       /* Turns out we're not the last reference.  Unlock and bail. */
2048       pthread_mutex_unlock(&cache->mutex);
2049       return;
2050    }
2051    assert(bo->refcount == 0);
2052 
2053    /* Memset the BO just in case.  The refcount being zero should be enough to
2054     * prevent someone from assuming the data is valid but it's safer to just
2055     * stomp to zero just in case.  We explicitly do this *before* we actually
2056     * close the GEM handle to ensure that if anyone allocates something and
2057     * gets the same GEM handle, the memset has already happen and won't stomp
2058     * all over any data they may write in this BO.
2059     */
2060    struct anv_bo old_bo = *bo;
2061 
2062    if (bo_is_xe_userptr)
2063       vk_free(&device->vk.alloc, bo);
2064    else
2065       memset(bo, 0, sizeof(*bo));
2066 
2067    anv_bo_finish(device, &old_bo);
2068 
2069    /* Don't unlock until we've actually closed the BO.  The whole point of
2070     * the BO cache is to ensure that we correctly handle races with creating
2071     * and releasing GEM handles and we don't want to let someone import the BO
2072     * again between mutex unlock and closing the GEM handle.
2073     */
2074    pthread_mutex_unlock(&cache->mutex);
2075 }
2076