// Copyright 2022 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include "xnnpack/cache.h" #include // For assert. #include // For size_t. #include // For uint32_t. #include "xnnpack.h" #include "xnnpack/allocator.h" #include "xnnpack/log.h" #include "xnnpack/math.h" #include "xnnpack/mutex.h" #define XNN_CACHE_HASH_SEED 7 #define XNN_CACHE_INITIAL_BUCKETS 32 #define XNN_CACHE_MAX_LOAD 0.75 // Max load factor is 0.75 (3/4), i.e. num_entries / num_buckets > 3 / 4. #define XNN_CACHE_MAX_LOAD_ENTRIES_MULTIPLIER 4 #define XNN_CACHE_MAX_LOAD_BUCKETS_MULTIPLIER 3 #define XNN_CACHE_GROWTH_FACTOR 2 // MurmurHash3 implementation, copied from smhasher, with minor modifications in // style and main loop. static inline uint32_t fmix32(uint32_t h) { h ^= h >> 16; h *= UINT32_C(0x85EBCA6B); h ^= h >> 13; h *= UINT32_C(0xC2B2AE35); h ^= h >> 16; return h; } static uint32_t murmur_hash3(const void* key, size_t len, uint32_t seed) { const uint8_t* data = (const uint8_t*) key; uint32_t h1 = seed; const uint32_t c1 = UINT32_C(0xCC9E2D51); const uint32_t c2 = UINT32_C(0x1B873593); const uint32_t* blocks = (const uint32_t*) data; for (; len >= sizeof(uint32_t); len -= sizeof(uint32_t)) { uint32_t k1 = *blocks++; k1 *= c1; k1 = math_rotl_u32(k1, 15); k1 *= c2; h1 ^= k1; h1 = math_rotl_u32(h1, 13); h1 = h1 * 5 + UINT32_C(0xE6546B64); } const uint8_t* tail = (const uint8_t*) blocks; uint32_t k1 = 0; switch (len & 3) { case 3: k1 ^= tail[2] << 16; case 2: k1 ^= tail[1] << 8; case 1: k1 ^= tail[0]; k1 *= c1; k1 = math_rotl_u32(k1, 15); k1 *= c2; h1 ^= k1; }; h1 ^= len; return fmix32(h1); } #ifndef NDEBUG // This function is only used by an assert, so do not include it in non-debug // builds. static inline size_t cache_size(struct xnn_cache* cache) { switch (cache->type) { case xnn_cache_type_code: return cache->code.size; case xnn_cache_type_weights: return cache->weights.size; default: XNN_UNREACHABLE; } return SIZE_MAX; } #endif static inline void* cache_start(struct xnn_cache* cache) { switch (cache->type) { case xnn_cache_type_code: return cache->code.start; case xnn_cache_type_weights: return cache->weights.start; default: XNN_UNREACHABLE; } return NULL; } enum xnn_status xnn_init_cache_with_size(struct xnn_cache* cache, size_t num_buckets, enum xnn_cache_type cache_type) { memset(cache, 0, sizeof(struct xnn_cache)); cache->buckets = (struct xnn_cache_bucket*) xnn_allocate_zero_memory(num_buckets * sizeof(struct xnn_cache_bucket)); if (cache->buckets == NULL) { xnn_log_error("fail to allocate memory for cache buckets"); return xnn_status_out_of_memory; } cache->type = cache_type; cache->num_buckets = num_buckets; return xnn_status_success; } enum xnn_status xnn_init_code_cache_with_size(struct xnn_code_cache* cache, size_t num_buckets) { memset(cache, 0, sizeof(struct xnn_code_cache)); enum xnn_status status = xnn_status_success; status = xnn_init_cache_with_size(&cache->cache, num_buckets, xnn_cache_type_code); if (status != xnn_status_success) { goto error; } status = xnn_allocate_code_memory(&cache->cache.code, XNN_DEFAULT_CODE_BUFFER_SIZE); if (status != xnn_status_success) { goto error; } return xnn_status_success; error: xnn_release_code_cache(cache); return status; } enum xnn_status xnn_init_code_cache(struct xnn_code_cache* cache) { return xnn_init_code_cache_with_size(cache, XNN_CACHE_INITIAL_BUCKETS); } static bool cache_buckets_grow(struct xnn_cache* cache) { const size_t new_num_buckets = cache->num_buckets * XNN_CACHE_GROWTH_FACTOR; assert(is_po2(new_num_buckets)); struct xnn_cache tmp_cache; xnn_init_cache_with_size(&tmp_cache, new_num_buckets, cache->type); for (size_t i = 0; i < cache->num_buckets; i++) { struct xnn_cache_bucket b = cache->buckets[i]; if (b.size == 0) { continue; } // Find the first empty slot by linear probing to insert. No need to check // hashes since we are not looking up anything, just moving things around // into a bigger hash table. const size_t mask = tmp_cache.num_buckets - 1; size_t idx = b.hash & mask; while (tmp_cache.buckets[idx].size != 0) { idx = (idx + 1) & mask; } tmp_cache.buckets[idx].hash = b.hash; tmp_cache.buckets[idx].size = b.size; tmp_cache.buckets[idx].offset = b.offset; } xnn_release_memory(cache->buckets); cache->buckets = tmp_cache.buckets; cache->num_buckets = tmp_cache.num_buckets; return true; } static inline bool bytes_equal(struct xnn_cache* cache, void* ptr, size_t size, size_t offset) { return memcmp(ptr, (void*) ((uintptr_t) cache_start(cache) + offset), size) == 0; } static bool lookup(struct xnn_cache* cache, void* ptr, size_t size, uint32_t hash, size_t* index) { assert(is_po2(cache->num_buckets)); const size_t mask = cache->num_buckets - 1; size_t idx = hash & mask; const struct xnn_cache_bucket* buckets = cache->buckets; // Linear probing. while (buckets[idx].size != 0 && !(buckets[idx].hash == hash && size == buckets[idx].size && bytes_equal(cache, ptr, buckets[idx].size, buckets[idx].offset))) { idx = (idx + 1) & mask; } *index = idx; if (buckets[idx].size == 0) { return false; } else { return true; } } static bool insert(struct xnn_cache* cache, void* ptr, size_t size) { const uint32_t hash = murmur_hash3(ptr, size, /*seed=*/XNN_CACHE_HASH_SEED); size_t idx; const bool found = lookup(cache, ptr, size, hash, &idx); if (found) { return false; } // Ensure we have enough buckets to keep under our load limit. if (cache->num_entries * XNN_CACHE_MAX_LOAD_ENTRIES_MULTIPLIER > cache->num_buckets * XNN_CACHE_MAX_LOAD_BUCKETS_MULTIPLIER) { if (!cache_buckets_grow(cache)) { // Can't grow hash table anymore. xnn_log_error("failed to grow cache buckets"); return false; } xnn_log_debug("successfully grew cache buckets"); // If the cache grew, idx is stale, since that is based on the old cache's num_buckets. const bool found_in_grown_cache = lookup(cache, ptr, size, hash, &idx); assert(!found_in_grown_cache); (void) found_in_grown_cache; // Silence unused variable warnings. } // Check that ptr points into cache's buffer. assert((uintptr_t) ptr >= (uintptr_t) cache_start(cache)); if (cache->type == xnn_cache_type_code) { assert((uintptr_t) ptr < (uintptr_t) cache_start(cache) + cache_size(cache)); } const size_t offset = (uintptr_t) ptr - (uintptr_t) cache_start(cache); // Insert the entry. cache->buckets[idx].size = size; cache->buckets[idx].hash = hash; cache->buckets[idx].offset = offset; cache->num_entries++; return true; } // Checks if a generated microkernel is already in the cache, returns the offset // if found, XNN_CACHE_NOT_FOUND otherwise. static size_t lookup_cache(struct xnn_cache* cache, void* ptr, size_t size) { const uint32_t hash = murmur_hash3(ptr, size, /*seed=*/XNN_CACHE_HASH_SEED); size_t bucket_idx; if (lookup(cache, ptr, size, hash, &bucket_idx)) { cache->hits++; return cache->buckets[bucket_idx].offset; } else { cache->misses++; return XNN_CACHE_NOT_FOUND; } } size_t xnn_get_or_insert_cache(struct xnn_cache* cache, void* ptr, size_t size) { const size_t found_offset = lookup_cache(cache, ptr, size); if (found_offset != XNN_CACHE_NOT_FOUND) { if (cache->type == xnn_cache_type_code) { // Found in the cache, rewind the buffer because code generators update buffer size. cache->code.size -= size; } return found_offset; } if (cache->type == xnn_cache_type_weights) { // Cache miss, weights packing functions don't update buffer size, update it here. cache->weights.size += size; } const size_t offset = (uintptr_t) ptr - (uintptr_t) cache_start(cache); if (!insert(cache, ptr, size)) { return XNN_CACHE_NOT_FOUND; } return offset; } size_t xnn_get_or_insert_code_cache(struct xnn_code_cache* cache, void* ptr, size_t size) { return xnn_get_or_insert_cache(&cache->cache, ptr, size); } enum xnn_status xnn_release_code_cache(struct xnn_code_cache* cache) { if XNN_LIKELY(cache != NULL) { assert(cache->cache.type == xnn_cache_type_code); xnn_release_code_memory(&cache->cache.code); xnn_release_memory(cache->cache.buckets); } return xnn_status_success; } enum xnn_status xnn_internal_init_weights_cache( struct xnn_weights_cache* cache, size_t num_buckets, size_t buffer_size) { memset(cache, 0, sizeof(struct xnn_weights_cache)); enum xnn_status status = xnn_status_success; status = xnn_init_cache_with_size(&cache->cache, num_buckets, xnn_cache_type_weights); if (status != xnn_status_success) { goto error; } status = xnn_allocate_weights_memory(&cache->cache.weights, buffer_size); if (status != xnn_status_success) { goto error; } status = xnn_mutex_init(&cache->mutex); if (status != xnn_status_success) { goto error; } return xnn_status_success; error: xnn_release_weights_cache(cache); return status; } enum xnn_status xnn_init_weights_cache_with_size(struct xnn_weights_cache* cache, size_t size) { return xnn_internal_init_weights_cache(cache, XNN_CACHE_INITIAL_BUCKETS, size); } enum xnn_status xnn_init_weights_cache(struct xnn_weights_cache* cache) { return xnn_init_weights_cache_with_size(cache, XNN_DEFAULT_WEIGHTS_BUFFER_SIZE); } enum xnn_status xnn_finalize_weights_cache( struct xnn_weights_cache* cache, enum xnn_weights_cache_finalization_kind finalization_kind) { switch (cache->finalization_state) { case xnn_cache_state_hard_finalized: case xnn_cache_state_soft_finalized: xnn_log_error("failed to finalize an already final weights cache"); return xnn_status_invalid_state; case xnn_cache_state_not_finalized: { enum xnn_status status; enum xnn_cache_state finalized_state; if (finalization_kind == xnn_weights_cache_finalization_kind_hard) { xnn_log_debug("hard finalizing weights cache"); status = xnn_finalize_weights_memory(&cache->cache.weights); // Also release the memory used by hash table (but not the weights memory). xnn_release_memory(cache->cache.buckets); cache->cache.buckets = NULL; finalized_state = xnn_cache_state_hard_finalized; } else { xnn_log_debug("soft finalizing weights cache"); assert(finalization_kind == xnn_weights_cache_finalization_kind_soft); // Finalize weights cache by reserving sufficient space for the insertion of the largest cached weights. This // ensures that we have space to write packed weights to check for cache hits without growing and moving the // memory. This has some memory overhead, which can be as large as the size of the largest cached weights, // rounded up to page size. status = xnn_reserve_weights_memory(&cache->cache.weights, cache->max_weights_size); finalized_state = xnn_cache_state_soft_finalized; } if (status != xnn_status_success) { xnn_log_error("failed to finalize weights cache memory"); return xnn_status_invalid_state; } cache->finalization_state = finalized_state; return xnn_status_success; } } } enum xnn_status xnn_release_weights_cache(struct xnn_weights_cache* cache) { if XNN_LIKELY(cache != NULL) { assert(cache->cache.type == xnn_cache_type_weights); xnn_release_weights_memory(&cache->cache.weights); if (cache->cache.buckets != NULL) { xnn_release_memory(cache->cache.buckets); } const enum xnn_status status = xnn_mutex_destroy(&cache->mutex); if (status != xnn_status_success) { return status; } } return xnn_status_success; } static inline bool cache_has_space(struct xnn_weights_cache* cache, size_t n) { const struct xnn_weights_buffer buf = cache->cache.weights; return buf.size + n <= buf.capacity; } void* xnn_reserve_space_in_weights_cache(struct xnn_weights_cache* cache, size_t n) { switch (cache->finalization_state) { case xnn_cache_state_hard_finalized: xnn_log_error("cannot reserve additional space in a finalized compact weights cache"); return NULL; case xnn_cache_state_soft_finalized: if (!cache_has_space(cache, n)) { xnn_log_error("cannot reserve additional space in a finalized weights cache"); return NULL; } // If the cache is finalized, and has space for `n` bytes, we still want to lock the mutex, because we can have // multiple writers attempting to write to this space. break; default: break; } enum xnn_status status = xnn_mutex_lock(&cache->mutex); if (status != xnn_status_success) { return NULL; } struct xnn_weights_buffer* buffer = &cache->cache.weights; status = xnn_reserve_weights_memory(buffer, n); if (status != xnn_status_success) { xnn_mutex_unlock(&cache->mutex); return NULL; } return (void*) ((uintptr_t) buffer->start + buffer->size); } size_t xnn_get_or_insert_weights_cache(struct xnn_weights_cache* cache, void* ptr, size_t size) { size_t offset = XNN_CACHE_NOT_FOUND; switch (cache->finalization_state) { case xnn_cache_state_hard_finalized: { xnn_log_error("cannot insert into a finalized compact weights cache"); return XNN_CACHE_NOT_FOUND; } case xnn_cache_state_soft_finalized: { // Inserting into a finalized weights cache is okay as long as: // 1. there is sufficient space in the memory (to write the incoming packed weights), or // 2. incoming packed weights is already in cache if (!cache_has_space(cache, size)) { xnn_log_error("insufficient extra space in finalized weights cache buffer"); return XNN_CACHE_NOT_FOUND; } // We need to release the mutex from this point onwards, because xnn_reserve_space_in_weights would have returned // non-NULL (which means that it locked the mutex). const size_t found_offset = lookup_cache(&cache->cache, ptr, size); if (found_offset == XNN_CACHE_NOT_FOUND) { xnn_log_error("packed weights not found in finalized weights cache"); } offset = found_offset; break; } case xnn_cache_state_not_finalized: { offset = xnn_get_or_insert_cache(&cache->cache, ptr, size); if (offset != XNN_CACHE_NOT_FOUND) { // Found or inserted packed weights, update the largest size seen so far, this will be used when finalizing the // weights cache, to ensure there is an extra space at the end for future cache checks. cache->max_weights_size = max(size, cache->max_weights_size); } break; } } // Mutex is locked in xnn_reserve_space_in_weights_cache when it returns non-NULL, i.e. when cache is not finalized, // or if it is xnn_cache_state_soft_finalized and has sufficient space. const enum xnn_status status = xnn_mutex_unlock(&cache->mutex); (void) status; assert(status == xnn_status_success); return offset; } bool xnn_weights_cache_is_finalized(struct xnn_weights_cache* cache) { return cache->finalization_state != xnn_cache_state_not_finalized; }