#pragma once #include #include #include #include #include /* * CPUCachingAllocator: * DISCLAIMER: * This is subject to change (beta) and only supported on mobile builds. * If code snippet such as in 'Usage pattern' is used outside of mobile * build you will not observe the intended behavior. * See below for more information. * Why? * It has been observed that some mobile platforms, such as pixel 3, return * memory aggressively to the system. This results in page faults in some * cases and ends up hurting performance. This caching allocator aims to address * that. Furthermore it also allows users to specify their own allocator by * implementing allocate/free virtual interfaces. What are the cons? There are * some cons that were observed where use of caching allocator led to worse * performance on some platforms. Reason being that the caching mechanism used * by this allocator left us worse off compared to the corresponding platform's * tuned memory allocator. In that case it seemed better to not use this * allocator. Note there are some ideas to fix this in the works. * * Usage: * Usage pattern: * Instantiate and own the caching allocator. * std::unique_ptr caching_allocator = * std::make_unique(); * Use caching allocator with a scoped guard at inference time. * { * WithCPUCachingAllocatorGuard(caching_allocator.get()); * ... model.forward(...); * } */ namespace c10 { class C10_API CPUCachingAllocator { /* * What it does: * Caches all the allocations carried out by this allocator. * Cache key is the size of the allocation. * If requested size is found in the cache returns the cached pointer. * What it does not do: * No speculative allocation for any future allocations. */ private: inline void* allocate_and_cache(const size_t bytes); void free_cached(); protected: // Invariants. // 1. If memory is ever allocated via this allocator then // the pointer will exist in allocation_map_, unless the allocator // returned the memory to OS via free_cached. // 1.1. Therefore even when the said memory is "freed" via this // allocator (and thus cached), it will continue to stay // in allocation_map_. Furthermore it will also exist in // available_map_. Thus an allocated memory pointer can be in both // allocation_map_ and available_map_ simultaneously. // 2. Memory pointer maybe removed from allocation_map_, when it // is freed outside of the scope of this allocator, but was allocated // by this allocator. // 3. Available map only contains that memory which was allocated // by this allocator and subsequently freed by this allocator. // As a result of above invariants, allocated memory ptr cannot be in // available_map_ unless it is in allocation_map_ as well. ska::flat_hash_map> available_map_; static ska::flat_hash_map allocation_map_; // Since allocation_map, which is a global instance, is mutated/read via // all public APIs we need a global mutex. static std::mutex mutex_; public: static void record_free(void* ptr); virtual ~CPUCachingAllocator(); // Checks the cache to see if allocation of size bytes can be found. // If so return cached memory, else // allocates memory, records it for caching and returns. virtual void* allocate(const size_t bytes); // Checks if the memory being freed is was marked for allocation by // an earlier call to allocate. If so cache the allocation. // Otherwise free. virtual void free(void* ptr); }; CPUCachingAllocator* GetDefaultCPUCachingAllocator(); bool ThreadLocalCachingAllocatorEnabled(); CPUCachingAllocator* GetThreadLocalCachingAllocator(); class C10_API WithCPUCachingAllocatorGuard { public: WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator); ~WithCPUCachingAllocatorGuard(); private: CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr}; }; } // namespace c10