1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 18 19 #include <cstdint> 20 #include <memory> 21 #include <optional> 22 #include <string> 23 #include <utility> 24 #include <variant> 25 26 #include "absl/container/flat_hash_map.h" 27 #include "absl/strings/string_view.h" 28 #include "absl/types/span.h" 29 #include "absl/types/variant.h" 30 #include "mlir/Dialect/Func/IR/FuncOps.h" // from @llvm-project 31 #include "mlir/IR/BuiltinOps.h" // from @llvm-project 32 #include "tensorflow/compiler/xla/service/buffer_assignment.h" 33 #include "tensorflow/compiler/xla/service/executable.h" 34 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" 35 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h" 36 #include "tensorflow/compiler/xla/service/gpu/thunk.h" 37 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" 38 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" 39 #include "tensorflow/compiler/xla/service/hlo_module.h" 40 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 41 #include "tensorflow/compiler/xla/statusor.h" 42 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 43 #include "tensorflow/stream_executor/device_memory_allocator.h" 44 45 namespace tfrt { 46 namespace gpu { 47 48 class GpuContextCache; 49 50 } // namespace gpu 51 } // namespace tfrt 52 53 namespace xla { 54 namespace gpu { 55 56 // Returns whether GpuExecutable runs on TFRT/JitRt. 57 bool IsJitRtExecutableEnabled(const HloModuleConfig& config); 58 59 // GPU-targeting implementation of the XLA Executable interface. 60 // 61 // Launches the given GPU kernel via the StreamExecutor. 62 // 63 // This is an immutable data type after initialization, and thus thread safe. 64 class GpuExecutable : public Executable { 65 public: 66 struct JitRtExecutable; 67 68 // Serialized MLIR module prepared for JitRt compilation. 69 struct JitRtProgram { JitRtProgramJitRtProgram70 explicit JitRtProgram(std::string entry_point, std::string module, 71 std::vector<int64_t> buffer_sizes, 72 DebugOptions debug_options) 73 : entry_point(std::move(entry_point)), 74 module(std::move(module)), 75 buffer_sizes(std::move(buffer_sizes)), 76 debug_options(std::move(debug_options)) {} 77 78 std::string entry_point; 79 std::string module; 80 std::vector<int64_t> buffer_sizes; 81 DebugOptions debug_options; 82 }; 83 84 typedef std::unique_ptr<const ThunkSequence> OwnedThunkSequence; 85 typedef std::unique_ptr<JitRtProgram> OwnedJitRtProgram; 86 87 struct ConstantInfo { 88 std::string symbol_name; 89 std::vector<uint8_t> content; 90 int allocation_index = -1; 91 }; 92 93 struct OutputInfo { 94 // Corresponding allocation index. 95 int allocation_index; 96 97 // Output is passed-through from a parameter. 98 bool passthrough = false; 99 100 // Whether this output is hinted to alias a parameter (BufferAllocation* 101 // would indicate the aliased parameter), and what kind of alias it is. 102 std::optional<HloInputOutputAliasConfig::Alias> alias_config; 103 }; 104 105 struct Params { 106 std::string asm_text; 107 std::vector<uint8_t> binary; 108 GpuVersion gpu_version; 109 // The GpuExecutable will either execute Thunks or a JitRt compiled native 110 // function depending on which is supplied. 111 std::variant<OwnedThunkSequence, OwnedJitRtProgram> executable; 112 xla::EntryFunctionAttributes entry_func_attrs; 113 std::vector<ConstantInfo> constants; 114 absl::flat_hash_map<ShapeIndex, OutputInfo> output_info; 115 std::string module_name; 116 xla::Shape output_shape; 117 std::vector<BufferAllocation> allocations; 118 std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment = nullptr; 119 120 // A callable that dumps out a debug string upon device OOM. It's not the 121 // string itself, as the string can be huge and increase peak host memory 122 // usage for the common (non-OOM) case. 123 std::function<std::string()> verbose_buffer_assignment_string_dumper = [] { 124 return std::string(); 125 }; 126 127 std::unique_ptr<HloModule> debug_module = nullptr; 128 }; 129 130 // TODO(hanbinyoon): Once BEF replaces Thunks, hide this method as an 131 // implementation detail of GpuExecutable. 132 // Analyze the entry function to construct buffer allocation and other output 133 // information. Optionally use buffer_param_offset to indicate the position of 134 // buffer parameters in the entry function - in tfrt_gpu dialect, buffer 135 // arguments start from the third parameter (after tfrt::Chain and GpuStream). 136 static Status SetUpMlirAllocation( 137 mlir::func::FuncOp func, llvm::ArrayRef<int64_t> buffer_sizes, 138 std::vector<BufferAllocation>* allocations, 139 absl::flat_hash_map<ShapeIndex, OutputInfo>* output_info, 140 Shape* output_shape, int buffer_param_offset = 0); 141 142 // Returns an Executable that is loaded from an object file (XLA program 143 // compiled to a native function using the JitRt stack). 144 static StatusOr<std::unique_ptr<Executable>> LoadFromObjFile( 145 std::shared_ptr<HloModule> hlo_module, absl::string_view obj_file, 146 absl::string_view mlir_module, 147 xla::EntryFunctionAttributes entry_func_attrs, DebugOptions debug_options, 148 GpuVersion gpu_version, stream_executor::StreamExecutor* executor); 149 150 // Constructor to use when loading a GpuExecutable from an object file (native 151 // function compiled for JitRt). Omits setting class members that aren't used 152 // in JitRt execution mode. 153 GpuExecutable(std::shared_ptr<HloModule> hlo_module, GpuVersion gpu_version, 154 xla::EntryFunctionAttributes entry_func_attrs, 155 absl::string_view module_name, Shape xla_output_shape, 156 std::vector<BufferAllocation> allocations, 157 absl::flat_hash_map<ShapeIndex, OutputInfo> output_info, 158 JitRtExecutable* jitrt_executable); 159 160 static StatusOr<std::unique_ptr<GpuExecutable>> Create(Params params); 161 ~GpuExecutable() override; 162 163 int64_t SizeOfGeneratedCodeInBytes() const override; 164 165 // This should be called after set_ir_module_string. ir_module_string()166 const std::string& ir_module_string() const { return ir_module_string_; } 167 168 // This should be called before ExecuteOnStream. set_ir_module_string(const std::string & ir_module_string)169 void set_ir_module_string(const std::string& ir_module_string) { 170 ir_module_string_ = ir_module_string; 171 } 172 173 // Returns the compiled code for the computation. The compiled code is PTX in 174 // Cuda and unused empty string in ROCm. text()175 const std::string& text() const { return text_; } 176 177 // Returns the binary stored in this GpuExecutable. The binary is cubin in 178 // Cuda, and HSA code object in ROCm. It may be empty, in which case 179 // compilation is left up to the GPU driver. binary()180 const std::vector<uint8_t>& binary() const { return binary_; } 181 182 // ExecuteAsyncOnStream will fail if the compute capability of the stream 183 // doesn't match the compute capability passed to this object's constructor. 184 StatusOr<ExecutionOutput> ExecuteAsyncOnStream( 185 const ServiceExecutableRunOptions* run_options, 186 std::vector<ExecutionInput> arguments, 187 HloExecutionProfile* hlo_execution_profile) override; 188 189 StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream( 190 const ServiceExecutableRunOptions* run_options, 191 absl::Span<const ShapedBuffer* const> arguments, 192 HloExecutionProfile* hlo_execution_profile) override; 193 194 using VariantArguments = std::variant<absl::Span<const ShapedBuffer* const>, 195 absl::Span<ExecutionInput>>; 196 StatusOr<ExecutionOutput> ExecuteAsyncOnStreamImpl( 197 const ServiceExecutableRunOptions* run_options, 198 VariantArguments arguments); 199 GetAllocations()200 absl::Span<const BufferAllocation> GetAllocations() const { 201 return allocations_; 202 } 203 constants()204 const std::vector<ConstantInfo>& constants() const { return constants_; } 205 206 private: 207 // Use GpuExecutable::Create() to create an instance. 208 explicit GpuExecutable(Params params); 209 210 // If `block_host_until_done` is false, execution will not block the host 211 // until the kernels have completed. This is used as an optimization for 212 // clients, such as Tensorflow, that use a single stream of execution for 213 // computations, and allow host-side deallocation from the allocator before 214 // GPU execution completes. 215 Status ExecuteThunksOrJitRt(const ServiceExecutableRunOptions* run_options, 216 const BufferAllocations& buffer_allocations, 217 bool block_host_until_done); 218 219 using BufferAllocToDeviceMemoryMap = 220 absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>; 221 222 // Loads the PTX or CUBIN for this executable and initializes all 223 // constants that haven't already been initialized by the CUDA driver. Loaded 224 // modules are owned by this executable. 225 // 226 // Returns a map from buffer allocation indices to device memory pointers 227 // (only for allocations that contain constants). 228 // 229 // The returned map is cached. If the above process has already been run for 230 // the given stream, it is skipped and the cached map is immediately returned 231 // instead. 232 StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals( 233 stream_executor::Stream* stream); 234 235 // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor 236 // version for compute capability, depending on the hardware. 237 Status CheckCompatibilityWithServiceExecutableRunOptions( 238 const ServiceExecutableRunOptions* run_options); 239 240 StatusOr<BufferAllocations> GenerateBufferAllocations( 241 VariantArguments arguments, 242 const GpuExecutable::BufferAllocToDeviceMemoryMap* globals, 243 se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal); 244 245 StatusOr<se::DeviceMemoryBase> BufferForAllocation( 246 VariantArguments arguments, 247 const GpuExecutable::BufferAllocToDeviceMemoryMap* globals, 248 const BufferAllocation& allocation, 249 se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal, 250 int64_t arg_idx); 251 252 // The LLVM IR, in string format, of the unoptimized module generated for 253 // this GpuExecutable. We save a string instead of an llvm::Module* because 254 // leaving llvm::Module* in a singleton can cause the heap checker to emit 255 // false positives. 256 // 257 // This string should be modified only before ExecuteOnStream. 258 std::string ir_module_string_; 259 260 // The compiled code for the computation. 261 const std::string text_; 262 263 // The GPU machine code for the computation, targeting GPUs at 264 // compute_capability_. 265 // 266 // May be empty, in which case we leave compilation up to the GPU driver. 267 const std::vector<uint8_t> binary_; 268 269 // The GPU version for compute compatibility check. 270 GpuVersion gpu_version_; 271 272 // The thunks to be invoked by this GpuExecutable. They are generated by the 273 // IrEmitter. 274 OwnedThunkSequence thunks_; 275 276 xla::EntryFunctionAttributes entry_func_attrs_; 277 278 std::string module_name_; 279 280 xla::Shape output_shape_; 281 282 // Owns the buffer data at runtime. It provides information to allocate 283 // memory for every output/temp buffers. 284 const std::vector<BufferAllocation> allocations_; 285 286 std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_; 287 std::function<std::string()> verbose_buffer_assignment_string_dumper_; 288 289 absl::Mutex module_handle_mutex_; 290 // Cache of module handles. Required to keep loaded modules alive until this 291 // executable is destroyed. 292 std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle> 293 module_handles_ ABSL_GUARDED_BY(module_handle_mutex_); 294 // Cache of constant buffer allocation maps used by `ResolveConstantGlobals`. 295 std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap> 296 module_globals_ ABSL_GUARDED_BY(module_handle_mutex_); 297 298 std::vector<ConstantInfo> constants_; 299 const absl::flat_hash_map<ShapeIndex, OutputInfo> output_info_; 300 // Retains shared ownership of on-device constants that are managed by XLA and 301 // potentially shared with other executables. 302 std::vector<std::shared_ptr<se::DeviceMemoryBase>> shared_constants_; 303 304 // JitRt executable if the JitRt mode is on, owned. 305 JitRtExecutable* jitrt_executable_ = nullptr; 306 307 GpuExecutable(const GpuExecutable&) = delete; 308 GpuExecutable& operator=(const GpuExecutable&) = delete; 309 }; 310 311 StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>> 312 GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment); 313 314 } // namespace gpu 315 } // namespace xla 316 317 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 318