xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/service/gpu/gpu_executable.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
18 
19 #include <cstdint>
20 #include <memory>
21 #include <optional>
22 #include <string>
23 #include <utility>
24 #include <variant>
25 
26 #include "absl/container/flat_hash_map.h"
27 #include "absl/strings/string_view.h"
28 #include "absl/types/span.h"
29 #include "absl/types/variant.h"
30 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
31 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
32 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
33 #include "tensorflow/compiler/xla/service/executable.h"
34 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
35 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
36 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
37 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
38 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
39 #include "tensorflow/compiler/xla/service/hlo_module.h"
40 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
41 #include "tensorflow/compiler/xla/statusor.h"
42 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
43 #include "tensorflow/stream_executor/device_memory_allocator.h"
44 
45 namespace tfrt {
46 namespace gpu {
47 
48 class GpuContextCache;
49 
50 }  // namespace gpu
51 }  // namespace tfrt
52 
53 namespace xla {
54 namespace gpu {
55 
56 // Returns whether GpuExecutable runs on TFRT/JitRt.
57 bool IsJitRtExecutableEnabled(const HloModuleConfig& config);
58 
59 // GPU-targeting implementation of the XLA Executable interface.
60 //
61 // Launches the given GPU kernel via the StreamExecutor.
62 //
63 // This is an immutable data type after initialization, and thus thread safe.
64 class GpuExecutable : public Executable {
65  public:
66   struct JitRtExecutable;
67 
68   // Serialized MLIR module prepared for JitRt compilation.
69   struct JitRtProgram {
JitRtProgramJitRtProgram70     explicit JitRtProgram(std::string entry_point, std::string module,
71                           std::vector<int64_t> buffer_sizes,
72                           DebugOptions debug_options)
73         : entry_point(std::move(entry_point)),
74           module(std::move(module)),
75           buffer_sizes(std::move(buffer_sizes)),
76           debug_options(std::move(debug_options)) {}
77 
78     std::string entry_point;
79     std::string module;
80     std::vector<int64_t> buffer_sizes;
81     DebugOptions debug_options;
82   };
83 
84   typedef std::unique_ptr<const ThunkSequence> OwnedThunkSequence;
85   typedef std::unique_ptr<JitRtProgram> OwnedJitRtProgram;
86 
87   struct ConstantInfo {
88     std::string symbol_name;
89     std::vector<uint8_t> content;
90     int allocation_index = -1;
91   };
92 
93   struct OutputInfo {
94     // Corresponding allocation index.
95     int allocation_index;
96 
97     // Output is passed-through from a parameter.
98     bool passthrough = false;
99 
100     // Whether this output is hinted to alias a parameter (BufferAllocation*
101     // would indicate the aliased parameter), and what kind of alias it is.
102     std::optional<HloInputOutputAliasConfig::Alias> alias_config;
103   };
104 
105   struct Params {
106     std::string asm_text;
107     std::vector<uint8_t> binary;
108     GpuVersion gpu_version;
109     // The GpuExecutable will either execute Thunks or a JitRt compiled native
110     // function depending on which is supplied.
111     std::variant<OwnedThunkSequence, OwnedJitRtProgram> executable;
112     xla::EntryFunctionAttributes entry_func_attrs;
113     std::vector<ConstantInfo> constants;
114     absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
115     std::string module_name;
116     xla::Shape output_shape;
117     std::vector<BufferAllocation> allocations;
118     std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment = nullptr;
119 
120     // A callable that dumps out a debug string upon device OOM. It's not the
121     // string itself, as the string can be huge and increase peak host memory
122     // usage for the common (non-OOM) case.
123     std::function<std::string()> verbose_buffer_assignment_string_dumper = [] {
124       return std::string();
125     };
126 
127     std::unique_ptr<HloModule> debug_module = nullptr;
128   };
129 
130   // TODO(hanbinyoon): Once BEF replaces Thunks, hide this method as an
131   // implementation detail of GpuExecutable.
132   // Analyze the entry function to construct buffer allocation and other output
133   // information. Optionally use buffer_param_offset to indicate the position of
134   // buffer parameters in the entry function - in tfrt_gpu dialect, buffer
135   // arguments start from the third parameter (after tfrt::Chain and GpuStream).
136   static Status SetUpMlirAllocation(
137       mlir::func::FuncOp func, llvm::ArrayRef<int64_t> buffer_sizes,
138       std::vector<BufferAllocation>* allocations,
139       absl::flat_hash_map<ShapeIndex, OutputInfo>* output_info,
140       Shape* output_shape, int buffer_param_offset = 0);
141 
142   // Returns an Executable that is loaded from an object file (XLA program
143   // compiled to a native function using the JitRt stack).
144   static StatusOr<std::unique_ptr<Executable>> LoadFromObjFile(
145       std::shared_ptr<HloModule> hlo_module, absl::string_view obj_file,
146       absl::string_view mlir_module,
147       xla::EntryFunctionAttributes entry_func_attrs, DebugOptions debug_options,
148       GpuVersion gpu_version, stream_executor::StreamExecutor* executor);
149 
150   // Constructor to use when loading a GpuExecutable from an object file (native
151   // function compiled for JitRt). Omits setting class members that aren't used
152   // in JitRt execution mode.
153   GpuExecutable(std::shared_ptr<HloModule> hlo_module, GpuVersion gpu_version,
154                 xla::EntryFunctionAttributes entry_func_attrs,
155                 absl::string_view module_name, Shape xla_output_shape,
156                 std::vector<BufferAllocation> allocations,
157                 absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
158                 JitRtExecutable* jitrt_executable);
159 
160   static StatusOr<std::unique_ptr<GpuExecutable>> Create(Params params);
161   ~GpuExecutable() override;
162 
163   int64_t SizeOfGeneratedCodeInBytes() const override;
164 
165   // This should be called after set_ir_module_string.
ir_module_string()166   const std::string& ir_module_string() const { return ir_module_string_; }
167 
168   // This should be called before ExecuteOnStream.
set_ir_module_string(const std::string & ir_module_string)169   void set_ir_module_string(const std::string& ir_module_string) {
170     ir_module_string_ = ir_module_string;
171   }
172 
173   // Returns the compiled code for the computation. The compiled code is PTX in
174   // Cuda and unused empty string in ROCm.
text()175   const std::string& text() const { return text_; }
176 
177   // Returns the binary stored in this GpuExecutable. The binary is cubin in
178   // Cuda, and HSA code object in ROCm. It may be empty, in which case
179   // compilation is left up to the GPU driver.
binary()180   const std::vector<uint8_t>& binary() const { return binary_; }
181 
182   // ExecuteAsyncOnStream will fail if the compute capability of the stream
183   // doesn't match the compute capability passed to this object's constructor.
184   StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
185       const ServiceExecutableRunOptions* run_options,
186       std::vector<ExecutionInput> arguments,
187       HloExecutionProfile* hlo_execution_profile) override;
188 
189   StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
190       const ServiceExecutableRunOptions* run_options,
191       absl::Span<const ShapedBuffer* const> arguments,
192       HloExecutionProfile* hlo_execution_profile) override;
193 
194   using VariantArguments = std::variant<absl::Span<const ShapedBuffer* const>,
195                                         absl::Span<ExecutionInput>>;
196   StatusOr<ExecutionOutput> ExecuteAsyncOnStreamImpl(
197       const ServiceExecutableRunOptions* run_options,
198       VariantArguments arguments);
199 
GetAllocations()200   absl::Span<const BufferAllocation> GetAllocations() const {
201     return allocations_;
202   }
203 
constants()204   const std::vector<ConstantInfo>& constants() const { return constants_; }
205 
206  private:
207   // Use GpuExecutable::Create() to create an instance.
208   explicit GpuExecutable(Params params);
209 
210   // If `block_host_until_done` is false, execution will not block the host
211   // until the kernels have completed. This is used as an optimization for
212   // clients, such as Tensorflow, that use a single stream of execution for
213   // computations, and allow host-side deallocation from the allocator before
214   // GPU execution completes.
215   Status ExecuteThunksOrJitRt(const ServiceExecutableRunOptions* run_options,
216                               const BufferAllocations& buffer_allocations,
217                               bool block_host_until_done);
218 
219   using BufferAllocToDeviceMemoryMap =
220       absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
221 
222   // Loads the PTX or CUBIN for this executable and initializes all
223   // constants that haven't already been initialized by the CUDA driver. Loaded
224   // modules are owned by this executable.
225   //
226   // Returns a map from buffer allocation indices to device memory pointers
227   // (only for allocations that contain constants).
228   //
229   // The returned map is cached. If the above process has already been run for
230   // the given stream, it is skipped and the cached map is immediately returned
231   // instead.
232   StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals(
233       stream_executor::Stream* stream);
234 
235   // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor
236   // version for compute capability, depending on the hardware.
237   Status CheckCompatibilityWithServiceExecutableRunOptions(
238       const ServiceExecutableRunOptions* run_options);
239 
240   StatusOr<BufferAllocations> GenerateBufferAllocations(
241       VariantArguments arguments,
242       const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
243       se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal);
244 
245   StatusOr<se::DeviceMemoryBase> BufferForAllocation(
246       VariantArguments arguments,
247       const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
248       const BufferAllocation& allocation,
249       se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,
250       int64_t arg_idx);
251 
252   // The LLVM IR, in string format, of the unoptimized module generated for
253   // this GpuExecutable. We save a string instead of an llvm::Module* because
254   // leaving llvm::Module* in a singleton can cause the heap checker to emit
255   // false positives.
256   //
257   // This string should be modified only before ExecuteOnStream.
258   std::string ir_module_string_;
259 
260   // The compiled code for the computation.
261   const std::string text_;
262 
263   // The GPU machine code for the computation, targeting GPUs at
264   // compute_capability_.
265   //
266   // May be empty, in which case we leave compilation up to the GPU driver.
267   const std::vector<uint8_t> binary_;
268 
269   // The GPU version for compute compatibility check.
270   GpuVersion gpu_version_;
271 
272   // The thunks to be invoked by this GpuExecutable. They are generated by the
273   // IrEmitter.
274   OwnedThunkSequence thunks_;
275 
276   xla::EntryFunctionAttributes entry_func_attrs_;
277 
278   std::string module_name_;
279 
280   xla::Shape output_shape_;
281 
282   // Owns the buffer data at runtime. It provides information to allocate
283   // memory for every output/temp buffers.
284   const std::vector<BufferAllocation> allocations_;
285 
286   std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_;
287   std::function<std::string()> verbose_buffer_assignment_string_dumper_;
288 
289   absl::Mutex module_handle_mutex_;
290   // Cache of module handles. Required to keep loaded modules alive until this
291   // executable is destroyed.
292   std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle>
293       module_handles_ ABSL_GUARDED_BY(module_handle_mutex_);
294   // Cache of constant buffer allocation maps used by `ResolveConstantGlobals`.
295   std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
296       module_globals_ ABSL_GUARDED_BY(module_handle_mutex_);
297 
298   std::vector<ConstantInfo> constants_;
299   const absl::flat_hash_map<ShapeIndex, OutputInfo> output_info_;
300   // Retains shared ownership of on-device constants that are managed by XLA and
301   // potentially shared with other executables.
302   std::vector<std::shared_ptr<se::DeviceMemoryBase>> shared_constants_;
303 
304   // JitRt executable if the JitRt mode is on, owned.
305   JitRtExecutable* jitrt_executable_ = nullptr;
306 
307   GpuExecutable(const GpuExecutable&) = delete;
308   GpuExecutable& operator=(const GpuExecutable&) = delete;
309 };
310 
311 StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
312 GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment);
313 
314 }  // namespace gpu
315 }  // namespace xla
316 
317 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
318