1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "llvm/Transforms/Utils/Cloning.h"
17 #include "mlir/Dialect/Func/IR/FuncOps.h" // from @llvm-project
18 #include "mlir/Target/LLVMIR/Export.h" // from @llvm-project
19 #include "mlir/Transforms/DialectConversion.h" // from @llvm-project
20 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
21 #include "tensorflow/compiler/xla/debug_options_flags.h"
22 #include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
23 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
24 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
25 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
26 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
27 #include "tensorflow/core/platform/cuda_libdevice_path.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/path.h"
30 #include "tensorflow/core/platform/status.h"
31 #include "tensorflow/core/platform/statusor.h"
32
33 #if GOOGLE_CUDA
34 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
35 #elif TENSORFLOW_USE_ROCM
36 #include "tensorflow/core/platform/rocm_rocdl_path.h"
37 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
38 #endif
39
40 namespace mlir {
41 namespace kernel_gen {
42 namespace transforms {
43 namespace {
44
45 #define GEN_PASS_CLASSES
46 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
47
48 class GpuKernelToBlobPass
49 : public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
50 public:
GpuKernelToBlobPass(StringRef blob_annotation,llvm::ArrayRef<std::string> architectures,bool print_ptx,bool print_llvmir,bool enable_ftz)51 GpuKernelToBlobPass(StringRef blob_annotation,
52 llvm::ArrayRef<std::string> architectures, bool print_ptx,
53 bool print_llvmir, bool enable_ftz) {
54 if (!blob_annotation.empty()) blob_annotation_ = blob_annotation.str();
55 architectures_ = architectures;
56 print_ptx_ = print_ptx;
57 print_llvmir_ = print_llvmir;
58 enable_ftz_ = enable_ftz;
59 }
60
runOnOperation()61 void runOnOperation() override {
62 gpu::GPUModuleOp gpu_module = getOperation();
63 auto blob_or = GetGpuBinaryBlob(gpu_module);
64 if (blob_or.ok()) {
65 const auto& blob = blob_or.ValueOrDie();
66 std::string blob_string(blob.begin(), blob.end());
67 gpu_module->setAttr(blob_annotation_,
68 StringAttr::get(&getContext(), blob_string));
69 return;
70 }
71 // Forward the error by attaching the message to the gpu module.
72 gpu_module.emitError(blob_or.status().error_message());
73 return signalPassFailure();
74 }
75
GetGpuBinaryBlob(gpu::GPUModuleOp gpu_module)76 tensorflow::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
77 gpu::GPUModuleOp gpu_module) {
78 if (architectures_.empty()) {
79 return tensorflow::errors::Internal(
80 "Expected at least one GPU architecture.");
81 }
82
83 // Lower to LLVM module.
84 llvm::LLVMContext llvmContext;
85 auto llvmModule = translateModuleToLLVMIR(gpu_module, llvmContext);
86 if (!llvmModule) {
87 return tensorflow::errors::Internal(
88 "Could not translate MLIR module to LLVM IR");
89 }
90 llvmModule->setModuleIdentifier(gpu_module.getName());
91
92 #if TENSORFLOW_USE_ROCM
93 xla::HloModuleConfig config;
94 xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
95 options.set_xla_gpu_ftz(enable_ftz_);
96 options.set_xla_gpu_dump_llvmir(print_llvmir_);
97 config.set_debug_options(options);
98
99 using AmdGpuHsaco = std::vector<tensorflow::uint8>;
100 std::vector<tensorflow::se::HsacoImage> images;
101 images.reserve(architectures_.size());
102 for (const std::string& arch_str : architectures_) {
103 // Parse ROCm architecture.
104 absl::string_view consumable_arch(arch_str);
105 if (!absl::ConsumePrefix(&consumable_arch, "gfx")) {
106 return tensorflow::errors::Internal(
107 "Could not parse ROCm architecture prefix (expected gfx)");
108 }
109 std::string libdevice_dir = tensorflow::RocdlRoot();
110 auto llvm_module_copy = llvm::CloneModule(*llvmModule);
111 auto hsaco_or = xla::gpu::amdgpu::CompileToHsaco(
112 llvm_module_copy.get(),
113 tensorflow::se::RocmComputeCapability{arch_str}, config,
114 libdevice_dir);
115 if (!hsaco_or.ok()) {
116 return tensorflow::errors::Internal("Failure when generating HSACO");
117 }
118 auto hsaco = hsaco_or.ValueOrDie();
119 images.push_back({arch_str, std::move(hsaco)});
120 }
121
122 // TODO(b/169870789): Revisit the use of fatbins.
123 // Bundle HSACO images into a single fatbin.
124 if (images.size() == 1) return images.front().bytes;
125 return tensorflow::se::BundleGpuAsm(images, tensorflow::RocmRoot());
126
127 #elif GOOGLE_CUDA
128 xla::HloModuleConfig config;
129 xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
130 options.set_xla_gpu_ftz(enable_ftz_);
131 options.set_xla_gpu_dump_llvmir(print_llvmir_);
132 // Make sure we use full precision division operations.
133 (*options.mutable_xla_backend_extra_options())["-nvptx-prec-divf32"] = "2";
134 // Disable tail sinking as it interferes with load/store vectorization. If
135 // we have common tails that is intentional.
136 (*options.mutable_xla_backend_extra_options())["-simplifycfg-sink-common"] =
137 "false";
138 config.set_debug_options(options);
139
140 llvmModule->setDataLayout(xla::gpu::nvptx::DataLayout());
141 llvmModule->setTargetTriple(xla::gpu::nvptx::TargetTriple());
142
143 // Compile and collect requested cubin and PTX images.
144 std::vector<tensorflow::se::CubinOrPTXImage> images;
145 TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
146 auto gpu_asm_opts =
147 xla::gpu::PtxOptsFromDebugOptions(config.debug_options());
148 for (const std::string& arch_str : architectures_) {
149 TF_ASSIGN_OR_RETURN(auto arch_pair, ParseCudaArch(arch_str));
150 bool is_compute_profile = arch_pair.first;
151 int arch = arch_pair.second;
152 int cc_major = arch / 10;
153 int cc_minor = arch % 10;
154
155 // Generate PTX code.
156 // Module may be changed by CompileToPtx.
157 auto llvm_module_copy = llvm::CloneModule(*llvmModule);
158 auto enable_fusion = [](llvm::TargetMachine* target) {
159 target->Options.AllowFPOpFusion =
160 llvm::FPOpFusion::FPOpFusionMode::Fast;
161 };
162 TF_ASSIGN_OR_RETURN(
163 std::string ptx,
164 xla::gpu::nvptx::CompileToPtx(
165 llvm_module_copy.get(),
166 tensorflow::se::CudaComputeCapability{cc_major, cc_minor}, config,
167 libdevice_dir, enable_fusion));
168 if (print_ptx_) {
169 llvm::dbgs() << "Generated PTX code for module '"
170 << gpu_module.getName() << "' on architecture sm_" << arch
171 << ":\n";
172 llvm::dbgs() << ptx << "\n";
173 }
174
175 // Compile PTX code with ptxas if requested and possible and fall back to
176 // a compute image, otherwise.
177 if (!is_compute_profile) {
178 auto gpu_asm = tensorflow::se::CompileGpuAsm(cc_major, cc_minor,
179 ptx.c_str(), gpu_asm_opts);
180 if (gpu_asm.ok()) {
181 images.push_back(
182 {absl::StrCat("sm_", arch), std::move(gpu_asm.ValueOrDie())});
183 } else {
184 #ifdef PLATFORM_GOOGLE
185 // Require compilation with ptxas.
186 return gpu_asm;
187 #else
188 // Fall back to compilation by driver in OSS.
189 LOG(WARNING) << "Failed to compile generated PTX with ptxas. Falling "
190 "back to compilation by driver.";
191 is_compute_profile = true;
192 #endif
193 }
194 }
195 if (is_compute_profile) {
196 std::vector<uint8_t> ptx_bytes;
197 ptx_bytes.reserve(ptx.size() + 1);
198 std::copy(ptx.begin(), ptx.end(), std::back_inserter(ptx_bytes));
199 ptx_bytes.push_back('\0');
200 images.push_back(
201 {absl::StrCat("compute_", arch), std::move(ptx_bytes)});
202 }
203 }
204
205 // TODO(b/169870789): Revisit the use of fatbins.
206 // Bundle cubin and PTX images into a single fatbin if needed.
207 if (images.size() == 1) return images.front().bytes;
208 return tensorflow::se::BundleGpuAsm(images, gpu_asm_opts);
209
210 #else
211 return tensorflow::errors::Internal(
212 "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
213 " Did you specify either --config=rocm or --config=cuda ?");
214 #endif
215 }
216
217 private:
ParseCudaArch(const std::string & arch_str)218 tensorflow::StatusOr<std::pair<bool, int>> ParseCudaArch(
219 const std::string& arch_str) {
220 absl::string_view consumable_arch(arch_str);
221 bool is_compute_profile;
222 if (absl::ConsumePrefix(&consumable_arch, "compute_")) {
223 is_compute_profile = true;
224 } else if (absl::ConsumePrefix(&consumable_arch, "sm_")) {
225 is_compute_profile = false;
226 } else {
227 return tensorflow::errors::Internal(
228 "Could not parse cuda architecture prefix (expected sm_ or "
229 "compute_)");
230 }
231 int arch;
232 if (!absl::SimpleAtoi(consumable_arch, &arch)) {
233 return tensorflow::errors::Internal(
234 "Could not parse cuda architecture number");
235 }
236 return std::pair<bool, int>(is_compute_profile, arch);
237 }
238
GetLibdeviceDir(const xla::HloModuleConfig & hlo_module_config)239 tensorflow::StatusOr<std::string> GetLibdeviceDir(
240 const xla::HloModuleConfig& hlo_module_config) {
241 for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
242 hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
243 std::string libdevice_dir =
244 tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
245 VLOG(2) << "Looking for libdevice at " << libdevice_dir;
246 if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
247 VLOG(2) << "Found libdevice dir " << libdevice_dir;
248 return libdevice_dir;
249 }
250 }
251 return tensorflow::errors::Internal(
252 "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
253 }
254 bool enable_ftz_;
255 };
256
257 } // namespace
258
CreateGpuKernelToBlobPass(StringRef blob_annotation,ArrayRef<std::string> architectures,bool print_ptx,bool print_llvmir,bool enable_ftz)259 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
260 StringRef blob_annotation, ArrayRef<std::string> architectures,
261 bool print_ptx, bool print_llvmir, bool enable_ftz) {
262 return std::make_unique<GpuKernelToBlobPass>(
263 blob_annotation, architectures, print_ptx, print_llvmir, enable_ftz);
264 }
265
266 } // namespace transforms
267 } // namespace kernel_gen
268 } // namespace mlir
269