1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "llvm/Transforms/Utils/Cloning.h"
17 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
18 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
19 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
20 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
21 #include "tensorflow/compiler/xla/debug_options_flags.h"
22 #include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
23 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
24 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
25 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
26 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
27 #include "tensorflow/core/platform/cuda_libdevice_path.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/path.h"
30 #include "tensorflow/core/platform/status.h"
31 #include "tensorflow/core/platform/statusor.h"
32 
33 #if GOOGLE_CUDA
34 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
35 #elif TENSORFLOW_USE_ROCM
36 #include "tensorflow/core/platform/rocm_rocdl_path.h"
37 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
38 #endif
39 
40 namespace mlir {
41 namespace kernel_gen {
42 namespace transforms {
43 namespace {
44 
45 #define GEN_PASS_CLASSES
46 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
47 
48 class GpuKernelToBlobPass
49     : public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
50  public:
GpuKernelToBlobPass(StringRef blob_annotation,llvm::ArrayRef<std::string> architectures,bool print_ptx,bool print_llvmir,bool enable_ftz)51   GpuKernelToBlobPass(StringRef blob_annotation,
52                       llvm::ArrayRef<std::string> architectures, bool print_ptx,
53                       bool print_llvmir, bool enable_ftz) {
54     if (!blob_annotation.empty()) blob_annotation_ = blob_annotation.str();
55     architectures_ = architectures;
56     print_ptx_ = print_ptx;
57     print_llvmir_ = print_llvmir;
58     enable_ftz_ = enable_ftz;
59   }
60 
runOnOperation()61   void runOnOperation() override {
62     gpu::GPUModuleOp gpu_module = getOperation();
63     auto blob_or = GetGpuBinaryBlob(gpu_module);
64     if (blob_or.ok()) {
65       const auto& blob = blob_or.ValueOrDie();
66       std::string blob_string(blob.begin(), blob.end());
67       gpu_module->setAttr(blob_annotation_,
68                           StringAttr::get(&getContext(), blob_string));
69       return;
70     }
71     // Forward the error by attaching the message to the gpu module.
72     gpu_module.emitError(blob_or.status().error_message());
73     return signalPassFailure();
74   }
75 
GetGpuBinaryBlob(gpu::GPUModuleOp gpu_module)76   tensorflow::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
77       gpu::GPUModuleOp gpu_module) {
78     if (architectures_.empty()) {
79       return tensorflow::errors::Internal(
80           "Expected at least one GPU architecture.");
81     }
82 
83     // Lower to LLVM module.
84     llvm::LLVMContext llvmContext;
85     auto llvmModule = translateModuleToLLVMIR(gpu_module, llvmContext);
86     if (!llvmModule) {
87       return tensorflow::errors::Internal(
88           "Could not translate MLIR module to LLVM IR");
89     }
90     llvmModule->setModuleIdentifier(gpu_module.getName());
91 
92 #if TENSORFLOW_USE_ROCM
93     xla::HloModuleConfig config;
94     xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
95     options.set_xla_gpu_ftz(enable_ftz_);
96     options.set_xla_gpu_dump_llvmir(print_llvmir_);
97     config.set_debug_options(options);
98 
99     using AmdGpuHsaco = std::vector<tensorflow::uint8>;
100     std::vector<tensorflow::se::HsacoImage> images;
101     images.reserve(architectures_.size());
102     for (const std::string& arch_str : architectures_) {
103       // Parse ROCm architecture.
104       absl::string_view consumable_arch(arch_str);
105       if (!absl::ConsumePrefix(&consumable_arch, "gfx")) {
106         return tensorflow::errors::Internal(
107             "Could not parse ROCm architecture prefix (expected gfx)");
108       }
109       std::string libdevice_dir = tensorflow::RocdlRoot();
110       auto llvm_module_copy = llvm::CloneModule(*llvmModule);
111       auto hsaco_or = xla::gpu::amdgpu::CompileToHsaco(
112           llvm_module_copy.get(),
113           tensorflow::se::RocmComputeCapability{arch_str}, config,
114           libdevice_dir);
115       if (!hsaco_or.ok()) {
116         return tensorflow::errors::Internal("Failure when generating HSACO");
117       }
118       auto hsaco = hsaco_or.ValueOrDie();
119       images.push_back({arch_str, std::move(hsaco)});
120     }
121 
122     // TODO(b/169870789): Revisit the use of fatbins.
123     // Bundle HSACO images into a single fatbin.
124     if (images.size() == 1) return images.front().bytes;
125     return tensorflow::se::BundleGpuAsm(images, tensorflow::RocmRoot());
126 
127 #elif GOOGLE_CUDA
128     xla::HloModuleConfig config;
129     xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
130     options.set_xla_gpu_ftz(enable_ftz_);
131     options.set_xla_gpu_dump_llvmir(print_llvmir_);
132     // Make sure we use full precision division operations.
133     (*options.mutable_xla_backend_extra_options())["-nvptx-prec-divf32"] = "2";
134     // Disable tail sinking as it interferes with load/store vectorization. If
135     // we have common tails that is intentional.
136     (*options.mutable_xla_backend_extra_options())["-simplifycfg-sink-common"] =
137         "false";
138     config.set_debug_options(options);
139 
140     llvmModule->setDataLayout(xla::gpu::nvptx::DataLayout());
141     llvmModule->setTargetTriple(xla::gpu::nvptx::TargetTriple());
142 
143     // Compile and collect requested cubin and PTX images.
144     std::vector<tensorflow::se::CubinOrPTXImage> images;
145     TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
146     auto gpu_asm_opts =
147         xla::gpu::PtxOptsFromDebugOptions(config.debug_options());
148     for (const std::string& arch_str : architectures_) {
149       TF_ASSIGN_OR_RETURN(auto arch_pair, ParseCudaArch(arch_str));
150       bool is_compute_profile = arch_pair.first;
151       int arch = arch_pair.second;
152       int cc_major = arch / 10;
153       int cc_minor = arch % 10;
154 
155       // Generate PTX code.
156       // Module may be changed by CompileToPtx.
157       auto llvm_module_copy = llvm::CloneModule(*llvmModule);
158       auto enable_fusion = [](llvm::TargetMachine* target) {
159         target->Options.AllowFPOpFusion =
160             llvm::FPOpFusion::FPOpFusionMode::Fast;
161       };
162       TF_ASSIGN_OR_RETURN(
163           std::string ptx,
164           xla::gpu::nvptx::CompileToPtx(
165               llvm_module_copy.get(),
166               tensorflow::se::CudaComputeCapability{cc_major, cc_minor}, config,
167               libdevice_dir, enable_fusion));
168       if (print_ptx_) {
169         llvm::dbgs() << "Generated PTX code for module '"
170                      << gpu_module.getName() << "' on architecture sm_" << arch
171                      << ":\n";
172         llvm::dbgs() << ptx << "\n";
173       }
174 
175       // Compile PTX code with ptxas if requested and possible and fall back to
176       // a compute image, otherwise.
177       if (!is_compute_profile) {
178         auto gpu_asm = tensorflow::se::CompileGpuAsm(cc_major, cc_minor,
179                                                      ptx.c_str(), gpu_asm_opts);
180         if (gpu_asm.ok()) {
181           images.push_back(
182               {absl::StrCat("sm_", arch), std::move(gpu_asm.ValueOrDie())});
183         } else {
184 #ifdef PLATFORM_GOOGLE
185           // Require compilation with ptxas.
186           return gpu_asm;
187 #else
188           // Fall back to compilation by driver in OSS.
189           LOG(WARNING) << "Failed to compile generated PTX with ptxas. Falling "
190                           "back to compilation by driver.";
191           is_compute_profile = true;
192 #endif
193         }
194       }
195       if (is_compute_profile) {
196         std::vector<uint8_t> ptx_bytes;
197         ptx_bytes.reserve(ptx.size() + 1);
198         std::copy(ptx.begin(), ptx.end(), std::back_inserter(ptx_bytes));
199         ptx_bytes.push_back('\0');
200         images.push_back(
201             {absl::StrCat("compute_", arch), std::move(ptx_bytes)});
202       }
203     }
204 
205     // TODO(b/169870789): Revisit the use of fatbins.
206     // Bundle cubin and PTX images into a single fatbin if needed.
207     if (images.size() == 1) return images.front().bytes;
208     return tensorflow::se::BundleGpuAsm(images, gpu_asm_opts);
209 
210 #else
211     return tensorflow::errors::Internal(
212         "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
213         " Did you specify either --config=rocm or --config=cuda ?");
214 #endif
215   }
216 
217  private:
ParseCudaArch(const std::string & arch_str)218   tensorflow::StatusOr<std::pair<bool, int>> ParseCudaArch(
219       const std::string& arch_str) {
220     absl::string_view consumable_arch(arch_str);
221     bool is_compute_profile;
222     if (absl::ConsumePrefix(&consumable_arch, "compute_")) {
223       is_compute_profile = true;
224     } else if (absl::ConsumePrefix(&consumable_arch, "sm_")) {
225       is_compute_profile = false;
226     } else {
227       return tensorflow::errors::Internal(
228           "Could not parse cuda architecture prefix (expected sm_ or "
229           "compute_)");
230     }
231     int arch;
232     if (!absl::SimpleAtoi(consumable_arch, &arch)) {
233       return tensorflow::errors::Internal(
234           "Could not parse cuda architecture number");
235     }
236     return std::pair<bool, int>(is_compute_profile, arch);
237   }
238 
GetLibdeviceDir(const xla::HloModuleConfig & hlo_module_config)239   tensorflow::StatusOr<std::string> GetLibdeviceDir(
240       const xla::HloModuleConfig& hlo_module_config) {
241     for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
242              hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
243       std::string libdevice_dir =
244           tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
245       VLOG(2) << "Looking for libdevice at " << libdevice_dir;
246       if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
247         VLOG(2) << "Found libdevice dir " << libdevice_dir;
248         return libdevice_dir;
249       }
250     }
251     return tensorflow::errors::Internal(
252         "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
253   }
254   bool enable_ftz_;
255 };
256 
257 }  // namespace
258 
CreateGpuKernelToBlobPass(StringRef blob_annotation,ArrayRef<std::string> architectures,bool print_ptx,bool print_llvmir,bool enable_ftz)259 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
260     StringRef blob_annotation, ArrayRef<std::string> architectures,
261     bool print_ptx, bool print_llvmir, bool enable_ftz) {
262   return std::make_unique<GpuKernelToBlobPass>(
263       blob_annotation, architectures, print_ptx, print_llvmir, enable_ftz);
264 }
265 
266 }  // namespace transforms
267 }  // namespace kernel_gen
268 }  // namespace mlir
269