xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
17 
18 #include <string>
19 #include <utility>
20 
21 #include "absl/base/const_init.h"
22 #include "absl/base/thread_annotations.h"
23 #include "absl/cleanup/cleanup.h"
24 #include "absl/container/flat_hash_map.h"
25 #include "absl/container/flat_hash_set.h"
26 #include "absl/strings/str_format.h"
27 #include "absl/strings/string_view.h"
28 #include "absl/synchronization/mutex.h"
29 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
30 #include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
31 #include "tensorflow/core/platform/cuda_libdevice_path.h"
32 #include "tensorflow/core/platform/env.h"
33 #include "tensorflow/core/platform/errors.h"
34 #include "tensorflow/core/platform/path.h"
35 #include "tensorflow/core/platform/regexp.h"
36 #include "tensorflow/core/platform/subprocess.h"
37 
38 namespace stream_executor {
39 
GetPtxasVersionString(const std::string & binary_path)40 static port::StatusOr<absl::string_view> GetPtxasVersionString(
41     const std::string& binary_path) {
42   static absl::Mutex mu(absl::kConstInit);
43   static auto* seen_binary_paths ABSL_GUARDED_BY(mu) =
44       new absl::flat_hash_map<std::string, std::string>();
45 
46   absl::MutexLock lock(&mu);
47   auto it = seen_binary_paths->find(binary_path);
48   if (it != seen_binary_paths->end()) {
49     // Already checked this binary, nothing to do.
50     return absl::string_view(it->second);
51   }
52 
53   tensorflow::SubProcess binary;
54   binary.SetProgram(binary_path, {binary_path, "--version"});
55   binary.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
56   if (!binary.Start()) {
57     return port::InternalError(
58         absl::StrFormat("Couldn't invoke %s --version", binary_path));
59   }
60 
61   std::string out;
62   int exit_code = binary.Communicate(/*stdin_input=*/nullptr, &out,
63                                      /*stderr_output=*/nullptr);
64   if (exit_code != 0) {
65     return port::InternalError(absl::StrFormat(
66         "Running %s --version returned %d", binary_path, exit_code));
67   }
68   auto emplace_it = seen_binary_paths->emplace(binary_path, std::move(out));
69   return absl::string_view(emplace_it.first->second);
70 }
71 
72 // Prints a warning if the ptxas at ptxas_path has known bugs.
73 //
74 // Only prints a warning the first time it's called for a particular value of
75 // ptxas_path.
76 //
77 // Locks on entry.
WarnIfBadPtxasVersion(const std::string & ptxas_path)78 static void WarnIfBadPtxasVersion(const std::string& ptxas_path) {
79   port::StatusOr<absl::string_view> ptxas_version =
80       GetPtxasVersionString(ptxas_path);
81   if (!ptxas_version.ok()) {
82     LOG(WARNING) << "Couldn't get ptxas version string: "
83                  << ptxas_version.status();
84     return;
85   }
86 
87   int64_t vmaj, vmin, vdot;
88   std::string vmaj_str, vmin_str, vdot_str;
89   if (!RE2::PartialMatch(ptxas_version.ValueOrDie(),
90                          R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str, &vmin_str,
91                          &vdot_str) ||
92       !absl::SimpleAtoi(vmaj_str, &vmaj) ||
93       !absl::SimpleAtoi(vmin_str, &vmin) ||
94       !absl::SimpleAtoi(vdot_str, &vdot)) {
95     LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
96                  << " --version:\n"
97                  << ptxas_version.ValueOrDie();
98     return;
99   }
100 
101   // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
102   // PTX 6.0.  An older ptxas will just fail to compile any of our code.
103   //
104   // ptxas versions before the version that shipped with CUDA 11.1 are known to
105   // miscompile XLA code.
106   if (vmaj < 9) {
107     LOG(ERROR)
108         << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
109            "prefers >= 11.1).  Compilation of XLA kernels below will likely "
110            "fail.\n\nYou may not need to update CUDA; cherry-picking the ptxas "
111            "binary is often sufficient.";
112   } else if (std::make_tuple(vmaj, vmin) < std::make_tuple(11, 1)) {
113     LOG(WARNING)
114         << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
115         << vdot
116         << ", which is older than 11.1. ptxas before 11.1 is known to "
117            "miscompile XLA code, leading to incorrect results or "
118            "invalid-address errors.\n\nYou may not need to update to CUDA "
119            "11.1; cherry-picking the ptxas binary is often sufficient.";
120   }
121 }
122 
CompileGpuAsmOrGetCached(int device_ordinal,const char * ptx,GpuAsmOpts compilation_options)123 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
124     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
125   using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
126   using PtxCompilerResult = port::StatusOr<std::vector<uint8>>;
127   static absl::Mutex ptx_cache_mutex(absl::kConstInit);
128   static auto& ptx_cache ABSL_GUARDED_BY(ptx_cache_mutex) =
129       *new absl::flat_hash_map<PtxCacheKey, PtxCompilerResult>();
130 
131   absl::MutexLock lock(&ptx_cache_mutex);
132   PtxCacheKey cache_key{device_ordinal, std::string(ptx),
133                         compilation_options.ToTuple()};
134   auto it = ptx_cache.find(cache_key);
135   if (it == ptx_cache.end()) {
136     PtxCompilerResult compiled =
137         CompileGpuAsm(device_ordinal, ptx, compilation_options);
138     it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
139   }
140 
141   CHECK(it != ptx_cache.end());
142 
143   // Failed compilation attempts are cached.
144   // Use separate status check and ValueOrDie invocation on ptx_cache
145   // entry to avoid value moving introduced by TF_ASSIGN_OR_RETURN.
146 
147   if (TF_PREDICT_FALSE(!it->second.ok())) {
148     return it->second.status();
149   }
150 
151   const std::vector<uint8>& compiled = it->second.ValueOrDie();
152   return absl::MakeSpan(compiled);
153 }
154 
CompileGpuAsm(int device_ordinal,const char * ptx_contents,GpuAsmOpts options)155 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
156                                                  const char* ptx_contents,
157                                                  GpuAsmOpts options) {
158   gpu::GpuDeviceHandle handle;
159   TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
160   int cc_major;
161   int cc_minor;
162   TF_RETURN_IF_ERROR(
163       gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
164   return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
165 }
166 
FindCudaExecutable(const std::string binary_name,const std::string preferred_cuda_dir)167 static std::string FindCudaExecutable(const std::string binary_name,
168                                       const std::string preferred_cuda_dir) {
169   static absl::Mutex mu(absl::kConstInit);
170   static auto* seen_binary_paths ABSL_GUARDED_BY(mu) =
171       new absl::flat_hash_map<std::pair<std::string, std::string>,
172                               std::string>();
173 
174 #if defined(PLATFORM_WINDOWS)
175   const std::string binary_filename = binary_name + ".exe";
176 #else
177   const std::string& binary_filename = binary_name;
178 #endif
179 
180   auto cache_key = std::make_pair(binary_name, preferred_cuda_dir);
181 
182   absl::MutexLock lock(&mu);
183   auto it = seen_binary_paths->find(cache_key);
184   if (it != seen_binary_paths->end()) {
185     return it->second;
186   }
187 
188   // Try searching in the default PATH first if applicable.
189   if (tensorflow::PreferPtxasFromPath() &&
190       GetPtxasVersionString(binary_filename).ok()) {
191     VLOG(2) << "Using " << binary_filename;
192     seen_binary_paths->emplace(std::move(cache_key), binary_filename);
193     return binary_filename;
194   }
195 
196   // Search in cuda root candidates.
197   auto env = tensorflow::Env::Default();
198   std::string binary_path;
199   for (const std::string& cuda_root :
200        tensorflow::CandidateCudaRoots(preferred_cuda_dir)) {
201     binary_path = tensorflow::io::JoinPath(cuda_root, "bin", binary_filename);
202     VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
203     if (env->FileExists(binary_path).ok() &&
204         GetPtxasVersionString(binary_path).ok()) {
205       break;
206     }
207   }
208   if (!env->FileExists(binary_path).ok()) {
209     // Give up and just rely on subprocess invocation to find the correct
210     // binary. This won't work, in all probability, given we already tried that
211     // above, but it's the best we can do.
212     VLOG(2) << "Unable to find " << binary_name;
213     binary_path = binary_filename;
214   }
215   VLOG(2) << "Using " << binary_filename << " at " << binary_path;
216   seen_binary_paths->emplace(std::move(cache_key), binary_path);
217   return binary_path;
218 }
219 
LogPtxasTooOld(const std::string & ptxas_path,int cc_major,int cc_minor)220 static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
221                            int cc_minor) {
222   using AlreadyLoggedSetTy =
223       absl::flat_hash_set<std::tuple<std::string, int, int>>;
224 
225   static absl::Mutex* mutex = new absl::Mutex;
226   static AlreadyLoggedSetTy* already_logged = new AlreadyLoggedSetTy;
227 
228   absl::MutexLock lock(mutex);
229 
230   if (already_logged->insert(std::make_tuple(ptxas_path, cc_major, cc_minor))
231           .second) {
232     LOG(WARNING) << "Falling back to the CUDA driver for PTX compilation; "
233                     "ptxas does not support CC "
234                  << cc_major << "." << cc_minor;
235     LOG(WARNING) << "Used ptxas at " << ptxas_path;
236   }
237 }
238 
AppendArgsFromOptions(GpuAsmOpts options,std::vector<std::string> & args)239 static void AppendArgsFromOptions(GpuAsmOpts options,
240                                   std::vector<std::string>& args) {
241   if (options.disable_gpuasm_optimizations) {
242     args.push_back("-O0");
243   }
244   args.insert(args.end(), options.extra_flags.begin(),
245               options.extra_flags.end());
246 }
247 
CompileGpuAsm(int cc_major,int cc_minor,const char * ptx_contents,GpuAsmOpts options)248 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
249                                                  const char* ptx_contents,
250                                                  GpuAsmOpts options) {
251   std::string ptxas_path =
252       FindCudaExecutable("ptxas", options.preferred_cuda_dir);
253 
254   WarnIfBadPtxasVersion(ptxas_path);
255 
256   // Write ptx into a temporary file.
257   std::string ptx_path;
258   auto env = tensorflow::Env::Default();
259   if (!env->LocalTempFilename(&ptx_path)) {
260     return port::InternalError("couldn't get temp PTX file name");
261   }
262   TF_RETURN_IF_ERROR(
263       tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
264   VLOG(2) << "ptx written to: " << ptx_path;
265 
266   absl::Cleanup ptx_cleaner = [&ptx_path] {
267     TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
268   };
269 
270   // Invoke ptxas and collect its output.
271   std::string cubin_path;
272   if (!env->LocalTempFilename(&cubin_path)) {
273     return port::InternalError("couldn't get temp CUBIN file name");
274   }
275   absl::Cleanup cubin_cleaner = [&cubin_path] {
276     // CUBIN file may never be created, so the failure to delete it should not
277     // produce TF error.
278     tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
279   };
280   tensorflow::SubProcess ptxas_info_dumper;
281   std::vector<std::string> ptxas_args = {
282       ptxas_path,
283       ptx_path,
284       "-o",
285       cubin_path,
286       absl::StrCat("-arch=sm_", cc_major, cc_minor),
287       "--warn-on-spills"};
288   if (VLOG_IS_ON(2)) {
289     ptxas_args.push_back("-v");
290   }
291   AppendArgsFromOptions(options, ptxas_args);
292   if (VLOG_IS_ON(3)) {
293     VLOG(3) << absl::StrJoin(ptxas_args, " ");
294   }
295 
296   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
297   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
298                                      tensorflow::ACTION_PIPE);
299   if (!ptxas_info_dumper.Start()) {
300     return port::InternalError("Failed to launch ptxas");
301   }
302   std::string stderr_output;
303   int exit_status = ptxas_info_dumper.Communicate(
304       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
305   if (exit_status != 0) {
306     //  It happens when the ptxas installed is too old for the current GPU.
307     //  Example error message associated with this error code:
308     //      ptxas fatal   : Value 'sm_80' is not defined for option 'gpu-name'
309     // In that case, fallback to the driver for compilation
310     if (absl::StartsWith(stderr_output, "ptxas fatal   : Value '") &&
311         absl::StrContains(stderr_output,
312                           "is not defined for option 'gpu-name'")) {
313       LogPtxasTooOld(ptxas_path, cc_major, cc_minor);
314       return tensorflow::errors::Unimplemented(
315           ptxas_path, " ptxas too old. Falling back to the driver to compile.");
316     }
317 
318     return port::InternalError(
319         absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
320                         exit_status, stderr_output));
321   }
322   // Print the verbose output of ptxas.
323   if (!stderr_output.empty()) {
324     if (absl::StrContains(stderr_output, "warning")) {
325       LOG(INFO) << stderr_output;
326     } else {
327       VLOG(2) << stderr_output;
328     }
329   }
330 
331   // Read in the result of compilation and return it as a byte vector.
332   std::string cubin;
333   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
334                                                   cubin_path, &cubin));
335   std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
336   return cubin_vector;
337 }
338 
BundleGpuAsm(std::vector<CubinOrPTXImage> images,GpuAsmOpts options)339 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
340     std::vector<CubinOrPTXImage> images, GpuAsmOpts options) {
341   std::string fatbinary_path =
342       FindCudaExecutable("fatbinary", options.preferred_cuda_dir);
343 
344   // Write images to temporary files.
345   std::vector<std::string> image_paths;
346   auto env = tensorflow::Env::Default();
347   for (const CubinOrPTXImage& img : images) {
348     std::string img_path;
349     if (!env->LocalTempFilename(&img_path)) {
350       return port::InternalError(
351           "Could not get temporary filenames for images.");
352     }
353     TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
354         env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
355     VLOG(2) << "image written to " << img_path;
356     image_paths.push_back(std::move(img_path));
357   }
358   absl::Cleanup image_files_cleaner = [&image_paths] {
359     for (const auto& path : image_paths) {
360       TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
361     }
362   };
363 
364   // Prepare temorary result file.
365   std::string result_path;
366   if (!env->LocalTempFilename(&result_path)) {
367     return port::InternalError(
368         "Could not get temporary filename for fatbin result.");
369   }
370   absl::Cleanup result_file_cleaner = [&result_path] {
371     // This file may never be created, so the failure to delete it should not
372     // propagate to TF.
373     tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
374   };
375 
376   // Compute the ptxas options that were used to produce the cubins.
377   std::vector<std::string> ptxas_options;
378   AppendArgsFromOptions(options, ptxas_options);
379 
380   // Invoke fatbinary and collect its output.
381   tensorflow::SubProcess fatbinary;
382   std::vector<std::string> fatbinary_args = {
383       fatbinary_path, "--64", "--link", "--compress-all",
384       absl::StrCat("--create=", result_path)};
385   if (!ptxas_options.empty()) {
386     auto command_line = absl::StrJoin(ptxas_options, " ");
387     fatbinary_args.push_back(absl::StrFormat("--cmdline=%s", command_line));
388   }
389   assert(images.size() == image_paths.size());
390   for (int i = 0; i < images.size(); i++) {
391     fatbinary_args.push_back(absl::StrFormat(
392         "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
393   }
394   if (VLOG_IS_ON(3)) {
395     VLOG(3) << absl::StrJoin(fatbinary_args, " ");
396   }
397   fatbinary.SetProgram(fatbinary_path, fatbinary_args);
398   fatbinary.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
399   if (!fatbinary.Start()) {
400     return port::InternalError("Failed to launch fatbinary.");
401   }
402   std::string stderr_output;
403   int exit_status = fatbinary.Communicate(
404       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
405   if (exit_status != 0) {
406     return port::InternalError(absl::StrFormat(
407         "fatbinary exited with non-zero error code %d, output: %s", exit_status,
408         stderr_output));
409   }
410   if (!stderr_output.empty()) {
411     VLOG(2) << stderr_output;
412   }
413 
414   // Read in the result and return it as a byte vector.
415   std::string result_blob;
416   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
417                                                   result_path, &result_blob));
418   return std::vector<uint8>(result_blob.begin(), result_blob.end());
419 }
420 
findRocmExecutable(const std::string & binary_relative_path,const std::string & rocm_root_dir)421 static std::string findRocmExecutable(const std::string& binary_relative_path,
422                                       const std::string& rocm_root_dir) {
423   auto env = tensorflow::Env::Default();
424   std::string binary_path =
425       tensorflow::io::JoinPath(rocm_root_dir, binary_relative_path);
426   VLOG(2) << "Looking for " << binary_relative_path << " at " << rocm_root_dir;
427   if (!env->FileExists(binary_path).ok()) {
428     binary_path = absl::StrCat("<", binary_path, " - NOT FOUND>");
429   }
430   return binary_path;
431 }
432 
BundleGpuAsm(std::vector<HsacoImage> images,const std::string rocm_root_dir)433 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
434     std::vector<HsacoImage> images, const std::string rocm_root_dir) {
435   std::string clang_offload_bundler_path =
436       findRocmExecutable("llvm/bin/clang-offload-bundler", rocm_root_dir);
437 
438   // Initialise the "--inputs" / "--targets" arguments for the
439   // clang-offload-bundler with a dummy file / host target triple...
440   // clang-offload-bundler requires 1 and only 1 host target triple
441   std::ostringstream inputs_list;
442   std::ostringstream targets_list;
443 
444   inputs_list << "/dev/null";
445   targets_list << "host-x86_64-unknown-linux";
446 
447   // Write images to temporary files.
448   std::vector<std::string> image_paths;
449   auto env = tensorflow::Env::Default();
450   for (const HsacoImage& img : images) {
451     std::string img_path;
452     if (!env->LocalTempFilename(&img_path)) {
453       return port::InternalError(
454           "Could not get temporary filenames for images.");
455     }
456     TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
457         env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
458     VLOG(2) << "image written to " << img_path;
459     inputs_list << "," << img_path;
460     targets_list << ",hip-amdgcn-amd-amdhsa-" << img.gfx_arch;
461     image_paths.push_back(std::move(img_path));
462   }
463   absl::Cleanup image_files_cleaner = [&image_paths] {
464     for (const auto& path : image_paths) {
465       TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
466     }
467   };
468 
469   // Prepare temorary result file.
470   std::string result_path;
471   if (!env->LocalTempFilename(&result_path)) {
472     return port::InternalError(
473         "Could not get temporary filename for fatbin result.");
474   }
475   absl::Cleanup result_file_cleaner = [&result_path] {
476     // This file may never be created, so the failure to delete it should not
477     // propagate to TF.
478     tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
479   };
480 
481   // Invoke clang_offload_bundler and collect its output.
482   tensorflow::SubProcess clang_offload_bundler;
483   std::vector<std::string> clang_offload_bundler_args = {
484       clang_offload_bundler_path, absl::StrCat("--inputs=", inputs_list.str()),
485       absl::StrCat("--targets=", targets_list.str()), "--type=o",
486       absl::StrCat("--outputs=", result_path)};
487   if (VLOG_IS_ON(3)) {
488     VLOG(3) << absl::StrJoin(clang_offload_bundler_args, " ");
489   }
490   clang_offload_bundler.SetProgram(clang_offload_bundler_path,
491                                    clang_offload_bundler_args);
492   clang_offload_bundler.SetChannelAction(tensorflow::CHAN_STDERR,
493                                          tensorflow::ACTION_PIPE);
494   if (!clang_offload_bundler.Start()) {
495     return port::InternalError("Failed to launch clang_offload_bundler.");
496   }
497   std::string stderr_output;
498   int exit_status = clang_offload_bundler.Communicate(
499       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
500   if (exit_status != 0) {
501     return port::InternalError(absl::StrFormat(
502         "clang_offload_bundler exited with non-zero error code %d, output: %s",
503         exit_status, stderr_output));
504   }
505   if (!stderr_output.empty()) {
506     VLOG(2) << stderr_output;
507   }
508 
509   // Read in the result and return it as a byte vector.
510   std::string result_blob;
511   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
512                                                   result_path, &result_blob));
513   return std::vector<uint8>(result_blob.begin(), result_blob.end());
514 }
515 
516 }  // namespace stream_executor
517