1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
17
18 #include <string>
19 #include <utility>
20
21 #include "absl/base/const_init.h"
22 #include "absl/base/thread_annotations.h"
23 #include "absl/cleanup/cleanup.h"
24 #include "absl/container/flat_hash_map.h"
25 #include "absl/container/flat_hash_set.h"
26 #include "absl/strings/str_format.h"
27 #include "absl/strings/string_view.h"
28 #include "absl/synchronization/mutex.h"
29 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
30 #include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
31 #include "tensorflow/core/platform/cuda_libdevice_path.h"
32 #include "tensorflow/core/platform/env.h"
33 #include "tensorflow/core/platform/errors.h"
34 #include "tensorflow/core/platform/path.h"
35 #include "tensorflow/core/platform/regexp.h"
36 #include "tensorflow/core/platform/subprocess.h"
37
38 namespace stream_executor {
39
GetPtxasVersionString(const std::string & binary_path)40 static port::StatusOr<absl::string_view> GetPtxasVersionString(
41 const std::string& binary_path) {
42 static absl::Mutex mu(absl::kConstInit);
43 static auto* seen_binary_paths ABSL_GUARDED_BY(mu) =
44 new absl::flat_hash_map<std::string, std::string>();
45
46 absl::MutexLock lock(&mu);
47 auto it = seen_binary_paths->find(binary_path);
48 if (it != seen_binary_paths->end()) {
49 // Already checked this binary, nothing to do.
50 return absl::string_view(it->second);
51 }
52
53 tensorflow::SubProcess binary;
54 binary.SetProgram(binary_path, {binary_path, "--version"});
55 binary.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
56 if (!binary.Start()) {
57 return port::InternalError(
58 absl::StrFormat("Couldn't invoke %s --version", binary_path));
59 }
60
61 std::string out;
62 int exit_code = binary.Communicate(/*stdin_input=*/nullptr, &out,
63 /*stderr_output=*/nullptr);
64 if (exit_code != 0) {
65 return port::InternalError(absl::StrFormat(
66 "Running %s --version returned %d", binary_path, exit_code));
67 }
68 auto emplace_it = seen_binary_paths->emplace(binary_path, std::move(out));
69 return absl::string_view(emplace_it.first->second);
70 }
71
72 // Prints a warning if the ptxas at ptxas_path has known bugs.
73 //
74 // Only prints a warning the first time it's called for a particular value of
75 // ptxas_path.
76 //
77 // Locks on entry.
WarnIfBadPtxasVersion(const std::string & ptxas_path)78 static void WarnIfBadPtxasVersion(const std::string& ptxas_path) {
79 port::StatusOr<absl::string_view> ptxas_version =
80 GetPtxasVersionString(ptxas_path);
81 if (!ptxas_version.ok()) {
82 LOG(WARNING) << "Couldn't get ptxas version string: "
83 << ptxas_version.status();
84 return;
85 }
86
87 int64_t vmaj, vmin, vdot;
88 std::string vmaj_str, vmin_str, vdot_str;
89 if (!RE2::PartialMatch(ptxas_version.ValueOrDie(),
90 R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str, &vmin_str,
91 &vdot_str) ||
92 !absl::SimpleAtoi(vmaj_str, &vmaj) ||
93 !absl::SimpleAtoi(vmin_str, &vmin) ||
94 !absl::SimpleAtoi(vdot_str, &vdot)) {
95 LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
96 << " --version:\n"
97 << ptxas_version.ValueOrDie();
98 return;
99 }
100
101 // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
102 // PTX 6.0. An older ptxas will just fail to compile any of our code.
103 //
104 // ptxas versions before the version that shipped with CUDA 11.1 are known to
105 // miscompile XLA code.
106 if (vmaj < 9) {
107 LOG(ERROR)
108 << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
109 "prefers >= 11.1). Compilation of XLA kernels below will likely "
110 "fail.\n\nYou may not need to update CUDA; cherry-picking the ptxas "
111 "binary is often sufficient.";
112 } else if (std::make_tuple(vmaj, vmin) < std::make_tuple(11, 1)) {
113 LOG(WARNING)
114 << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
115 << vdot
116 << ", which is older than 11.1. ptxas before 11.1 is known to "
117 "miscompile XLA code, leading to incorrect results or "
118 "invalid-address errors.\n\nYou may not need to update to CUDA "
119 "11.1; cherry-picking the ptxas binary is often sufficient.";
120 }
121 }
122
CompileGpuAsmOrGetCached(int device_ordinal,const char * ptx,GpuAsmOpts compilation_options)123 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
124 int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
125 using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
126 using PtxCompilerResult = port::StatusOr<std::vector<uint8>>;
127 static absl::Mutex ptx_cache_mutex(absl::kConstInit);
128 static auto& ptx_cache ABSL_GUARDED_BY(ptx_cache_mutex) =
129 *new absl::flat_hash_map<PtxCacheKey, PtxCompilerResult>();
130
131 absl::MutexLock lock(&ptx_cache_mutex);
132 PtxCacheKey cache_key{device_ordinal, std::string(ptx),
133 compilation_options.ToTuple()};
134 auto it = ptx_cache.find(cache_key);
135 if (it == ptx_cache.end()) {
136 PtxCompilerResult compiled =
137 CompileGpuAsm(device_ordinal, ptx, compilation_options);
138 it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
139 }
140
141 CHECK(it != ptx_cache.end());
142
143 // Failed compilation attempts are cached.
144 // Use separate status check and ValueOrDie invocation on ptx_cache
145 // entry to avoid value moving introduced by TF_ASSIGN_OR_RETURN.
146
147 if (TF_PREDICT_FALSE(!it->second.ok())) {
148 return it->second.status();
149 }
150
151 const std::vector<uint8>& compiled = it->second.ValueOrDie();
152 return absl::MakeSpan(compiled);
153 }
154
CompileGpuAsm(int device_ordinal,const char * ptx_contents,GpuAsmOpts options)155 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
156 const char* ptx_contents,
157 GpuAsmOpts options) {
158 gpu::GpuDeviceHandle handle;
159 TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
160 int cc_major;
161 int cc_minor;
162 TF_RETURN_IF_ERROR(
163 gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
164 return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
165 }
166
FindCudaExecutable(const std::string binary_name,const std::string preferred_cuda_dir)167 static std::string FindCudaExecutable(const std::string binary_name,
168 const std::string preferred_cuda_dir) {
169 static absl::Mutex mu(absl::kConstInit);
170 static auto* seen_binary_paths ABSL_GUARDED_BY(mu) =
171 new absl::flat_hash_map<std::pair<std::string, std::string>,
172 std::string>();
173
174 #if defined(PLATFORM_WINDOWS)
175 const std::string binary_filename = binary_name + ".exe";
176 #else
177 const std::string& binary_filename = binary_name;
178 #endif
179
180 auto cache_key = std::make_pair(binary_name, preferred_cuda_dir);
181
182 absl::MutexLock lock(&mu);
183 auto it = seen_binary_paths->find(cache_key);
184 if (it != seen_binary_paths->end()) {
185 return it->second;
186 }
187
188 // Try searching in the default PATH first if applicable.
189 if (tensorflow::PreferPtxasFromPath() &&
190 GetPtxasVersionString(binary_filename).ok()) {
191 VLOG(2) << "Using " << binary_filename;
192 seen_binary_paths->emplace(std::move(cache_key), binary_filename);
193 return binary_filename;
194 }
195
196 // Search in cuda root candidates.
197 auto env = tensorflow::Env::Default();
198 std::string binary_path;
199 for (const std::string& cuda_root :
200 tensorflow::CandidateCudaRoots(preferred_cuda_dir)) {
201 binary_path = tensorflow::io::JoinPath(cuda_root, "bin", binary_filename);
202 VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
203 if (env->FileExists(binary_path).ok() &&
204 GetPtxasVersionString(binary_path).ok()) {
205 break;
206 }
207 }
208 if (!env->FileExists(binary_path).ok()) {
209 // Give up and just rely on subprocess invocation to find the correct
210 // binary. This won't work, in all probability, given we already tried that
211 // above, but it's the best we can do.
212 VLOG(2) << "Unable to find " << binary_name;
213 binary_path = binary_filename;
214 }
215 VLOG(2) << "Using " << binary_filename << " at " << binary_path;
216 seen_binary_paths->emplace(std::move(cache_key), binary_path);
217 return binary_path;
218 }
219
LogPtxasTooOld(const std::string & ptxas_path,int cc_major,int cc_minor)220 static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
221 int cc_minor) {
222 using AlreadyLoggedSetTy =
223 absl::flat_hash_set<std::tuple<std::string, int, int>>;
224
225 static absl::Mutex* mutex = new absl::Mutex;
226 static AlreadyLoggedSetTy* already_logged = new AlreadyLoggedSetTy;
227
228 absl::MutexLock lock(mutex);
229
230 if (already_logged->insert(std::make_tuple(ptxas_path, cc_major, cc_minor))
231 .second) {
232 LOG(WARNING) << "Falling back to the CUDA driver for PTX compilation; "
233 "ptxas does not support CC "
234 << cc_major << "." << cc_minor;
235 LOG(WARNING) << "Used ptxas at " << ptxas_path;
236 }
237 }
238
AppendArgsFromOptions(GpuAsmOpts options,std::vector<std::string> & args)239 static void AppendArgsFromOptions(GpuAsmOpts options,
240 std::vector<std::string>& args) {
241 if (options.disable_gpuasm_optimizations) {
242 args.push_back("-O0");
243 }
244 args.insert(args.end(), options.extra_flags.begin(),
245 options.extra_flags.end());
246 }
247
CompileGpuAsm(int cc_major,int cc_minor,const char * ptx_contents,GpuAsmOpts options)248 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
249 const char* ptx_contents,
250 GpuAsmOpts options) {
251 std::string ptxas_path =
252 FindCudaExecutable("ptxas", options.preferred_cuda_dir);
253
254 WarnIfBadPtxasVersion(ptxas_path);
255
256 // Write ptx into a temporary file.
257 std::string ptx_path;
258 auto env = tensorflow::Env::Default();
259 if (!env->LocalTempFilename(&ptx_path)) {
260 return port::InternalError("couldn't get temp PTX file name");
261 }
262 TF_RETURN_IF_ERROR(
263 tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
264 VLOG(2) << "ptx written to: " << ptx_path;
265
266 absl::Cleanup ptx_cleaner = [&ptx_path] {
267 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
268 };
269
270 // Invoke ptxas and collect its output.
271 std::string cubin_path;
272 if (!env->LocalTempFilename(&cubin_path)) {
273 return port::InternalError("couldn't get temp CUBIN file name");
274 }
275 absl::Cleanup cubin_cleaner = [&cubin_path] {
276 // CUBIN file may never be created, so the failure to delete it should not
277 // produce TF error.
278 tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
279 };
280 tensorflow::SubProcess ptxas_info_dumper;
281 std::vector<std::string> ptxas_args = {
282 ptxas_path,
283 ptx_path,
284 "-o",
285 cubin_path,
286 absl::StrCat("-arch=sm_", cc_major, cc_minor),
287 "--warn-on-spills"};
288 if (VLOG_IS_ON(2)) {
289 ptxas_args.push_back("-v");
290 }
291 AppendArgsFromOptions(options, ptxas_args);
292 if (VLOG_IS_ON(3)) {
293 VLOG(3) << absl::StrJoin(ptxas_args, " ");
294 }
295
296 ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
297 ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
298 tensorflow::ACTION_PIPE);
299 if (!ptxas_info_dumper.Start()) {
300 return port::InternalError("Failed to launch ptxas");
301 }
302 std::string stderr_output;
303 int exit_status = ptxas_info_dumper.Communicate(
304 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
305 if (exit_status != 0) {
306 // It happens when the ptxas installed is too old for the current GPU.
307 // Example error message associated with this error code:
308 // ptxas fatal : Value 'sm_80' is not defined for option 'gpu-name'
309 // In that case, fallback to the driver for compilation
310 if (absl::StartsWith(stderr_output, "ptxas fatal : Value '") &&
311 absl::StrContains(stderr_output,
312 "is not defined for option 'gpu-name'")) {
313 LogPtxasTooOld(ptxas_path, cc_major, cc_minor);
314 return tensorflow::errors::Unimplemented(
315 ptxas_path, " ptxas too old. Falling back to the driver to compile.");
316 }
317
318 return port::InternalError(
319 absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
320 exit_status, stderr_output));
321 }
322 // Print the verbose output of ptxas.
323 if (!stderr_output.empty()) {
324 if (absl::StrContains(stderr_output, "warning")) {
325 LOG(INFO) << stderr_output;
326 } else {
327 VLOG(2) << stderr_output;
328 }
329 }
330
331 // Read in the result of compilation and return it as a byte vector.
332 std::string cubin;
333 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
334 cubin_path, &cubin));
335 std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
336 return cubin_vector;
337 }
338
BundleGpuAsm(std::vector<CubinOrPTXImage> images,GpuAsmOpts options)339 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
340 std::vector<CubinOrPTXImage> images, GpuAsmOpts options) {
341 std::string fatbinary_path =
342 FindCudaExecutable("fatbinary", options.preferred_cuda_dir);
343
344 // Write images to temporary files.
345 std::vector<std::string> image_paths;
346 auto env = tensorflow::Env::Default();
347 for (const CubinOrPTXImage& img : images) {
348 std::string img_path;
349 if (!env->LocalTempFilename(&img_path)) {
350 return port::InternalError(
351 "Could not get temporary filenames for images.");
352 }
353 TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
354 env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
355 VLOG(2) << "image written to " << img_path;
356 image_paths.push_back(std::move(img_path));
357 }
358 absl::Cleanup image_files_cleaner = [&image_paths] {
359 for (const auto& path : image_paths) {
360 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
361 }
362 };
363
364 // Prepare temorary result file.
365 std::string result_path;
366 if (!env->LocalTempFilename(&result_path)) {
367 return port::InternalError(
368 "Could not get temporary filename for fatbin result.");
369 }
370 absl::Cleanup result_file_cleaner = [&result_path] {
371 // This file may never be created, so the failure to delete it should not
372 // propagate to TF.
373 tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
374 };
375
376 // Compute the ptxas options that were used to produce the cubins.
377 std::vector<std::string> ptxas_options;
378 AppendArgsFromOptions(options, ptxas_options);
379
380 // Invoke fatbinary and collect its output.
381 tensorflow::SubProcess fatbinary;
382 std::vector<std::string> fatbinary_args = {
383 fatbinary_path, "--64", "--link", "--compress-all",
384 absl::StrCat("--create=", result_path)};
385 if (!ptxas_options.empty()) {
386 auto command_line = absl::StrJoin(ptxas_options, " ");
387 fatbinary_args.push_back(absl::StrFormat("--cmdline=%s", command_line));
388 }
389 assert(images.size() == image_paths.size());
390 for (int i = 0; i < images.size(); i++) {
391 fatbinary_args.push_back(absl::StrFormat(
392 "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
393 }
394 if (VLOG_IS_ON(3)) {
395 VLOG(3) << absl::StrJoin(fatbinary_args, " ");
396 }
397 fatbinary.SetProgram(fatbinary_path, fatbinary_args);
398 fatbinary.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
399 if (!fatbinary.Start()) {
400 return port::InternalError("Failed to launch fatbinary.");
401 }
402 std::string stderr_output;
403 int exit_status = fatbinary.Communicate(
404 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
405 if (exit_status != 0) {
406 return port::InternalError(absl::StrFormat(
407 "fatbinary exited with non-zero error code %d, output: %s", exit_status,
408 stderr_output));
409 }
410 if (!stderr_output.empty()) {
411 VLOG(2) << stderr_output;
412 }
413
414 // Read in the result and return it as a byte vector.
415 std::string result_blob;
416 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
417 result_path, &result_blob));
418 return std::vector<uint8>(result_blob.begin(), result_blob.end());
419 }
420
findRocmExecutable(const std::string & binary_relative_path,const std::string & rocm_root_dir)421 static std::string findRocmExecutable(const std::string& binary_relative_path,
422 const std::string& rocm_root_dir) {
423 auto env = tensorflow::Env::Default();
424 std::string binary_path =
425 tensorflow::io::JoinPath(rocm_root_dir, binary_relative_path);
426 VLOG(2) << "Looking for " << binary_relative_path << " at " << rocm_root_dir;
427 if (!env->FileExists(binary_path).ok()) {
428 binary_path = absl::StrCat("<", binary_path, " - NOT FOUND>");
429 }
430 return binary_path;
431 }
432
BundleGpuAsm(std::vector<HsacoImage> images,const std::string rocm_root_dir)433 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
434 std::vector<HsacoImage> images, const std::string rocm_root_dir) {
435 std::string clang_offload_bundler_path =
436 findRocmExecutable("llvm/bin/clang-offload-bundler", rocm_root_dir);
437
438 // Initialise the "--inputs" / "--targets" arguments for the
439 // clang-offload-bundler with a dummy file / host target triple...
440 // clang-offload-bundler requires 1 and only 1 host target triple
441 std::ostringstream inputs_list;
442 std::ostringstream targets_list;
443
444 inputs_list << "/dev/null";
445 targets_list << "host-x86_64-unknown-linux";
446
447 // Write images to temporary files.
448 std::vector<std::string> image_paths;
449 auto env = tensorflow::Env::Default();
450 for (const HsacoImage& img : images) {
451 std::string img_path;
452 if (!env->LocalTempFilename(&img_path)) {
453 return port::InternalError(
454 "Could not get temporary filenames for images.");
455 }
456 TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
457 env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
458 VLOG(2) << "image written to " << img_path;
459 inputs_list << "," << img_path;
460 targets_list << ",hip-amdgcn-amd-amdhsa-" << img.gfx_arch;
461 image_paths.push_back(std::move(img_path));
462 }
463 absl::Cleanup image_files_cleaner = [&image_paths] {
464 for (const auto& path : image_paths) {
465 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
466 }
467 };
468
469 // Prepare temorary result file.
470 std::string result_path;
471 if (!env->LocalTempFilename(&result_path)) {
472 return port::InternalError(
473 "Could not get temporary filename for fatbin result.");
474 }
475 absl::Cleanup result_file_cleaner = [&result_path] {
476 // This file may never be created, so the failure to delete it should not
477 // propagate to TF.
478 tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
479 };
480
481 // Invoke clang_offload_bundler and collect its output.
482 tensorflow::SubProcess clang_offload_bundler;
483 std::vector<std::string> clang_offload_bundler_args = {
484 clang_offload_bundler_path, absl::StrCat("--inputs=", inputs_list.str()),
485 absl::StrCat("--targets=", targets_list.str()), "--type=o",
486 absl::StrCat("--outputs=", result_path)};
487 if (VLOG_IS_ON(3)) {
488 VLOG(3) << absl::StrJoin(clang_offload_bundler_args, " ");
489 }
490 clang_offload_bundler.SetProgram(clang_offload_bundler_path,
491 clang_offload_bundler_args);
492 clang_offload_bundler.SetChannelAction(tensorflow::CHAN_STDERR,
493 tensorflow::ACTION_PIPE);
494 if (!clang_offload_bundler.Start()) {
495 return port::InternalError("Failed to launch clang_offload_bundler.");
496 }
497 std::string stderr_output;
498 int exit_status = clang_offload_bundler.Communicate(
499 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
500 if (exit_status != 0) {
501 return port::InternalError(absl::StrFormat(
502 "clang_offload_bundler exited with non-zero error code %d, output: %s",
503 exit_status, stderr_output));
504 }
505 if (!stderr_output.empty()) {
506 VLOG(2) << stderr_output;
507 }
508
509 // Read in the result and return it as a byte vector.
510 std::string result_blob;
511 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
512 result_path, &result_blob));
513 return std::vector<uint8>(result_blob.begin(), result_blob.end());
514 }
515
516 } // namespace stream_executor
517