1 // Copyright 2020 The TensorFlow Runtime Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_LMHLO_TO_GPU_LMHLO_TO_GPU_BINARY_H_
16 #define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_LMHLO_TO_GPU_LMHLO_TO_GPU_BINARY_H_
17 
18 #include <memory>
19 #include <string>
20 
21 #include "mlir/Pass/Pass.h"
22 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
23 #include "tensorflow/compiler/xla/stream_executor/device_description.h"
24 
25 namespace tensorflow {
26 
27 struct GpuBinaryOptions {
DefaultGpuBinaryOptionsGpuBinaryOptions28   static GpuBinaryOptions DefaultGpuBinaryOptions() {
29     GpuBinaryOptions options;
30     options.platform_name = "CUDA";
31 
32     options.gpu_device_info.threads_per_block_limit = 1024;
33     options.gpu_device_info.threads_per_warp = 32;
34     options.gpu_device_info.shared_memory_per_block =
35         49152;  // static shmem limit.
36     // Should be 1024 for sm7.5, 1536 for sm8.6. This results in more blocks
37     // than SMs on those architectures, but doesn't hit any resource limit.
38     options.gpu_device_info.threads_per_core_limit = 2048;
39     // This is higher than any SKU, resulting in more blocks than SMs.
40     options.gpu_device_info.core_count = 128;
41     options.gpu_device_info.block_dim_limit_x = 2147483647;
42     options.gpu_device_info.block_dim_limit_y = 65535;
43     options.gpu_device_info.block_dim_limit_z = 65535;
44 
45     options.cuda_compute_capability = {5, 2};
46     options.rocm_compute_capability =
47         stream_executor::RocmComputeCapability("gfx900");
48     return options;
49   }
50 
51   std::string platform_name;
52   xla::gpu::GpuDeviceInfo gpu_device_info;
53   stream_executor::CudaComputeCapability cuda_compute_capability;
54   stream_executor::RocmComputeCapability rocm_compute_capability{"unknown"};
55 };
56 
57 // Creates a pass that lowers lmhlo.fusion ops to a gpu.module with a binary
58 // device code attribute plus a gpu.launch_func.
59 std::unique_ptr<mlir::Pass> createConvertLmhloToGpuBinaryPass(
60     GpuBinaryOptions options = GpuBinaryOptions::DefaultGpuBinaryOptions());
61 
62 void registerConvertLmhloToGpuBinaryPass();
63 
64 }  // namespace tensorflow
65 
66 #endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_LMHLO_TO_GPU_LMHLO_TO_GPU_BINARY_H_
67