1 // Copyright 2020 The TensorFlow Runtime Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_LMHLO_TO_GPU_LMHLO_TO_GPU_BINARY_H_ 16 #define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_LMHLO_TO_GPU_LMHLO_TO_GPU_BINARY_H_ 17 18 #include <memory> 19 #include <string> 20 21 #include "mlir/Pass/Pass.h" 22 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h" 23 #include "tensorflow/compiler/xla/stream_executor/device_description.h" 24 25 namespace tensorflow { 26 27 struct GpuBinaryOptions { DefaultGpuBinaryOptionsGpuBinaryOptions28 static GpuBinaryOptions DefaultGpuBinaryOptions() { 29 GpuBinaryOptions options; 30 options.platform_name = "CUDA"; 31 32 options.gpu_device_info.threads_per_block_limit = 1024; 33 options.gpu_device_info.threads_per_warp = 32; 34 options.gpu_device_info.shared_memory_per_block = 35 49152; // static shmem limit. 36 // Should be 1024 for sm7.5, 1536 for sm8.6. This results in more blocks 37 // than SMs on those architectures, but doesn't hit any resource limit. 38 options.gpu_device_info.threads_per_core_limit = 2048; 39 // This is higher than any SKU, resulting in more blocks than SMs. 40 options.gpu_device_info.core_count = 128; 41 options.gpu_device_info.block_dim_limit_x = 2147483647; 42 options.gpu_device_info.block_dim_limit_y = 65535; 43 options.gpu_device_info.block_dim_limit_z = 65535; 44 45 options.cuda_compute_capability = {5, 2}; 46 options.rocm_compute_capability = 47 stream_executor::RocmComputeCapability("gfx900"); 48 return options; 49 } 50 51 std::string platform_name; 52 xla::gpu::GpuDeviceInfo gpu_device_info; 53 stream_executor::CudaComputeCapability cuda_compute_capability; 54 stream_executor::RocmComputeCapability rocm_compute_capability{"unknown"}; 55 }; 56 57 // Creates a pass that lowers lmhlo.fusion ops to a gpu.module with a binary 58 // device code attribute plus a gpu.launch_func. 59 std::unique_ptr<mlir::Pass> createConvertLmhloToGpuBinaryPass( 60 GpuBinaryOptions options = GpuBinaryOptions::DefaultGpuBinaryOptions()); 61 62 void registerConvertLmhloToGpuBinaryPass(); 63 64 } // namespace tensorflow 65 66 #endif // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_LMHLO_TO_GPU_LMHLO_TO_GPU_BINARY_H_ 67