xref: /aosp_15_r20/external/tensorflow/third_party/nccl/nccl_configure.bzl (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1"""Repository rule for NCCL configuration.
2
3`nccl_configure` depends on the following environment variables:
4
5  * `TF_NCCL_VERSION`: Installed NCCL version or empty to build from source.
6  * `NCCL_INSTALL_PATH` (deprecated): The installation path of the NCCL library.
7  * `NCCL_HDR_PATH` (deprecated): The installation path of the NCCL header
8    files.
9  * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
10    `/usr/local/cuda,usr/`.
11
12"""
13
14load(
15    "//third_party/gpus:cuda_configure.bzl",
16    "enable_cuda",
17    "find_cuda_config",
18)
19load(
20    "//third_party/remote_config:common.bzl",
21    "config_repo_label",
22    "get_cpu_value",
23    "get_host_environ",
24)
25
26_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
27_NCCL_HDR_PATH = "NCCL_HDR_PATH"
28_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
29_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
30_TF_NCCL_VERSION = "TF_NCCL_VERSION"
31_TF_NEED_CUDA = "TF_NEED_CUDA"
32
33_DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
34_DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
35_DEFINE_NCCL_PATCH = "#define NCCL_PATCH"
36
37_NCCL_DUMMY_BUILD_CONTENT = """
38filegroup(
39  name = "LICENSE",
40  visibility = ["//visibility:public"],
41)
42
43cc_library(
44  name = "nccl",
45  visibility = ["//visibility:public"],
46)
47"""
48
49_NCCL_ARCHIVE_BUILD_CONTENT = """
50filegroup(
51  name = "LICENSE",
52  data = ["@nccl_archive//:LICENSE.txt"],
53  visibility = ["//visibility:public"],
54)
55
56alias(
57  name = "nccl",
58  actual = "@nccl_archive//:nccl",
59  visibility = ["//visibility:public"],
60)
61"""
62
63def _label(file):
64    return Label("//third_party/nccl:{}".format(file))
65
66def _create_local_nccl_repository(repository_ctx):
67    # Resolve all labels before doing any real work. Resolving causes the
68    # function to be restarted with all previous state being lost. This
69    # can easily lead to a O(n^2) runtime in the number of labels.
70    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
71    find_cuda_config_path = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py.gz.base64"))
72
73    nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
74    if nccl_version:
75        nccl_version = nccl_version.split(".")[0]
76
77    cuda_config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda"])
78    cuda_version = cuda_config["cuda_version"].split(".")
79
80    if nccl_version == "":
81        # Alias to open source build from @nccl_archive.
82        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
83
84        repository_ctx.template(
85            "build_defs.bzl",
86            _label("build_defs.bzl.tpl"),
87            {"%{cuda_version}": "(%s, %s)" % tuple(cuda_version)},
88        )
89    else:
90        # Create target for locally installed NCCL.
91        config = find_cuda_config(repository_ctx, find_cuda_config_path, ["nccl"])
92        config_wrap = {
93            "%{nccl_version}": config["nccl_version"],
94            "%{nccl_header_dir}": config["nccl_include_dir"],
95            "%{nccl_library_dir}": config["nccl_library_dir"],
96        }
97        repository_ctx.template("BUILD", _label("system.BUILD.tpl"), config_wrap)
98
99def _create_remote_nccl_repository(repository_ctx, remote_config_repo):
100    repository_ctx.template(
101        "BUILD",
102        config_repo_label(remote_config_repo, ":BUILD"),
103        {},
104    )
105
106    nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
107    if nccl_version == "":
108        repository_ctx.template(
109            "build_defs.bzl",
110            config_repo_label(remote_config_repo, ":build_defs.bzl"),
111            {},
112        )
113
114def _nccl_autoconf_impl(repository_ctx):
115    if (not enable_cuda(repository_ctx) or
116        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
117        # Add a dummy build file to make bazel query happy.
118        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
119    elif get_host_environ(repository_ctx, "TF_NCCL_CONFIG_REPO") != None:
120        _create_remote_nccl_repository(repository_ctx, get_host_environ(repository_ctx, "TF_NCCL_CONFIG_REPO"))
121    else:
122        _create_local_nccl_repository(repository_ctx)
123
124_ENVIRONS = [
125    _CUDA_TOOLKIT_PATH,
126    _NCCL_HDR_PATH,
127    _NCCL_INSTALL_PATH,
128    _TF_NCCL_VERSION,
129    _TF_CUDA_COMPUTE_CAPABILITIES,
130    _TF_NEED_CUDA,
131    "TF_CUDA_PATHS",
132]
133
134remote_nccl_configure = repository_rule(
135    implementation = _create_local_nccl_repository,
136    environ = _ENVIRONS,
137    remotable = True,
138    attrs = {
139        "environ": attr.string_dict(),
140    },
141)
142
143nccl_configure = repository_rule(
144    implementation = _nccl_autoconf_impl,
145    environ = _ENVIRONS,
146)
147"""Detects and configures the NCCL configuration.
148
149Add the following to your WORKSPACE FILE:
150
151```python
152nccl_configure(name = "local_config_nccl")
153```
154
155Args:
156  name: A unique name for this workspace rule.
157"""
158