1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/tpu/tpu_initializer_helper.h"
17
18 #include <dirent.h>
19 #include <dlfcn.h>
20 #include <fcntl.h>
21 #include <stdlib.h>
22 #include <sys/mman.h>
23 #include <sys/stat.h>
24 #include <sys/types.h>
25 #include <unistd.h>
26
27 #include <fstream>
28 #include <string>
29 #include <utility>
30
31 #include "absl/strings/str_cat.h"
32 #include "absl/strings/str_split.h"
33 #include "absl/synchronization/mutex.h"
34 #include "tensorflow/core/platform/errors.h"
35 #include "tensorflow/core/platform/logging.h"
36 #include "tensorflow/core/tpu/libtftpu.h"
37 #include "tensorflow/core/tpu/tpu_api_dlsym_set_fn.h"
38 #include "tensorflow/core/tpu/tpu_ops_c_api.h"
39 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
40
41 #if !defined(PLATFORM_GOOGLE)
42 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
43 #include "tensorflow/core/platform/env.h"
44 #include "tensorflow/core/tpu/tpu_api.h"
45 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
46 #elif defined(LIBTPU_STATIC)
47 #include "tensorflow/core/tpu/tpu_api.h"
48 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
49 #endif // PLATFORM_GOOGLE
50
51 namespace tensorflow {
52 namespace tpu {
53 namespace {
54
GetEnvVar(const char * name)55 static std::string GetEnvVar(const char* name) {
56 // Constructing a std::string directly from nullptr is undefined behavior so
57 // we can return empty string in that case
58 const char* env_value = getenv(name);
59 if (!env_value) return "";
60 return std::string(env_value);
61 }
62
GetEnvBool(const char * name,bool defval)63 bool GetEnvBool(const char* name, bool defval) {
64 const char* env = getenv(name);
65 if (env == nullptr) {
66 return defval;
67 }
68 if (std::strcmp(env, "true") == 0) {
69 return true;
70 }
71 if (std::strcmp(env, "false") == 0) {
72 return false;
73 }
74 int int_env;
75 bool has_int = absl::SimpleAtoi(env, &int_env);
76 return has_int && int_env != 0;
77 }
78
79 } // namespace
80
81 // This function gets pid of a process and checks if that process is using tpu.
82 // It is not able to check processes that are owned by another user.
IsTpuUsed(int64_t pid)83 bool IsTpuUsed(int64_t pid) {
84 std::string path = absl::StrCat("/proc/", pid, "/fd");
85 DIR* raw_fd_dir = opendir(path.c_str());
86 if (!raw_fd_dir) {
87 return false;
88 }
89 std::unique_ptr<DIR, int (*)(DIR*)> fd_dir(raw_fd_dir, closedir);
90 struct dirent* ent;
91 std::string line;
92 std::string tpu_dev_path = "/dev/accel0";
93 line.resize(tpu_dev_path.size());
94 while ((ent = readdir(raw_fd_dir))) {
95 if (!isdigit(*ent->d_name)) continue;
96 int64_t fd = strtol(ent->d_name, nullptr, 10);
97 path = absl::StrCat("/proc/", pid, "/fd/", fd);
98 if (!readlink(path.c_str(), &line[0], line.size())) continue;
99 if (line != tpu_dev_path) continue;
100 return true;
101 }
102 return false;
103 }
104
105 // This function iterates through all the processes in /proc and finds out if
106 // any process it was able to check is using the TPU. It does not have
107 // permission to processes owned by another user.
108 // TODO (shahrokhi) use tensorflow/core/platform/filesystem (GetChildren) for
109 // this.
FindLibtpuProcess()110 StatusOr<int64_t> FindLibtpuProcess() {
111 DIR* proc = opendir("/proc");
112
113 if (proc == nullptr) {
114 return errors::Unavailable("was not able to open /proc");
115 }
116 std::unique_ptr<DIR, int (*)(DIR*)> proc_dir(proc, closedir);
117 struct dirent* ent;
118 int64_t pid;
119 while ((ent = readdir(proc))) {
120 if (!isdigit(*ent->d_name)) continue;
121
122 pid = strtol(ent->d_name, nullptr, 10);
123 if (IsTpuUsed(pid)) {
124 return pid;
125 }
126 }
127 return errors::NotFound("did not find which pid uses the libtpu.so");
128 }
129
TryAcquireTpuLock()130 Status TryAcquireTpuLock() {
131 static absl::Mutex* mu = new absl::Mutex();
132 absl::MutexLock l(mu);
133
134 // TODO(skyewm): use `absl::StrCat(getenv(name))` once we build with the
135 // fix for https://github.com/abseil/abseil-cpp/issues/1167.
136 std::string load_library_override;
137 const char* env_value = getenv("TPU_LOAD_LIBRARY");
138 if (env_value != nullptr) {
139 load_library_override = std::string(env_value);
140 }
141
142 if (load_library_override == "1") {
143 return OkStatus();
144 } else if (load_library_override == "0") {
145 return errors::FailedPrecondition("TPU_LOAD_LIBRARY=0, not loading libtpu");
146 }
147
148 // If TPU_CHIPS_PER_PROCESS_BOUNDS doesn't include all chips, we assume
149 // we're using different chips in different processes and thus multiple
150 // libtpu loads are ok.
151 // TODO(skyewm): we could make per-chip lock files and look at
152 // TPU_VISIBLE_DEVICES if we wanted to make this really precise.
153 std::string chips_per_process_bounds =
154 GetEnvVar("TPU_CHIPS_PER_PROCESS_BOUNDS");
155 bool allow_multiple_libtpu_load =
156 GetEnvBool("ALLOW_MULTIPLE_LIBTPU_LOAD", false);
157 // TODO(skyewm): remove this when TPU_CHIPS_PER_HOST_BOUNDS is fully
158 // deprecated
159 if (chips_per_process_bounds.empty()) {
160 chips_per_process_bounds = GetEnvVar("TPU_CHIPS_PER_HOST_BOUNDS");
161 }
162 if ((chips_per_process_bounds.empty() ||
163 chips_per_process_bounds == "2,2,1") &&
164 !allow_multiple_libtpu_load) {
165 int fd = open("/tmp/libtpu_lockfile", O_CREAT | O_RDWR, 0644);
166
167 // This lock is held until the process exits intentionally. The underlying
168 // TPU device will be held on until it quits.
169 if (lockf(fd, F_TLOCK, 0) != 0) {
170 auto pid = FindLibtpuProcess();
171 if (pid.ok()) {
172 return errors::Aborted(absl::StrCat(
173 "libtpu.so is already in use by process with pid ",
174 pid.ValueOrDie(),
175 ". Not attempting to load libtpu.so in this process."));
176 } else {
177 return errors::Aborted(
178 "libtpu.so already in use by another process probably owned by "
179 "another user. Run \"$ sudo lsof -w /dev/accel0\" to figure out "
180 "which process is using the TPU. Not attempting to load "
181 "libtpu.so in this process.");
182 }
183 } else {
184 return OkStatus();
185 }
186 } else {
187 VLOG(1) << "TPU_CHIPS_PER_PROCESS_BOUNDS is not empty or "
188 "ALLOW_MULTIPLE_LIBTPU_LOAD is set to True, "
189 "therefore allowing multiple libtpu.so loads.";
190 return OkStatus();
191 }
192 }
193 #if !defined(PLATFORM_GOOGLE)
194 #include "tensorflow/core/tpu/tpu_library_init_fns.inc"
195
InitializeTpuLibrary(void * library_handle)196 Status InitializeTpuLibrary(void* library_handle) {
197 Status s = InitializeTpuStructFns(library_handle);
198
199 // Retrieve arguments from environment if applicable
200 std::pair<std::vector<std::string>, std::vector<const char*>> args =
201 GetLibTpuInitArguments();
202
203 // TPU platform registration must only be performed after the library is
204 // loaded. We do not want to register a TPU platform in XLA without the
205 // supporting library providing the necessary APIs.
206 if (s.ok()) {
207 void (*initialize_fn)(bool init_library, int num_args, const char** args);
208 initialize_fn = reinterpret_cast<decltype(initialize_fn)>(
209 dlsym(library_handle, "TfTpu_Initialize"));
210 (*initialize_fn)(/*init_library=*/true, args.second.size(),
211 args.second.data());
212
213 RegisterTpuPlatform();
214 }
215
216 return s;
217 }
218
219 namespace {
CreateGcsFilesystemFn()220 void* CreateGcsFilesystemFn() {
221 return new tensorflow::RetryingGcsFileSystem();
222 }
223
224 // This is a temporary fix for including GCS file system on TPU builds.
225 // Will be removed once b/176954917 is fully resolved with the build fix.
InitializeCreateGcsFileSystemFnPtr()226 void InitializeCreateGcsFileSystemFnPtr() {
227 int fd = shm_open(absl::StrCat("/tmp_tf_gcs_fs_pointer_", getpid()).data(),
228 O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
229 if (fd == -1) {
230 LOG(ERROR) << "Unable to open shared memory for GCS file system creator.";
231 return;
232 }
233
234 if (ftruncate(fd, sizeof(tensorflow::FileSystem*)) == -1) {
235 LOG(ERROR)
236 << "Unable to allocate shared memory for GCS file system creator.";
237 return;
238 }
239
240 void* (**fn)() = reinterpret_cast<void* (**)()>(mmap(
241 NULL, sizeof(void* (*)()), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
242 if (fn == MAP_FAILED) {
243 LOG(ERROR) << "Cannot mmap shared memory for GCS file system creator.";
244 return;
245 }
246
247 *fn = &CreateGcsFilesystemFn;
248
249 munmap(fn, sizeof(void* (*)()));
250 close(fd);
251
252 // Clean up shared memory on a clean exit.
253 atexit([]() {
254 shm_unlink(absl::StrCat("/tmp_tf_gcs_fs_pointer_", getpid()).data());
255 });
256 }
257 } // namespace
FindAndLoadTpuLibrary()258 Status FindAndLoadTpuLibrary() {
259 const char* env_value = getenv("TPU_LIBRARY_PATH");
260 const char* libtpu_path =
261 env_value && strlen(env_value) > 0 ? env_value : "libtpu.so";
262 LOG(INFO) << "Libtpu path is: " << libtpu_path;
263 void* library = dlopen(libtpu_path, RTLD_NOW);
264 if (library) {
265 // We can open the shared library which means we are in a TPU environment.
266 // Try to acquire exclusive access.
267 TF_RETURN_IF_ERROR(TryAcquireTpuLock());
268 TF_RETURN_IF_ERROR(InitializeTpuLibrary(library));
269 }
270
271 InitializeCreateGcsFileSystemFnPtr();
272 return Status::OK();
273 }
274
275 #elif defined(LIBTPU_STATIC)
276
277 #include "tensorflow/core/tpu/tpu_library_init_fns.inc"
278
InitializeTpuLibrary()279 Status InitializeTpuLibrary() {
280 // Retrieve arguments from environment if applicable
281 std::pair<std::vector<std::string>, std::vector<const char*>> args =
282 GetLibTpuInitArguments();
283
284 TfTpu_Initialize(/*init_library*/ true, args.second.size(),
285 args.second.data());
286
287 RegisterTpuPlatform();
288 return Status::OK();
289 }
290
FindAndLoadTpuLibrary()291 Status FindAndLoadTpuLibrary() {
292 // We can open the shared library which means we are in a TPU environment.
293 // Try to acquire exclusive access.
294 TF_RETURN_IF_ERROR(TryAcquireTpuLock());
295 TF_RETURN_IF_ERROR(InitializeTpuLibrary());
296 return Status::OK();
297 }
298
299 #else // PLATFORM_GOOGLE
InitializeTpuLibrary(void * library_handle)300 Status InitializeTpuLibrary(void* library_handle) {
301 return errors::Unimplemented("You must statically link in a TPU library.");
302 }
303 #endif // PLATFORM_GOOGLE
304 std::pair<std::vector<std::string>, std::vector<const char*>>
GetLibTpuInitArguments()305 GetLibTpuInitArguments() {
306 // We make copies of the arguments returned by getenv because the memory
307 // returned may be altered or invalidated by further calls to getenv.
308 std::vector<std::string> args;
309 std::vector<const char*> arg_ptrs;
310
311 // Retrieve arguments from environment if applicable.
312 char* env = getenv("LIBTPU_INIT_ARGS");
313 if (env != nullptr) {
314 // TODO(frankchn): Handles quotes properly if necessary.
315 args = absl::StrSplit(env, ' ');
316 }
317
318 arg_ptrs.reserve(args.size());
319 for (int i = 0; i < args.size(); ++i) {
320 arg_ptrs.push_back(args[i].data());
321 }
322
323 return {std::move(args), std::move(arg_ptrs)};
324 }
325
326 } // namespace tpu
327 } // namespace tensorflow
328