xref: /aosp_15_r20/external/tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/tpu/tpu_initializer_helper.h"
17 
18 #include <dirent.h>
19 #include <dlfcn.h>
20 #include <fcntl.h>
21 #include <stdlib.h>
22 #include <sys/mman.h>
23 #include <sys/stat.h>
24 #include <sys/types.h>
25 #include <unistd.h>
26 
27 #include <fstream>
28 #include <string>
29 #include <utility>
30 
31 #include "absl/strings/str_cat.h"
32 #include "absl/strings/str_split.h"
33 #include "absl/synchronization/mutex.h"
34 #include "tensorflow/core/platform/errors.h"
35 #include "tensorflow/core/platform/logging.h"
36 #include "tensorflow/core/tpu/libtftpu.h"
37 #include "tensorflow/core/tpu/tpu_api_dlsym_set_fn.h"
38 #include "tensorflow/core/tpu/tpu_ops_c_api.h"
39 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
40 
41 #if !defined(PLATFORM_GOOGLE)
42 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
43 #include "tensorflow/core/platform/env.h"
44 #include "tensorflow/core/tpu/tpu_api.h"
45 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
46 #elif defined(LIBTPU_STATIC)
47 #include "tensorflow/core/tpu/tpu_api.h"
48 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
49 #endif  // PLATFORM_GOOGLE
50 
51 namespace tensorflow {
52 namespace tpu {
53 namespace {
54 
GetEnvVar(const char * name)55 static std::string GetEnvVar(const char* name) {
56   // Constructing a std::string directly from nullptr is undefined behavior so
57   // we can return empty string in that case
58   const char* env_value = getenv(name);
59   if (!env_value) return "";
60   return std::string(env_value);
61 }
62 
GetEnvBool(const char * name,bool defval)63 bool GetEnvBool(const char* name, bool defval) {
64   const char* env = getenv(name);
65   if (env == nullptr) {
66     return defval;
67   }
68   if (std::strcmp(env, "true") == 0) {
69     return true;
70   }
71   if (std::strcmp(env, "false") == 0) {
72     return false;
73   }
74   int int_env;
75   bool has_int = absl::SimpleAtoi(env, &int_env);
76   return has_int && int_env != 0;
77 }
78 
79 }  // namespace
80 
81 // This function gets pid of a process and checks if that process is using tpu.
82 // It is not able to check processes that are owned by another user.
IsTpuUsed(int64_t pid)83 bool IsTpuUsed(int64_t pid) {
84   std::string path = absl::StrCat("/proc/", pid, "/fd");
85   DIR* raw_fd_dir = opendir(path.c_str());
86   if (!raw_fd_dir) {
87     return false;
88   }
89   std::unique_ptr<DIR, int (*)(DIR*)> fd_dir(raw_fd_dir, closedir);
90   struct dirent* ent;
91   std::string line;
92   std::string tpu_dev_path = "/dev/accel0";
93   line.resize(tpu_dev_path.size());
94   while ((ent = readdir(raw_fd_dir))) {
95     if (!isdigit(*ent->d_name)) continue;
96     int64_t fd = strtol(ent->d_name, nullptr, 10);
97     path = absl::StrCat("/proc/", pid, "/fd/", fd);
98     if (!readlink(path.c_str(), &line[0], line.size())) continue;
99     if (line != tpu_dev_path) continue;
100     return true;
101   }
102   return false;
103 }
104 
105 // This function iterates through all the processes in /proc and finds out if
106 // any process it was able to check is using the TPU. It does not have
107 // permission to processes owned by another user.
108 // TODO (shahrokhi) use tensorflow/core/platform/filesystem (GetChildren) for
109 // this.
FindLibtpuProcess()110 StatusOr<int64_t> FindLibtpuProcess() {
111   DIR* proc = opendir("/proc");
112 
113   if (proc == nullptr) {
114     return errors::Unavailable("was not able to open /proc");
115   }
116   std::unique_ptr<DIR, int (*)(DIR*)> proc_dir(proc, closedir);
117   struct dirent* ent;
118   int64_t pid;
119   while ((ent = readdir(proc))) {
120     if (!isdigit(*ent->d_name)) continue;
121 
122     pid = strtol(ent->d_name, nullptr, 10);
123     if (IsTpuUsed(pid)) {
124       return pid;
125     }
126   }
127   return errors::NotFound("did not find which pid uses the libtpu.so");
128 }
129 
TryAcquireTpuLock()130 Status TryAcquireTpuLock() {
131   static absl::Mutex* mu = new absl::Mutex();
132   absl::MutexLock l(mu);
133 
134   // TODO(skyewm): use `absl::StrCat(getenv(name))` once we build with the
135   // fix for https://github.com/abseil/abseil-cpp/issues/1167.
136   std::string load_library_override;
137   const char* env_value = getenv("TPU_LOAD_LIBRARY");
138   if (env_value != nullptr) {
139     load_library_override = std::string(env_value);
140   }
141 
142   if (load_library_override == "1") {
143     return OkStatus();
144   } else if (load_library_override == "0") {
145     return errors::FailedPrecondition("TPU_LOAD_LIBRARY=0, not loading libtpu");
146   }
147 
148   // If TPU_CHIPS_PER_PROCESS_BOUNDS doesn't include all chips, we assume
149   // we're using different chips in different processes and thus multiple
150   // libtpu loads are ok.
151   // TODO(skyewm): we could make per-chip lock files and look at
152   // TPU_VISIBLE_DEVICES if we wanted to make this really precise.
153   std::string chips_per_process_bounds =
154       GetEnvVar("TPU_CHIPS_PER_PROCESS_BOUNDS");
155   bool allow_multiple_libtpu_load =
156       GetEnvBool("ALLOW_MULTIPLE_LIBTPU_LOAD", false);
157   // TODO(skyewm): remove this when TPU_CHIPS_PER_HOST_BOUNDS is fully
158   // deprecated
159   if (chips_per_process_bounds.empty()) {
160     chips_per_process_bounds = GetEnvVar("TPU_CHIPS_PER_HOST_BOUNDS");
161   }
162   if ((chips_per_process_bounds.empty() ||
163        chips_per_process_bounds == "2,2,1") &&
164       !allow_multiple_libtpu_load) {
165     int fd = open("/tmp/libtpu_lockfile", O_CREAT | O_RDWR, 0644);
166 
167     // This lock is held until the process exits intentionally. The underlying
168     // TPU device will be held on until it quits.
169     if (lockf(fd, F_TLOCK, 0) != 0) {
170       auto pid = FindLibtpuProcess();
171       if (pid.ok()) {
172         return errors::Aborted(absl::StrCat(
173             "libtpu.so is already in use by process with pid ",
174             pid.ValueOrDie(),
175             ". Not attempting to load libtpu.so in this process."));
176       } else {
177         return errors::Aborted(
178             "libtpu.so already in use by another process probably owned by "
179             "another user. Run \"$ sudo lsof -w /dev/accel0\" to figure out "
180             "which process is using the TPU. Not attempting to load "
181             "libtpu.so in this process.");
182       }
183     } else {
184       return OkStatus();
185     }
186   } else {
187     VLOG(1) << "TPU_CHIPS_PER_PROCESS_BOUNDS is not empty or "
188                "ALLOW_MULTIPLE_LIBTPU_LOAD is set to True, "
189                "therefore allowing multiple libtpu.so loads.";
190     return OkStatus();
191   }
192 }
193 #if !defined(PLATFORM_GOOGLE)
194 #include "tensorflow/core/tpu/tpu_library_init_fns.inc"
195 
InitializeTpuLibrary(void * library_handle)196 Status InitializeTpuLibrary(void* library_handle) {
197   Status s = InitializeTpuStructFns(library_handle);
198 
199   // Retrieve arguments from environment if applicable
200   std::pair<std::vector<std::string>, std::vector<const char*>> args =
201       GetLibTpuInitArguments();
202 
203   // TPU platform registration must only be performed after the library is
204   // loaded. We do not want to register a TPU platform in XLA without the
205   // supporting library providing the necessary APIs.
206   if (s.ok()) {
207     void (*initialize_fn)(bool init_library, int num_args, const char** args);
208     initialize_fn = reinterpret_cast<decltype(initialize_fn)>(
209         dlsym(library_handle, "TfTpu_Initialize"));
210     (*initialize_fn)(/*init_library=*/true, args.second.size(),
211                      args.second.data());
212 
213     RegisterTpuPlatform();
214   }
215 
216   return s;
217 }
218 
219 namespace {
CreateGcsFilesystemFn()220 void* CreateGcsFilesystemFn() {
221   return new tensorflow::RetryingGcsFileSystem();
222 }
223 
224 // This is a temporary fix for including GCS file system on TPU builds.
225 // Will be removed once b/176954917 is fully resolved with the build fix.
InitializeCreateGcsFileSystemFnPtr()226 void InitializeCreateGcsFileSystemFnPtr() {
227   int fd = shm_open(absl::StrCat("/tmp_tf_gcs_fs_pointer_", getpid()).data(),
228                     O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
229   if (fd == -1) {
230     LOG(ERROR) << "Unable to open shared memory for GCS file system creator.";
231     return;
232   }
233 
234   if (ftruncate(fd, sizeof(tensorflow::FileSystem*)) == -1) {
235     LOG(ERROR)
236         << "Unable to allocate shared memory for GCS file system creator.";
237     return;
238   }
239 
240   void* (**fn)() = reinterpret_cast<void* (**)()>(mmap(
241       NULL, sizeof(void* (*)()), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
242   if (fn == MAP_FAILED) {
243     LOG(ERROR) << "Cannot mmap shared memory for GCS file system creator.";
244     return;
245   }
246 
247   *fn = &CreateGcsFilesystemFn;
248 
249   munmap(fn, sizeof(void* (*)()));
250   close(fd);
251 
252   // Clean up shared memory on a clean exit.
253   atexit([]() {
254     shm_unlink(absl::StrCat("/tmp_tf_gcs_fs_pointer_", getpid()).data());
255   });
256 }
257 }  // namespace
FindAndLoadTpuLibrary()258 Status FindAndLoadTpuLibrary() {
259   const char* env_value = getenv("TPU_LIBRARY_PATH");
260   const char* libtpu_path =
261       env_value && strlen(env_value) > 0 ? env_value : "libtpu.so";
262   LOG(INFO) << "Libtpu path is: " << libtpu_path;
263   void* library = dlopen(libtpu_path, RTLD_NOW);
264   if (library) {
265     // We can open the shared library which means we are in a TPU environment.
266     // Try to acquire exclusive access.
267     TF_RETURN_IF_ERROR(TryAcquireTpuLock());
268     TF_RETURN_IF_ERROR(InitializeTpuLibrary(library));
269   }
270 
271   InitializeCreateGcsFileSystemFnPtr();
272   return Status::OK();
273 }
274 
275 #elif defined(LIBTPU_STATIC)
276 
277 #include "tensorflow/core/tpu/tpu_library_init_fns.inc"
278 
InitializeTpuLibrary()279 Status InitializeTpuLibrary() {
280   // Retrieve arguments from environment if applicable
281   std::pair<std::vector<std::string>, std::vector<const char*>> args =
282       GetLibTpuInitArguments();
283 
284   TfTpu_Initialize(/*init_library*/ true, args.second.size(),
285                    args.second.data());
286 
287   RegisterTpuPlatform();
288   return Status::OK();
289 }
290 
FindAndLoadTpuLibrary()291 Status FindAndLoadTpuLibrary() {
292   // We can open the shared library which means we are in a TPU environment.
293   // Try to acquire exclusive access.
294   TF_RETURN_IF_ERROR(TryAcquireTpuLock());
295   TF_RETURN_IF_ERROR(InitializeTpuLibrary());
296   return Status::OK();
297 }
298 
299 #else   // PLATFORM_GOOGLE
InitializeTpuLibrary(void * library_handle)300 Status InitializeTpuLibrary(void* library_handle) {
301   return errors::Unimplemented("You must statically link in a TPU library.");
302 }
303 #endif  // PLATFORM_GOOGLE
304 std::pair<std::vector<std::string>, std::vector<const char*>>
GetLibTpuInitArguments()305 GetLibTpuInitArguments() {
306   // We make copies of the arguments returned by getenv because the memory
307   // returned may be altered or invalidated by further calls to getenv.
308   std::vector<std::string> args;
309   std::vector<const char*> arg_ptrs;
310 
311   // Retrieve arguments from environment if applicable.
312   char* env = getenv("LIBTPU_INIT_ARGS");
313   if (env != nullptr) {
314     // TODO(frankchn): Handles quotes properly if necessary.
315     args = absl::StrSplit(env, ' ');
316   }
317 
318   arg_ptrs.reserve(args.size());
319   for (int i = 0; i < args.size(); ++i) {
320     arg_ptrs.push_back(args[i].data());
321   }
322 
323   return {std::move(args), std::move(arg_ptrs)};
324 }
325 
326 }  // namespace tpu
327 }  // namespace tensorflow
328