xref: /aosp_15_r20/external/federated-compute/fcp/client/cache/file_backed_resource_cache.cc (revision 14675a029014e728ec732f129a32e299b2da0601)
1 /*
2  * Copyright 2022 Google LLC
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "fcp/client/cache/file_backed_resource_cache.h"
18 
19 #include <algorithm>
20 #include <cstdint>
21 #include <filesystem>
22 #include <fstream>
23 #include <functional>
24 #include <map>
25 #include <memory>
26 #include <optional>
27 #include <set>
28 #include <string>
29 #include <system_error>  // NOLINT
30 #include <utility>
31 #include <vector>
32 
33 #include "google/protobuf/any.pb.h"
34 #include "google/protobuf/timestamp.pb.h"
35 #include "absl/cleanup/cleanup.h"
36 #include "absl/status/status.h"
37 #include "absl/status/statusor.h"
38 #include "absl/strings/str_cat.h"
39 #include "absl/strings/string_view.h"
40 #include "absl/time/time.h"
41 #include "fcp/base/monitoring.h"
42 #include "fcp/base/platform.h"
43 #include "fcp/base/time_util.h"
44 #include "fcp/client/cache/cache_manifest.pb.h"
45 #include "fcp/client/diag_codes.pb.h"
46 #include "protostore/file-storage.h"
47 #include "protostore/proto-data-store.h"
48 
49 namespace fcp {
50 namespace client {
51 namespace cache {
52 
53 constexpr char kCacheManifestFileName[] = "cache_manifest.pb";
54 constexpr char kParentDir[] = "fcp";
55 // Cached files will be saved in <cache directory>/fcp/cache.
56 constexpr char kCacheDir[] = "cache";
57 
ReadInternal()58 absl::StatusOr<CacheManifest> FileBackedResourceCache::ReadInternal() {
59   absl::StatusOr<const CacheManifest*> data = pds_->Read();
60   if (data.ok()) {
61     return *data.value();
62   }
63   log_manager_.LogDiag(ProdDiagCode::RESOURCE_CACHE_MANIFEST_READ_FAILED);
64   // Ignore the status from DeleteManifest() even if it's an error, and bubble
65   // up the status from pds. We call DeleteManifest() here instead of
66   // Initialize(), as Initialize() calls ReadInternal(), potentially causing
67   // infinite recursion. This means that any resources that were tracked by the
68   // deleted manifest will not be cleaned up until the next time Initialize() is
69   // called.
70   auto ignored_status = DeleteManifest();
71   if (!ignored_status.ok()) {
72     FCP_LOG(INFO) << "Failed to delete manifest: " << ignored_status.ToString();
73   }
74   return absl::InternalError(
75       absl::StrCat("Failed to read from database, with error message: ",
76                    data.status().message()));
77 }
78 
WriteInternal(std::unique_ptr<CacheManifest> manifest)79 absl::Status FileBackedResourceCache::WriteInternal(
80     std::unique_ptr<CacheManifest> manifest) {
81   absl::Status status = pds_->Write(std::move(manifest));
82   if (!status.ok()) {
83     log_manager_.LogDiag(ProdDiagCode::RESOURCE_CACHE_MANIFEST_WRITE_FAILED);
84     // Ignore the status returned by DeleteManifest even if it's an error and
85     // instead return the status from pds. We call DeleteManifest() here instead
86     // of Initialize(), as Initialize() calls WriteInternal(), potentially
87     // causing infinite recursion. This means that any resources that were
88     // tracked by the deleted manifest will not be cleaned up until the next
89     // time Initialize() is called.
90     auto ignored_status = DeleteManifest();
91     if (!ignored_status.ok()) {
92       FCP_LOG(INFO) << "Failed to delete manifest: "
93                     << ignored_status.ToString();
94     }
95   }
96   return status;
97 }
98 
99 absl::StatusOr<std::unique_ptr<FileBackedResourceCache>>
Create(absl::string_view base_dir,absl::string_view cache_dir,LogManager * log_manager,fcp::Clock * clock,int64_t max_cache_size_bytes)100 FileBackedResourceCache::Create(absl::string_view base_dir,
101                                 absl::string_view cache_dir,
102                                 LogManager* log_manager, fcp::Clock* clock,
103                                 int64_t max_cache_size_bytes) {
104   // Create <cache root>/fcp.
105   // Unfortunately NDK's flavor of std::filesystem::path does not support using
106   // absl::string_view.
107   std::filesystem::path cache_root_path((std::string(cache_dir)));
108   if (!cache_root_path.is_absolute()) {
109     log_manager->LogDiag(
110         ProdDiagCode::RESOURCE_CACHE_CACHE_ROOT_PATH_NOT_ABSOLUTE);
111     return absl::InvalidArgumentError(
112         absl::StrCat("The provided path: ", cache_dir,
113                      " is invalid. The path must be absolute"));
114   }
115   std::filesystem::path cache_dir_path =
116       cache_root_path / kParentDir / kCacheDir;
117   std::error_code error;
118   std::filesystem::create_directories(cache_dir_path, error);
119   if (error.value() != 0) {
120     log_manager->LogDiag(
121         ProdDiagCode::RESOURCE_CACHE_FAILED_TO_CREATE_CACHE_DIR);
122     return absl::InternalError(absl::StrCat(
123         "Failed to create FileBackedResourceCache cache directory ",
124         cache_dir_path.string()));
125   }
126   // Create <files root>/fcp/cache_manifest.pb.s
127   std::filesystem::path manifest_path((std::string(base_dir)));
128   if (!manifest_path.is_absolute()) {
129     log_manager->LogDiag(ProdDiagCode::RESOURCE_CACHE_INVALID_MANIFEST_PATH);
130     return absl::InvalidArgumentError(
131         absl::StrCat("The provided path: ", manifest_path.string(),
132                      " is invalid. The path must start with \"/\""));
133   }
134   manifest_path /= kParentDir;
135   std::filesystem::create_directories(manifest_path, error);
136   if (error.value() != 0) {
137     log_manager->LogDiag(RESOURCE_CACHE_FAILED_TO_CREATE_MANIFEST_DIR);
138     return absl::InternalError(
139         absl::StrCat("Failed to create directory ", manifest_path.string()));
140   }
141   manifest_path /= kCacheManifestFileName;
142 
143   auto file_storage = std::make_unique<protostore::FileStorage>();
144   auto pds = std::make_unique<protostore::ProtoDataStore<CacheManifest>>(
145       *file_storage, manifest_path.string());
146   std::unique_ptr<FileBackedResourceCache> resource_cache =
147       absl::WrapUnique(new FileBackedResourceCache(
148           std::move(pds), std::move(file_storage), cache_dir_path,
149           manifest_path, log_manager, clock, max_cache_size_bytes));
150   {
151     absl::MutexLock lock(&resource_cache->mutex_);
152     FCP_RETURN_IF_ERROR(resource_cache->Initialize());
153   }
154 
155   return resource_cache;
156 }
157 
Put(absl::string_view cache_id,const absl::Cord & resource,const google::protobuf::Any & metadata,absl::Duration max_age)158 absl::Status FileBackedResourceCache::Put(absl::string_view cache_id,
159                                           const absl::Cord& resource,
160                                           const google::protobuf::Any& metadata,
161                                           absl::Duration max_age) {
162   absl::MutexLock lock(&mutex_);
163 
164   if (resource.size() > max_cache_size_bytes_ / 2) {
165     return absl::ResourceExhaustedError(absl::StrCat(cache_id, " too large"));
166   }
167 
168   FCP_ASSIGN_OR_RETURN(CacheManifest manifest, ReadInternal());
169   FCP_RETURN_IF_ERROR(CleanUp(resource.size(), manifest));
170 
171   std::string cache_id_str(cache_id);
172   std::filesystem::path cached_file_path = cache_dir_path_ / cache_id_str;
173   absl::Time now = clock_.Now();
174   absl::Time expiry = now + max_age;
175   CachedResource cached_resource;
176   cached_resource.set_file_name(cache_id_str);
177   *cached_resource.mutable_metadata() = metadata;
178   *cached_resource.mutable_expiry_time() =
179       TimeUtil::ConvertAbslToProtoTimestamp(expiry);
180   *cached_resource.mutable_last_accessed_time() =
181       TimeUtil::ConvertAbslToProtoTimestamp(now);
182 
183   // Write the manifest back to disk before we write the file.
184   manifest.mutable_cache()->insert({cache_id_str, cached_resource});
185   FCP_RETURN_IF_ERROR(
186       WriteInternal(std::make_unique<CacheManifest>(std::move(manifest))));
187 
188   // Write file if it doesn't exist.
189   std::error_code exists_error;
190   bool cached_file_exists =
191       std::filesystem::exists(cached_file_path, exists_error);
192   if (exists_error.value() != 0) {
193     log_manager_.LogDiag(
194         ProdDiagCode::RESOURCE_CACHE_PUT_FAILED_TO_CHECK_IF_FILE_EXISTS);
195     return absl::InternalError(absl::StrCat(
196         "Failed to check if cached resource already exists with error code: ",
197         exists_error.value()));
198   }
199   if (!cached_file_exists) {
200     auto status = WriteCordToFile(cached_file_path.string(), resource);
201     if (!status.ok()) {
202       log_manager_.LogDiag(ProdDiagCode::RESOURCE_CACHE_RESOURCE_WRITE_FAILED);
203       return status;
204     }
205   }
206 
207   return absl::OkStatus();
208 }
209 
210 absl::StatusOr<FileBackedResourceCache::ResourceAndMetadata>
Get(absl::string_view cache_id,std::optional<absl::Duration> max_age)211 FileBackedResourceCache::Get(absl::string_view cache_id,
212                              std::optional<absl::Duration> max_age) {
213   // By default, set up a "CACHE_MISS" diag code to be logged when this method
214   // exits.
215   DebugDiagCode diag_code = DebugDiagCode::RESOURCE_CACHE_MISS;
216   absl::Cleanup diag_code_logger = [this, &diag_code] {
217     log_manager_.LogDiag(diag_code);
218   };
219   absl::MutexLock lock(&mutex_);
220   FCP_ASSIGN_OR_RETURN(CacheManifest manifest, ReadInternal());
221 
222   std::string cache_id_str(cache_id);
223   if (!manifest.cache().contains(cache_id_str)) {
224     return absl::NotFoundError(absl::StrCat(cache_id, " not found"));
225   }
226   CachedResource cached_resource = manifest.cache().at(cache_id_str);
227   std::filesystem::path cached_file_path = cache_dir_path_ / cache_id_str;
228   google::protobuf::Any metadata = cached_resource.metadata();
229   absl::Time now = clock_.Now();
230   *cached_resource.mutable_last_accessed_time() =
231       TimeUtil::ConvertAbslToProtoTimestamp(now);
232   if (max_age.has_value()) {
233     absl::Time expiry = now + max_age.value();
234     *cached_resource.mutable_expiry_time() =
235         TimeUtil::ConvertAbslToProtoTimestamp(expiry);
236   }
237 
238   absl::StatusOr<absl::Cord> contents =
239       ReadFileToCord(cached_file_path.string());
240   if (!contents.ok()) {
241     log_manager_.LogDiag(ProdDiagCode::RESOURCE_CACHE_RESOURCE_READ_FAILED);
242     manifest.mutable_cache()->erase(cache_id_str);
243     std::error_code error;
244     std::filesystem::remove(cached_file_path, error);
245     if (error.value() != 0) {
246       return absl::InternalError(error.message());
247     }
248     // Treat as not found, the resource should be fetched again.
249     return absl::NotFoundError(absl::StrCat(cache_id, " not found"));
250   }
251 
252   manifest.mutable_cache()->erase(cache_id_str);
253   manifest.mutable_cache()->insert({cache_id_str, cached_resource});
254 
255   absl::Status status =
256       WriteInternal(std::make_unique<CacheManifest>(std::move(manifest)));
257   if (!status.ok()) return status;
258 
259   // We've reached the end, this is a hit! The absl::Cleanup above has a
260   // reference to diag_code, so we update it to CACHE_HIT here.
261   diag_code = DebugDiagCode::RESOURCE_CACHE_HIT;
262   return FileBackedResourceCache::ResourceAndMetadata{*contents, metadata};
263 }
264 
Initialize()265 absl::Status FileBackedResourceCache::Initialize() {
266   absl::string_view errorInInitializePrefix = "Error in initialize: ";
267   std::string pds_path = manifest_path_.string();
268   if (!std::filesystem::exists(pds_path)) {
269     std::ofstream ofs(pds_path);
270   }
271   absl::StatusOr<int64_t> file_size = storage_->GetFileSize(pds_path);
272   if (!file_size.ok()) {
273     log_manager_.LogDiag(
274         ProdDiagCode::RESOURCE_CACHE_INIT_FAILED_TO_GET_MANIFEST_SIZE);
275     return absl::InternalError(absl::StrCat(
276         errorInInitializePrefix, "Failed to get file size of cache manifest: ",
277         file_size.status().message()));
278   }
279   // Initialize db if it's not initialized.
280   if (*file_size == 0) {
281     auto status = WriteInternal(std::make_unique<CacheManifest>());
282     if (!status.ok()) {
283       log_manager_.LogDiag(
284           ProdDiagCode::RESOURCE_CACHE_INIT_FAILED_TO_INITIALIZE_MANIFEST);
285       return absl::InternalError(absl::StrCat(
286           errorInInitializePrefix,
287           "Failed to initialize cache manifest for the first time: ",
288           status.message()));
289     }
290   }
291   // Then run CleanUp. Even if our manifest was empty we still might have
292   // stranded cache files to delete, i.e. in the case that the manifest was
293   // deleted but the cache dir was not deleted.
294   absl::StatusOr<CacheManifest> manifest = ReadInternal();
295   if (!manifest.ok()) {
296     return absl::InternalError(
297         absl::StrCat(errorInInitializePrefix,
298                      "Failed to read manifest: ", manifest.status().message()));
299   }
300   auto cleanup_status = CleanUp(std::nullopt, *manifest);
301   if (!cleanup_status.ok()) {
302     log_manager_.LogDiag(ProdDiagCode::RESOURCE_CACHE_INIT_FAILED_CLEANUP);
303     return absl::InternalError(absl::StrCat(
304         errorInInitializePrefix,
305         "Failed to clean up resource cache: ", cleanup_status.message()));
306   }
307   auto write_status = WriteInternal(std::make_unique<CacheManifest>(*manifest));
308   if (!write_status.ok()) {
309     return absl::InternalError(absl::StrCat(
310         errorInInitializePrefix,
311         "Failed to write cleaned up resource cache: ", write_status.message()));
312   }
313   return absl::OkStatus();
314 }
315 
CleanUp(std::optional<int64_t> reserved_space_bytes,CacheManifest & manifest)316 absl::Status FileBackedResourceCache::CleanUp(
317     std::optional<int64_t> reserved_space_bytes, CacheManifest& manifest) {
318   // Expire any cached resources past their expiry.
319   // Clean up any files that are not tracked in the manifest.
320   // Clean up any manifest entries that point to nonexistent files.
321 
322   // In order to delete files we don't track in the CacheManifest (or that
323   // became untracked due to a crash), fill cache_dir_files with every file in
324   // the cache dir. We'll then remove any file not actively tracked in the cache
325   // manifest.
326   std::set<std::filesystem::path> cache_dir_files;
327 
328   // We don't have any subdirectories in the cache, so we can use a directory
329   // iterator.
330   std::error_code directory_error;
331   auto cache_dir_iterator =
332       std::filesystem::directory_iterator(cache_dir_path_, directory_error);
333   if (directory_error.value() != 0) {
334     log_manager_.LogDiag(
335         ProdDiagCode::RESOURCE_CACHE_CLEANUP_FAILED_TO_ITERATE_OVER_CACHE_DIR);
336     return absl::InternalError(absl::StrCat(
337         "Error iterating over cache dir. Error code: ", directory_error.value(),
338         " message: ", directory_error.message()));
339   }
340   for (auto& file : cache_dir_iterator) {
341     cache_dir_files.insert(cache_dir_path_ / file);
342   }
343 
344   int64_t max_allowed_size_bytes = max_cache_size_bytes_;
345   max_allowed_size_bytes -= reserved_space_bytes.value_or(0);
346 
347   std::set<std::string> cache_ids_to_delete;
348   absl::Time now = clock_.Now();
349   for (const auto& [id, resource] : manifest.cache()) {
350     absl::Time expiry =
351         TimeUtil::ConvertProtoToAbslTime(resource.expiry_time());
352     std::filesystem::path resource_file =
353         cache_dir_path_ / resource.file_name();
354     // It's possible that this manifest entry points at a file in the cache dir
355     // that doesn't exist, i.e. due to a failed write. In this case, the entry
356     // should be deleted as well. cache_dir_files should contain a scan of the
357     // entire cache dir, so the file pointed at by this manifest entry should be
358     // there.
359     bool cached_resource_exists =
360         cache_dir_files.find(resource_file) != cache_dir_files.end();
361     if (expiry < now || !cached_resource_exists) {
362       cache_ids_to_delete.insert(id);
363     } else {
364       cache_dir_files.erase(resource_file);
365     }
366   }
367 
368   // Then delete CacheManifest entries.
369   for (const auto& cache_id : cache_ids_to_delete) {
370     manifest.mutable_cache()->erase(cache_id);
371   }
372 
373   // Then delete files.
374   absl::Status filesystem_status = absl::OkStatus();
375   for (const auto& file : cache_dir_files) {
376     std::error_code remove_error;
377     std::filesystem::remove(file, remove_error);
378     // We intentionally loop through all files and attempt to remove as many as
379     // we can, then return the first error we saw.
380     if (remove_error.value() != 0 && filesystem_status.ok()) {
381       log_manager_.LogDiag(
382           ProdDiagCode::RESOURCE_CACHE_CLEANUP_FAILED_TO_DELETE_CACHED_FILE);
383       filesystem_status = absl::InternalError(absl::StrCat(
384           "Failed to delete file. Error code: ", remove_error.value(),
385           ", message: ", remove_error.message()));
386     }
387   }
388 
389   FCP_RETURN_IF_ERROR(filesystem_status);
390 
391   // If we still exceed the allowed size of the cache, delete entries until
392   // we're under the allowed size, sorted by least recently used.
393 
394   // Build up a list of (cache_id, least recently used timestamp) and compute
395   // the total size of the cache.
396   std::vector<std::pair<std::string, absl::Time>> cache_id_lru;
397   cache_id_lru.reserve(manifest.cache().size());
398   uintmax_t cache_dir_size = 0;
399 
400   for (const auto& [id, resource] : manifest.cache()) {
401     cache_id_lru.emplace_back(std::make_pair(
402         id, TimeUtil::ConvertProtoToAbslTime(resource.last_accessed_time())));
403     std::filesystem::path resource_file =
404         cache_dir_path_ / resource.file_name();
405     // We calculate the sum of tracked files instead of taking the file_size()
406     // of the cache directory, because the latter generally does not reflect the
407     // total size of the sum of all the files inside a directory.
408     std::error_code ignored_exists_error;
409     if (!std::filesystem::exists(resource_file, ignored_exists_error)) {
410       // We log that the manifest entry pointed at a file in the cache that
411       // doesn't exist, but otherwise continue. The next time the cache is
412       // initialized, the manifest entry will be cleaned up.
413       log_manager_.LogDiag(
414           ProdDiagCode::RESOURCE_CACHE_CLEANUP_FAILED_TO_GET_FILE_SIZE);
415       continue;
416     }
417     std::error_code file_size_error;
418     std::uintmax_t size =
419         std::filesystem::file_size(resource_file, file_size_error);
420     // Loop through as many as we can and if there's an error, return the first
421     // error we saw.
422     if (file_size_error.value() != 0) {
423       log_manager_.LogDiag(
424           ProdDiagCode::RESOURCE_CACHE_CLEANUP_FAILED_TO_GET_FILE_SIZE);
425       if (filesystem_status.ok()) {
426         filesystem_status = absl::InternalError(absl::StrCat(
427             "Error getting file size. Error code: ", file_size_error.value(),
428             ", message: ", file_size_error.message()));
429       }
430       // If the file exists, but we failed to get the file size for some reason,
431       // try to delete it then continue.
432       std::error_code ignored_remove_error;
433       std::filesystem::remove(resource_file, ignored_remove_error);
434     } else {
435       cache_dir_size += size;
436     }
437   }
438 
439   FCP_RETURN_IF_ERROR(filesystem_status);
440 
441   // Then, if the cache is bigger than the allowed size, delete entries ordered
442   // by least recently used until we're below the threshold.
443   if (cache_dir_size > max_allowed_size_bytes) {
444     std::sort(cache_id_lru.begin(), cache_id_lru.end(),
445               [](std::pair<std::string, absl::Time> first,
446                  std::pair<std::string, absl::Time> second) -> bool {
447                 // Sort by least recently used timestamp.
448                 return first.second < second.second;
449               });
450     for (auto const& [cache_id, timestamp] : cache_id_lru) {
451       std::string id_to_remove = cache_id;
452       std::filesystem::path file_to_remove =
453           cache_dir_path_ / manifest.cache().at(id_to_remove).file_name();
454       manifest.mutable_cache()->erase(id_to_remove);
455       std::error_code remove_error;
456       uintmax_t file_size =
457           std::filesystem::file_size(file_to_remove, remove_error);
458       if (remove_error.value() != 0 && filesystem_status.ok()) {
459         log_manager_.LogDiag(
460             ProdDiagCode::RESOURCE_CACHE_CLEANUP_FAILED_TO_GET_FILE_SIZE);
461         filesystem_status = absl::InternalError(absl::StrCat(
462             "Error getting file size. Error code: ", remove_error.value(),
463             ", message: ", remove_error.message()));
464       }
465       std::filesystem::remove(file_to_remove, remove_error);
466       if (remove_error.value() != 0 && filesystem_status.ok()) {
467         log_manager_.LogDiag(
468             ProdDiagCode::RESOURCE_CACHE_CLEANUP_FAILED_TO_GET_FILE_SIZE);
469         filesystem_status = absl::InternalError(absl::StrCat(
470             "Failed to delete file. Error code: ", remove_error.value(),
471             ", message: ", remove_error.message()));
472       }
473       cache_dir_size -= file_size;
474       if (cache_dir_size < max_allowed_size_bytes) break;
475     }
476   }
477 
478   FCP_RETURN_IF_ERROR(filesystem_status);
479 
480   return absl::OkStatus();
481 }
482 
DeleteManifest()483 absl::Status FileBackedResourceCache::DeleteManifest() {
484   if (std::filesystem::exists(manifest_path_)) {
485     std::error_code error;
486     std::filesystem::remove(manifest_path_, error);
487     if (error.value() != 0) {
488       log_manager_.LogDiag(
489           ProdDiagCode::RESOURCE_CACHE_FAILED_TO_DELETE_MANIFEST);
490       return absl::InternalError(
491           absl::StrCat("Failed to delete manifest! error code: ", error.value(),
492                        ", message: ", error.message()));
493     }
494   }
495   return absl::OkStatus();
496 }
497 
498 }  // namespace cache
499 }  // namespace client
500 }  // namespace fcp
501