xref: /aosp_15_r20/system/update_engine/aosp/cleanup_previous_update_action.cc (revision 5a9231315b4521097b8dc3750bc806fcafe0c72f)
1 //
2 // Copyright (C) 2020 The Android Open Source Project
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "update_engine/aosp/cleanup_previous_update_action.h"
17 
18 #include <algorithm>
19 #include <chrono>
20 #include <functional>
21 #include <string>
22 #include <type_traits>
23 
24 #include <android-base/chrono_utils.h>
25 #include <android-base/properties.h>
26 #include <base/bind.h>
27 #include <libsnapshot/snapshot.h>
28 
29 #if !defined(__ANDROID_RECOVERY__) && !defined(UE_DISABLE_STATS)
30 #include <statslog_ue.h>
31 #endif
32 
33 #include "update_engine/common/utils.h"
34 #include "update_engine/payload_consumer/delta_performer.h"
35 
36 using android::base::GetBoolProperty;
37 using android::snapshot::ISnapshotManager;
38 using android::snapshot::SnapshotMergeStats;
39 using android::snapshot::UpdateState;
40 using brillo::MessageLoop;
41 
42 constexpr char kBootCompletedProp[] = "sys.boot_completed";
43 constexpr auto&& kMergeDelaySecondsProp = "ro.virtual_ab.merge_delay_seconds";
44 constexpr size_t kMaxMergeDelaySeconds = 600;
45 // Interval to check sys.boot_completed.
46 constexpr auto kCheckBootCompletedInterval = base::TimeDelta::FromSeconds(2);
47 // Interval to check IBootControl::isSlotMarkedSuccessful
48 constexpr auto kCheckSlotMarkedSuccessfulInterval =
49     base::TimeDelta::FromSeconds(2);
50 // Interval to call SnapshotManager::ProcessUpdateState
51 constexpr auto kWaitForMergeInterval = base::TimeDelta::FromSeconds(2);
52 
53 #ifdef __ANDROID_RECOVERY__
54 static constexpr bool kIsRecovery = true;
55 #else
56 static constexpr bool kIsRecovery = false;
57 #endif
58 
59 namespace chromeos_update_engine {
60 
CleanupPreviousUpdateAction(PrefsInterface * prefs,BootControlInterface * boot_control,android::snapshot::ISnapshotManager * snapshot,CleanupPreviousUpdateActionDelegateInterface * delegate)61 CleanupPreviousUpdateAction::CleanupPreviousUpdateAction(
62     PrefsInterface* prefs,
63     BootControlInterface* boot_control,
64     android::snapshot::ISnapshotManager* snapshot,
65     CleanupPreviousUpdateActionDelegateInterface* delegate)
66     : prefs_(prefs),
67       boot_control_(boot_control),
68       snapshot_(snapshot),
69       delegate_(delegate),
70       running_(false),
71       cancel_failed_(false),
72       last_percentage_(0),
73       merge_stats_(nullptr) {}
74 
~CleanupPreviousUpdateAction()75 CleanupPreviousUpdateAction::~CleanupPreviousUpdateAction() {
76   StopActionInternal();
77 }
78 
PerformAction()79 void CleanupPreviousUpdateAction::PerformAction() {
80   StartActionInternal();
81 }
82 
TerminateProcessing()83 void CleanupPreviousUpdateAction::TerminateProcessing() {
84   StopActionInternal();
85 }
86 
ResumeAction()87 void CleanupPreviousUpdateAction::ResumeAction() {
88   StartActionInternal();
89 }
90 
SuspendAction()91 void CleanupPreviousUpdateAction::SuspendAction() {
92   StopActionInternal();
93 }
94 
ActionCompleted(ErrorCode error_code)95 void CleanupPreviousUpdateAction::ActionCompleted(ErrorCode error_code) {
96   StopActionInternal();
97   ReportMergeStats();
98   metadata_device_ = nullptr;
99 }
100 
Type() const101 std::string CleanupPreviousUpdateAction::Type() const {
102   return StaticType();
103 }
104 
StaticType()105 std::string CleanupPreviousUpdateAction::StaticType() {
106   return "CleanupPreviousUpdateAction";
107 }
108 
109 // This function is called at the beginning of all delayed functions. By
110 // resetting |scheduled_task_|, the delayed function acknowledges that the task
111 // has already been executed, therefore there's no need to cancel it in the
112 // future. This avoids StopActionInternal() from resetting task IDs in an
113 // unexpected way because task IDs could be reused.
AcknowledgeTaskExecuted()114 void CleanupPreviousUpdateAction::AcknowledgeTaskExecuted() {
115   if (scheduled_task_.IsScheduled()) {
116     LOG(INFO) << "Executing task " << scheduled_task_;
117   }
118 }
119 
120 // Check that scheduled_task_ is a valid task ID. Otherwise, terminate the
121 // action.
CheckTaskScheduled(std::string_view name)122 void CleanupPreviousUpdateAction::CheckTaskScheduled(std::string_view name) {
123   if (!scheduled_task_.IsScheduled()) {
124     LOG(ERROR) << "Unable to schedule " << name;
125     processor_->ActionComplete(this, ErrorCode::kError);
126   } else {
127     LOG(INFO) << "CleanupPreviousUpdateAction scheduled task ID "
128               << scheduled_task_ << " for " << name;
129   }
130 }
131 
StopActionInternal()132 void CleanupPreviousUpdateAction::StopActionInternal() {
133   LOG(INFO) << "Stopping/suspending/completing CleanupPreviousUpdateAction";
134   running_ = false;
135 
136   if (scheduled_task_.IsScheduled()) {
137     if (scheduled_task_.Cancel()) {
138       LOG(INFO) << "CleanupPreviousUpdateAction cancelled pending task ID "
139                 << scheduled_task_;
140     } else {
141       LOG(ERROR) << "CleanupPreviousUpdateAction unable to cancel task ID "
142                  << scheduled_task_;
143     }
144   }
145 }
146 
StartActionInternal()147 void CleanupPreviousUpdateAction::StartActionInternal() {
148   CHECK(prefs_);
149   CHECK(boot_control_);
150 
151   LOG(INFO) << "Starting/resuming CleanupPreviousUpdateAction";
152   running_ = true;
153   // Do nothing on non-VAB device.
154   if (!boot_control_->GetDynamicPartitionControl()
155            ->GetVirtualAbFeatureFlag()
156            .IsEnabled()) {
157     processor_->ActionComplete(this, ErrorCode::kSuccess);
158     return;
159   }
160   // SnapshotManager must be available on VAB devices.
161   CHECK(snapshot_ != nullptr);
162   merge_stats_ = snapshot_->GetSnapshotMergeStatsInstance();
163   CHECK(merge_stats_ != nullptr);
164   WaitBootCompletedOrSchedule();
165 }
166 
ScheduleWaitBootCompleted()167 void CleanupPreviousUpdateAction::ScheduleWaitBootCompleted() {
168   TEST_AND_RETURN(running_);
169   if (!scheduled_task_.PostTask(
170           FROM_HERE,
171           base::Bind(&CleanupPreviousUpdateAction::WaitBootCompletedOrSchedule,
172                      base::Unretained(this)),
173           kCheckBootCompletedInterval)) {
174     CheckTaskScheduled("WaitBootCompleted");
175   }
176 }
177 
WaitBootCompletedOrSchedule()178 void CleanupPreviousUpdateAction::WaitBootCompletedOrSchedule() {
179   AcknowledgeTaskExecuted();
180   TEST_AND_RETURN(running_);
181   if (!kIsRecovery &&
182       !android::base::GetBoolProperty(kBootCompletedProp, false)) {
183     // repeat
184     ScheduleWaitBootCompleted();
185     return;
186   }
187 
188   auto boot_time = std::chrono::duration_cast<std::chrono::milliseconds>(
189       android::base::boot_clock::now().time_since_epoch());
190   merge_stats_->set_boot_complete_time_ms(boot_time.count());
191 
192   LOG(INFO) << "Boot completed, waiting on markBootSuccessful()";
193   CheckSlotMarkedSuccessfulOrSchedule();
194 }
195 
ScheduleWaitMarkBootSuccessful()196 void CleanupPreviousUpdateAction::ScheduleWaitMarkBootSuccessful() {
197   TEST_AND_RETURN(running_);
198   if (!scheduled_task_.PostTask(
199           FROM_HERE,
200           base::Bind(
201               &CleanupPreviousUpdateAction::CheckSlotMarkedSuccessfulOrSchedule,
202               base::Unretained(this)),
203           kCheckSlotMarkedSuccessfulInterval)) {
204     CheckTaskScheduled("WaitMarkBootSuccessful");
205   }
206 }
207 
CheckForMergeDelay()208 void CleanupPreviousUpdateAction::CheckForMergeDelay() {
209   if (!android::snapshot::SnapshotManager::IsSnapshotManagerNeeded()) {
210     StartMerge();
211     return;
212   }
213   const auto merge_delay_seconds =
214       std::clamp<int>(android::base::GetIntProperty(kMergeDelaySecondsProp, 0),
215                       0,
216                       kMaxMergeDelaySeconds);
217   if (merge_delay_seconds != 0) {
218     LOG(INFO) << "Merge is ready to start, but " << kMergeDelaySecondsProp
219               << " is set, delaying merge by " << merge_delay_seconds
220               << " seconds";
221   }
222   if (!scheduled_task_.PostTask(
223           FROM_HERE,
224           [this]() { StartMerge(); },
225           base::TimeDelta::FromSeconds(merge_delay_seconds))) {
226     LOG(ERROR) << "Unable to schedule " << __FUNCTION__;
227     processor_->ActionComplete(this, ErrorCode::kError);
228   }
229 }
230 
CheckSlotMarkedSuccessfulOrSchedule()231 void CleanupPreviousUpdateAction::CheckSlotMarkedSuccessfulOrSchedule() {
232   AcknowledgeTaskExecuted();
233   TEST_AND_RETURN(running_);
234   if (!kIsRecovery &&
235       !boot_control_->IsSlotMarkedSuccessful(boot_control_->GetCurrentSlot())) {
236     ScheduleWaitMarkBootSuccessful();
237     return;
238   }
239   CheckForMergeDelay();
240 }
241 
StartMerge()242 void CleanupPreviousUpdateAction::StartMerge() {
243   if (metadata_device_ == nullptr) {
244     metadata_device_ = snapshot_->EnsureMetadataMounted();
245   }
246 
247   if (metadata_device_ == nullptr) {
248     LOG(ERROR) << "Failed to mount /metadata.";
249     // If metadata is erased but not formatted, it is possible to not mount
250     // it in recovery. It is safe to skip CleanupPreviousUpdateAction.
251     processor_->ActionComplete(
252         this, kIsRecovery ? ErrorCode::kSuccess : ErrorCode::kError);
253     return;
254   }
255 
256   if (kIsRecovery) {
257     auto snapshots_created =
258         snapshot_->RecoveryCreateSnapshotDevices(metadata_device_);
259     switch (snapshots_created) {
260       case android::snapshot::CreateResult::CREATED: {
261         // If previous update has not finished merging, snapshots exists and are
262         // created here so that ProcessUpdateState can proceed.
263         LOG(INFO) << "Snapshot devices are created";
264         break;
265       }
266       case android::snapshot::CreateResult::NOT_CREATED: {
267         // If there is no previous update, no snapshot devices are created and
268         // ProcessUpdateState will return immediately. Hence, NOT_CREATED is not
269         // considered an error.
270         LOG(INFO) << "Snapshot devices are not created";
271         break;
272       }
273       case android::snapshot::CreateResult::ERROR:
274       default: {
275         LOG(ERROR)
276             << "Failed to create snapshot devices (CreateResult = "
277             << static_cast<
278                    std::underlying_type_t<android::snapshot::CreateResult>>(
279                    snapshots_created);
280         processor_->ActionComplete(this, ErrorCode::kError);
281         return;
282       }
283     }
284   }
285 
286   if (!merge_stats_->Start()) {
287     // Not an error because CleanupPreviousUpdateAction may be paused and
288     // resumed while kernel continues merging snapshots in the background.
289     LOG(WARNING) << "SnapshotMergeStats::Start failed.";
290   }
291   LOG(INFO) << "Waiting for any previous merge request to complete. "
292             << "This can take up to several minutes.";
293   WaitForMergeOrSchedule();
294 }
295 
ScheduleWaitForMerge()296 void CleanupPreviousUpdateAction::ScheduleWaitForMerge() {
297   TEST_AND_RETURN(running_);
298   if (!scheduled_task_.PostTask(
299           FROM_HERE,
300           base::Bind(&CleanupPreviousUpdateAction::WaitForMergeOrSchedule,
301                      base::Unretained(this)),
302           kWaitForMergeInterval)) {
303     CheckTaskScheduled("WaitForMerge");
304   }
305 }
306 
WaitForMergeOrSchedule()307 void CleanupPreviousUpdateAction::WaitForMergeOrSchedule() {
308   AcknowledgeTaskExecuted();
309   TEST_AND_RETURN(running_);
310 
311   snapshot_->SetMergeStatsFeatures(merge_stats_);
312 
313   // Propagate the merge failure code to the merge stats. If we wait until
314   // after ProcessUpdateState, then a successful merge could overwrite the
315   // state of the previous failure.
316   auto failure_code = snapshot_->ReadMergeFailureCode();
317   if (failure_code != android::snapshot::MergeFailureCode::Ok) {
318     merge_stats_->set_merge_failure_code(failure_code);
319   }
320 
321   auto state = snapshot_->ProcessUpdateState(
322       std::bind(&CleanupPreviousUpdateAction::OnMergePercentageUpdate, this),
323       std::bind(&CleanupPreviousUpdateAction::BeforeCancel, this));
324   merge_stats_->set_state(state);
325 
326   switch (state) {
327     case UpdateState::None: {
328       LOG(INFO) << "Can't find any snapshot to merge.";
329       ErrorCode error_code = ErrorCode::kSuccess;
330       if (!snapshot_->CancelUpdate()) {
331         error_code = ErrorCode::kError;
332         LOG(INFO) << "Failed to call SnapshotManager::CancelUpdate().";
333       }
334       processor_->ActionComplete(this, error_code);
335       return;
336     }
337 
338     case UpdateState::Initiated: {
339       LOG(ERROR) << "Previous update has not been completed, not cleaning up";
340       processor_->ActionComplete(this, ErrorCode::kSuccess);
341       return;
342     }
343 
344     case UpdateState::Unverified: {
345       InitiateMergeAndWait();
346       return;
347     }
348 
349     case UpdateState::Merging: {
350       ScheduleWaitForMerge();
351       return;
352     }
353 
354     case UpdateState::MergeNeedsReboot: {
355       LOG(ERROR) << "Need reboot to finish merging.";
356       processor_->ActionComplete(this, ErrorCode::kError);
357       return;
358     }
359 
360     case UpdateState::MergeCompleted: {
361       LOG(INFO) << "Merge finished with state MergeCompleted.";
362       boot_control_->MarkSlotUnbootable(1 - boot_control_->GetCurrentSlot());
363       processor_->ActionComplete(this, ErrorCode::kSuccess);
364       return;
365     }
366 
367     case UpdateState::MergeFailed: {
368       LOG(ERROR) << "Merge failed. Device may be corrupted.";
369       merge_stats_->set_merge_failure_code(snapshot_->ReadMergeFailureCode());
370       processor_->ActionComplete(this, ErrorCode::kDeviceCorrupted);
371       return;
372     }
373 
374     case UpdateState::Cancelled: {
375       // DeltaPerformer::ResetUpdateProgress failed, hence snapshots are
376       // not deleted to avoid inconsistency.
377       // Nothing can be done here; just try next time.
378       ErrorCode error_code =
379           cancel_failed_ ? ErrorCode::kError : ErrorCode::kSuccess;
380       processor_->ActionComplete(this, error_code);
381       return;
382     }
383 
384     default: {
385       // Protobuf has some reserved enum values, so a default case is needed.
386       LOG(FATAL) << "SnapshotManager::ProcessUpdateState returns "
387                  << static_cast<int32_t>(state);
388     }
389   }
390 }
391 
OnMergePercentageUpdate()392 bool CleanupPreviousUpdateAction::OnMergePercentageUpdate() {
393   double percentage = 0.0;
394   snapshot_->GetUpdateState(&percentage);
395   if (delegate_) {
396     // libsnapshot uses [0, 100] percentage but update_engine uses [0, 1].
397     delegate_->OnCleanupProgressUpdate(percentage / 100);
398   }
399 
400   // Log if percentage increments by at least 1.
401   if (last_percentage_ < static_cast<unsigned int>(percentage)) {
402     last_percentage_ = percentage;
403     LOG(INFO) << "Waiting for merge to complete: " << last_percentage_ << "%.";
404   }
405 
406   // Do not continue to wait for merge. Instead, let ProcessUpdateState
407   // return Merging directly so that we can ScheduleWaitForMerge() in
408   // MessageLoop.
409   return false;
410 }
411 
BeforeCancel()412 bool CleanupPreviousUpdateAction::BeforeCancel() {
413   if (DeltaPerformer::ResetUpdateProgress(
414           prefs_,
415           false /* quick */,
416           false /* skip dynamic partitions metadata*/)) {
417     return true;
418   }
419 
420   // ResetUpdateProgress might not work on stub prefs. Do additional checks.
421   LOG(WARNING) << "ProcessUpdateState returns Cancelled but cleanup failed.";
422 
423   std::string val;
424   ignore_result(prefs_->GetString(kPrefsDynamicPartitionMetadataUpdated, &val));
425   if (val.empty()) {
426     LOG(INFO) << kPrefsDynamicPartitionMetadataUpdated
427               << " is empty, assuming successful cleanup";
428     return true;
429   }
430   LOG(WARNING)
431       << kPrefsDynamicPartitionMetadataUpdated << " is " << val
432       << ", not deleting snapshots even though UpdateState is Cancelled.";
433   cancel_failed_ = true;
434   return false;
435 }
436 
InitiateMergeAndWait()437 void CleanupPreviousUpdateAction::InitiateMergeAndWait() {
438   TEST_AND_RETURN(running_);
439   LOG(INFO) << "Attempting to initiate merge.";
440   // suspend the VAB merge when running a DSU
441   if (GetBoolProperty("ro.gsid.image_running", false)) {
442     LOG(WARNING) << "Suspend the VAB merge when running a DSU.";
443     processor_->ActionComplete(this, ErrorCode::kError);
444     return;
445   }
446 
447   snapshot_->UpdateCowStats(merge_stats_);
448 
449   auto merge_start_time = std::chrono::duration_cast<std::chrono::milliseconds>(
450       android::base::boot_clock::now().time_since_epoch());
451   merge_stats_->set_boot_complete_to_merge_start_time_ms(
452       merge_start_time.count() - merge_stats_->boot_complete_time_ms());
453 
454   auto source_build_fingerprint = snapshot_->ReadSourceBuildFingerprint();
455   merge_stats_->set_source_build_fingerprint(source_build_fingerprint);
456 
457   if (!merge_stats_->WriteState()) {
458     LOG(ERROR) << "Failed to write merge stats; record may be unreliable if "
459                   "merge is interrupted.";
460   }
461 
462   if (snapshot_->InitiateMerge()) {
463     WaitForMergeOrSchedule();
464     return;
465   }
466 
467   LOG(WARNING) << "InitiateMerge failed.";
468   auto state = snapshot_->GetUpdateState();
469   merge_stats_->set_state(state);
470   if (state == UpdateState::Unverified) {
471     // We are stuck at unverified state. This can happen if the update has
472     // been applied, but it has not even been attempted yet (in libsnapshot,
473     // rollback indicator does not exist); for example, if update_engine
474     // restarts before the device reboots, then this state may be reached.
475     // Nothing should be done here.
476     LOG(WARNING) << "InitiateMerge leaves the device at "
477                  << "UpdateState::Unverified. (Did update_engine "
478                  << "restarted?)";
479     processor_->ActionComplete(this, ErrorCode::kSuccess);
480     return;
481   }
482 
483   // State does seems to be advanced.
484   // It is possibly racy. For example, on a userdebug build, the user may
485   // manually initiate a merge with snapshotctl between last time
486   // update_engine checks UpdateState. Hence, just call
487   // WaitForMergeOrSchedule one more time.
488   LOG(WARNING) << "IniitateMerge failed but GetUpdateState returned "
489                << android::snapshot::UpdateState_Name(state)
490                << ", try to wait for merge again.";
491   WaitForMergeOrSchedule();
492   return;
493 }
494 
ReportMergeStats()495 void CleanupPreviousUpdateAction::ReportMergeStats() {
496   auto result = merge_stats_->Finish();
497   if (result == nullptr) {
498     LOG(WARNING) << "Not reporting merge stats because "
499                     "SnapshotMergeStats::Finish failed.";
500     return;
501   }
502 
503 #ifdef __ANDROID_RECOVERY__
504   LOG(INFO) << "Skip reporting merge stats in recovery.";
505 #elif defined(UE_DISABLE_STATS)
506   LOG(INFO) << "Skip reporting merge stats because metrics are disabled.";
507 #else
508   const auto& report = result->report();
509 
510   if (report.state() == UpdateState::None ||
511       report.state() == UpdateState::Initiated ||
512       report.state() == UpdateState::Unverified) {
513     LOG(INFO) << "Not reporting merge stats because state is "
514               << android::snapshot::UpdateState_Name(report.state());
515     return;
516   }
517 
518   auto passed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
519       result->merge_time());
520 
521   bool vab_retrofit = boot_control_->GetDynamicPartitionControl()
522                           ->GetVirtualAbFeatureFlag()
523                           .IsRetrofit();
524   bool vab_compression_enabled = boot_control_->GetDynamicPartitionControl()
525                                      ->GetVirtualAbCompressionFeatureFlag()
526                                      .IsEnabled();
527   // The snapshot has been merged, so we can no longer call
528   // DynamicPartitionControlInterface::UpdateUsesSnapshotCompression.
529   // However, we have saved the flag in the snapshot report.
530   bool vab_compression_used = report.compression_enabled();
531   bool userspace_snapshots_enabled =
532       boot_control_->GetDynamicPartitionControl()
533           ->GetVirtualAbUserspaceSnapshotsFeatureFlag()
534           .IsEnabled();
535   bool userspace_snapshots_used = report.userspace_snapshots_used();
536   bool xor_compression_enabled = boot_control_->GetDynamicPartitionControl()
537                                      ->GetVirtualAbCompressionXorFeatureFlag()
538                                      .IsEnabled();
539   bool xor_compression_used = report.xor_compression_used();
540   bool iouring_used = report.iouring_used();
541 
542   auto target_build_fingerprint =
543       android::base::GetProperty("ro.build.fingerprint", "");
544 
545   LOG(INFO) << "Reporting merge stats: "
546             << android::snapshot::UpdateState_Name(report.state()) << " in "
547             << passed_ms.count() << "ms (resumed " << report.resume_count()
548             << " times), using " << report.cow_file_size()
549             << " bytes of COW image.";
550   statsd::stats_write(statsd::SNAPSHOT_MERGE_REPORTED,
551                       static_cast<int32_t>(report.state()),
552                       static_cast<int64_t>(passed_ms.count()),
553                       static_cast<int32_t>(report.resume_count()),
554                       vab_retrofit,
555                       static_cast<int64_t>(report.cow_file_size()),
556                       vab_compression_enabled,
557                       vab_compression_used,
558                       report.total_cow_size_bytes(),
559                       report.estimated_cow_size_bytes(),
560                       report.boot_complete_time_ms(),
561                       report.boot_complete_to_merge_start_time_ms(),
562                       static_cast<int32_t>(report.merge_failure_code()),
563                       report.source_build_fingerprint().c_str(),
564                       target_build_fingerprint.c_str(),
565                       userspace_snapshots_enabled,
566                       userspace_snapshots_used,
567                       xor_compression_enabled,
568                       xor_compression_used,
569                       iouring_used);
570 #endif
571 }
572 
573 }  // namespace chromeos_update_engine
574