1 //
2 // Copyright (C) 2020 The Android Open Source Project
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "update_engine/aosp/cleanup_previous_update_action.h"
17
18 #include <algorithm>
19 #include <chrono>
20 #include <functional>
21 #include <string>
22 #include <type_traits>
23
24 #include <android-base/chrono_utils.h>
25 #include <android-base/properties.h>
26 #include <base/bind.h>
27 #include <libsnapshot/snapshot.h>
28
29 #if !defined(__ANDROID_RECOVERY__) && !defined(UE_DISABLE_STATS)
30 #include <statslog_ue.h>
31 #endif
32
33 #include "update_engine/common/utils.h"
34 #include "update_engine/payload_consumer/delta_performer.h"
35
36 using android::base::GetBoolProperty;
37 using android::snapshot::ISnapshotManager;
38 using android::snapshot::SnapshotMergeStats;
39 using android::snapshot::UpdateState;
40 using brillo::MessageLoop;
41
42 constexpr char kBootCompletedProp[] = "sys.boot_completed";
43 constexpr auto&& kMergeDelaySecondsProp = "ro.virtual_ab.merge_delay_seconds";
44 constexpr size_t kMaxMergeDelaySeconds = 600;
45 // Interval to check sys.boot_completed.
46 constexpr auto kCheckBootCompletedInterval = base::TimeDelta::FromSeconds(2);
47 // Interval to check IBootControl::isSlotMarkedSuccessful
48 constexpr auto kCheckSlotMarkedSuccessfulInterval =
49 base::TimeDelta::FromSeconds(2);
50 // Interval to call SnapshotManager::ProcessUpdateState
51 constexpr auto kWaitForMergeInterval = base::TimeDelta::FromSeconds(2);
52
53 #ifdef __ANDROID_RECOVERY__
54 static constexpr bool kIsRecovery = true;
55 #else
56 static constexpr bool kIsRecovery = false;
57 #endif
58
59 namespace chromeos_update_engine {
60
CleanupPreviousUpdateAction(PrefsInterface * prefs,BootControlInterface * boot_control,android::snapshot::ISnapshotManager * snapshot,CleanupPreviousUpdateActionDelegateInterface * delegate)61 CleanupPreviousUpdateAction::CleanupPreviousUpdateAction(
62 PrefsInterface* prefs,
63 BootControlInterface* boot_control,
64 android::snapshot::ISnapshotManager* snapshot,
65 CleanupPreviousUpdateActionDelegateInterface* delegate)
66 : prefs_(prefs),
67 boot_control_(boot_control),
68 snapshot_(snapshot),
69 delegate_(delegate),
70 running_(false),
71 cancel_failed_(false),
72 last_percentage_(0),
73 merge_stats_(nullptr) {}
74
~CleanupPreviousUpdateAction()75 CleanupPreviousUpdateAction::~CleanupPreviousUpdateAction() {
76 StopActionInternal();
77 }
78
PerformAction()79 void CleanupPreviousUpdateAction::PerformAction() {
80 StartActionInternal();
81 }
82
TerminateProcessing()83 void CleanupPreviousUpdateAction::TerminateProcessing() {
84 StopActionInternal();
85 }
86
ResumeAction()87 void CleanupPreviousUpdateAction::ResumeAction() {
88 StartActionInternal();
89 }
90
SuspendAction()91 void CleanupPreviousUpdateAction::SuspendAction() {
92 StopActionInternal();
93 }
94
ActionCompleted(ErrorCode error_code)95 void CleanupPreviousUpdateAction::ActionCompleted(ErrorCode error_code) {
96 StopActionInternal();
97 ReportMergeStats();
98 metadata_device_ = nullptr;
99 }
100
Type() const101 std::string CleanupPreviousUpdateAction::Type() const {
102 return StaticType();
103 }
104
StaticType()105 std::string CleanupPreviousUpdateAction::StaticType() {
106 return "CleanupPreviousUpdateAction";
107 }
108
109 // This function is called at the beginning of all delayed functions. By
110 // resetting |scheduled_task_|, the delayed function acknowledges that the task
111 // has already been executed, therefore there's no need to cancel it in the
112 // future. This avoids StopActionInternal() from resetting task IDs in an
113 // unexpected way because task IDs could be reused.
AcknowledgeTaskExecuted()114 void CleanupPreviousUpdateAction::AcknowledgeTaskExecuted() {
115 if (scheduled_task_.IsScheduled()) {
116 LOG(INFO) << "Executing task " << scheduled_task_;
117 }
118 }
119
120 // Check that scheduled_task_ is a valid task ID. Otherwise, terminate the
121 // action.
CheckTaskScheduled(std::string_view name)122 void CleanupPreviousUpdateAction::CheckTaskScheduled(std::string_view name) {
123 if (!scheduled_task_.IsScheduled()) {
124 LOG(ERROR) << "Unable to schedule " << name;
125 processor_->ActionComplete(this, ErrorCode::kError);
126 } else {
127 LOG(INFO) << "CleanupPreviousUpdateAction scheduled task ID "
128 << scheduled_task_ << " for " << name;
129 }
130 }
131
StopActionInternal()132 void CleanupPreviousUpdateAction::StopActionInternal() {
133 LOG(INFO) << "Stopping/suspending/completing CleanupPreviousUpdateAction";
134 running_ = false;
135
136 if (scheduled_task_.IsScheduled()) {
137 if (scheduled_task_.Cancel()) {
138 LOG(INFO) << "CleanupPreviousUpdateAction cancelled pending task ID "
139 << scheduled_task_;
140 } else {
141 LOG(ERROR) << "CleanupPreviousUpdateAction unable to cancel task ID "
142 << scheduled_task_;
143 }
144 }
145 }
146
StartActionInternal()147 void CleanupPreviousUpdateAction::StartActionInternal() {
148 CHECK(prefs_);
149 CHECK(boot_control_);
150
151 LOG(INFO) << "Starting/resuming CleanupPreviousUpdateAction";
152 running_ = true;
153 // Do nothing on non-VAB device.
154 if (!boot_control_->GetDynamicPartitionControl()
155 ->GetVirtualAbFeatureFlag()
156 .IsEnabled()) {
157 processor_->ActionComplete(this, ErrorCode::kSuccess);
158 return;
159 }
160 // SnapshotManager must be available on VAB devices.
161 CHECK(snapshot_ != nullptr);
162 merge_stats_ = snapshot_->GetSnapshotMergeStatsInstance();
163 CHECK(merge_stats_ != nullptr);
164 WaitBootCompletedOrSchedule();
165 }
166
ScheduleWaitBootCompleted()167 void CleanupPreviousUpdateAction::ScheduleWaitBootCompleted() {
168 TEST_AND_RETURN(running_);
169 if (!scheduled_task_.PostTask(
170 FROM_HERE,
171 base::Bind(&CleanupPreviousUpdateAction::WaitBootCompletedOrSchedule,
172 base::Unretained(this)),
173 kCheckBootCompletedInterval)) {
174 CheckTaskScheduled("WaitBootCompleted");
175 }
176 }
177
WaitBootCompletedOrSchedule()178 void CleanupPreviousUpdateAction::WaitBootCompletedOrSchedule() {
179 AcknowledgeTaskExecuted();
180 TEST_AND_RETURN(running_);
181 if (!kIsRecovery &&
182 !android::base::GetBoolProperty(kBootCompletedProp, false)) {
183 // repeat
184 ScheduleWaitBootCompleted();
185 return;
186 }
187
188 auto boot_time = std::chrono::duration_cast<std::chrono::milliseconds>(
189 android::base::boot_clock::now().time_since_epoch());
190 merge_stats_->set_boot_complete_time_ms(boot_time.count());
191
192 LOG(INFO) << "Boot completed, waiting on markBootSuccessful()";
193 CheckSlotMarkedSuccessfulOrSchedule();
194 }
195
ScheduleWaitMarkBootSuccessful()196 void CleanupPreviousUpdateAction::ScheduleWaitMarkBootSuccessful() {
197 TEST_AND_RETURN(running_);
198 if (!scheduled_task_.PostTask(
199 FROM_HERE,
200 base::Bind(
201 &CleanupPreviousUpdateAction::CheckSlotMarkedSuccessfulOrSchedule,
202 base::Unretained(this)),
203 kCheckSlotMarkedSuccessfulInterval)) {
204 CheckTaskScheduled("WaitMarkBootSuccessful");
205 }
206 }
207
CheckForMergeDelay()208 void CleanupPreviousUpdateAction::CheckForMergeDelay() {
209 if (!android::snapshot::SnapshotManager::IsSnapshotManagerNeeded()) {
210 StartMerge();
211 return;
212 }
213 const auto merge_delay_seconds =
214 std::clamp<int>(android::base::GetIntProperty(kMergeDelaySecondsProp, 0),
215 0,
216 kMaxMergeDelaySeconds);
217 if (merge_delay_seconds != 0) {
218 LOG(INFO) << "Merge is ready to start, but " << kMergeDelaySecondsProp
219 << " is set, delaying merge by " << merge_delay_seconds
220 << " seconds";
221 }
222 if (!scheduled_task_.PostTask(
223 FROM_HERE,
224 [this]() { StartMerge(); },
225 base::TimeDelta::FromSeconds(merge_delay_seconds))) {
226 LOG(ERROR) << "Unable to schedule " << __FUNCTION__;
227 processor_->ActionComplete(this, ErrorCode::kError);
228 }
229 }
230
CheckSlotMarkedSuccessfulOrSchedule()231 void CleanupPreviousUpdateAction::CheckSlotMarkedSuccessfulOrSchedule() {
232 AcknowledgeTaskExecuted();
233 TEST_AND_RETURN(running_);
234 if (!kIsRecovery &&
235 !boot_control_->IsSlotMarkedSuccessful(boot_control_->GetCurrentSlot())) {
236 ScheduleWaitMarkBootSuccessful();
237 return;
238 }
239 CheckForMergeDelay();
240 }
241
StartMerge()242 void CleanupPreviousUpdateAction::StartMerge() {
243 if (metadata_device_ == nullptr) {
244 metadata_device_ = snapshot_->EnsureMetadataMounted();
245 }
246
247 if (metadata_device_ == nullptr) {
248 LOG(ERROR) << "Failed to mount /metadata.";
249 // If metadata is erased but not formatted, it is possible to not mount
250 // it in recovery. It is safe to skip CleanupPreviousUpdateAction.
251 processor_->ActionComplete(
252 this, kIsRecovery ? ErrorCode::kSuccess : ErrorCode::kError);
253 return;
254 }
255
256 if (kIsRecovery) {
257 auto snapshots_created =
258 snapshot_->RecoveryCreateSnapshotDevices(metadata_device_);
259 switch (snapshots_created) {
260 case android::snapshot::CreateResult::CREATED: {
261 // If previous update has not finished merging, snapshots exists and are
262 // created here so that ProcessUpdateState can proceed.
263 LOG(INFO) << "Snapshot devices are created";
264 break;
265 }
266 case android::snapshot::CreateResult::NOT_CREATED: {
267 // If there is no previous update, no snapshot devices are created and
268 // ProcessUpdateState will return immediately. Hence, NOT_CREATED is not
269 // considered an error.
270 LOG(INFO) << "Snapshot devices are not created";
271 break;
272 }
273 case android::snapshot::CreateResult::ERROR:
274 default: {
275 LOG(ERROR)
276 << "Failed to create snapshot devices (CreateResult = "
277 << static_cast<
278 std::underlying_type_t<android::snapshot::CreateResult>>(
279 snapshots_created);
280 processor_->ActionComplete(this, ErrorCode::kError);
281 return;
282 }
283 }
284 }
285
286 if (!merge_stats_->Start()) {
287 // Not an error because CleanupPreviousUpdateAction may be paused and
288 // resumed while kernel continues merging snapshots in the background.
289 LOG(WARNING) << "SnapshotMergeStats::Start failed.";
290 }
291 LOG(INFO) << "Waiting for any previous merge request to complete. "
292 << "This can take up to several minutes.";
293 WaitForMergeOrSchedule();
294 }
295
ScheduleWaitForMerge()296 void CleanupPreviousUpdateAction::ScheduleWaitForMerge() {
297 TEST_AND_RETURN(running_);
298 if (!scheduled_task_.PostTask(
299 FROM_HERE,
300 base::Bind(&CleanupPreviousUpdateAction::WaitForMergeOrSchedule,
301 base::Unretained(this)),
302 kWaitForMergeInterval)) {
303 CheckTaskScheduled("WaitForMerge");
304 }
305 }
306
WaitForMergeOrSchedule()307 void CleanupPreviousUpdateAction::WaitForMergeOrSchedule() {
308 AcknowledgeTaskExecuted();
309 TEST_AND_RETURN(running_);
310
311 snapshot_->SetMergeStatsFeatures(merge_stats_);
312
313 // Propagate the merge failure code to the merge stats. If we wait until
314 // after ProcessUpdateState, then a successful merge could overwrite the
315 // state of the previous failure.
316 auto failure_code = snapshot_->ReadMergeFailureCode();
317 if (failure_code != android::snapshot::MergeFailureCode::Ok) {
318 merge_stats_->set_merge_failure_code(failure_code);
319 }
320
321 auto state = snapshot_->ProcessUpdateState(
322 std::bind(&CleanupPreviousUpdateAction::OnMergePercentageUpdate, this),
323 std::bind(&CleanupPreviousUpdateAction::BeforeCancel, this));
324 merge_stats_->set_state(state);
325
326 switch (state) {
327 case UpdateState::None: {
328 LOG(INFO) << "Can't find any snapshot to merge.";
329 ErrorCode error_code = ErrorCode::kSuccess;
330 if (!snapshot_->CancelUpdate()) {
331 error_code = ErrorCode::kError;
332 LOG(INFO) << "Failed to call SnapshotManager::CancelUpdate().";
333 }
334 processor_->ActionComplete(this, error_code);
335 return;
336 }
337
338 case UpdateState::Initiated: {
339 LOG(ERROR) << "Previous update has not been completed, not cleaning up";
340 processor_->ActionComplete(this, ErrorCode::kSuccess);
341 return;
342 }
343
344 case UpdateState::Unverified: {
345 InitiateMergeAndWait();
346 return;
347 }
348
349 case UpdateState::Merging: {
350 ScheduleWaitForMerge();
351 return;
352 }
353
354 case UpdateState::MergeNeedsReboot: {
355 LOG(ERROR) << "Need reboot to finish merging.";
356 processor_->ActionComplete(this, ErrorCode::kError);
357 return;
358 }
359
360 case UpdateState::MergeCompleted: {
361 LOG(INFO) << "Merge finished with state MergeCompleted.";
362 boot_control_->MarkSlotUnbootable(1 - boot_control_->GetCurrentSlot());
363 processor_->ActionComplete(this, ErrorCode::kSuccess);
364 return;
365 }
366
367 case UpdateState::MergeFailed: {
368 LOG(ERROR) << "Merge failed. Device may be corrupted.";
369 merge_stats_->set_merge_failure_code(snapshot_->ReadMergeFailureCode());
370 processor_->ActionComplete(this, ErrorCode::kDeviceCorrupted);
371 return;
372 }
373
374 case UpdateState::Cancelled: {
375 // DeltaPerformer::ResetUpdateProgress failed, hence snapshots are
376 // not deleted to avoid inconsistency.
377 // Nothing can be done here; just try next time.
378 ErrorCode error_code =
379 cancel_failed_ ? ErrorCode::kError : ErrorCode::kSuccess;
380 processor_->ActionComplete(this, error_code);
381 return;
382 }
383
384 default: {
385 // Protobuf has some reserved enum values, so a default case is needed.
386 LOG(FATAL) << "SnapshotManager::ProcessUpdateState returns "
387 << static_cast<int32_t>(state);
388 }
389 }
390 }
391
OnMergePercentageUpdate()392 bool CleanupPreviousUpdateAction::OnMergePercentageUpdate() {
393 double percentage = 0.0;
394 snapshot_->GetUpdateState(&percentage);
395 if (delegate_) {
396 // libsnapshot uses [0, 100] percentage but update_engine uses [0, 1].
397 delegate_->OnCleanupProgressUpdate(percentage / 100);
398 }
399
400 // Log if percentage increments by at least 1.
401 if (last_percentage_ < static_cast<unsigned int>(percentage)) {
402 last_percentage_ = percentage;
403 LOG(INFO) << "Waiting for merge to complete: " << last_percentage_ << "%.";
404 }
405
406 // Do not continue to wait for merge. Instead, let ProcessUpdateState
407 // return Merging directly so that we can ScheduleWaitForMerge() in
408 // MessageLoop.
409 return false;
410 }
411
BeforeCancel()412 bool CleanupPreviousUpdateAction::BeforeCancel() {
413 if (DeltaPerformer::ResetUpdateProgress(
414 prefs_,
415 false /* quick */,
416 false /* skip dynamic partitions metadata*/)) {
417 return true;
418 }
419
420 // ResetUpdateProgress might not work on stub prefs. Do additional checks.
421 LOG(WARNING) << "ProcessUpdateState returns Cancelled but cleanup failed.";
422
423 std::string val;
424 ignore_result(prefs_->GetString(kPrefsDynamicPartitionMetadataUpdated, &val));
425 if (val.empty()) {
426 LOG(INFO) << kPrefsDynamicPartitionMetadataUpdated
427 << " is empty, assuming successful cleanup";
428 return true;
429 }
430 LOG(WARNING)
431 << kPrefsDynamicPartitionMetadataUpdated << " is " << val
432 << ", not deleting snapshots even though UpdateState is Cancelled.";
433 cancel_failed_ = true;
434 return false;
435 }
436
InitiateMergeAndWait()437 void CleanupPreviousUpdateAction::InitiateMergeAndWait() {
438 TEST_AND_RETURN(running_);
439 LOG(INFO) << "Attempting to initiate merge.";
440 // suspend the VAB merge when running a DSU
441 if (GetBoolProperty("ro.gsid.image_running", false)) {
442 LOG(WARNING) << "Suspend the VAB merge when running a DSU.";
443 processor_->ActionComplete(this, ErrorCode::kError);
444 return;
445 }
446
447 snapshot_->UpdateCowStats(merge_stats_);
448
449 auto merge_start_time = std::chrono::duration_cast<std::chrono::milliseconds>(
450 android::base::boot_clock::now().time_since_epoch());
451 merge_stats_->set_boot_complete_to_merge_start_time_ms(
452 merge_start_time.count() - merge_stats_->boot_complete_time_ms());
453
454 auto source_build_fingerprint = snapshot_->ReadSourceBuildFingerprint();
455 merge_stats_->set_source_build_fingerprint(source_build_fingerprint);
456
457 if (!merge_stats_->WriteState()) {
458 LOG(ERROR) << "Failed to write merge stats; record may be unreliable if "
459 "merge is interrupted.";
460 }
461
462 if (snapshot_->InitiateMerge()) {
463 WaitForMergeOrSchedule();
464 return;
465 }
466
467 LOG(WARNING) << "InitiateMerge failed.";
468 auto state = snapshot_->GetUpdateState();
469 merge_stats_->set_state(state);
470 if (state == UpdateState::Unverified) {
471 // We are stuck at unverified state. This can happen if the update has
472 // been applied, but it has not even been attempted yet (in libsnapshot,
473 // rollback indicator does not exist); for example, if update_engine
474 // restarts before the device reboots, then this state may be reached.
475 // Nothing should be done here.
476 LOG(WARNING) << "InitiateMerge leaves the device at "
477 << "UpdateState::Unverified. (Did update_engine "
478 << "restarted?)";
479 processor_->ActionComplete(this, ErrorCode::kSuccess);
480 return;
481 }
482
483 // State does seems to be advanced.
484 // It is possibly racy. For example, on a userdebug build, the user may
485 // manually initiate a merge with snapshotctl between last time
486 // update_engine checks UpdateState. Hence, just call
487 // WaitForMergeOrSchedule one more time.
488 LOG(WARNING) << "IniitateMerge failed but GetUpdateState returned "
489 << android::snapshot::UpdateState_Name(state)
490 << ", try to wait for merge again.";
491 WaitForMergeOrSchedule();
492 return;
493 }
494
ReportMergeStats()495 void CleanupPreviousUpdateAction::ReportMergeStats() {
496 auto result = merge_stats_->Finish();
497 if (result == nullptr) {
498 LOG(WARNING) << "Not reporting merge stats because "
499 "SnapshotMergeStats::Finish failed.";
500 return;
501 }
502
503 #ifdef __ANDROID_RECOVERY__
504 LOG(INFO) << "Skip reporting merge stats in recovery.";
505 #elif defined(UE_DISABLE_STATS)
506 LOG(INFO) << "Skip reporting merge stats because metrics are disabled.";
507 #else
508 const auto& report = result->report();
509
510 if (report.state() == UpdateState::None ||
511 report.state() == UpdateState::Initiated ||
512 report.state() == UpdateState::Unverified) {
513 LOG(INFO) << "Not reporting merge stats because state is "
514 << android::snapshot::UpdateState_Name(report.state());
515 return;
516 }
517
518 auto passed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
519 result->merge_time());
520
521 bool vab_retrofit = boot_control_->GetDynamicPartitionControl()
522 ->GetVirtualAbFeatureFlag()
523 .IsRetrofit();
524 bool vab_compression_enabled = boot_control_->GetDynamicPartitionControl()
525 ->GetVirtualAbCompressionFeatureFlag()
526 .IsEnabled();
527 // The snapshot has been merged, so we can no longer call
528 // DynamicPartitionControlInterface::UpdateUsesSnapshotCompression.
529 // However, we have saved the flag in the snapshot report.
530 bool vab_compression_used = report.compression_enabled();
531 bool userspace_snapshots_enabled =
532 boot_control_->GetDynamicPartitionControl()
533 ->GetVirtualAbUserspaceSnapshotsFeatureFlag()
534 .IsEnabled();
535 bool userspace_snapshots_used = report.userspace_snapshots_used();
536 bool xor_compression_enabled = boot_control_->GetDynamicPartitionControl()
537 ->GetVirtualAbCompressionXorFeatureFlag()
538 .IsEnabled();
539 bool xor_compression_used = report.xor_compression_used();
540 bool iouring_used = report.iouring_used();
541
542 auto target_build_fingerprint =
543 android::base::GetProperty("ro.build.fingerprint", "");
544
545 LOG(INFO) << "Reporting merge stats: "
546 << android::snapshot::UpdateState_Name(report.state()) << " in "
547 << passed_ms.count() << "ms (resumed " << report.resume_count()
548 << " times), using " << report.cow_file_size()
549 << " bytes of COW image.";
550 statsd::stats_write(statsd::SNAPSHOT_MERGE_REPORTED,
551 static_cast<int32_t>(report.state()),
552 static_cast<int64_t>(passed_ms.count()),
553 static_cast<int32_t>(report.resume_count()),
554 vab_retrofit,
555 static_cast<int64_t>(report.cow_file_size()),
556 vab_compression_enabled,
557 vab_compression_used,
558 report.total_cow_size_bytes(),
559 report.estimated_cow_size_bytes(),
560 report.boot_complete_time_ms(),
561 report.boot_complete_to_merge_start_time_ms(),
562 static_cast<int32_t>(report.merge_failure_code()),
563 report.source_build_fingerprint().c_str(),
564 target_build_fingerprint.c_str(),
565 userspace_snapshots_enabled,
566 userspace_snapshots_used,
567 xor_compression_enabled,
568 xor_compression_used,
569 iouring_used);
570 #endif
571 }
572
573 } // namespace chromeos_update_engine
574