1 // SPDX-License-Identifier: GPL-2.0
2 #include "qcow2.h"
3
MetaFlushingState(Qcow2TopTable & t,bool is_mapping)4 MetaFlushingState::MetaFlushingState(Qcow2TopTable &t, bool is_mapping):
5 mapping(is_mapping), top(t)
6 {
7 state = qcow2_meta_flush::IDLE;
8 slice_dirtied = 0;
9 parent_blk_idx = -1;
10 last_flush = std::chrono::system_clock::now();
11 }
12
del_meta_from_list(std::vector<Qcow2SliceMeta * > & v,const Qcow2SliceMeta * t)13 void MetaFlushingState::del_meta_from_list(std::vector <Qcow2SliceMeta *> &v,
14 const Qcow2SliceMeta *t)
15 {
16 auto it = find(v.cbegin(), v.cend(), t);
17
18 qcow2_assert(it != v.cend());
19 v.erase(it);
20 }
21
slice_is_done(const Qcow2SliceMeta * t)22 void MetaFlushingState::slice_is_done(const Qcow2SliceMeta *t)
23 {
24 del_meta_from_list(slices_in_flight, t);
25
26 qcow2_assert(state == WRITE_SLICES);
27
28 if (slices_in_flight.empty() && slices_to_flush.empty()) {
29 if (++parent_entry_idx >= (512/8))
30 set_state(qcow2_meta_flush::WRITE_TOP);
31 else
32 //handle next entry in this block of top table
33 set_state(qcow2_meta_flush::PREP_WRITE_SLICES);
34 }
35 }
36
add_slice_to_flush(Qcow2SliceMeta * m)37 void MetaFlushingState::add_slice_to_flush(Qcow2SliceMeta *m)
38 {
39 qcow2_assert(state == PREP_WRITE_SLICES);
40 qcow2_assert(m->get_dirty(-1));
41
42 auto it = find(slices_to_flush.cbegin(), slices_to_flush.cend(), m);
43 qcow2_assert(it == slices_to_flush.cend());
44
45 auto it1 = find(slices_in_flight.cbegin(), slices_in_flight.cend(), m);
46 qcow2_assert(it1 == slices_in_flight.cend());
47
48 slices_to_flush.push_back(m);
49 }
50
__write_slice_co(Qcow2State & qs,const struct ublksrv_queue * q,Qcow2SliceMeta * m,struct ublk_io_tgt * io,int tag)51 co_io_job MetaFlushingState::__write_slice_co(Qcow2State &qs,
52 const struct ublksrv_queue *q, Qcow2SliceMeta *m,
53 struct ublk_io_tgt *io, int tag)
54 {
55 int ret;
56 qcow2_io_ctx_t ioc(tag, q->q_id);
57 bool wait;
58
59 slices_in_flight.push_back(m);
60 again:
61 try {
62 ret = m->flush(qs, ioc, m->get_offset(), m->get_buf_size());
63 wait = false;
64 } catch (MetaUpdateException &meta_update_error) {
65 wait = true;
66 }
67
68 if (wait) {
69 co_await__suspend_always(tag);
70 goto again;
71 }
72
73 if (ret < 0) {
74 ublk_err( "%s: zero my cluster failed %d\n",
75 __func__, ret);
76 goto exit;
77 }
78
79 if (ret > 0) {
80 const struct io_uring_cqe *cqe;
81 bool done = false;
82 int io_ret = 0;
83
84 co_await__suspend_always(tag);
85
86 cqe = io->tgt_io_cqe;
87 done = (cqe && cqe->res != -EAGAIN);
88 if (done)
89 io_ret = cqe->res;
90 ret = qcow2_meta_io_done(q, cqe);
91 if (!done && ret == -EAGAIN)
92 goto again;
93
94 //here we can't retry since the slice may be
95 //dirtied just after io_done()
96 if (!done) {
97 if (ret < 0)
98 goto exit;
99 } else {
100 if (io_ret < 0)
101 goto exit;
102 ret = io_ret;
103 }
104 }
105 exit:
106 if (m->get_prep_flush()) {
107 m->set_prep_flush(false);
108 m->wakeup_all(q, tag);
109 }
110 qs.meta_flushing.free_tag(q, tag);
111 if (ret >= 0)
112 slice_is_done(m);
113 else
114 del_meta_from_list(slices_in_flight, m);
115 m->put_ref();
116 }
117
__write_slices(Qcow2State & qs,const struct ublksrv_queue * q)118 void MetaFlushingState::__write_slices(Qcow2State &qs,
119 const struct ublksrv_queue *q)
120 {
121 std::vector<Qcow2SliceMeta *> &v1 = slices_to_flush;
122 std::vector<Qcow2SliceMeta *>::const_iterator it = v1.cbegin();
123
124 flush_log("%s: mapping %d to_flush %d, in_flight %d\n",
125 __func__, mapping, v1.size(), slices_in_flight.size());
126
127 if (v1.empty())
128 return;
129
130 while (it != v1.cend()) {
131 int tag;
132 struct ublk_io_tgt *io;
133 Qcow2SliceMeta *m;
134
135 tag = qs.meta_flushing.alloc_tag(q);
136 if (tag == -1)
137 return;
138 m = *it;
139 it = v1.erase(it);
140 m->get_ref();
141 io = ublk_get_io_tgt_data(q, tag);
142 io->co = __write_slice_co(qs, q, m, io, tag);
143 }
144 }
145
146 //todo: run fsync before flushing top table, and global fsync should be
147 //fine, given top table seldom becomes dirty
__write_top_co(Qcow2State & qs,const struct ublksrv_queue * q,struct ublk_io_tgt * io,int tag)148 co_io_job MetaFlushingState::__write_top_co(Qcow2State &qs,
149 const struct ublksrv_queue *q, struct ublk_io_tgt *io, int tag)
150 {
151 int ret;
152 qcow2_io_ctx_t ioc(tag, q->q_id);
153 bool wait;
154
155 again:
156 try {
157 ret = top.flush(qs, ioc,
158 top.get_offset() + parent_blk_idx * 512, 512);
159 wait = false;
160 } catch (MetaUpdateException &meta_update_error) {
161 wait = true;
162 }
163
164 if (wait) {
165 co_await__suspend_always(tag);
166 goto again;
167 }
168
169 if (ret < 0) {
170 ublk_err( "%s: zero my cluster failed %d\n",
171 __func__, ret);
172 goto exit;
173 }
174
175 if (ret > 0) {
176 const struct io_uring_cqe *cqe;
177
178 co_await__suspend_always(tag);
179
180 cqe = io->tgt_io_cqe;
181 ret = qcow2_meta_io_done(q, cqe);
182 if (ret == -EAGAIN)
183 goto again;
184 if (ret < 0)
185 goto exit;
186 }
187 exit:
188 qs.meta_flushing.free_tag(q, tag);
189
190 if (!top.get_blk_dirty(parent_blk_idx))
191 set_state(qcow2_meta_flush::DONE);
192 }
193
__write_top(Qcow2State & qs,const struct ublksrv_queue * q)194 void MetaFlushingState::__write_top(Qcow2State &qs,
195 const struct ublksrv_queue *q)
196 {
197 int tag;
198 struct ublk_io_tgt *io;
199
200 if (top.is_flushing(parent_blk_idx))
201 return;
202
203 tag = qs.meta_flushing.alloc_tag(q);
204 if (tag == -1)
205 return;
206
207 io = ublk_get_io_tgt_data(q, tag);
208 io->co = __write_top_co(qs, q, io, tag);
209 }
210
__done(Qcow2State & qs,const struct ublksrv_queue * q)211 void MetaFlushingState::__done(Qcow2State &qs, const struct ublksrv_queue *q)
212 {
213 set_state(qcow2_meta_flush::IDLE);
214 last_flush = std::chrono::system_clock::now();
215 }
216
mark_no_update()217 void MetaFlushingState::mark_no_update()
218 {
219 auto it = slices_to_flush.begin();
220
221 for (; it != slices_to_flush.end(); it++)
222 (*it)->set_prep_flush(true);
223 }
224
__prep_write_slice(Qcow2State & qs,const struct ublksrv_queue * q)225 void MetaFlushingState::__prep_write_slice(Qcow2State &qs,
226 const struct ublksrv_queue *q)
227 {
228 u64 entry;
229 u64 idx = -1;
230 u64 start, end, offset, step;
231
232 do {
233 qcow2_assert(parent_entry_idx >= 0 && parent_entry_idx < (512/8));
234
235 idx = (parent_blk_idx * 512 / 8) + parent_entry_idx;
236
237 qcow2_assert(idx >= 0 && idx < top.get_nr_entries());
238
239 entry = top.get_entry(idx);
240 if (entry && top.has_dirty_slices(qs, idx))
241 break;
242
243 if (++parent_entry_idx == (512/8)) {
244 parent_entry_idx = 0;
245 set_state(qcow2_meta_flush::WRITE_TOP);
246 return;
247 }
248 } while (true);
249
250 if (mapping)
251 step = 1ULL << (QCOW2_PARA::L2_TABLE_SLICE_BITS - 3 +
252 qs.header.cluster_bits);
253 else
254 step = 1ULL << (QCOW2_PARA::REFCOUNT_BLK_SLICE_BITS - 3 +
255 qs.header.cluster_bits);
256
257 start = idx << top.single_entry_order();
258 end = start + (1ULL << top.single_entry_order());
259 for (offset = start; offset < end; offset += step) {
260 Qcow2SliceMeta *t;
261
262 if (mapping)
263 t = qs.cluster_map.__find_slice(offset);
264 else
265 t = qs.cluster_allocator.__find_slice(offset);
266
267 if (t && t->get_dirty(-1)) {
268 qcow2_assert(!t->is_flushing());
269 add_slice_to_flush(t);
270 }
271 }
272
273 if (slices_to_flush.size() > 0)
274 set_state(qcow2_meta_flush::ZERO_MY_CLUSTER);
275 else
276 set_state(qcow2_meta_flush::WRITE_TOP);
277 }
278
__zero_my_cluster_co(Qcow2State & qs,const struct ublksrv_queue * q,struct ublk_io_tgt * io,int tag,Qcow2SliceMeta * m)279 co_io_job MetaFlushingState::__zero_my_cluster_co(Qcow2State &qs,
280 const struct ublksrv_queue *q, struct ublk_io_tgt *io, int tag,
281 Qcow2SliceMeta *m)
282
283 {
284 int ret;
285 qcow2_io_ctx_t ioc(tag, q->q_id);
286 u64 cluster_off = m->get_offset() &
287 ~((1ULL << qs.header.cluster_bits) - 1);
288 bool wait;
289
290 again:
291 try {
292 ret = m->zero_my_cluster(qs, ioc);
293 wait = false;
294 } catch (MetaUpdateException &meta_update_error) {
295 wait = true;
296 }
297
298 if (wait) {
299 co_await__suspend_always(tag);
300 goto again;
301 }
302
303 if (ret < 0) {
304 ublk_err( "%s: zero my cluster failed %d\n",
305 __func__, ret);
306 goto exit;
307 }
308
309 if (ret > 0) {
310 const struct io_uring_cqe *cqe;
311
312 co_await__suspend_always(tag);
313
314 cqe = io->tgt_io_cqe;
315 ret = qcow2_meta_io_done(q, cqe);
316 if (ret == -EAGAIN)
317 goto again;
318 if (ret < 0)
319 goto exit;
320 }
321 exit:
322 qs.meta_flushing.free_tag(q, tag);
323 if (qs.cluster_allocator.alloc_cluster_is_zeroed(cluster_off)) {
324 //for mapping table, wait until the associated refcount
325 //tables are flushed out
326 if (mapping) {
327 mark_no_update();
328 set_state(qcow2_meta_flush::WAIT);
329 } else
330 set_state(qcow2_meta_flush::WRITE_SLICES);
331 }
332 m->put_ref();
333 }
334
335
__zero_my_cluster(Qcow2State & qs,const struct ublksrv_queue * q)336 void MetaFlushingState::__zero_my_cluster(Qcow2State &qs,
337 const struct ublksrv_queue *q)
338 {
339 int tag;
340 struct ublk_io_tgt *io;
341 Qcow2SliceMeta *m = slices_to_flush[0];
342 u64 cluster_off = m->get_offset() &
343 ~((1ULL << qs.header.cluster_bits) - 1);
344 Qcow2ClusterState *s =
345 qs.cluster_allocator.get_cluster_state(cluster_off);
346
347 if (s != nullptr && s->get_state() == QCOW2_ALLOC_ZEROING)
348 return;
349
350 tag = qs.meta_flushing.alloc_tag(q);
351 if (tag == -1)
352 return;
353
354 m->get_ref();
355 io = ublk_get_io_tgt_data(q, tag);
356 io->co = __zero_my_cluster_co(qs, q, io, tag, m);
357 }
358
run_flush(Qcow2State & qs,const struct ublksrv_queue * q,int top_blk_idx)359 void MetaFlushingState::run_flush(Qcow2State &qs,
360 const struct ublksrv_queue *q, int top_blk_idx)
361 {
362 if (state == qcow2_meta_flush::IDLE) {
363 if (top_blk_idx >= 0 && top_blk_idx < top.dirty_blk_size()) {
364 parent_blk_idx = top_blk_idx;
365 parent_entry_idx = 0;
366 set_state(qcow2_meta_flush::PREP_WRITE_SLICES);
367 }
368 }
369 again:
370 if (state == qcow2_meta_flush::PREP_WRITE_SLICES)
371 __prep_write_slice(qs, q);
372
373 if (state == qcow2_meta_flush::ZERO_MY_CLUSTER)
374 __zero_my_cluster(qs, q);
375
376 if (state == qcow2_meta_flush::WAIT) {
377 qcow2_assert(mapping);
378 return;
379 }
380
381 if (state == qcow2_meta_flush::WRITE_SLICES)
382 __write_slices(qs, q);
383
384 if (state == qcow2_meta_flush::WRITE_TOP)
385 __write_top(qs, q);
386
387 if (state == qcow2_meta_flush::DONE)
388 __done(qs, q);
389
390 if (state == qcow2_meta_flush::PREP_WRITE_SLICES)
391 goto again;
392 }
393
dump(const char * func,int line) const394 void MetaFlushingState::dump(const char *func, int line) const {
395 qcow2_log("%s %d: mapping %d state %d blk_idx %d entry_idx %d list size(%ld %ld)"
396 " dirty slices %u, top table dirty blocks %u\n",
397 func, line, mapping, state,
398 parent_blk_idx, parent_entry_idx,
399 slices_to_flush.size(),
400 slices_in_flight.size(),
401 slice_dirtied, top.dirty_blks());
402 }
403
__need_flush(int queued)404 bool MetaFlushingState::__need_flush(int queued)
405 {
406 bool need_flush = slice_dirtied > 0;
407
408 if (!need_flush)
409 need_flush = top.dirty_blks() > 0;
410
411 if (!need_flush)
412 return false;
413
414 if (queued) {
415 auto diff = std::chrono::system_clock::now() - last_flush;
416 std::chrono::milliseconds ms = std::chrono::duration_cast<
417 std::chrono::milliseconds>(diff);
418
419 //timeout, so flush now
420 if (ms.count() > MAX_META_FLUSH_DELAY_MS)
421 return true;
422 else
423 return false;
424 }
425
426 /* queue is idle, so have to flush immediately */
427 return true;
428 }
429
need_flush(Qcow2State & qs,int * top_idx,unsigned queued)430 bool MetaFlushingState::need_flush(Qcow2State &qs, int *top_idx,
431 unsigned queued)
432 {
433 bool need_flush = get_state() > qcow2_meta_flush::IDLE;
434 int idx = -1;
435
436 if (!need_flush) {
437 if (mapping)
438 need_flush = qs.cluster_map.
439 has_evicted_dirty_slices();
440 else
441 need_flush = qs.cluster_allocator.
442 has_evicted_dirty_slices();
443
444 //only flush refcount tables actively if there
445 //are evicted dirty refcount slices
446 if (!need_flush)
447 need_flush = __need_flush(queued);
448 }
449
450 if (need_flush && get_state() == qcow2_meta_flush::IDLE) {
451 if (mapping)
452 idx = qs.cluster_map.figure_group_from_l1_table();
453 else
454 idx = qs.cluster_allocator.figure_group_from_refcount_table();
455
456 //idx is more accurate than slice_dirtied
457 //FIXME: make slice_dirtied more accurate
458 if (idx == -1) {
459 need_flush = false;
460 slice_dirtied = 0;
461 }
462 }
463
464 *top_idx = idx;
465 return need_flush;
466 }
467
468 //calculate the 1st index of refcount table, in which the to-be-flushed
469 //l2's entries depend on
calc_refcount_dirty_blk_range(Qcow2State & qs,int * refcnt_blk_start,int * refcnt_blk_end)470 int MetaFlushingState::calc_refcount_dirty_blk_range(Qcow2State& qs,
471 int *refcnt_blk_start, int *refcnt_blk_end)
472 {
473 u64 s = (u64)-1;
474 u64 e = 0;
475 u64 l2_offset = 0;
476 int start_idx, end_idx;
477
478 qcow2_assert(mapping);
479
480 for (auto it = slices_to_flush.begin(); it != slices_to_flush.end();
481 it++) {
482 u64 ts, te;
483
484 qcow2_assert((*it)->get_dirty(-1));
485
486 (*it)->get_dirty_range(&ts, &te);
487
488 if (!l2_offset)
489 l2_offset = (*it)->get_offset() & ~((1ULL <<
490 qs.header.cluster_bits) - 1);
491
492 if (ts > te)
493 continue;
494 if (ts < s)
495 s = ts;
496 if (te > e)
497 e = te;
498 }
499
500 if (s > e)
501 return -EINVAL;
502
503 //this l2 should be considered too
504 if (l2_offset && l2_offset < s)
505 s = l2_offset;
506
507 start_idx = qs.refcount_table.offset_to_idx(s);
508 *refcnt_blk_start = start_idx >> (qs.get_min_flush_unit_bits() - 3);
509
510 end_idx = qs.refcount_table.offset_to_idx(e);
511 *refcnt_blk_end = end_idx >> (qs.get_min_flush_unit_bits() - 3);
512 *refcnt_blk_end += 1;
513
514 flush_log("%s: %lx-%lx idx (%d %d) blk idx(%d %d)\n", __func__, s, e,
515 start_idx, end_idx, *refcnt_blk_start, *refcnt_blk_end);
516
517 if (*refcnt_blk_start == *refcnt_blk_end)
518 *refcnt_blk_end = *refcnt_blk_start + 1;
519
520 if (*refcnt_blk_start >= *refcnt_blk_end)
521 qcow2_log("%s: %lx-%lx bad idx %d %d\n", __func__, s, e,
522 *refcnt_blk_start, *refcnt_blk_end);
523
524 qcow2_assert(*refcnt_blk_start < *refcnt_blk_end);
525
526 return 0;
527 }
528
Qcow2MetaFlushing(Qcow2State & qs)529 Qcow2MetaFlushing::Qcow2MetaFlushing(Qcow2State &qs):
530 tags(QCOW2_PARA::META_MAX_TAGS),
531 refcnt_blk_start(-1),
532 refcnt_blk_end(-1),
533 state(qs),
534 mapping_stat(qs.l1_table, true),
535 refcount_stat(qs.refcount_table, false)
536 {
537 for (int i = 0; i < tags.size(); i++)
538 tags[i] = true;
539 }
540
alloc_tag(const struct ublksrv_queue * q)541 int Qcow2MetaFlushing::alloc_tag(const struct ublksrv_queue *q) {
542 for (size_t i = 0; i < tags.size(); i++) {
543 if (tags[i]) {
544 tags[i] = false;
545 return i + q->q_depth;
546 }
547 }
548 return -1;
549 }
550
free_tag(const struct ublksrv_queue * q,int tag)551 void Qcow2MetaFlushing::free_tag(const struct ublksrv_queue *q, int tag) {
552 int depth = q->q_depth;
553
554 qcow2_assert(tag >= depth && tag < depth + tags.size());
555 tags[tag - depth] = true;
556 }
557
dump()558 void Qcow2MetaFlushing::dump()
559 {
560 ublk_err( "meta flushing: mapping: dirty slices %u, l1 dirty blocks %u\n",
561 mapping_stat.slice_dirtied,
562 state.l1_table.dirty_blks());
563 ublk_err( "meta flushing: refcount: dirty slices %u, refcount table dirty blocks %u\n",
564 refcount_stat.slice_dirtied,
565 state.refcount_table.dirty_blks());
566 }
567
handle_mapping_dependency_start_end(Qcow2State * qs,const struct ublksrv_queue * q)568 bool Qcow2MetaFlushing::handle_mapping_dependency_start_end(Qcow2State *qs,
569 const struct ublksrv_queue *q)
570 {
571 if (refcount_stat.get_state() == qcow2_meta_flush::IDLE &&
572 (refcnt_blk_start == refcnt_blk_end)) {
573 int ret;
574
575 //current flushing refcnt is done
576 if (refcnt_blk_start >= 0) {
577 mapping_stat.set_state(
578 qcow2_meta_flush::WRITE_SLICES);
579 refcnt_blk_start = refcnt_blk_end = -1;
580 mapping_stat.run_flush(state, q, -1);
581
582 return true;
583 } else { //current flushing is just started
584 ret = mapping_stat.calc_refcount_dirty_blk_range(
585 *qs, &refcnt_blk_start, &refcnt_blk_end);
586
587 if (ret < 0) {
588 mapping_stat.set_state(
589 qcow2_meta_flush::WRITE_SLICES);
590 mapping_stat.run_flush(state, q, -1);
591 return true;
592 }
593 }
594 }
595
596 return false;
597 }
598
handle_mapping_dependency(Qcow2State * qs,const struct ublksrv_queue * q)599 void Qcow2MetaFlushing::handle_mapping_dependency(Qcow2State *qs,
600 const struct ublksrv_queue *q)
601 {
602 qcow2_assert(mapping_stat.get_state() == qcow2_meta_flush::WAIT);
603
604 if (!handle_mapping_dependency_start_end(qs, q)) {
605
606 refcount_stat.run_flush(state, q, refcnt_blk_start);
607
608 while (refcount_stat.get_state() == qcow2_meta_flush::IDLE &&
609 (++refcnt_blk_start < refcnt_blk_end))
610 refcount_stat.run_flush(state, q, refcnt_blk_start);
611 handle_mapping_dependency_start_end(qs, q);
612 }
613
614 if (mapping_stat.get_state() != qcow2_meta_flush::WAIT)
615 mapping_stat.run_flush(state, q, -1);
616 }
617
is_flushing()618 bool Qcow2MetaFlushing::is_flushing()
619 {
620 return mapping_stat.get_state() != qcow2_meta_flush::IDLE ||
621 refcount_stat.get_state() != qcow2_meta_flush::IDLE;
622 }
623
run_flush(const struct ublksrv_queue * q,int queued)624 void Qcow2MetaFlushing::run_flush(const struct ublksrv_queue *q, int queued)
625 {
626 Qcow2State *qs = queue_to_qcow2state(q);
627 bool need_flush;
628 int map_idx = -1;
629 int refcnt_idx = -1;
630
631 need_flush = mapping_stat.need_flush(*qs, &map_idx, queued);
632 need_flush |= refcount_stat.need_flush(*qs, &refcnt_idx, queued);
633
634 if (need_flush)
635 flush_log("%s: enter flush: state %d/%d top blk idx %d/%d queued %d, refcnt blks(%d %d)\n",
636 __func__, mapping_stat.get_state(),
637 refcount_stat.get_state(), map_idx, refcnt_idx,
638 queued, refcnt_blk_start, refcnt_blk_end);
639
640 //refcount tables flushing is always triggered by flushing mapping
641 //tables
642 if (need_flush)
643 mapping_stat.run_flush(state, q, map_idx);
644
645 if (mapping_stat.get_state() == qcow2_meta_flush::WAIT)
646 handle_mapping_dependency(qs, q);
647
648 if (need_flush)
649 flush_log("%s: exit flush: state %d/%d queued %d refcnt blks(%d %d) has dirty slice %d\n",
650 __func__, mapping_stat.get_state(),
651 refcount_stat.get_state(), queued,
652 refcnt_blk_start, refcnt_blk_end,
653 qs->has_dirty_slice());
654 }
655