1 // SPDX-License-Identifier: GPL-2.0
2 #include <cassert>
3
4 #include "qcow2.h"
5 #include "ublksrv_tgt.h"
6
7
8 // refcnt is for slice only, and initialize it as two, one is for submission
9 // side, another is for free side. This way guarantees that the returned slice
10 // from alloc_slice is always valid
Qcow2Meta(Qcow2Header & h,u64 off,u32 sz,const char * name,u32 f)11 Qcow2Meta::Qcow2Meta(Qcow2Header &h, u64 off, u32 sz, const char *name, u32 f):
12 header(h), offset(off), buf_sz(sz), flags(f), refcnt(2)
13 {
14 //used for implementing slice's ->reset() only
15 if (f & QCOW2_META_DONT_ALLOC_BUF)
16 return;
17
18 if (posix_memalign((void **)&addr, getpagesize(), sz))
19 ublk_err( "allocate memory %d bytes failed, %s\n",
20 sz, name);
21 #ifdef DEBUG_QCOW2_META_OBJ
22 id = name;
23 qcow2_log("%s: constructed, obj %p, buf size %d off %lx flags %x\n",
24 name, this, sz, off, flags);
25 #endif
26 }
27
show(const char * func,int line)28 void Qcow2Meta::show(const char *func, int line)
29 {
30 #ifdef DEBUG_QCOW2_META_OBJ
31 qcow2_log("%s:%d id %s obj %p flags %x off %lx ref %d\n",
32 func, line, id, this, flags, offset, refcnt);
33 #else
34 qcow2_log("%s:%d obj %p flags %x off %lx ref %d\n",
35 func, line, this, flags, offset, refcnt);
36 #endif
37 }
38
~Qcow2Meta()39 Qcow2Meta::~Qcow2Meta()
40 {
41 #ifdef DEBUG_QCOW2_META_OBJ
42 qcow2_log("%s: destructed, obj %p flags %x off %lx ref %d\n",
43 id, this, flags, offset, refcnt);
44 #endif
45 if (flags & QCOW2_META_DONT_ALLOC_BUF)
46 return;
47
48 if (!is_top_meta() && (get_dirty(-1) || is_flushing() ||
49 (!get_update() && !get_evicted()))) {
50 qcow2_log("BUG %s: obj %p flags %x off %lx\n",
51 __func__, this, flags, offset);
52 qcow2_assert(0);
53 }
54 free(addr);
55 }
56
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)57 int Qcow2Meta::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u32 len, bool sync)
58 {
59 int fd;
60
61 if (addr == NULL)
62 return -EINVAL;
63 if (len > buf_sz) {
64 ublk_err( "%s %s: load too much %d(%d) \n",
65 __func__, typeid(*this).name(), len, buf_sz);
66 return -EINVAL;
67 }
68 if (!sync)
69 return -EOPNOTSUPP;
70
71 //qcow2_log("%s: read %s offset %llx len %lu \n", __func__,
72 // typeid(*this).name(), offset, len);
73 fd = qs.img.fd;
74 lseek(fd, offset, SEEK_SET);
75 data_len = read(fd, addr, len);
76 if (data_len != len)
77 qcow2_log("%s: read %u(%u)\n", __func__, len, data_len);
78 if (data_len > 0)
79 flags |= QCOW2_META_UPDATE;
80 return data_len;
81 }
82
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)83 int Qcow2Meta::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u64 off,
84 u32 len)
85 {
86 int fd = qs.img.fd;
87 int ret;
88
89 if (!(flags & QCOW2_META_DIRTY))
90 return 0;
91
92 if (!(flags & QCOW2_META_UPDATE))
93 ublk_err( "%s %s: buf isn't update\n", __func__,
94 typeid(*this).name());
95
96 //qcow2_log("%s: write %s offset %llx len %lu \n", __func__,
97 // typeid(*this).name(), offset, buf_sz);
98 lseek(fd, off, SEEK_SET);
99 ret = write(fd, addr, len);
100 if (len != ret)
101 qcow2_log("%s: write %u(%u)\n", __func__, len, ret);
102 if (ret > 0)
103 flags &= ~QCOW2_META_DIRTY;
104
105 return len;
106 }
107
zero_buf()108 void Qcow2Meta::zero_buf() {
109 memset((void *)addr, 0, buf_sz);
110 }
111
112 // Base class is constructed first, then follows member class/objects,
113 // and member classes are done in the order of their declaration,
114 // so here __a can be setup correctly.
Qcow2HeaderExtFeatureNameTable(char * addr,u64 offset)115 Qcow2HeaderExtFeatureNameTable::Qcow2HeaderExtFeatureNameTable(
116 char *addr, u64 offset): Qcow2HeaderExt(addr, offset),
117 __a(len / sizeof(struct feature_entry))
118 {
119 unsigned off = offset;
120
121 for (int i = 0; i < __a.size(); i++) {
122 __a[i].feature_type = *(addr + off + 8);
123 __a[i].bit_num = *(addr + off + 9);
124 strncpy(__a[i].feature_name, addr + off + 10, 46);
125 off += 48;
126 }
127 }
128
dump() const129 void Qcow2HeaderExtFeatureNameTable::dump() const
130 {
131 Qcow2HeaderExt::dump();
132
133 for (int i = 0; i < __a.size(); i++)
134 qcow2_log("\t %d: type %x bit_num %u name %s\n",
135 i, __a[i].feature_type, __a[i].bit_num,
136 __a[i].feature_name);
137 }
138
Qcow2Header(Qcow2State & state)139 Qcow2Header::Qcow2Header(Qcow2State &state): Qcow2Meta(*this, 0, 4096,
140 typeid(this).name(), 0), magic(0), version(0), cluster_bits(0),
141 refcount_order(0), qs(state)
142 {
143 backingfile_format_name = NULL;
144 feature_name_table = NULL;
145 enc_header_pointer = NULL;
146 bitmaps = NULL;
147 ext_data_file_name = NULL;
148
149 load(state, 0, buf_sz, true);
150 }
151
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)152 int Qcow2Header::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u64 off,
153 u32 len)
154 {
155 return Qcow2Meta::flush(qs, ioc, off, len);
156 }
157
~Qcow2Header()158 Qcow2Header::~Qcow2Header()
159 {
160 delete backingfile_format_name;
161 delete feature_name_table;
162 delete enc_header_pointer;
163 delete bitmaps;
164 delete ext_data_file_name;
165 }
166
dump_ext() const167 void Qcow2Header::dump_ext() const
168 {
169 if (backingfile_format_name)
170 backingfile_format_name->dump();
171
172 if (ext_data_file_name)
173 ext_data_file_name->dump();
174
175 if (feature_name_table)
176 feature_name_table->dump();
177
178 if (bitmaps)
179 bitmaps->dump();
180
181 if (enc_header_pointer)
182 enc_header_pointer->dump();
183 }
184
185 /*
186 * populate header extensions
187 *
188 * The header may take more than 4k, which should be decided by
189 * backing_file_offset & backing_file_size __or__ populate
190 * header extensions.
191 */
populate()192 int Qcow2Header::populate()
193 {
194 char *buf = (char *)addr;
195 u64 start = (get_header_length() + 7) & ~0x7ULL;
196 u32 *p_magic = const_cast<u32 *> (&magic);
197 u32 *p_version = const_cast<u32 *> (&version);
198 u32 *p_cluster_bits = const_cast<u32 *> (&cluster_bits);
199 u32 *p_refcount_order = const_cast<u32 *> (&refcount_order);
200
201 *p_magic = get_magic();
202 *p_version = get_version();
203 *p_cluster_bits = get_cluster_bits();
204 *p_refcount_order = get_refcount_order();
205
206 if (version == 2)
207 goto exit;
208
209 //todo: populate extensions
210 while (true) {
211 Qcow2HeaderExt ext(buf, start);
212
213 switch (ext.type) {
214 case QCOW2_EXT_MAGIC_END:
215 goto exit;
216 case QCOW2_EXT_MAGIC_BACKING_FORMAT:
217 this->backingfile_format_name =
218 new Qcow2HeaderExtString(buf, start);
219 break;
220 case QCOW2_EXT_MAGIC_FEATURE_TABLE:
221 this->feature_name_table =
222 new Qcow2HeaderExtFeatureNameTable(
223 buf, start);
224 break;
225 case QCOW2_EXT_MAGIC_CRYPTO_HEADER:
226 this->enc_header_pointer =
227 new Qcow2HeaderExtEncHeader(buf, start);
228 break;
229 case QCOW2_EXT_MAGIC_BITMAPS:
230 this->bitmaps =
231 new Qcow2HeaderExtBitmaps(buf, start);
232 break;
233 case QCOW2_EXT_MAGIC_DATA_FILE:
234 this->ext_data_file_name =
235 new Qcow2HeaderExtString(buf, start);
236 break;
237 };
238 start += 8 + (ext.len + 7) & ~0x7ULL;
239 }
240 exit:
241 return 0;
242 }
243
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)244 int Qcow2Header::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u32 len, bool sync)
245 {
246 int ret;
247
248 ret = Qcow2Meta::load(qs, ioc, len, sync);
249 if (ret <= 0)
250 goto fail;
251
252 ret = populate();
253 return ret;
254 fail:
255 ublk_err( "%s: load failed %d", __func__, ret);
256 return ret;
257 }
258
operator <<(std::ostream & os,const Qcow2Header & h)259 std::ostream & operator<<(std::ostream &os, const Qcow2Header &h)
260 {
261 char buf[256];
262
263 sprintf(buf, "magic: %x", h.magic);
264 std::cout << std::string(buf) << std::endl;
265 qcow2_log("%s", buf);
266
267 sprintf(buf, "version: %x\n", h.version);
268 std::cout << std::string(buf) << std::endl;
269 qcow2_log("%s", buf);
270
271 sprintf(buf, "cluster_bits: %x\n", h.cluster_bits);
272 std::cout << std::string(buf) << std::endl;
273 qcow2_log("%s", buf);
274
275 sprintf(buf, "refcount_order: %x\n", h.refcount_order);
276 std::cout << std::string(buf) << std::endl;
277 qcow2_log("%s", buf);
278
279 return os;
280 }
281
Qcow2MappingMeta(Qcow2State & qs,u64 off,u32 buf_sz,const char * cls_name,u32 f)282 Qcow2MappingMeta::Qcow2MappingMeta(Qcow2State &qs, u64 off, u32 buf_sz,
283 const char *cls_name, u32 f):
284 Qcow2Meta(qs.header, off, buf_sz, cls_name, f)
285 {
286 //default each entry is 64bits(8bytes) except for:
287 // extended l2 entry is 128bit, refcount blk has refcount_order
288 entry_bits_order = 6;
289 next_free_idx = -1;
290 }
291
292 /*
293 * __flush() is just one worker, state check/update is done before calling
294 * __flush()
295 */
__flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len,bool run_fsync)296 int Qcow2MappingMeta::__flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
297 u64 off, u32 len, bool run_fsync)
298 {
299 int fd = qs.img.fd;
300 u32 qid = ioc.get_qid();
301 u32 tag = ioc.get_tag();
302 const struct ublksrv_queue *q = ublksrv_get_queue(qs.dev, qid);
303 struct io_uring_sqe *sqe, *sqe2;
304 unsigned mio_id;
305
306 qcow2_assert(flags & QCOW2_META_DIRTY);
307
308 if (!(flags & QCOW2_META_UPDATE))
309 ublk_err( "%s %s: buf isn't update\n", __func__,
310 typeid(*this).name());
311
312 if (off < offset || off >= offset + buf_sz) {
313 ublk_err( "%s %s: offset %" PRIx64 " is wrong\n", __func__,
314 typeid(*this).name(), offset);
315 return -EINVAL;
316 }
317
318 if (len > offset + buf_sz - off) {
319 ublk_err( "%s %s: len %x is wrong\n", __func__,
320 typeid(*this).name(), len);
321 return -EINVAL;
322 }
323
324 sqe = io_uring_get_sqe(q->ring_ptr);
325 if (!sqe) {
326 ublk_err( "%s %s: not get sqe allocated",
327 __func__, typeid(*this).name());
328 return -ENOMEM;
329 }
330
331 if (run_fsync) {
332 sqe2 = io_uring_get_sqe(q->ring_ptr);
333 if (!sqe2) {
334 ublk_err( "%s %s: not get sqe2 allocated",
335 __func__, typeid(*this).name());
336 return -ENOMEM;
337 }
338 io_uring_prep_fsync(sqe2, fd, IORING_FSYNC_DATASYNC);
339 sqe2->user_data = build_user_data(0xffff, IORING_OP_FSYNC, 0, 1);
340 sqe2->flags |= IOSQE_IO_LINK;
341 }
342
343 mio_id = qs.add_meta_io(qid, this);
344
345 io_uring_prep_write(sqe, fd, (void *)((u64)addr + (off - offset)),
346 len, off);
347 sqe->user_data = build_user_data(tag, IORING_OP_WRITE, mio_id + 1, 1);
348 ublk_dbg(UBLK_DBG_QCOW2_META, "%s %s: flushing %p tag %d off %lx sz %d flags %x refcnt %d\n",
349 __func__, typeid(*this).name(), this, tag, off,
350 len, flags, read_ref());
351 return 1;
352 }
353
io_done(Qcow2State & qs,const struct ublksrv_queue * q,const struct io_uring_cqe * cqe)354 void Qcow2MappingMeta::io_done(Qcow2State &qs, const struct ublksrv_queue *q,
355 const struct io_uring_cqe *cqe)
356 {
357 u32 tag = user_data_to_tag(cqe->user_data);
358 u32 meta_id = user_data_to_tgt_data(cqe->user_data) - 1;
359 u32 op = user_data_to_op(cqe->user_data);
360
361 qs.del_meta_io(q->q_id, meta_id);
362
363 //zero my cluster needn't to wakeup events on me
364 if (op != IORING_OP_FALLOCATE)
365 wakeup_all(q, tag);
366 }
367
Qcow2TopTable(Qcow2State & qs,u64 off,u32 buf_sz,const char * cls_name,u32 f)368 Qcow2TopTable::Qcow2TopTable(Qcow2State &qs, u64 off, u32 buf_sz,
369 const char *cls_name, u32 f):
370 Qcow2MappingMeta(qs, off, buf_sz, cls_name, f),
371 min_bs_bits(qs.min_bs_bits),
372 dirty(qs.get_l1_table_max_size() >> qs.min_bs_bits)
373 {
374 ublk_dbg(UBLK_DBG_QCOW2_META_L1, "%s: %s dirty size %zd %u/%u\n",
375 __func__,
376 cls_name, dirty.size(),
377 qs.get_l1_table_max_size(),qs.min_bs_bits);
378 for (int i = 0; i < dirty.size(); i++)
379 dirty[i] = false;
380 }
381
prep_flush(const qcow2_io_ctx_t & ioc,u32 blk_idx)382 bool Qcow2TopTable::prep_flush(const qcow2_io_ctx_t &ioc, u32 blk_idx)
383 {
384 if (!(flags & QCOW2_META_DIRTY))
385 return false;
386
387 //so far, just allow one in-progress unit for l1/refcount table
388 if (flags & QCOW2_META_FLUSHING)
389 return false;
390
391 flags |= QCOW2_META_FLUSHING;
392 return true;
393 }
394
unprep_flush(u32 blk_idx)395 void Qcow2TopTable::unprep_flush(u32 blk_idx) {
396 flags &= ~QCOW2_META_FLUSHING;
397 }
398
io_done(Qcow2State & qs,const struct ublksrv_queue * q,const struct io_uring_cqe * cqe)399 void Qcow2TopTable::io_done(Qcow2State &qs, const struct ublksrv_queue *q,
400 const struct io_uring_cqe *cqe)
401 {
402 u32 op = user_data_to_op(cqe->user_data);
403
404 //only for write l1 or refcount table
405 qcow2_assert(op == IORING_OP_WRITE);
406
407 unprep_flush(get_flush_blk_idx());
408
409 if (cqe->res < 0)
410 return;
411
412 set_blk_dirty(get_flush_blk_idx(), false);
413
414 Qcow2MappingMeta::io_done(qs, q, cqe);
415 }
416
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)417 int Qcow2TopTable::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
418 u64 off, u32 len)
419 {
420 int blk_idx = (off - offset) >> min_bs_bits;
421 int ret;
422
423 qcow2_assert(len == 512 && blk_idx < dirty.size());
424
425 if (!prep_flush(ioc, blk_idx))
426 return 0;
427
428 if (!get_blk_dirty(blk_idx)) {
429 ret = 0;
430 goto exit;
431 }
432
433 set_flush_blk_idx(blk_idx);
434
435 //need to run fsync before writting l1/refcount table, so
436 //that write order between top and l2/refcount blk is respected
437 ret = Qcow2MappingMeta::__flush(qs, ioc, off, len, true);
438 exit:
439 if (ret <= 0)
440 unprep_flush(blk_idx);
441 return ret;
442 }
443
has_dirty_slices(Qcow2State & qs,int idx)444 bool Qcow2TopTable::has_dirty_slices(Qcow2State &qs, int idx)
445 {
446 u64 entry = get_entry(idx);
447 u64 start, end, step, offset;
448
449 if (!entry)
450 return false;
451
452 if (is_mapping_meta())
453 step = 1ULL << (QCOW2_PARA::L2_TABLE_SLICE_BITS - 3 +
454 qs.header.cluster_bits);
455 else
456 step = 1ULL << (QCOW2_PARA::REFCOUNT_BLK_SLICE_BITS - 3 +
457 qs.header.cluster_bits);
458
459 start = ((u64)idx) << single_entry_order();
460 end = start + (1ULL << single_entry_order());
461 for (offset = start; offset < end; offset += step) {
462 Qcow2SliceMeta *t;
463
464 if (is_mapping_meta())
465 t = qs.cluster_map.__find_slice(offset);
466 else
467 t = qs.cluster_allocator.__find_slice(offset);
468
469 if (t && t->get_dirty(-1))
470 return true;
471 }
472
473 return false;
474 }
475
Qcow2L1Table(Qcow2State & qs)476 Qcow2L1Table::Qcow2L1Table(Qcow2State &qs): Qcow2TopTable(qs,
477 qs.get_l1_table_offset(), qs.get_l1_table_max_size(),
478 typeid(*this).name(), QCOW2_META_TOP | QCOW2_META_MAPPING)
479 {
480 }
481
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)482 int Qcow2L1Table::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u32 len, bool sync)
483 {
484 int ret;
485
486 ret = Qcow2Meta::load(qs, ioc, len, sync);
487 if (ret < 0)
488 ublk_err( "%s %s: load failed %d", __func__,
489 typeid(*this).name(), ret);
490 return ret;
491 }
492
dump()493 void Qcow2L1Table::dump()
494 {
495 qcow2_log("%s %s: sizeof %zd\n", __func__, typeid(*this).name(),
496 sizeof(*this));
497 for (int i = 0; i < header.get_l1_size(); i++)
498 qcow2_log("%d: %lx\n", i, get_entry(i));
499 }
500
get_entry(u32 idx)501 u64 Qcow2L1Table::get_entry(u32 idx) {
502 return get_entry_fast(idx);
503 }
504
set_entry(u32 idx,u64 val)505 void Qcow2L1Table::set_entry(u32 idx, u64 val) {
506 set_entry_fast(idx, val);
507 }
508
Qcow2RefcountTable(Qcow2State & qs)509 Qcow2RefcountTable::Qcow2RefcountTable(Qcow2State &qs):
510 Qcow2TopTable(qs, qs.get_refcount_table_offset(),
511 qs.get_refcount_table_max_size(),
512 typeid(*this).name(), QCOW2_META_TOP)
513 {
514 }
515
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)516 int Qcow2RefcountTable::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
517 u32 len, bool sync)
518 {
519 int ret;
520
521 ret = Qcow2Meta::load(qs, ioc, len, sync);
522 if (ret < 0)
523 ublk_err( "%s %s: load failed %d", __func__,
524 typeid(*this).name(), ret);
525 return ret;
526 }
527
get_entry(u32 idx)528 u64 Qcow2RefcountTable::get_entry(u32 idx) {
529 return get_entry_fast(idx);
530 }
531
set_entry(u32 idx,u64 val)532 void Qcow2RefcountTable::set_entry(u32 idx, u64 val) {
533 set_entry_fast(idx, val);
534 }
535
dump()536 void Qcow2RefcountTable::dump()
537 {
538 qcow2_log("%s %s: sizeof %zd\n", __func__, typeid(*this).name(),
539 sizeof(*this));
540 for (int i = 0; i < data_len / 8; i++) {
541 u64 entry = get_entry(i);
542
543 if (entry != 0)
544 qcow2_log("%d: %lx\n", i, entry);
545 }
546 }
547
Qcow2SliceMeta(Qcow2State & qs,u64 off,u32 buf_sz,const char * cls_name,u32 p_idx,u32 f)548 Qcow2SliceMeta::Qcow2SliceMeta(Qcow2State &qs, u64 off, u32 buf_sz,
549 const char *cls_name, u32 p_idx, u32 f):
550 Qcow2MappingMeta(qs, off, buf_sz, cls_name, f),
551 parent_idx(p_idx)
552 {
553 #ifdef QCOW2_CACHE_DEBUG
554 qcow2_log("slice meta %llx/%p/%d allocated\n", off, addr, buf_sz);
555 #endif
556 #ifdef DEBUG_QCOW2_META_VALIDATE
557 if (posix_memalign((void **)&validate_addr, getpagesize(), buf_sz))
558 ublk_err( "%s: allocate validate memory %d bytes failed\n",
559 __func__, buf_sz);
560 #endif
561 }
562
~Qcow2SliceMeta()563 Qcow2SliceMeta::~Qcow2SliceMeta() {
564 #ifdef DEBUG_QCOW2_META_VALIDATE
565 free(validate_addr);
566 #endif
567 }
568
prep_flush(const qcow2_io_ctx_t & ioc)569 bool Qcow2SliceMeta::prep_flush(const qcow2_io_ctx_t &ioc)
570 {
571 if (!(flags & QCOW2_META_DIRTY))
572 return false;
573
574 if (flags & QCOW2_META_FLUSHING) {
575 add_waiter(ioc.get_tag());
576 throw MetaUpdateException();
577 }
578 flags |= QCOW2_META_FLUSHING;
579 return true;
580 }
581
unprep_flush()582 void Qcow2SliceMeta::unprep_flush() {
583 flags &= ~QCOW2_META_FLUSHING;
584 }
585
zero_my_cluster(Qcow2State & qs,const qcow2_io_ctx_t & ioc)586 int Qcow2SliceMeta::zero_my_cluster(Qcow2State &qs,
587 const qcow2_io_ctx_t &ioc)
588 {
589 u64 cluster_off = offset & ~((1ULL << qs.header.cluster_bits) - 1);
590 Qcow2ClusterState *s = qs.cluster_allocator.get_cluster_state(
591 cluster_off);
592 u32 qid = ioc.get_qid();
593 u32 tag = ioc.get_tag();
594 const struct ublksrv_queue *q = ublksrv_get_queue(qs.dev, qid);
595 int fd = q->dev->tgt.fds[1];
596 struct io_uring_sqe *sqe;
597 int mode = FALLOC_FL_ZERO_RANGE;
598 unsigned mio_id;
599
600 if (s == nullptr)
601 return 0;
602
603 if (s->get_state() >= QCOW2_ALLOC_ZEROED)
604 return 0;
605
606 if (s->get_state() == QCOW2_ALLOC_ZEROING) {
607 s->add_waiter(ioc.get_tag());
608 throw MetaUpdateException();
609 }
610
611 sqe = io_uring_get_sqe(q->ring_ptr);
612 if (!sqe) {
613 ublk_err("%s: tag %d offset %" PRIu64 "op %d, no sqe for zeroing\n",
614 __func__, tag, offset, IORING_OP_FALLOCATE);
615 return -ENOMEM;
616 }
617
618 get_ref();
619
620 mio_id = qs.add_meta_io(qid, this);
621 s->set_state(QCOW2_ALLOC_ZEROING);
622 io_uring_prep_fallocate(sqe, fd, mode, cluster_off,
623 (1ULL << qs.header.cluster_bits));
624 sqe->user_data = build_user_data(tag,
625 IORING_OP_FALLOCATE, mio_id + 1, 1);
626 ublk_dbg(UBLK_DBG_QCOW2_META, "%s %s: zeroing %p tag %d off %lx sz %d flags %x ref %d\n",
627 __func__, typeid(*this).name(), this, tag, cluster_off,
628 (1ULL << qs.header.cluster_bits), flags, refcnt);
629 return 1;
630 }
631
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)632 int Qcow2SliceMeta::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
633 u32 len, bool sync)
634 {
635 int ret = -EINVAL;
636 u32 qid = ioc.get_qid();
637 u32 tag = ioc.get_tag();
638 const struct ublksrv_queue *q = ublksrv_get_queue(qs.dev, qid);
639 struct io_uring_sqe *sqe;
640 int mio_id;
641
642 if (sync) {
643 ublk_err( "%s %s: we only support async load",
644 __func__, typeid(*this).name());
645 return -EINVAL;
646 }
647
648 if (flags & QCOW2_META_UPDATE) {
649 ublk_err( "%s %s: we are update, need to load?",
650 __func__, typeid(*this).name());
651 return -EINVAL;
652 }
653
654 sqe = io_uring_get_sqe(q->ring_ptr);
655 if (!sqe) {
656 ublk_err( "%s %s: not get sqe allocated",
657 __func__, typeid(*this).name());
658 return ret;
659 }
660
661 get_ref();
662
663 mio_id = qs.add_meta_io(qid, this);
664
665 io_uring_prep_read(sqe, 1, (void *)addr, buf_sz, offset);
666 sqe->flags = IOSQE_FIXED_FILE;
667 /* meta io id starts from one and zero is reserved for plain ublk io */
668 sqe->user_data = build_user_data(tag, IORING_OP_READ, mio_id + 1, 1);
669
670 ublk_dbg(UBLK_DBG_QCOW2_META, "%s: queue io op %d(%llx %x %llx)"
671 " (qid %d tag %u, cmd_op %u target: %d tgt_data %d)\n",
672 __func__, sqe->opcode, sqe->off, sqe->len, sqe->addr,
673 q->q_id, tag, sqe->opcode, 1, mio_id + 1);
674 ublk_dbg(UBLK_DBG_QCOW2_META, "%s %s: loading %p tag %d off %lx sz %d flags %x ref %d\n",
675 __func__, typeid(*this).name(), this, tag,
676 offset, buf_sz, flags, refcnt);
677
678 return 0;
679 }
680
681 #ifdef DEBUG_QCOW2_META_VALIDATE
io_done_validate(Qcow2State & qs,const struct ublksrv_queue * q,struct io_uring_cqe * cqe)682 void Qcow2SliceMeta::io_done_validate(Qcow2State &qs, const struct ublksrv_queue *q,
683 struct io_uring_cqe *cqe)
684 {
685 u32 tag = user_data_to_tag(cqe->user_data);
686 u32 meta_id = user_data_to_tgt_data(cqe->user_data) - 1;
687 u32 op = user_data_to_op(cqe->user_data);
688 u64 cluster_off = offset & ~((1ULL << qs.header.cluster_bits) - 1);
689 bool res;
690
691 //for write, buffer data has been saved to validate_addr before
692 //submitting the WRITE io
693 if (op != IORING_OP_WRITE) {
694 lseek(qs.img.fd, offset, SEEK_SET);
695 read(qs.img.fd, validate_addr, buf_sz);
696 }
697
698 if (op == IORING_OP_FALLOCATE) {
699 for (int i = 0; i < buf_sz; i++) {
700 char *buf = (char *)validate_addr;
701
702 qcow2_assert(buf[i] == 0);
703 }
704 } else if (op == IORING_OP_WRITE || op == IORING_OP_READ) {
705 unsigned long *buf = (unsigned long *)addr;
706 unsigned long *buf2 = (unsigned long *)validate_addr;
707
708 res = bcmp(addr, validate_addr, buf_sz);
709
710 if (res == 0)
711 return;
712
713 for (int i = 0; i < buf_sz / 8; i++) {
714 if (buf[i] != buf2[i]) {
715 qcow2_log("%s: not same in %d %lx %lx\n",
716 __func__, i, buf[i], buf2[i]);
717 qcow2_log("%s: tag %d, tgt_data %d op %d meta (%p %x %lx %d) res %d\n",
718 __func__, tag, meta_id, op, this,
719 get_flags(), get_offset(),
720 refcnt, cqe->res);
721 }
722 }
723 qcow2_assert(0);
724 }
725 }
726 #endif
727
728 /* called for both load() and flush() */
io_done(Qcow2State & qs,const struct ublksrv_queue * q,const struct io_uring_cqe * cqe)729 void Qcow2SliceMeta::io_done(Qcow2State &qs, const struct ublksrv_queue *q,
730 const struct io_uring_cqe *cqe)
731 {
732 u32 tag = user_data_to_tag(cqe->user_data);
733 u32 meta_id = user_data_to_tgt_data(cqe->user_data) - 1;
734 u32 op = user_data_to_op(cqe->user_data);
735 u64 cluster_off = offset & ~((1ULL << qs.header.cluster_bits) - 1);
736
737 if (cqe->res < 0) {
738 qcow2_log("%s: failure: tag %d, tgt_data %d op %d meta (%p %x %lx %d) res %d\n",
739 __func__, tag, meta_id, op, this,
740 get_flags(), get_offset(), refcnt, cqe->res);
741 //zeroing the cluster for holding me is done
742 if (op == IORING_OP_FALLOCATE) {
743 if (qs.cluster_allocator.
744 alloc_cluster_reset(cluster_off))
745 goto exit;
746 } else if (op == IORING_OP_WRITE) {
747 unprep_flush();
748 goto exit;
749 } else
750 goto exit;
751 }
752
753 io_done_validate(qs, q, cqe);
754
755 if (op == IORING_OP_READ)
756 set_update(true);
757 else if (op == IORING_OP_WRITE) {
758 unprep_flush();
759 qs.meta_flushing.dec_dirtied_slice(is_mapping_meta());
760 set_dirty(-1, false);
761 set_prep_flush(false);
762 } else if (op == IORING_OP_FALLOCATE)
763 qs.cluster_allocator.alloc_cluster_zeroed(q, tag, cluster_off);
764 else
765 ublk_err( "%s: unknown op: tag %d op %d meta_id %d res %d\n",
766 __func__, tag, op, meta_id, cqe->res);
767
768 ublk_dbg(UBLK_DBG_QCOW2_META, "%s: tag %d, tgt_data %d op %d meta (%p %x %lx %d) res %d\n",
769 __func__, tag, meta_id, op, this,
770 get_flags(), get_offset(), refcnt, cqe->res);
771
772 //wake up waiters
773 Qcow2MappingMeta::io_done(qs, q, cqe);
774
775 //if it is evicted, now it is ready to free it
776 if ((op == IORING_OP_WRITE) && cqe->res >= 0 && get_evicted())
777 qs.add_slice_to_free_list(this);
778
779 exit:
780 //drop the reference grabbed in either load() or flush()
781 put_ref();
782 return;
783 }
784
wait_clusters(Qcow2State & qs,const qcow2_io_ctx_t & ioc)785 void Qcow2SliceMeta::wait_clusters(Qcow2State &qs,
786 const qcow2_io_ctx_t &ioc)
787 {
788 for (int i = 0; i < get_nr_entries(); i++) {
789 u64 entry = get_entry(i);
790
791 if (entry) {
792 u64 cluster_off;
793
794 //mapping meta means this is one l2 table, otherwise
795 //it is one refcount block table
796 if (is_mapping_meta())
797 cluster_off = entry & L1E_OFFSET_MASK;
798 else
799 cluster_off = virt_offset() + (u64)i << qs.header.cluster_bits;
800
801 Qcow2ClusterState *s = qs.cluster_allocator.
802 get_cluster_state(cluster_off);
803
804 if (s == nullptr)
805 continue;
806
807 if (s->get_state() < QCOW2_ALLOC_ZEROED) {
808 s->add_waiter(ioc.get_tag());
809 throw MetaUpdateException();
810 }
811 }
812 }
813 }
814
reclaim_me()815 void Qcow2SliceMeta::reclaim_me()
816 {
817 unsigned queues = header.qs.dev_info->nr_hw_queues;
818
819 ublk_dbg(UBLK_DBG_QCOW2_META, "%s: %p off %llx flags %x\n", __func__,
820 this, get_offset(), flags);
821
822 header.qs.remove_slice_from_evicted_list(this);
823
824 ublk_dbg(UBLK_DBG_QCOW2_META, "%s: %p off %llx\n", __func__, this, get_offset());
825
826 //Tell the whole world, I am leaving
827 for (int i = 0; i < queues; i++) {
828 const struct ublksrv_queue *q = ublksrv_get_queue(header.qs.dev, i);
829
830 wakeup_all(q, -1);
831 }
832 header.qs.reclaim_slice(this);
833 }
834
Qcow2RefcountBlock(Qcow2State & qs,u64 off,u32 p_idx,u32 f)835 Qcow2RefcountBlock::Qcow2RefcountBlock(Qcow2State &qs, u64 off, u32 p_idx, u32 f):
836 Qcow2SliceMeta(qs, off, QCOW2_PARA::REFCOUNT_BLK_SLICE_BYTES,
837 typeid(*this).name(), p_idx, f),
838 dirty_start_idx((unsigned)-1)
839 {
840 entry_bits_order = qs.header.refcount_order;
841 ublk_dbg(UBLK_DBG_QCOW2_META_RB, "rb meta %p %llx -> %llx \n", this, virt_offset(), off);
842 }
843
844
reset(Qcow2State & qs,u64 off,u32 p_idx,u32 f)845 void Qcow2RefcountBlock::reset(Qcow2State &qs, u64 off, u32 p_idx, u32 f)
846 {
847 Qcow2RefcountBlock tmp(qs, off, p_idx, f | QCOW2_META_DONT_ALLOC_BUF);
848
849 qcow2_assert(refcnt == 0);
850
851 offset = tmp.get_offset();
852 flags = tmp.get_flags() & ~QCOW2_META_DONT_ALLOC_BUF;
853 refcnt = tmp.read_ref();
854
855 ublk_dbg(UBLK_DBG_QCOW2_META_RB, "%s: %p refcnt %d flags %x offset %lx \n",
856 __func__, this, refcnt, flags, offset);
857
858 next_free_idx = tmp.get_next_free_idx();
859
860 parent_idx = tmp.parent_idx;
861
862 dirty_start_idx = tmp.dirty_start_idx;
863 }
864
get_entry(u32 idx)865 u64 Qcow2RefcountBlock::get_entry(u32 idx) {
866 return get_entry_fast(idx);
867 }
868
set_entry(u32 idx,u64 val)869 void Qcow2RefcountBlock::set_entry(u32 idx, u64 val) {
870 set_entry_fast(idx, val);
871
872 if (is_flushing() || !get_update()) {
873 qcow2_log("BUG %s: obj %p flags %x off %lx\n",
874 __func__, this, flags, offset);
875 qcow2_assert(0);
876 }
877 }
878
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)879 int Qcow2RefcountBlock::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
880 u64 off, u32 len)
881 {
882 int ret;
883
884 //wait_clusters(qs, ioc);
885
886 if (!prep_flush(ioc))
887 return 0;
888
889 //flush can't be started unless the above two are done
890 //
891 //the ref is released in io_done()
892 get_ref();
893 #ifdef DEBUG_QCOW2_META_VALIDATE
894 memcpy(validate_addr, addr, buf_sz);
895 #endif
896 ret = Qcow2MappingMeta::__flush(qs, ioc, off, len);
897 if (ret <= 0) {
898 unprep_flush();
899 put_ref();
900 }
901 return ret;
902 }
903
~Qcow2RefcountBlock()904 Qcow2RefcountBlock::~Qcow2RefcountBlock()
905 {
906 }
907
get_dirty_range(u64 * start,u64 * end)908 void Qcow2RefcountBlock::get_dirty_range(u64 *start, u64 *end)
909 {
910 *start = 1;
911 *end = 0;
912 }
913
dump()914 void Qcow2RefcountBlock::dump()
915 {
916 unsigned cnt = 0;
917 int f = -1, l;
918 for (int i = 0; i < get_nr_entries(); i++) {
919 u64 entry = get_entry(i);
920
921 if (entry != 0) {
922 if (f == -1)
923 f = i;
924 l = i;
925 cnt++; //qcow2_log("%d: %lx\n", i, entry);
926 }
927 }
928
929 if (!cnt)
930 return;
931
932 qcow2_log("%s %s: buf_sz %u offset %" PRIx64 " sizeof %zd entries %u parent_idx %u virt_off %" PRIx64 " flags %x\n",
933 __func__, typeid(*this).name(), buf_sz, offset, sizeof(*this),
934 cnt, parent_idx, virt_offset(),
935 flags);
936 qcow2_log("\t [%d] = %" PRIx64 "/%" PRIx64 " [%d] = %" PRIx64 "/%" PRIx64 "\n",
937 f, get_entry(f),
938 virt_offset() + (f << header.cluster_bits),
939 l, get_entry(l),
940 virt_offset() + (l << header.cluster_bits));
941 }
942
Qcow2L2Table(Qcow2State & qs,u64 off,u32 p_idx,u32 f)943 Qcow2L2Table::Qcow2L2Table(Qcow2State &qs, u64 off, u32 p_idx, u32 f):
944 Qcow2SliceMeta(qs, off, QCOW2_PARA::L2_TABLE_SLICE_BYTES,
945 typeid(*this).name(), p_idx, f | QCOW2_META_MAPPING)
946 {
947 if (header.is_extended_l2_entries())
948 entry_bits_order <<= 1;
949 dirty_start = (u64)-1;
950 dirty_end = 0;
951 ublk_dbg(UBLK_DBG_QCOW2_META_L2, "l2 meta %p %llx -> %llx \n", this, virt_offset(), off);
952 }
953
reset(Qcow2State & qs,u64 off,u32 p_idx,u32 f)954 void Qcow2L2Table::reset(Qcow2State &qs, u64 off, u32 p_idx, u32 f)
955 {
956 Qcow2L2Table tmp(qs, off, p_idx, f | QCOW2_META_DONT_ALLOC_BUF);
957
958 qcow2_assert(refcnt == 0);
959
960 offset = tmp.get_offset();
961 flags = tmp.get_flags() & ~QCOW2_META_DONT_ALLOC_BUF;
962 refcnt = tmp.read_ref();
963
964 ublk_dbg(UBLK_DBG_QCOW2_META_L2, "%s: %p refcnt %d flags %x offset %lx \n",
965 __func__, this, refcnt, flags, offset);
966
967 next_free_idx = tmp.get_next_free_idx();
968
969 parent_idx = tmp.parent_idx;
970
971 tmp.get_dirty_range(&dirty_start, &dirty_end);
972 }
973
~Qcow2L2Table()974 Qcow2L2Table::~Qcow2L2Table()
975 {
976 }
977
io_done(Qcow2State & qs,const struct ublksrv_queue * q,const struct io_uring_cqe * cqe)978 void Qcow2L2Table::io_done(Qcow2State &qs, const struct ublksrv_queue *q,
979 const struct io_uring_cqe *cqe)
980 {
981 get_ref();
982 Qcow2SliceMeta::io_done(qs, q, cqe);
983 check(qs, __func__, __LINE__);
984 put_ref();
985 }
986
get_entry(u32 idx)987 u64 Qcow2L2Table::get_entry(u32 idx) {
988 return get_entry_fast(idx);
989 }
990
get_dirty_range(u64 * start,u64 * end)991 void Qcow2L2Table::get_dirty_range(u64 *start, u64 *end)
992 {
993 *start = dirty_start;
994 *end = dirty_end;
995 }
996
set_entry(u32 idx,u64 val)997 void Qcow2L2Table::set_entry(u32 idx, u64 val) {
998 set_entry_fast(idx, val);
999
1000 if (is_flushing() || !get_update()) {
1001 qcow2_log("BUG %s: obj %p flags %x off %lx\n",
1002 __func__, this, flags, offset);
1003 qcow2_assert(0);
1004 }
1005
1006 val &= L2E_OFFSET_MASK;
1007
1008 qcow2_assert(!(val & ((1ULL << header.cluster_bits) - 1)));
1009
1010 if (val < dirty_start)
1011 dirty_start = val;
1012 if (val > dirty_end)
1013 dirty_end = val;
1014 }
1015
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)1016 int Qcow2L2Table::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
1017 u64 off, u32 len)
1018 {
1019 int ret;
1020
1021 wait_clusters(qs, ioc);
1022
1023 if (!prep_flush(ioc))
1024 return 0;
1025
1026 //flush can't be started unless the above two are done
1027 //
1028 //the ref is released in io_done()
1029 get_ref();
1030 #ifdef DEBUG_QCOW2_META_VALIDATE
1031 memcpy(validate_addr, addr, buf_sz);
1032 check_duplicated_clusters(qs, ioc.get_tag(), __func__, __LINE__);
1033 #endif
1034 ret = Qcow2MappingMeta::__flush(qs, ioc, off, len);
1035 if (ret <= 0) {
1036 unprep_flush();
1037 put_ref();
1038 }
1039 return ret;
1040 }
1041
dump()1042 void Qcow2L2Table::dump()
1043 {
1044 unsigned cnt = 0;
1045 int f = -1, l;
1046
1047 for (int i = 0; i < get_nr_entries(); i++) {
1048 u64 entry = get_entry(i);
1049
1050 if (entry != 0) {
1051 if (f == -1)
1052 f = i;
1053 l = i;
1054 cnt++; //qcow2_log("%d: %lx\n", i, entry);
1055 }
1056 }
1057
1058 if (!cnt)
1059 return;
1060
1061 qcow2_log("%s %s: buf_sz %u offset %" PRIx64 " sizeof %zd entries %u parent_idx %u virt_off %" PRIx64 " flags %x\n",
1062 __func__, typeid(*this).name(), buf_sz, offset, sizeof(*this),
1063 cnt, parent_idx, virt_offset(), flags);
1064 qcow2_log("\t [%d] = %" PRIx64 "[%u] = %" PRIx64 "\n", f,
1065 get_entry(f), l, get_entry(l));
1066 }
1067
1068 #ifdef DEBUG_QCOW2_META_VALIDATE
check(Qcow2State & qs,const char * func,int line)1069 void Qcow2L2Table::check(Qcow2State &qs, const char *func, int line)
1070 {
1071 int i, cnt = 0;
1072 bool bad = false;
1073
1074 if (!get_update())
1075 return;
1076
1077 //don't check evicted obj, which can't be used by anyone
1078 if (get_evicted())
1079 return;
1080
1081 for (i = 0; i < get_nr_entries(); i++) {
1082 u64 entry = get_entry(i) & ((1ULL << 63) - 1);
1083
1084 if (entry == 0)
1085 continue;
1086
1087 cnt++;
1088
1089 if (entry + (1ULL << qs.header.cluster_bits) >
1090 qs.cluster_allocator.max_physical_size) {
1091 qcow2_log("%s %d: entry %llx(parent idx %d, idx %d) offset %llx is too big\n",
1092 func, line, entry, parent_idx, i,
1093 get_offset());
1094 bad = true;
1095 }
1096
1097 if (entry & ((1ULL << qs.header.cluster_bits) - 1)) {
1098 qcow2_log("%s: entry %llx(parent idx %d, idx %d) offset %llx isn't aligned\n",
1099 func, line, entry, parent_idx, i,
1100 get_offset());
1101 bad = true;
1102 }
1103 }
1104
1105 if (bad) {
1106 qcow2_log("%s %s: %p buf_sz %u offset %llx sizeof %d parent_idx %u virt_off %llx flags %x refcnt %d\n",
1107 __func__, typeid(*this).name(), this, buf_sz, offset, sizeof(*this),
1108 parent_idx, virt_offset(), flags, read_ref());
1109 qcow2_log("\t total entries %d\n", cnt);
1110 assert(0);
1111 }
1112 }
1113
check_duplicated_clusters(Qcow2State & qs,int tag,const char * func,int line)1114 void Qcow2L2Table::check_duplicated_clusters(Qcow2State &qs, int tag,
1115 const char *func, int line)
1116 {
1117 for (int i = 0; i < get_nr_entries(); i++) {
1118 u64 entry = get_entry(i);
1119
1120 if (entry != 0) {
1121 u64 host_off = entry & ((1ULL << 63) - 1);
1122 u64 virt_off = virt_offset() + (((u64)i) <<
1123 qs.header.cluster_bits);
1124
1125 if (qs.validate_cluster_map(host_off, virt_off))
1126 continue;
1127 qcow2_log("BUG %s %d: tag %d obj %p flags %x off %lx virt_off "
1128 "%lx(#%d) parent_idx %d\n",
1129 func, line, tag, this, flags, offset,
1130 virt_offset(), i, parent_idx);
1131 qcow2_assert(0);
1132 }
1133 }
1134 }
1135 #endif
1136