xref: /aosp_15_r20/external/ublksrv/qcow2/qcow2_meta.cpp (revision 94c4a1e103eb1715230460aab379dff275992c20)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <cassert>
3 
4 #include "qcow2.h"
5 #include "ublksrv_tgt.h"
6 
7 
8 // refcnt is for slice only, and initialize it as two, one is for submission
9 // side, another is for free side. This way guarantees that the returned slice
10 // from alloc_slice is always valid
Qcow2Meta(Qcow2Header & h,u64 off,u32 sz,const char * name,u32 f)11 Qcow2Meta::Qcow2Meta(Qcow2Header &h, u64 off, u32 sz, const char *name, u32 f):
12 	header(h), offset(off), buf_sz(sz), flags(f), refcnt(2)
13 {
14 	//used for implementing slice's ->reset() only
15 	if (f & QCOW2_META_DONT_ALLOC_BUF)
16 		return;
17 
18 	if (posix_memalign((void **)&addr, getpagesize(), sz))
19 		ublk_err( "allocate memory %d bytes failed, %s\n",
20 				sz, name);
21 #ifdef DEBUG_QCOW2_META_OBJ
22 	id = name;
23 	qcow2_log("%s: constructed, obj %p, buf size %d off %lx flags %x\n",
24 			name, this, sz, off, flags);
25 #endif
26 }
27 
show(const char * func,int line)28 void Qcow2Meta::show(const char *func, int line)
29 {
30 #ifdef DEBUG_QCOW2_META_OBJ
31 	qcow2_log("%s:%d id %s obj %p flags %x off %lx ref %d\n",
32 			func, line, id, this, flags, offset, refcnt);
33 #else
34 	qcow2_log("%s:%d obj %p flags %x off %lx ref %d\n",
35 			func, line, this, flags, offset, refcnt);
36 #endif
37 }
38 
~Qcow2Meta()39 Qcow2Meta::~Qcow2Meta()
40 {
41 #ifdef DEBUG_QCOW2_META_OBJ
42 	qcow2_log("%s: destructed, obj %p flags %x off %lx ref %d\n",
43 			id, this, flags, offset, refcnt);
44 #endif
45 	if (flags & QCOW2_META_DONT_ALLOC_BUF)
46 		return;
47 
48 	if (!is_top_meta() && (get_dirty(-1) || is_flushing() ||
49 				(!get_update() && !get_evicted()))) {
50 		qcow2_log("BUG %s: obj %p flags %x off %lx\n",
51 				__func__, this, flags, offset);
52 		qcow2_assert(0);
53 	}
54 	free(addr);
55 }
56 
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)57 int Qcow2Meta::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u32 len, bool sync)
58 {
59 	int fd;
60 
61 	if (addr == NULL)
62 		return -EINVAL;
63 	if (len > buf_sz) {
64 		ublk_err( "%s %s: load too much %d(%d) \n",
65 				__func__, typeid(*this).name(), len, buf_sz);
66 		return -EINVAL;
67 	}
68 	if (!sync)
69 		return -EOPNOTSUPP;
70 
71 	//qcow2_log("%s: read %s offset %llx len %lu  \n", __func__,
72 	//		typeid(*this).name(), offset, len);
73 	fd = qs.img.fd;
74 	lseek(fd, offset, SEEK_SET);
75 	data_len = read(fd, addr, len);
76 	if (data_len != len)
77 		qcow2_log("%s: read %u(%u)\n", __func__, len, data_len);
78 	if (data_len > 0)
79 		flags |= QCOW2_META_UPDATE;
80 	return data_len;
81 }
82 
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)83 int Qcow2Meta::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u64 off,
84 		u32 len)
85 {
86 	int fd = qs.img.fd;
87 	int ret;
88 
89 	if (!(flags & QCOW2_META_DIRTY))
90 		return 0;
91 
92 	if (!(flags & QCOW2_META_UPDATE))
93 		ublk_err( "%s %s: buf isn't update\n", __func__,
94 				typeid(*this).name());
95 
96 	//qcow2_log("%s: write %s offset %llx len %lu  \n", __func__,
97 	//		typeid(*this).name(), offset, buf_sz);
98 	lseek(fd, off, SEEK_SET);
99 	ret = write(fd, addr, len);
100 	if (len != ret)
101 		qcow2_log("%s: write %u(%u)\n", __func__, len, ret);
102 	if (ret > 0)
103 		flags &= ~QCOW2_META_DIRTY;
104 
105 	return len;
106 }
107 
zero_buf()108 void Qcow2Meta::zero_buf() {
109 	memset((void *)addr, 0, buf_sz);
110 }
111 
112 // Base class is constructed first, then follows member class/objects,
113 // and member classes are done in the order of their declaration,
114 // so here __a can be setup correctly.
Qcow2HeaderExtFeatureNameTable(char * addr,u64 offset)115 Qcow2HeaderExtFeatureNameTable::Qcow2HeaderExtFeatureNameTable(
116 		char *addr, u64 offset): Qcow2HeaderExt(addr, offset),
117 	__a(len / sizeof(struct feature_entry))
118 {
119 	unsigned off = offset;
120 
121 	for (int i = 0; i < __a.size(); i++) {
122 		__a[i].feature_type = *(addr + off + 8);
123 		__a[i].bit_num = *(addr + off + 9);
124 		strncpy(__a[i].feature_name, addr + off + 10, 46);
125 		off += 48;
126 	}
127 }
128 
dump() const129 void Qcow2HeaderExtFeatureNameTable::dump() const
130 {
131 	Qcow2HeaderExt::dump();
132 
133 	for (int i = 0; i < __a.size(); i++)
134 		qcow2_log("\t %d: type %x bit_num %u name %s\n",
135 			i, __a[i].feature_type, __a[i].bit_num,
136 			__a[i].feature_name);
137 }
138 
Qcow2Header(Qcow2State & state)139 Qcow2Header::Qcow2Header(Qcow2State &state): Qcow2Meta(*this, 0, 4096,
140 	typeid(this).name(), 0), magic(0), version(0), cluster_bits(0),
141 	refcount_order(0), qs(state)
142 {
143 	backingfile_format_name = NULL;
144 	feature_name_table = NULL;
145 	enc_header_pointer = NULL;
146 	bitmaps = NULL;
147 	ext_data_file_name = NULL;
148 
149 	load(state, 0, buf_sz, true);
150 }
151 
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)152 int Qcow2Header::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u64 off,
153 			u32 len)
154 {
155 	return Qcow2Meta::flush(qs, ioc, off, len);
156 }
157 
~Qcow2Header()158 Qcow2Header::~Qcow2Header()
159 {
160 	delete	backingfile_format_name;
161 	delete	feature_name_table;
162 	delete	enc_header_pointer;
163 	delete	bitmaps;
164 	delete	ext_data_file_name;
165 }
166 
dump_ext() const167 void Qcow2Header::dump_ext() const
168 {
169 	if (backingfile_format_name)
170 		backingfile_format_name->dump();
171 
172 	if (ext_data_file_name)
173 		ext_data_file_name->dump();
174 
175 	if (feature_name_table)
176 		feature_name_table->dump();
177 
178 	if (bitmaps)
179 		bitmaps->dump();
180 
181 	if (enc_header_pointer)
182 		enc_header_pointer->dump();
183 }
184 
185 /*
186  * populate header extensions
187  *
188  * The header may take more than 4k, which should be decided by
189  * backing_file_offset & backing_file_size __or__ populate
190  * header extensions.
191  */
populate()192 int Qcow2Header::populate()
193 {
194 	char *buf = (char *)addr;
195 	u64 start = (get_header_length() + 7) & ~0x7ULL;
196 	u32 *p_magic =  const_cast<u32 *> (&magic);
197 	u32 *p_version =  const_cast<u32 *> (&version);
198 	u32 *p_cluster_bits = const_cast<u32 *> (&cluster_bits);
199 	u32 *p_refcount_order = const_cast<u32 *> (&refcount_order);
200 
201 	*p_magic = get_magic();
202 	*p_version = get_version();
203 	*p_cluster_bits = get_cluster_bits();
204 	*p_refcount_order = get_refcount_order();
205 
206 	if (version == 2)
207 		goto exit;
208 
209 	//todo: populate extensions
210 	while (true) {
211 		Qcow2HeaderExt ext(buf, start);
212 
213 		switch (ext.type) {
214 		case QCOW2_EXT_MAGIC_END:
215 			goto exit;
216 		case QCOW2_EXT_MAGIC_BACKING_FORMAT:
217 			this->backingfile_format_name =
218 				new Qcow2HeaderExtString(buf, start);
219 			break;
220 		case QCOW2_EXT_MAGIC_FEATURE_TABLE:
221 			this->feature_name_table =
222 				new Qcow2HeaderExtFeatureNameTable(
223 						buf, start);
224 			break;
225 		case QCOW2_EXT_MAGIC_CRYPTO_HEADER:
226 			this->enc_header_pointer =
227 				new Qcow2HeaderExtEncHeader(buf, start);
228 			break;
229 		case QCOW2_EXT_MAGIC_BITMAPS:
230 			this->bitmaps =
231 				new Qcow2HeaderExtBitmaps(buf, start);
232 			break;
233 		case QCOW2_EXT_MAGIC_DATA_FILE:
234 			this->ext_data_file_name =
235 				new Qcow2HeaderExtString(buf, start);
236 			break;
237 		};
238 		start += 8 + (ext.len + 7) & ~0x7ULL;
239 	}
240  exit:
241 	return 0;
242 }
243 
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)244 int Qcow2Header::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u32 len, bool sync)
245 {
246 	int ret;
247 
248 	ret = Qcow2Meta::load(qs, ioc, len, sync);
249 	if (ret <= 0)
250 		goto fail;
251 
252 	ret = populate();
253 	return ret;
254  fail:
255 	ublk_err( "%s: load failed %d", __func__, ret);
256 	return ret;
257 }
258 
operator <<(std::ostream & os,const Qcow2Header & h)259 std::ostream & operator<<(std::ostream &os, const Qcow2Header &h)
260 {
261 	char buf[256];
262 
263 	sprintf(buf, "magic: %x", h.magic);
264 	std::cout << std::string(buf) << std::endl;
265 	qcow2_log("%s", buf);
266 
267 	sprintf(buf, "version: %x\n", h.version);
268 	std::cout << std::string(buf) << std::endl;
269 	qcow2_log("%s", buf);
270 
271 	sprintf(buf, "cluster_bits: %x\n", h.cluster_bits);
272 	std::cout << std::string(buf) << std::endl;
273 	qcow2_log("%s", buf);
274 
275 	sprintf(buf, "refcount_order: %x\n", h.refcount_order);
276 	std::cout << std::string(buf) << std::endl;
277 	qcow2_log("%s", buf);
278 
279 	return os;
280 }
281 
Qcow2MappingMeta(Qcow2State & qs,u64 off,u32 buf_sz,const char * cls_name,u32 f)282 Qcow2MappingMeta::Qcow2MappingMeta(Qcow2State &qs, u64 off, u32 buf_sz,
283 		const char *cls_name, u32 f):
284 	Qcow2Meta(qs.header, off, buf_sz, cls_name, f)
285 {
286 	//default each entry is 64bits(8bytes) except for:
287 	// extended l2 entry is 128bit, refcount blk has refcount_order
288 	entry_bits_order = 6;
289 	next_free_idx = -1;
290 }
291 
292 /*
293  * __flush() is just one worker, state check/update is done before calling
294  * __flush()
295  */
__flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len,bool run_fsync)296 int Qcow2MappingMeta::__flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
297 		u64 off, u32 len, bool run_fsync)
298 {
299 	int fd = qs.img.fd;
300 	u32 qid = ioc.get_qid();
301 	u32 tag = ioc.get_tag();
302 	const struct ublksrv_queue *q = ublksrv_get_queue(qs.dev, qid);
303 	struct io_uring_sqe *sqe, *sqe2;
304 	unsigned mio_id;
305 
306 	qcow2_assert(flags & QCOW2_META_DIRTY);
307 
308 	if (!(flags & QCOW2_META_UPDATE))
309 		ublk_err( "%s %s: buf isn't update\n", __func__,
310 				typeid(*this).name());
311 
312 	if (off < offset || off >= offset + buf_sz) {
313 		ublk_err( "%s %s: offset %" PRIx64 " is wrong\n", __func__,
314 				typeid(*this).name(), offset);
315 		return -EINVAL;
316 	}
317 
318 	if (len > offset + buf_sz - off) {
319 		ublk_err( "%s %s: len %x is wrong\n", __func__,
320 				typeid(*this).name(), len);
321 		return -EINVAL;
322 	}
323 
324 	sqe = io_uring_get_sqe(q->ring_ptr);
325 	if (!sqe) {
326 		ublk_err( "%s %s: not get sqe allocated",
327 				__func__, typeid(*this).name());
328 		return -ENOMEM;
329 	}
330 
331 	if (run_fsync) {
332 		sqe2 = io_uring_get_sqe(q->ring_ptr);
333 		if (!sqe2) {
334 			ublk_err( "%s %s: not get sqe2 allocated",
335 				__func__, typeid(*this).name());
336 			return -ENOMEM;
337 		}
338 		io_uring_prep_fsync(sqe2, fd, IORING_FSYNC_DATASYNC);
339 		sqe2->user_data = build_user_data(0xffff, IORING_OP_FSYNC, 0, 1);
340 		sqe2->flags |= IOSQE_IO_LINK;
341 	}
342 
343 	mio_id = qs.add_meta_io(qid, this);
344 
345 	io_uring_prep_write(sqe, fd, (void *)((u64)addr + (off - offset)),
346 			len, off);
347 	sqe->user_data = build_user_data(tag, IORING_OP_WRITE, mio_id + 1, 1);
348 	ublk_dbg(UBLK_DBG_QCOW2_META, "%s %s: flushing %p tag %d off %lx sz %d flags %x refcnt %d\n",
349 			__func__, typeid(*this).name(), this, tag, off,
350 			len, flags, read_ref());
351 	return 1;
352 }
353 
io_done(Qcow2State & qs,const struct ublksrv_queue * q,const struct io_uring_cqe * cqe)354 void Qcow2MappingMeta::io_done(Qcow2State &qs, const struct ublksrv_queue *q,
355 			const struct io_uring_cqe *cqe)
356 {
357 	u32 tag = user_data_to_tag(cqe->user_data);
358 	u32 meta_id = user_data_to_tgt_data(cqe->user_data) - 1;
359 	u32 op = user_data_to_op(cqe->user_data);
360 
361 	qs.del_meta_io(q->q_id, meta_id);
362 
363 	//zero my cluster needn't to wakeup events on me
364 	if (op != IORING_OP_FALLOCATE)
365 		wakeup_all(q, tag);
366 }
367 
Qcow2TopTable(Qcow2State & qs,u64 off,u32 buf_sz,const char * cls_name,u32 f)368 Qcow2TopTable::Qcow2TopTable(Qcow2State &qs, u64 off, u32 buf_sz,
369 		const char *cls_name, u32 f):
370 	Qcow2MappingMeta(qs, off, buf_sz, cls_name, f),
371 	min_bs_bits(qs.min_bs_bits),
372 	dirty(qs.get_l1_table_max_size() >> qs.min_bs_bits)
373 {
374 	ublk_dbg(UBLK_DBG_QCOW2_META_L1, "%s: %s dirty size %zd %u/%u\n",
375 			__func__,
376 			cls_name, dirty.size(),
377 		qs.get_l1_table_max_size(),qs.min_bs_bits);
378 	for (int i = 0; i < dirty.size(); i++)
379 		dirty[i] = false;
380 }
381 
prep_flush(const qcow2_io_ctx_t & ioc,u32 blk_idx)382 bool Qcow2TopTable::prep_flush(const qcow2_io_ctx_t &ioc, u32 blk_idx)
383 {
384 	if (!(flags & QCOW2_META_DIRTY))
385 		return false;
386 
387 	//so far, just allow one in-progress unit for l1/refcount table
388 	if (flags & QCOW2_META_FLUSHING)
389 		return false;
390 
391 	flags |= QCOW2_META_FLUSHING;
392 	return true;
393 }
394 
unprep_flush(u32 blk_idx)395 void Qcow2TopTable::unprep_flush(u32 blk_idx) {
396 	flags &= ~QCOW2_META_FLUSHING;
397 }
398 
io_done(Qcow2State & qs,const struct ublksrv_queue * q,const struct io_uring_cqe * cqe)399 void Qcow2TopTable::io_done(Qcow2State &qs, const struct ublksrv_queue *q,
400 			const struct io_uring_cqe *cqe)
401 {
402 	u32 op = user_data_to_op(cqe->user_data);
403 
404 	//only for write l1 or refcount table
405 	qcow2_assert(op == IORING_OP_WRITE);
406 
407 	unprep_flush(get_flush_blk_idx());
408 
409 	if (cqe->res < 0)
410 		return;
411 
412 	set_blk_dirty(get_flush_blk_idx(), false);
413 
414 	Qcow2MappingMeta::io_done(qs, q, cqe);
415 }
416 
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)417 int Qcow2TopTable::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
418 		u64 off, u32 len)
419 {
420 	int blk_idx = (off - offset) >> min_bs_bits;
421 	int ret;
422 
423 	qcow2_assert(len == 512 && blk_idx < dirty.size());
424 
425 	if (!prep_flush(ioc, blk_idx))
426 		return 0;
427 
428 	if (!get_blk_dirty(blk_idx)) {
429 		ret = 0;
430 		goto exit;
431 	}
432 
433 	set_flush_blk_idx(blk_idx);
434 
435 	//need to run fsync before writting l1/refcount table, so
436 	//that write order between top and l2/refcount blk is respected
437 	ret = Qcow2MappingMeta::__flush(qs, ioc, off, len, true);
438 exit:
439 	if (ret <= 0)
440 		unprep_flush(blk_idx);
441 	return ret;
442 }
443 
has_dirty_slices(Qcow2State & qs,int idx)444 bool Qcow2TopTable::has_dirty_slices(Qcow2State &qs, int idx)
445 {
446 	u64 entry = get_entry(idx);
447 	u64 start, end, step, offset;
448 
449 	if (!entry)
450 		return false;
451 
452 	if (is_mapping_meta())
453 		step = 1ULL << (QCOW2_PARA::L2_TABLE_SLICE_BITS - 3 +
454 				qs.header.cluster_bits);
455 	else
456 		step = 1ULL << (QCOW2_PARA::REFCOUNT_BLK_SLICE_BITS - 3 +
457 				qs.header.cluster_bits);
458 
459 	start = ((u64)idx) << single_entry_order();
460 	end = start + (1ULL << single_entry_order());
461 	for (offset = start; offset < end; offset += step) {
462 		Qcow2SliceMeta *t;
463 
464 		if (is_mapping_meta())
465 			t = qs.cluster_map.__find_slice(offset);
466 		else
467 			t = qs.cluster_allocator.__find_slice(offset);
468 
469 		if (t && t->get_dirty(-1))
470 			return true;
471 	}
472 
473 	return false;
474 }
475 
Qcow2L1Table(Qcow2State & qs)476 Qcow2L1Table::Qcow2L1Table(Qcow2State &qs): Qcow2TopTable(qs,
477 		qs.get_l1_table_offset(), qs.get_l1_table_max_size(),
478 		typeid(*this).name(), QCOW2_META_TOP | QCOW2_META_MAPPING)
479 {
480 }
481 
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)482 int Qcow2L1Table::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc, u32 len, bool sync)
483 {
484 	int ret;
485 
486 	ret = Qcow2Meta::load(qs, ioc, len, sync);
487 	if (ret < 0)
488 		ublk_err( "%s %s: load failed %d", __func__,
489 				typeid(*this).name(), ret);
490 	return ret;
491 }
492 
dump()493 void Qcow2L1Table::dump()
494 {
495 	qcow2_log("%s %s: sizeof %zd\n", __func__, typeid(*this).name(),
496 			sizeof(*this));
497 	for (int i = 0; i < header.get_l1_size(); i++)
498 		qcow2_log("%d: %lx\n", i, get_entry(i));
499 }
500 
get_entry(u32 idx)501 u64  Qcow2L1Table::get_entry(u32 idx) {
502 	return get_entry_fast(idx);
503 }
504 
set_entry(u32 idx,u64 val)505 void Qcow2L1Table::set_entry(u32 idx, u64 val) {
506 	set_entry_fast(idx, val);
507 }
508 
Qcow2RefcountTable(Qcow2State & qs)509 Qcow2RefcountTable::Qcow2RefcountTable(Qcow2State &qs):
510 	Qcow2TopTable(qs, qs.get_refcount_table_offset(),
511 		qs.get_refcount_table_max_size(),
512 		typeid(*this).name(), QCOW2_META_TOP)
513 {
514 }
515 
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)516 int Qcow2RefcountTable::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
517 		u32 len, bool sync)
518 {
519 	int ret;
520 
521 	ret = Qcow2Meta::load(qs, ioc, len, sync);
522 	if (ret < 0)
523 		ublk_err( "%s %s: load failed %d", __func__,
524 				typeid(*this).name(), ret);
525 	return ret;
526 }
527 
get_entry(u32 idx)528 u64  Qcow2RefcountTable::get_entry(u32 idx) {
529 	return get_entry_fast(idx);
530 }
531 
set_entry(u32 idx,u64 val)532 void Qcow2RefcountTable::set_entry(u32 idx, u64 val) {
533 	set_entry_fast(idx, val);
534 }
535 
dump()536 void Qcow2RefcountTable::dump()
537 {
538 	qcow2_log("%s %s: sizeof %zd\n", __func__, typeid(*this).name(),
539 			sizeof(*this));
540 	for (int i = 0; i < data_len / 8; i++) {
541 		u64 entry = get_entry(i);
542 
543 		if (entry != 0)
544 			qcow2_log("%d: %lx\n", i, entry);
545 	}
546 }
547 
Qcow2SliceMeta(Qcow2State & qs,u64 off,u32 buf_sz,const char * cls_name,u32 p_idx,u32 f)548 Qcow2SliceMeta::Qcow2SliceMeta(Qcow2State &qs, u64 off, u32 buf_sz,
549 		const char *cls_name, u32 p_idx, u32 f):
550 	Qcow2MappingMeta(qs, off, buf_sz, cls_name, f),
551 	parent_idx(p_idx)
552 {
553 #ifdef QCOW2_CACHE_DEBUG
554         qcow2_log("slice meta %llx/%p/%d allocated\n", off, addr, buf_sz);
555 #endif
556 #ifdef DEBUG_QCOW2_META_VALIDATE
557 	if (posix_memalign((void **)&validate_addr, getpagesize(), buf_sz))
558 		ublk_err( "%s: allocate validate memory %d bytes failed\n",
559 				__func__, buf_sz);
560 #endif
561 }
562 
~Qcow2SliceMeta()563 Qcow2SliceMeta::~Qcow2SliceMeta() {
564 #ifdef DEBUG_QCOW2_META_VALIDATE
565 	free(validate_addr);
566 #endif
567 }
568 
prep_flush(const qcow2_io_ctx_t & ioc)569 bool Qcow2SliceMeta::prep_flush(const qcow2_io_ctx_t &ioc)
570 {
571 	if (!(flags & QCOW2_META_DIRTY))
572 		return false;
573 
574 	if (flags & QCOW2_META_FLUSHING) {
575 		add_waiter(ioc.get_tag());
576 		throw MetaUpdateException();
577 	}
578 	flags |= QCOW2_META_FLUSHING;
579 	return true;
580 }
581 
unprep_flush()582 void Qcow2SliceMeta::unprep_flush() {
583 	flags &= ~QCOW2_META_FLUSHING;
584 }
585 
zero_my_cluster(Qcow2State & qs,const qcow2_io_ctx_t & ioc)586 int Qcow2SliceMeta::zero_my_cluster(Qcow2State &qs,
587 		const qcow2_io_ctx_t &ioc)
588 {
589 	u64 cluster_off = offset & ~((1ULL << qs.header.cluster_bits) - 1);
590 	Qcow2ClusterState *s = qs.cluster_allocator.get_cluster_state(
591 			 cluster_off);
592 	u32 qid = ioc.get_qid();
593 	u32 tag = ioc.get_tag();
594 	const struct ublksrv_queue *q = ublksrv_get_queue(qs.dev, qid);
595 	int fd = q->dev->tgt.fds[1];
596 	struct io_uring_sqe *sqe;
597 	int mode = FALLOC_FL_ZERO_RANGE;
598 	unsigned mio_id;
599 
600 	if (s == nullptr)
601 		return 0;
602 
603 	if (s->get_state() >= QCOW2_ALLOC_ZEROED)
604 		return 0;
605 
606 	if (s->get_state() == QCOW2_ALLOC_ZEROING) {
607 		s->add_waiter(ioc.get_tag());
608 		throw MetaUpdateException();
609 	}
610 
611 	sqe = io_uring_get_sqe(q->ring_ptr);
612 	if (!sqe) {
613 		ublk_err("%s: tag %d offset %" PRIu64 "op %d, no sqe for zeroing\n",
614 			__func__, tag, offset, IORING_OP_FALLOCATE);
615 		return -ENOMEM;
616 	}
617 
618 	get_ref();
619 
620 	mio_id = qs.add_meta_io(qid, this);
621 	s->set_state(QCOW2_ALLOC_ZEROING);
622 	io_uring_prep_fallocate(sqe, fd, mode, cluster_off,
623 			(1ULL << qs.header.cluster_bits));
624 	sqe->user_data = build_user_data(tag,
625 			IORING_OP_FALLOCATE, mio_id + 1, 1);
626 	ublk_dbg(UBLK_DBG_QCOW2_META, "%s %s: zeroing %p tag %d off %lx sz %d flags %x ref %d\n",
627 			__func__, typeid(*this).name(), this, tag, cluster_off,
628 			(1ULL << qs.header.cluster_bits), flags, refcnt);
629 	return 1;
630 }
631 
load(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u32 len,bool sync)632 int Qcow2SliceMeta::load(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
633 		u32 len, bool sync)
634 {
635 	int ret = -EINVAL;
636 	u32 qid = ioc.get_qid();
637 	u32 tag = ioc.get_tag();
638 	const struct ublksrv_queue *q = ublksrv_get_queue(qs.dev, qid);
639 	struct io_uring_sqe *sqe;
640 	int mio_id;
641 
642 	if (sync) {
643 		ublk_err( "%s %s: we only support async load",
644 				__func__, typeid(*this).name());
645 		return -EINVAL;
646 	}
647 
648 	if (flags & QCOW2_META_UPDATE) {
649 		ublk_err( "%s %s: we are update, need to load?",
650 				__func__, typeid(*this).name());
651 		return -EINVAL;
652 	}
653 
654 	sqe = io_uring_get_sqe(q->ring_ptr);
655 	if (!sqe) {
656 		ublk_err( "%s %s: not get sqe allocated",
657 				__func__, typeid(*this).name());
658 		return ret;
659 	}
660 
661 	get_ref();
662 
663 	mio_id = qs.add_meta_io(qid, this);
664 
665 	io_uring_prep_read(sqe, 1, (void *)addr, buf_sz, offset);
666 	sqe->flags = IOSQE_FIXED_FILE;
667 	/* meta io id starts from one and zero is reserved for plain ublk io */
668 	sqe->user_data = build_user_data(tag, IORING_OP_READ, mio_id + 1, 1);
669 
670 	ublk_dbg(UBLK_DBG_QCOW2_META, "%s: queue io op %d(%llx %x %llx)"
671 				" (qid %d tag %u, cmd_op %u target: %d tgt_data %d)\n",
672 			__func__, sqe->opcode, sqe->off, sqe->len, sqe->addr,
673 			q->q_id, tag, sqe->opcode, 1, mio_id + 1);
674 	ublk_dbg(UBLK_DBG_QCOW2_META, "%s %s: loading %p tag %d off %lx sz %d flags %x ref %d\n",
675 			__func__, typeid(*this).name(), this, tag,
676 			offset, buf_sz, flags, refcnt);
677 
678 	return 0;
679 }
680 
681 #ifdef DEBUG_QCOW2_META_VALIDATE
io_done_validate(Qcow2State & qs,const struct ublksrv_queue * q,struct io_uring_cqe * cqe)682 void Qcow2SliceMeta::io_done_validate(Qcow2State &qs, const struct ublksrv_queue *q,
683 			struct io_uring_cqe *cqe)
684 {
685 	u32 tag = user_data_to_tag(cqe->user_data);
686 	u32 meta_id = user_data_to_tgt_data(cqe->user_data) - 1;
687 	u32 op = user_data_to_op(cqe->user_data);
688 	u64 cluster_off = offset & ~((1ULL << qs.header.cluster_bits) - 1);
689 	bool res;
690 
691 	//for write, buffer data has been saved to validate_addr before
692 	//submitting the WRITE io
693 	if (op != IORING_OP_WRITE) {
694 		lseek(qs.img.fd, offset, SEEK_SET);
695 		read(qs.img.fd, validate_addr, buf_sz);
696 	}
697 
698 	if (op == IORING_OP_FALLOCATE) {
699 		for (int i = 0; i < buf_sz; i++) {
700 			char *buf = (char *)validate_addr;
701 
702 			qcow2_assert(buf[i] == 0);
703 		}
704 	} else if (op == IORING_OP_WRITE || op == IORING_OP_READ) {
705 		unsigned long *buf = (unsigned long *)addr;
706 		unsigned long *buf2 = (unsigned long *)validate_addr;
707 
708 		res = bcmp(addr, validate_addr, buf_sz);
709 
710 		if (res == 0)
711 			return;
712 
713 		for (int i = 0; i < buf_sz / 8; i++) {
714 			if (buf[i] != buf2[i]) {
715 				qcow2_log("%s: not same in %d %lx %lx\n",
716 					__func__, i, buf[i], buf2[i]);
717 				qcow2_log("%s: tag %d, tgt_data %d op %d meta (%p %x %lx %d) res %d\n",
718 					__func__, tag, meta_id, op, this,
719 					get_flags(), get_offset(),
720 					refcnt, cqe->res);
721 			}
722 		}
723 		qcow2_assert(0);
724 	}
725 }
726 #endif
727 
728 /* called for both load() and flush() */
io_done(Qcow2State & qs,const struct ublksrv_queue * q,const struct io_uring_cqe * cqe)729 void Qcow2SliceMeta::io_done(Qcow2State &qs, const struct ublksrv_queue *q,
730 			const struct io_uring_cqe *cqe)
731 {
732 	u32 tag = user_data_to_tag(cqe->user_data);
733 	u32 meta_id = user_data_to_tgt_data(cqe->user_data) - 1;
734 	u32 op = user_data_to_op(cqe->user_data);
735 	u64 cluster_off = offset & ~((1ULL << qs.header.cluster_bits) - 1);
736 
737 	if (cqe->res < 0) {
738 		qcow2_log("%s: failure: tag %d, tgt_data %d op %d meta (%p %x %lx %d) res %d\n",
739 			__func__, tag, meta_id, op, this,
740 			get_flags(), get_offset(), refcnt, cqe->res);
741 		//zeroing the cluster for holding me is done
742 		if (op == IORING_OP_FALLOCATE) {
743 			if (qs.cluster_allocator.
744 			    alloc_cluster_reset(cluster_off))
745 				goto exit;
746 		} else if (op == IORING_OP_WRITE) {
747 			unprep_flush();
748 			goto exit;
749 		} else
750 			goto exit;
751 	}
752 
753 	io_done_validate(qs, q, cqe);
754 
755 	if (op == IORING_OP_READ)
756 		set_update(true);
757 	else if (op == IORING_OP_WRITE) {
758 		unprep_flush();
759 		qs.meta_flushing.dec_dirtied_slice(is_mapping_meta());
760 		set_dirty(-1, false);
761 		set_prep_flush(false);
762 	} else if (op == IORING_OP_FALLOCATE)
763 		qs.cluster_allocator.alloc_cluster_zeroed(q, tag, cluster_off);
764 	else
765 		ublk_err( "%s: unknown op: tag %d op %d meta_id %d res %d\n",
766 			__func__, tag, op, meta_id, cqe->res);
767 
768 	ublk_dbg(UBLK_DBG_QCOW2_META, "%s: tag %d, tgt_data %d op %d meta (%p %x %lx %d) res %d\n",
769 			__func__, tag, meta_id, op, this,
770 			get_flags(), get_offset(), refcnt, cqe->res);
771 
772 	//wake up waiters
773 	Qcow2MappingMeta::io_done(qs, q, cqe);
774 
775 	//if it is evicted, now it is ready to free it
776 	if ((op == IORING_OP_WRITE) && cqe->res >= 0 && get_evicted())
777 		qs.add_slice_to_free_list(this);
778 
779 exit:
780 	//drop the reference grabbed in either load() or flush()
781 	put_ref();
782 	return;
783 }
784 
wait_clusters(Qcow2State & qs,const qcow2_io_ctx_t & ioc)785 void Qcow2SliceMeta::wait_clusters(Qcow2State &qs,
786 		const qcow2_io_ctx_t &ioc)
787 {
788 	for (int i = 0; i < get_nr_entries(); i++) {
789 		u64 entry = get_entry(i);
790 
791 		if (entry) {
792 			u64 cluster_off;
793 
794 			//mapping meta means this is one l2 table, otherwise
795 			//it is one refcount block table
796 			if (is_mapping_meta())
797 				cluster_off = entry & L1E_OFFSET_MASK;
798 			else
799 				cluster_off = virt_offset() + (u64)i << qs.header.cluster_bits;
800 
801 			 Qcow2ClusterState *s = qs.cluster_allocator.
802 				 get_cluster_state(cluster_off);
803 
804 			if (s == nullptr)
805 				continue;
806 
807 			if (s->get_state() < QCOW2_ALLOC_ZEROED) {
808 				s->add_waiter(ioc.get_tag());
809 				throw MetaUpdateException();
810 			}
811 		}
812 	}
813 }
814 
reclaim_me()815 void Qcow2SliceMeta::reclaim_me()
816 {
817 	unsigned queues = header.qs.dev_info->nr_hw_queues;
818 
819 	ublk_dbg(UBLK_DBG_QCOW2_META, "%s: %p off %llx flags %x\n", __func__,
820 			this, get_offset(), flags);
821 
822 	header.qs.remove_slice_from_evicted_list(this);
823 
824 	ublk_dbg(UBLK_DBG_QCOW2_META, "%s: %p off %llx\n", __func__, this, get_offset());
825 
826 	//Tell the whole world, I am leaving
827 	for (int i = 0; i < queues; i++) {
828 		const struct ublksrv_queue *q = ublksrv_get_queue(header.qs.dev, i);
829 
830 		wakeup_all(q, -1);
831 	}
832 	header.qs.reclaim_slice(this);
833 }
834 
Qcow2RefcountBlock(Qcow2State & qs,u64 off,u32 p_idx,u32 f)835 Qcow2RefcountBlock::Qcow2RefcountBlock(Qcow2State &qs, u64 off, u32 p_idx, u32 f):
836 	Qcow2SliceMeta(qs, off, QCOW2_PARA::REFCOUNT_BLK_SLICE_BYTES,
837 			typeid(*this).name(), p_idx, f),
838 	dirty_start_idx((unsigned)-1)
839 {
840 	entry_bits_order = qs.header.refcount_order;
841 	ublk_dbg(UBLK_DBG_QCOW2_META_RB, "rb meta %p %llx -> %llx \n", this, virt_offset(), off);
842 }
843 
844 
reset(Qcow2State & qs,u64 off,u32 p_idx,u32 f)845 void Qcow2RefcountBlock::reset(Qcow2State &qs, u64 off, u32 p_idx, u32 f)
846 {
847 	Qcow2RefcountBlock tmp(qs, off, p_idx, f | QCOW2_META_DONT_ALLOC_BUF);
848 
849 	qcow2_assert(refcnt == 0);
850 
851 	offset = tmp.get_offset();
852 	flags  = tmp.get_flags() & ~QCOW2_META_DONT_ALLOC_BUF;
853 	refcnt = tmp.read_ref();
854 
855 	ublk_dbg(UBLK_DBG_QCOW2_META_RB, "%s: %p refcnt %d flags %x offset %lx \n",
856 			__func__, this, refcnt, flags, offset);
857 
858 	next_free_idx = tmp.get_next_free_idx();
859 
860 	parent_idx = tmp.parent_idx;
861 
862 	dirty_start_idx = tmp.dirty_start_idx;
863 }
864 
get_entry(u32 idx)865 u64  Qcow2RefcountBlock::get_entry(u32 idx) {
866 	return get_entry_fast(idx);
867 }
868 
set_entry(u32 idx,u64 val)869 void Qcow2RefcountBlock::set_entry(u32 idx, u64 val) {
870 	set_entry_fast(idx, val);
871 
872 	if (is_flushing() || !get_update()) {
873 		qcow2_log("BUG %s: obj %p flags %x off %lx\n",
874 				__func__, this, flags, offset);
875 		qcow2_assert(0);
876 	}
877 }
878 
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)879 int Qcow2RefcountBlock::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
880 		u64 off, u32 len)
881 {
882 	int ret;
883 
884 	//wait_clusters(qs, ioc);
885 
886 	if (!prep_flush(ioc))
887 		return 0;
888 
889 	//flush can't be started unless the above two are done
890 	//
891 	//the ref is released in io_done()
892 	get_ref();
893 #ifdef DEBUG_QCOW2_META_VALIDATE
894 	memcpy(validate_addr, addr, buf_sz);
895 #endif
896 	ret = Qcow2MappingMeta::__flush(qs, ioc, off, len);
897 	if (ret <= 0) {
898 		unprep_flush();
899 		put_ref();
900 	}
901 	return ret;
902 }
903 
~Qcow2RefcountBlock()904 Qcow2RefcountBlock::~Qcow2RefcountBlock()
905 {
906 }
907 
get_dirty_range(u64 * start,u64 * end)908 void Qcow2RefcountBlock::get_dirty_range(u64 *start, u64 *end)
909 {
910 	*start = 1;
911 	*end = 0;
912 }
913 
dump()914 void Qcow2RefcountBlock::dump()
915 {
916 	unsigned cnt = 0;
917 	int f = -1, l;
918 	for (int i = 0; i < get_nr_entries(); i++) {
919 		u64 entry = get_entry(i);
920 
921 		if (entry != 0) {
922 			if (f == -1)
923 				f = i;
924 			l = i;
925 			cnt++; //qcow2_log("%d: %lx\n", i, entry);
926 		}
927 	}
928 
929 	if (!cnt)
930 		return;
931 
932 	qcow2_log("%s %s: buf_sz %u offset %" PRIx64 " sizeof %zd entries %u parent_idx %u virt_off %" PRIx64 " flags %x\n",
933 			__func__, typeid(*this).name(), buf_sz, offset, sizeof(*this),
934 			cnt, parent_idx, virt_offset(),
935 			flags);
936 	qcow2_log("\t [%d] = %" PRIx64 "/%" PRIx64 " [%d] = %" PRIx64 "/%" PRIx64 "\n",
937 			f, get_entry(f),
938 			virt_offset() + (f << header.cluster_bits),
939 			l, get_entry(l),
940 			virt_offset() + (l << header.cluster_bits));
941 }
942 
Qcow2L2Table(Qcow2State & qs,u64 off,u32 p_idx,u32 f)943 Qcow2L2Table::Qcow2L2Table(Qcow2State &qs, u64 off, u32 p_idx, u32 f):
944 	Qcow2SliceMeta(qs, off, QCOW2_PARA::L2_TABLE_SLICE_BYTES,
945 		typeid(*this).name(), p_idx, f | QCOW2_META_MAPPING)
946 {
947 	if (header.is_extended_l2_entries())
948 		entry_bits_order <<= 1;
949 	dirty_start = (u64)-1;
950 	dirty_end = 0;
951         ublk_dbg(UBLK_DBG_QCOW2_META_L2, "l2 meta %p %llx -> %llx \n", this, virt_offset(), off);
952 }
953 
reset(Qcow2State & qs,u64 off,u32 p_idx,u32 f)954 void Qcow2L2Table::reset(Qcow2State &qs, u64 off, u32 p_idx, u32 f)
955 {
956 	Qcow2L2Table tmp(qs, off, p_idx, f | QCOW2_META_DONT_ALLOC_BUF);
957 
958 	qcow2_assert(refcnt == 0);
959 
960 	offset = tmp.get_offset();
961 	flags = tmp.get_flags() & ~QCOW2_META_DONT_ALLOC_BUF;
962 	refcnt = tmp.read_ref();
963 
964 	ublk_dbg(UBLK_DBG_QCOW2_META_L2, "%s: %p refcnt %d flags %x offset %lx \n",
965 			__func__, this, refcnt, flags, offset);
966 
967 	next_free_idx = tmp.get_next_free_idx();
968 
969 	parent_idx = tmp.parent_idx;
970 
971 	tmp.get_dirty_range(&dirty_start, &dirty_end);
972 }
973 
~Qcow2L2Table()974 Qcow2L2Table::~Qcow2L2Table()
975 {
976 }
977 
io_done(Qcow2State & qs,const struct ublksrv_queue * q,const struct io_uring_cqe * cqe)978 void Qcow2L2Table::io_done(Qcow2State &qs, const struct ublksrv_queue *q,
979 			const struct io_uring_cqe *cqe)
980 {
981 	get_ref();
982 	Qcow2SliceMeta::io_done(qs, q, cqe);
983 	check(qs, __func__, __LINE__);
984 	put_ref();
985 }
986 
get_entry(u32 idx)987 u64  Qcow2L2Table::get_entry(u32 idx) {
988 	return get_entry_fast(idx);
989 }
990 
get_dirty_range(u64 * start,u64 * end)991 void Qcow2L2Table::get_dirty_range(u64 *start, u64 *end)
992 {
993 	*start = dirty_start;
994 	*end = dirty_end;
995 }
996 
set_entry(u32 idx,u64 val)997 void Qcow2L2Table::set_entry(u32 idx, u64 val) {
998 	set_entry_fast(idx, val);
999 
1000 	if (is_flushing() || !get_update()) {
1001 		qcow2_log("BUG %s: obj %p flags %x off %lx\n",
1002 				__func__, this, flags, offset);
1003 		qcow2_assert(0);
1004 	}
1005 
1006 	val &= L2E_OFFSET_MASK;
1007 
1008 	qcow2_assert(!(val & ((1ULL << header.cluster_bits) - 1)));
1009 
1010 	if (val < dirty_start)
1011 		dirty_start = val;
1012 	if (val > dirty_end)
1013 		dirty_end = val;
1014 }
1015 
flush(Qcow2State & qs,const qcow2_io_ctx_t & ioc,u64 off,u32 len)1016 int Qcow2L2Table::flush(Qcow2State &qs, const qcow2_io_ctx_t &ioc,
1017 		u64 off, u32 len)
1018 {
1019 	int ret;
1020 
1021 	wait_clusters(qs, ioc);
1022 
1023 	if (!prep_flush(ioc))
1024 		return 0;
1025 
1026 	//flush can't be started unless the above two are done
1027 	//
1028 	//the ref is released in io_done()
1029 	get_ref();
1030 #ifdef DEBUG_QCOW2_META_VALIDATE
1031 	memcpy(validate_addr, addr, buf_sz);
1032 	check_duplicated_clusters(qs, ioc.get_tag(), __func__, __LINE__);
1033 #endif
1034 	ret = Qcow2MappingMeta::__flush(qs, ioc, off, len);
1035 	if (ret <= 0) {
1036 		unprep_flush();
1037 		put_ref();
1038 	}
1039 	return ret;
1040 }
1041 
dump()1042 void Qcow2L2Table::dump()
1043 {
1044 	unsigned cnt = 0;
1045 	int f = -1, l;
1046 
1047 	for (int i = 0; i < get_nr_entries(); i++) {
1048 		u64 entry = get_entry(i);
1049 
1050 		if (entry != 0) {
1051 			if (f == -1)
1052 				f = i;
1053 			l = i;
1054 			cnt++; //qcow2_log("%d: %lx\n", i, entry);
1055 		}
1056 	}
1057 
1058 	if (!cnt)
1059 		return;
1060 
1061 	qcow2_log("%s %s: buf_sz %u offset %" PRIx64 " sizeof %zd entries %u parent_idx %u virt_off %" PRIx64 " flags %x\n",
1062 			__func__, typeid(*this).name(), buf_sz, offset, sizeof(*this),
1063 			cnt, parent_idx, virt_offset(), flags);
1064 	qcow2_log("\t [%d] = %" PRIx64 "[%u] = %" PRIx64 "\n", f,
1065 			get_entry(f), l, get_entry(l));
1066 }
1067 
1068 #ifdef DEBUG_QCOW2_META_VALIDATE
check(Qcow2State & qs,const char * func,int line)1069 void Qcow2L2Table::check(Qcow2State &qs, const char *func, int line)
1070 {
1071 	int i, cnt = 0;
1072 	bool bad = false;
1073 
1074 	if (!get_update())
1075 		return;
1076 
1077 	//don't check evicted obj, which can't be used by anyone
1078 	if (get_evicted())
1079 		return;
1080 
1081 	for (i = 0; i < get_nr_entries(); i++) {
1082 		u64 entry = get_entry(i) & ((1ULL << 63) - 1);
1083 
1084 		if (entry == 0)
1085 			continue;
1086 
1087 		cnt++;
1088 
1089 		if (entry + (1ULL << qs.header.cluster_bits) >
1090 				qs.cluster_allocator.max_physical_size) {
1091 			qcow2_log("%s %d: entry %llx(parent idx %d, idx %d) offset %llx is too big\n",
1092 					func, line, entry, parent_idx, i,
1093 					get_offset());
1094 			bad = true;
1095 		}
1096 
1097 		if (entry & ((1ULL << qs.header.cluster_bits) - 1)) {
1098 			qcow2_log("%s: entry %llx(parent idx %d, idx %d) offset %llx isn't aligned\n",
1099 					func, line, entry, parent_idx, i,
1100 					get_offset());
1101 			bad = true;
1102 		}
1103 	}
1104 
1105 	if (bad) {
1106 		qcow2_log("%s %s: %p buf_sz %u offset %llx sizeof %d parent_idx %u virt_off %llx flags %x refcnt %d\n",
1107 				__func__, typeid(*this).name(), this, buf_sz, offset, sizeof(*this),
1108 				parent_idx, virt_offset(), flags, read_ref());
1109 		qcow2_log("\t total entries %d\n", cnt);
1110 		assert(0);
1111 	}
1112 }
1113 
check_duplicated_clusters(Qcow2State & qs,int tag,const char * func,int line)1114 void Qcow2L2Table::check_duplicated_clusters(Qcow2State &qs, int tag,
1115 		const char *func, int line)
1116 {
1117 	for (int i = 0; i < get_nr_entries(); i++) {
1118 		u64 entry = get_entry(i);
1119 
1120 		if (entry != 0) {
1121 			u64 host_off = entry & ((1ULL << 63) - 1);
1122 			u64 virt_off = virt_offset() + (((u64)i) <<
1123 				qs.header.cluster_bits);
1124 
1125 			if (qs.validate_cluster_map(host_off, virt_off))
1126 				continue;
1127 			qcow2_log("BUG %s %d: tag %d obj %p flags %x off %lx virt_off "
1128 					"%lx(#%d) parent_idx %d\n",
1129 				func, line, tag, this, flags, offset,
1130 				virt_offset(), i, parent_idx);
1131 			qcow2_assert(0);
1132 		}
1133 	}
1134 }
1135 #endif
1136