xref: /aosp_15_r20/external/ublksrv/qcow2/qcow2.cpp (revision 94c4a1e103eb1715230460aab379dff275992c20)
1 // SPDX-License-Identifier: GPL-2.0
2 #include "qcow2.h"
3 
Qcow2Image(const char * path)4 Qcow2Image:: Qcow2Image(const char *path): fpath(path) {
5 	fd = open(path, O_RDWR);
6 	if (fd < 0)
7 		ublk_err( "%s: backing file %s can't be opened %d\n",
8 				__func__, path, fd);
9 	fcntl(fd, F_SETFL, O_DIRECT);
10 }
11 
~Qcow2Image()12 Qcow2Image:: ~Qcow2Image() {
13 	if (fd >= 0)
14 		close(fd);
15 }
16 
Qcow2State(const char * path,const struct ublksrv_dev * d)17 Qcow2State:: Qcow2State(const char *path, const struct ublksrv_dev *d):
18 	dev_info(ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(d))),
19 	min_bs_bits(9), dev(d), img(path), header(*this), l1_table(*this),
20 	refcount_table(*this), cluster_allocator(*this),
21 	cluster_map(*this),
22 	meta_io_map(dev_info->nr_hw_queues),
23 	meta_flushing(*this)
24 {
25 	u64 l1_bytes = get_l1_table_max_size();
26 	u64 ref_table_bytes = get_refcount_table_act_size();
27 
28 	l1_table.load(*this, 0, l1_bytes, true);
29 	//l1_table.dump();
30 
31 	refcount_table.load(*this, 0, ref_table_bytes, true);
32 	//refcount_table.dump();
33 
34 	cluster_allocator.setup();
35 }
36 
~Qcow2State()37 Qcow2State:: ~Qcow2State() {
38 }
39 
get_l1_table_max_size()40 u32 Qcow2State::get_l1_table_max_size()
41 {
42 	u32 l2_entry_size = 8;
43 	u64 l2_size, res;
44 
45 	l2_entry_size = header.is_extended_l2_entries() ? 16 : 8;
46 
47 	l2_size = ((1 << header.cluster_bits) / l2_entry_size) <<
48 		header.cluster_bits;
49 	res = (header.get_size() + l2_size - 1) / l2_size;
50 	res *= 8;
51 
52 	//qcow2_log("%s: cls bit %d, l2 entry size %d, l2_size %d, l1 tbl size %d\n",
53 	//		__func__, header.cluster_bits, l2_entry_size, l2_size, res);
54 	if (res < QCOW_MAX_L1_SIZE)
55 		return round_up(res, 1UL << min_bs_bits);
56 	return  QCOW_MAX_L1_SIZE;
57 }
58 
get_refcount_table_max_size()59 u32 Qcow2State::get_refcount_table_max_size()
60 {
61 	u64 blk_size, res;
62 
63 	blk_size = 1ULL << (2 * header.cluster_bits + 3 - header.refcount_order);
64 	res = (header.get_size() + blk_size - 1) / blk_size;
65 	res *= 8;
66 
67 	//qcow2_log("%s: cls bit %d, refcount_order %d, blk_size %llu, ref tbl size %d\n",
68 	//		__func__, header.cluster_bits, header.refcount_order, blk_size, res);
69 	if (res < QCOW_MAX_REFTABLE_SIZE)
70 		return round_up(res, 1UL << min_bs_bits);
71 	return  QCOW_MAX_REFTABLE_SIZE;
72 }
73 
get_refcount_table_act_size()74 u32 Qcow2State::get_refcount_table_act_size()
75 {
76 	u64 ref_table_bytes = header.get_refcount_table_clusters() <<
77 		header.cluster_bits;
78 
79 	if (ref_table_bytes > get_refcount_table_max_size())
80 		ref_table_bytes = get_refcount_table_max_size();
81 
82 	return round_up(ref_table_bytes, 1UL << min_bs_bits);
83 }
84 
get_l1_table_offset()85 u64  Qcow2State::get_l1_table_offset()
86 {
87 	return header.get_l1_table_offset();
88 }
89 
get_refcount_table_offset()90 u64 Qcow2State::get_refcount_table_offset()
91 {
92 	return header.get_refcount_table_offset();
93 }
94 
get_l2_slices_count()95 u32 Qcow2State::get_l2_slices_count()
96 {
97 	u32 mapping_bytes = get_dev_size() >> (header.cluster_bits - 3);
98 
99 	//align with qemu, at most 32MB
100 	if (mapping_bytes > (32U << 20))
101 		mapping_bytes = 32U << 20;
102 
103 	return mapping_bytes >> QCOW2_PARA::L2_TABLE_SLICE_BITS;
104 }
105 
add_meta_io(u32 qid,Qcow2MappingMeta * m)106 u32 Qcow2State::add_meta_io(u32 qid, Qcow2MappingMeta *m)
107 {
108 	struct meta_mapping *map = &meta_io_map[qid];
109 	std::vector <Qcow2MappingMeta *> &v = map->meta;
110 	int i;
111 
112 	for (i = 0; i < v.size(); i++)
113 		if (v[i] == nullptr)
114 			break;
115 	if (i < v.size()) {
116 		v[i] = m;
117 	} else {
118 		v.push_back(m);
119 		i = v.size() - 1;
120 	}
121 
122 	map->nr += 1;
123 
124 	return i;
125 }
126 
has_dirty_slice()127 bool Qcow2State::has_dirty_slice()
128 {
129 	return cluster_map.cache.has_dirty_slice(*this) ||
130 		cluster_allocator.cache.has_dirty_slice(*this);
131 }
132 
reclaim_slice(Qcow2SliceMeta * m)133 void Qcow2State::reclaim_slice(Qcow2SliceMeta *m)
134 {
135 	if (m->is_mapping_meta()) {
136 		Qcow2L2Table *t =
137 			static_cast<Qcow2L2Table *>(m);
138 
139 		cluster_map.cache.add_slice_to_reclaim_list(t);
140 	} else {
141 		Qcow2RefcountBlock *t =
142 			static_cast<Qcow2RefcountBlock *>(m);
143 
144 		cluster_allocator.cache.add_slice_to_reclaim_list(t);
145 	}
146 }
147 
remove_slice_from_evicted_list(Qcow2SliceMeta * m)148 void Qcow2State::remove_slice_from_evicted_list(Qcow2SliceMeta *m)
149 {
150 	if (m->is_mapping_meta()) {
151 		Qcow2L2Table *t =
152 			static_cast<Qcow2L2Table *>(m);
153 
154 		cluster_map.cache.remove_slice_from_evicted_list(t);
155 	} else {
156 		Qcow2RefcountBlock *t =
157 			static_cast<Qcow2RefcountBlock *>(m);
158 
159 		cluster_allocator.cache.remove_slice_from_evicted_list(t);
160 	}
161 }
162 
dump_meta()163 void Qcow2State::dump_meta()
164 {
165 	cluster_allocator.dump_meta();
166 	cluster_map.dump_meta();
167 	meta_flushing.dump();
168 }
169 
170 //todo: allocate from slices from reclaim_slices
kill_slices(const struct ublksrv_queue * q)171 void Qcow2State::kill_slices(const struct ublksrv_queue *q)
172 {
173 	std::vector<Qcow2SliceMeta *> tmp(move(freed_slices));
174 
175 	if (tmp.empty())
176 		return;
177 
178 	qcow2_assert(!tmp.empty() && freed_slices.empty());
179 
180 	//can't free new added slice from ->wakeup_all()
181 	for (auto it = tmp.cbegin(); it != tmp.cend(); ++it) {
182 		auto m = *it;
183 
184 		m->put_ref();
185 	}
186 }
187 
shrink_cache()188 void Qcow2State::shrink_cache()
189 {
190 	cluster_map.cache.shrink(*this);
191 	cluster_allocator.cache.shrink(*this);
192 }
193 
194 #ifdef DEBUG_QCOW2_META_VALIDATE
validate_cluster_use(u64 host_off,u64 virt_off,u32 use)195 void Qcow2State::validate_cluster_use(u64 host_off, u64 virt_off, u32 use) {
196 	auto it = cluster_use.find(host_off);
197 
198 	if (it == cluster_use.end())
199 		cluster_use[host_off] = ((u64)use << 56) | virt_off;
200 	else {
201 		qcow2_log("%s: duplicated cluster assignment host off "
202 				"%llx, virt_off %llx use %d, old entry %llx\n",
203 				__func__, host_off, virt_off, use,
204 				it->second);
205 		qcow2_assert(0);
206 	}
207 }
208 
209 // call it for each entry before flushing the slice
validate_cluster_map(u64 host_off,u64 virt_off)210 bool Qcow2State::validate_cluster_map(u64 host_off, u64 virt_off) {
211 	auto it = cluster_validate_map.find(host_off);
212 
213 	if (it == cluster_validate_map.end()) {
214 		cluster_validate_map[host_off] = virt_off;
215 		return true;
216 	}
217 
218 	if (virt_off == it->second)
219 		return true;
220 
221 	qcow2_log("%s: duplicated cluster assignment host off "
222 			"%llx, virt_off %llx old virt_offset %llx\n",
223 			__func__, host_off, virt_off, it->second);
224 	return false;
225 }
226 #endif
227 
228 /* Make any kind of Qcow2State, so far only support the plain one */
make_qcow2state(const char * file,struct ublksrv_dev * dev)229 Qcow2State *make_qcow2state(const char *file, struct ublksrv_dev *dev)
230 {
231 	return new Qcow2StatePlain(file, dev);
232 }
233 
234 template <class T>
slice_cache(u8 slice_bits,u8 cluster_bits,u8 slice_virt_bits,u32 max_size)235 slice_cache<T>::slice_cache(u8 slice_bits, u8 cluster_bits, u8 slice_virt_bits,
236 		u32 max_size):
237 	slice_size_bits(slice_bits),
238 	cluster_size_bits(cluster_bits),
239 	slice_virt_size_bits(slice_virt_bits),
240 	slices(max_size >> slice_bits),
241 	evicted_slices({})
242 {
243 }
244 
245 template <class T>
__find_slice(u64 key,bool use_evicted_cache)246 T *slice_cache<T>::__find_slice(u64 key, bool use_evicted_cache) {
247 	T *t = slices.__get(key);
248 
249 	if (t)
250 		return t;
251 
252 	if (use_evicted_cache) {
253 		auto it = evicted_slices.find(key);
254 
255 		if (it != evicted_slices.end())
256 			return it->second;
257 	}
258 	return nullptr;
259 }
260 
261 template <class T>
alloc_slice(Qcow2State & state,const qcow2_io_ctx_t & ioc,u64 virt_offset,u64 host_offset,u32 parent_idx)262 T *slice_cache<T>::alloc_slice(Qcow2State &state, const qcow2_io_ctx_t &ioc,
263 		u64 virt_offset, u64 host_offset, u32 parent_idx)
264 {
265 	T *t;
266 	u32 flags;
267 	bool zero_buf;
268 
269 	qcow2_assert(__find_slice(virt_offset, true) == nullptr);
270 	qcow2_assert(!(virt_offset & ((1ULL << cluster_size_bits) - 1)));
271 
272 	if (!state.cluster_allocator.alloc_cluster_is_zeroed(host_offset &
273 				~((1ULL << cluster_size_bits) - 1))) {
274 		flags = QCOW2_META_UPDATE | QCOW2_META_DIRTY;
275 		zero_buf = true;
276 	} else {
277 		flags = 0;
278 		zero_buf = false;
279 	}
280 
281 	t = pick_slice_from_reclaim_list();
282 	if (t == nullptr)
283 		t = new T(state, host_offset, parent_idx, flags);
284 	else
285 		t->reset(state, host_offset, parent_idx, flags);
286 
287 	if (t->get_dirty(-1))
288 		state.meta_flushing.inc_dirtied_slice(t->is_mapping_meta());
289 
290 	if (zero_buf)
291 		t->zero_buf();
292 
293 	T *old = slices.put(virt_offset, t);
294 	if (old) {
295 #ifdef DEBUG_QCOW2_META_OBJ
296 		qcow2_assert(__find_slice(old->virt_offset(), true)
297 				== nullptr);
298 #endif
299 		//loading or flushing may be in-progress, that is allowed.
300 		//and we guarantee that the slice isn't released until
301 		//the loading or flushing is done
302 		old->set_evicted();
303 		add_slice_to_evicted_list(old->virt_offset(), old);
304 
305 		//can't free one dirty slice, but one clean slice can't
306 		//be dirtied after it is evicted, so safe to move clean
307 		//slice into free list for release
308 		if (!old->get_dirty(-1))
309 			state.add_slice_to_free_list(old);
310 		old->put_ref();
311 
312 #ifdef QCOW2_DEBUG
313 		ublk_dbg(UBLK_DBG_QCOW2_META, "%s: %s evicted from tag %d, obj %p flags %x offset %lx ref %d\n",
314 				__func__, old->get_id(), ioc.get_tag(), old,
315 				old->get_flags(), old->get_offset(),
316 				old->read_ref());
317 #endif
318 	}
319 
320 	if (virt_offset != t->virt_offset()) {
321 		ublk_err( "%s %d: %s %" PRIx64 "/%" PRIx64 " parent_idx %d host_off %" PRIx64 " flags %x\n",
322 			__func__, __LINE__, typeid(*t).name(),
323 			virt_offset, t->virt_offset(), parent_idx,
324 			host_offset, flags);
325 		qcow2_assert(virt_offset == t->virt_offset());
326 	}
327 
328 	return t;
329 }
330 
331 template <class T>
add_slice_to_evicted_list(u64 virt_offset,T * t)332 void slice_cache<T>::add_slice_to_evicted_list(u64 virt_offset, T *t)
333 {
334 	auto it = evicted_slices.find(virt_offset);
335 
336 	qcow2_assert(virt_offset == t->virt_offset());
337 
338 	if (it == evicted_slices.end())
339 		evicted_slices[virt_offset] = t;
340 	else {
341 #if 1
342 		auto m = it->second;
343 		qcow2_log("%s: add duplicated cache virt_offset %" PRIx64 ", remove old entry(%p %lx/%lx %x %d)\n",
344 				__func__, virt_offset, m, m->virt_offset(),
345 				m->get_offset(), m->get_flags(), m->read_ref());
346 		it->second->show(__func__, __LINE__);
347 		qcow2_assert(0);
348 #endif
349 
350 		//this slice has been in handled in prep_flushing,
351 		//so it is fine to remove it from freed list now
352 		evicted_slices.erase(it);
353 		evicted_slices[virt_offset] = t;
354 	}
355 }
356 
357 template <class T>
dump(Qcow2State & qs)358 void slice_cache<T>::dump(Qcow2State &qs) {
359 	auto lru_list = slices.get_lru_list_ro();
360 
361 	ublk_log("cache size %zu, dirty cache size %zu\n",
362 			slices.size(), evicted_slices.size());
363 
364 	//todo: use lrucache iterator to cut the loop time
365 	for (auto it = lru_list.cbegin(); it != lru_list.cend(); ++it) {
366 		T *t = it->second;
367 
368 		if (t)
369 			t->dump();
370 	}
371 }
372 
373 template <class T>
figure_group_from_dirty_list(Qcow2State & qs)374 int slice_cache<T>::figure_group_from_dirty_list(Qcow2State &qs) {
375 	std::unordered_map<u32, int> cnt;
376 	int val = -1;
377 	int idx = -1;
378 
379 	for (auto it = evicted_slices.cbegin(); it != evicted_slices.cend(); ++it) {
380 		u32 key = (it->second->parent_idx * 8) / 512;
381 		auto it1 = cnt.find(key);
382 
383 		if (it1 == cnt.end())
384 			cnt[key] = 0;
385 		else
386 			cnt[key] += 1;
387 	}
388 
389 	for (auto it = cnt.cbegin(); it != cnt.cend(); ++it) {
390 		if (it->second > val) {
391 			idx = it->first;
392 			val = it->second;
393 		}
394 	}
395 
396 	flush_log("%s: dirty list: idx %d cnt %u\n", __func__, idx, val);
397 
398 	qcow2_assert(idx != -1);
399 	return idx;
400 }
401 
402 template <class T>
__figure_group_for_flush(Qcow2State & qs)403 int slice_cache<T>::__figure_group_for_flush(Qcow2State &qs)
404 {
405 	std::unordered_map<u32, int> cnt;
406 	int val = -1;
407 	int idx = -1;
408 	auto lru_list = slices.get_lru_list_ro();
409 
410 	//todo: use lrucache iterator to cut the loop time
411 	for (auto it = lru_list.cbegin(); it != lru_list.cend(); ++it) {
412 		T *t = it->second;
413 
414 		if (t != nullptr && t->get_dirty(-1) && !t->is_flushing()) {
415 			u32 key = (t->parent_idx * 8) / 512;
416 			auto it1 = cnt.find(key);
417 
418 			if (it1 == cnt.end())
419 				cnt[key] = 0;
420 			else
421 				cnt[key] += 1;
422 		}
423 	}
424 
425 	if (cnt.size() == 0)
426 		return -1;
427 
428 	for (auto it = cnt.cbegin(); it != cnt.cend(); ++it) {
429 		if (it->second > val) {
430 			idx = it->first;
431 			val = it->second;
432 		}
433 	}
434 	qcow2_assert(idx != -1);
435 	flush_log("%s: lru list: idx %d cnt %u\n", __func__, idx, val);
436 	return idx;
437 }
438 
439 template <class T>
figure_group_for_flush(Qcow2State & qs)440 int slice_cache<T>::figure_group_for_flush(Qcow2State &qs)
441 {
442 	if (evicted_slices.size() > 0)
443 		return figure_group_from_dirty_list(qs);
444 
445 	return __figure_group_for_flush(qs);
446 }
447 
448 template <class T>
has_dirty_slice(Qcow2State & qs)449 bool slice_cache<T>::has_dirty_slice(Qcow2State &qs)
450 {
451 	auto lru_list = slices.get_lru_list_ro();
452 
453 	//todo: use lrucache iterator to cut the loop time
454 	for (auto it = lru_list.cbegin(); it != lru_list.cend(); ++it) {
455 		T *t = it->second;
456 
457 		if (t != nullptr && t->get_dirty(-1) && !t->is_flushing())
458 			return true;
459 	}
460 
461 	return has_evicted_dirty_slices();
462 }
463 
464 template <class T>
shrink(Qcow2State & qs)465 void slice_cache<T>::shrink(Qcow2State &qs)
466 {
467 	u32 cnt = qs.get_l2_slices_count();
468 
469 	for (auto it = reclaimed_slices.cbegin();
470 			it != reclaimed_slices.cend(); ++it) {
471 		delete *it;
472 	}
473 
474 	reclaimed_slices.clear();
475 
476 	cnt >>= 3;
477 
478 	//shrink cache until 1/8 slices are kept
479 	while (slices.size() > cnt) {
480 		auto t = slices.remove_last();
481 
482 		delete t;
483 	}
484 }
485 
486 // refcount table shouldn't be so big
Qcow2ClusterAllocator(Qcow2State & qs)487 Qcow2ClusterAllocator::Qcow2ClusterAllocator(Qcow2State &qs): state(qs),
488 	cache(REFCOUNT_BLK_SLICE_BITS, qs.header.cluster_bits,
489 		qs.header.cluster_bits + 3 - qs.header.refcount_order +
490 		QCOW2_PARA::REFCOUNT_BLK_SLICE_BITS,
491 		QCOW2_PARA::REFCOUNT_BLK_MAX_CACHE_BYTES),
492 	alloc_state({})
493 {
494 	max_alloc_states = 0;
495 };
496 
__find_slice(u64 key)497 Qcow2RefcountBlock* Qcow2ClusterAllocator::__find_slice(u64 key)
498 {
499 	return cache.__find_slice(key, true);
500 }
501 
figure_group_from_refcount_table()502 int Qcow2ClusterAllocator::figure_group_from_refcount_table()
503 {
504 	int ret = cache.figure_group_for_flush(state);
505 
506 	if (ret == -1)
507 		return state.refcount_table.get_1st_dirty_blk();
508 	return ret;
509 }
510 
alloc_cluster_started(const qcow2_io_ctx_t & ioc,u64 cluster_offset,u8 purpose)511 void Qcow2ClusterAllocator::alloc_cluster_started(const qcow2_io_ctx_t &ioc,
512 		u64 cluster_offset, u8 purpose)
513 {
514 	auto it = alloc_state.find(cluster_offset);
515 	u32 sz;
516 
517 	qcow2_assert(it == alloc_state.end());
518 
519 	alloc_state[cluster_offset] = new Qcow2ClusterState(
520 			QCOW2_ALLOC_STARTED, purpose);
521 
522 	sz = alloc_state.size();
523 
524 	if (sz > max_alloc_states)
525 		max_alloc_states = sz;
526 
527 	alloc_log("%s: offset %lx state %d purpose %d\n",
528 			__func__, cluster_offset,
529 			QCOW2_ALLOC_STARTED, purpose);
530 }
531 
alloc_cluster_zeroing(const qcow2_io_ctx_t & ioc,u64 cluster_offset)532 void Qcow2ClusterAllocator::alloc_cluster_zeroing(const qcow2_io_ctx_t &ioc,
533 		u64 cluster_offset)
534 {
535 	auto it = alloc_state.find(cluster_offset);
536 
537 	qcow2_assert(it != alloc_state.end());
538 
539 	it->second->set_state(QCOW2_ALLOC_ZEROING);
540 
541 	alloc_log("%s: offset %lx state %d purpose %d\n", __func__,
542 			cluster_offset, it->second->get_state(),
543 			it->second->get_purpose());
544 }
545 
alloc_cluster_zeroed(const struct ublksrv_queue * q,int tag,u64 cluster_offset)546 void Qcow2ClusterAllocator::alloc_cluster_zeroed(const struct ublksrv_queue *q,
547 		int tag, u64 cluster_offset)
548 {
549 	auto it = alloc_state.find(cluster_offset);
550 
551 	if (it == alloc_state.end())
552 		ublk_err( "%s: offset %lx\n", __func__, cluster_offset);
553 	qcow2_assert(it != alloc_state.end());
554 
555 	it->second->set_state(QCOW2_ALLOC_ZEROED);
556 	alloc_log("%s: offset %lx state %d purpose %d\n", __func__,
557 			cluster_offset, it->second->get_state(),
558 			it->second->get_purpose());
559 
560 	it->second->wakeup_all(q, tag);
561 
562 	/* safe to remove it now */
563 	delete it->second;
564 	alloc_state.erase(it);
565 }
566 
567 //called after mapping is setup for this cluster
alloc_cluster_done(const qcow2_io_ctx_t & ioc,u64 cluster_offset)568 void Qcow2ClusterAllocator::alloc_cluster_done(const qcow2_io_ctx_t &ioc,
569 		u64 cluster_offset)
570 {
571 	auto it = alloc_state.find(cluster_offset);
572 
573 	qcow2_assert(it != alloc_state.end());
574 
575 	delete it->second;
576 
577 	alloc_state.erase(it);
578 }
579 
dump_meta()580 void Qcow2ClusterAllocator::dump_meta() {
581 
582 	qcow2_log("cluster allocator %s: total allocates %" PRIu64 " clusters, bytes %" PRIu64 "KB, max states %u/%lu\n",
583 			__func__, alloc_cnt, (alloc_cnt <<
584 			state.header.cluster_bits) >> 10,
585 			max_alloc_states, alloc_state.size());
586 	state.refcount_table.dump();
587 	cache.dump(state);
588 }
589 
setup()590 void Qcow2ClusterAllocator::setup() {
591 	long i = 0;
592 
593 	for (i = (state.refcount_table.get_data_len() / 8) - 1; i >= 0; i--)
594 		if (state.refcount_table.get_entry(i) != 0)
595 			break;
596 	/*
597 	 * most of times this entry has slot available yet, otherwise
598 	 * allocate_cluster() will move to next refcount block cache
599 	 */
600 	state.refcount_table.set_next_free_idx(i);
601 
602 	table_entry_virt_size_bits = 2 * state.header.cluster_bits + 3 -
603 		state.header.refcount_order;
604 	slice_idx = 0;
605 	alloc_cnt = 0;
606 
607 	//just one estimation, for runtime check only
608 	max_physical_size = ((u64)(i + 1)) << table_entry_virt_size_bits;
609 }
610 
allocate_refcount_blk(const qcow2_io_ctx_t & ioc,s32 idx)611 void Qcow2ClusterAllocator::allocate_refcount_blk(const qcow2_io_ctx_t &ioc,
612 		s32 idx)
613 {
614 	Qcow2RefcountBlock *rb;
615 	u64 virt_offset = (u64)idx << table_entry_virt_size_bits;
616 	u64 host_offset = virt_offset;
617 
618 	if (state.refcount_table.is_flushing(idx)) {
619 		state.refcount_table.add_waiter(ioc.get_tag());
620 		throw MetaUpdateException();
621 	}
622 
623 	max_physical_size = ((u64)(idx + 1)) << table_entry_virt_size_bits;
624 	state.refcount_table.set_next_free_idx(idx);
625 	qcow2_assert(!state.refcount_table.get_entry(idx));
626 	state.refcount_table.set_entry(idx, host_offset);
627 
628 	//track the new allocated cluster
629 	alloc_cluster_started(ioc, host_offset,
630 			QCOW2_CLUSTER_USE::REFCOUNT_BLK);
631 	state.validate_cluster_use(host_offset, virt_offset,
632 			QCOW2_CLUSTER_USE::REFCOUNT_BLK);
633 
634 	rb = cache.alloc_slice(state, ioc, virt_offset, host_offset, idx);
635 	qcow2_assert(rb != nullptr);
636 	qcow2_assert(rb->get_update() && !rb->get_evicted() &&
637 			!rb->is_flushing());
638 
639 	//the first cluster is for this refcount block
640 	rb->set_entry(0, 1);
641 	rb->set_next_free_idx(1);
642 }
643 
allocate_cluster(const qcow2_io_ctx_t & ioc)644 u64 Qcow2ClusterAllocator::allocate_cluster(const qcow2_io_ctx_t &ioc)
645 {
646 	Qcow2RefcountBlock *rb;
647 	s32 free_idx;
648 	u64 virt_offset, host_offset;
649 
650 again:
651 	free_idx = state.refcount_table.get_next_free_idx();
652 	virt_offset = ((u64)free_idx << table_entry_virt_size_bits) +
653 		((u64)slice_idx << cache.get_slice_virt_size_bits());
654 	rb = cache.find_slice(virt_offset, true);
655 	if (rb == nullptr)
656 		goto alloc_refcount_blk;
657 	qcow2_assert(rb->read_ref() > 0);
658 
659 check_new:
660 	/* the cache has been allocated & being loaded */
661 	if (!rb->get_update()) {
662 		rb->add_waiter(ioc.get_tag());
663 		throw MetaUpdateException();
664 	}
665 
666 	//if we are being flushed, can't touch the in-ram table,
667 	//so wait until the flushing is done
668 	if (rb->is_flushing() || rb->get_evicted()) {
669 		rb->add_waiter(ioc.get_tag());
670 		throw MetaUpdateException();
671 	}
672 
673 #ifdef QCOW2_CACHE_DEBUG
674 	qcow2_log("%s: hit: next free %d entries %d virt_off %llx slice_idx %d\n",
675 			__func__, rb->get_next_free_idx(), rb->get_nr_entries(),
676 			virt_offset, slice_idx);
677 #endif
678 	//todo: cache the last free entry
679 	for (int i = rb->get_next_free_idx(); i < rb->get_nr_entries(); i++) {
680 		if (i < 0)
681 			continue;
682 		//qcow2_log("\t entry[%d]=%llx\n", i, rb->get_entry(i));
683 		if (rb->get_entry_fast(i) == 0) {
684 			u64 res = virt_offset + (((u64)i) <<
685 					state.header.cluster_bits);
686 
687 			if (!rb->get_dirty(-1))
688 				state.meta_flushing.inc_dirtied_slice(false);
689 			qcow2_assert(rb->get_update() && !rb->is_flushing() &&
690 				!rb->get_evicted());
691 			rb->set_entry(i, 1);
692 			rb->set_next_free_idx(i + 1);
693 
694 			alloc_cnt++;
695 			return res;
696 		}
697 	}
698 
699 	if (++slice_idx < cache.get_nr_slices())
700 		goto again;
701 
702 	// this current cache is full, so move to next one.
703 	//
704 	// Here it is different with l2 table's cache which is sliced, but
705 	// refcount blk cache size is always equal to one cluster
706 	qcow2_assert(free_idx < state.refcount_table.get_nr_entries());
707 	allocate_refcount_blk(ioc, free_idx + 1);
708 	slice_idx = 0;
709 	goto again;
710 
711 alloc_refcount_blk:
712 	//start is host offset of refcount block object
713 	host_offset = state.refcount_table.get_entry(free_idx) +
714 			    + (u64(slice_idx) << cache.get_slice_size_bits());
715 
716 	rb = cache.alloc_slice(state, ioc, virt_offset, host_offset, free_idx);
717 
718 	/* the cluster may be allocated just in ram, no need to load */
719 	if (rb->get_update())
720 		goto check_new;
721 
722 	rb->load(state, ioc, QCOW2_PARA::REFCOUNT_BLK_SLICE_BYTES, false);
723 
724 	//add our tag into io_waiters, so once we get updated,
725 	//the current io context will be resumed when handling cqe
726 	//
727 	//we have to call it explicitly here for both io contexts
728 	//which starts to load meta and wait for in-flight meta
729 	rb->add_waiter(ioc.get_tag());
730 
731 	//->handle_io_async() has to handle this exception
732 	throw MetaIoException();
733 
734 	return 0;
735 }
736 
737 // refcount table shouldn't be so big
Qcow2ClusterMapping(Qcow2State & qs)738 Qcow2ClusterMapping::Qcow2ClusterMapping(Qcow2State &qs): state(qs),
739 	cache(QCOW2_PARA::L2_TABLE_SLICE_BITS,
740 		qs.header.cluster_bits,
741 		qs.header.cluster_bits + L2_TABLE_SLICE_BITS - 3,
742 		qs.get_l2_slices_count() * QCOW2_PARA::L2_TABLE_SLICE_BYTES),
743 	cluster_bits(state.header.cluster_bits),
744 	l2_entries_order(state.header.cluster_bits - 3),
745 	max_alloc_entries(0)
746 {
747 }
748 
__find_slice(u64 key,bool use_dirty)749 Qcow2L2Table* Qcow2ClusterMapping::__find_slice(u64 key, bool use_dirty)
750 {
751 	return cache.__find_slice(key, use_dirty);
752 }
753 
figure_group_from_l1_table()754 int Qcow2ClusterMapping::figure_group_from_l1_table()
755 {
756 	int ret = cache.figure_group_for_flush(state);
757 
758 	if (ret == -1)
759 		return state.l1_table.get_1st_dirty_blk();
760 	return ret;
761 }
762 
create_and_add_l2(const qcow2_io_ctx_t & ioc,u64 offset)763 Qcow2L2Table *Qcow2ClusterMapping::create_and_add_l2(const qcow2_io_ctx_t &ioc,
764 		u64 offset)
765 {
766 	const unsigned idx = l1_idx(offset);
767 	u64 l1_entry = state.l1_table.get_entry(idx);
768 	u64 l2_cluster = -1;
769 	const struct ublksrv_queue *q = ublksrv_get_queue(state.dev, ioc.get_qid());
770 	Qcow2L2Table *l2 = nullptr;
771 
772 	qcow2_assert(!state.l1_table.entry_allocated(l1_entry));
773 
774 	//in case of being flushed, we can't update in-ram meta, so
775 	//exit and wait for flush completion
776 	if (state.l1_table.is_flushing(idx)) {
777 		state.l1_table.add_waiter(ioc.get_tag());
778 		throw MetaUpdateException();
779 	}
780 
781 	//if someone is allocating cluster for this entry, wait until
782 	//the entry becomes valid or failed
783 	if (entry_is_allocating(offset, true)) {
784 		u32 owner = entry_get_alloc_owner(offset, true);
785 
786 		if (owner != ioc.get_tag()) {
787 			state.l1_table.add_waiter_idx(ioc.get_tag(), idx);
788 			throw MetaUpdateException();
789 		}
790 	} else {
791 		//store owner into the entry for marking we are allocating, so
792 		//others can't allocate for this entry any more, and others
793 		//just need to wait until the allocation is done
794 		entry_mark_allocating(offset, ioc.get_tag(), true);
795 	}
796 
797 	l2_cluster = state.cluster_allocator.allocate_cluster(ioc);
798 	if (l2_cluster == -1) {
799 		state.l1_table.set_entry(idx, 0);
800 	} else {
801 		unsigned long s_idx = cache.get_slice_idx(l2_slice_key(offset));
802 		u64 host_offset = l2_cluster +
803 			(s_idx << cache.get_slice_size_bits());
804 
805 		state.cluster_allocator.alloc_cluster_started(ioc,
806 				l2_cluster, QCOW2_CLUSTER_USE::L2_TABLE);
807 		state.validate_cluster_use(l2_cluster, l2_slice_key(offset),
808 				QCOW2_CLUSTER_USE::L2_TABLE);
809 		//allocate l2 cache
810 		l2 = cache.alloc_slice(state, ioc, l2_slice_key(offset),
811 				host_offset, idx);
812 		l2->get_ref();
813 		qcow2_assert(l2->get_update());
814 
815 		l2_cluster |= 1ULL << 63;
816 		state.l1_table.set_entry(idx, l2_cluster);
817 	}
818 
819 	entry_mark_allocated(offset, true);
820 	state.l1_table.wakeup_all_idx(q, ioc.get_tag(), idx);
821 
822 	return l2;
823 }
824 
load_l2_slice(const qcow2_io_ctx_t & ioc,u64 offset,u64 l1_entry)825 Qcow2L2Table *Qcow2ClusterMapping::load_l2_slice(const qcow2_io_ctx_t &ioc, u64 offset,
826 		u64 l1_entry)
827 {
828 	const u64 slice_offset = (l2_idx(offset) << 3) &
829 		~(QCOW2_PARA::L2_TABLE_SLICE_BYTES - 1);
830 	u64 start = (l1_entry & ((1ULL << 63) - 1)) + slice_offset;
831 	Qcow2L2Table *l2;
832 
833 	l2 = cache.alloc_slice(state, ioc, l2_slice_key(offset), start,
834 			l1_idx(offset));
835 	//start may point to one new allocated cluster
836 	if (l2->get_update()) {
837 		l2->get_ref();
838 		return l2;
839 	}
840 
841 	ublk_dbg(UBLK_DBG_QCOW2_META_L2, "cache: alloc: key %" PRIx64 " val %p, update %d\n",
842 			start, l2, l2->get_update());
843 	l2->load(state, ioc, QCOW2_PARA::L2_TABLE_SLICE_BYTES, false);
844 	l2->add_waiter(ioc.get_tag());
845 	throw MetaIoException();
846 
847 	return l2;
848 }
849 
850 //return l2 slice object with holding one extra reference
create_l2_map(const qcow2_io_ctx_t & ioc,u64 offset,bool create_l2)851 Qcow2L2Table *Qcow2ClusterMapping::create_l2_map(const qcow2_io_ctx_t &ioc,
852 		u64 offset, bool create_l2)
853 {
854 	u64 l1_entry = state.l1_table.get_entry_fast(l1_idx(offset));
855 	Qcow2L2Table *l2 = nullptr;
856 
857 	if (state.l1_table.entry_allocated(l1_entry))
858 		return load_l2_slice(ioc, offset, l1_entry);
859 
860 	if (create_l2) {
861 		// l2 table isn't allocated yet, so create one and add it here
862 		l2 = create_and_add_l2(ioc, offset);
863 		if (!l2)
864 			ublk_err( "%s: tag %d: allocate l2 failed for %" PRIx64 "\n",
865 				__func__, ioc.get_tag(), offset);
866 	}
867 	return l2;
868 }
869 
870 //virt_offset's l2 table doesn't include this entry yet, so allocate
871 //one cluster and install the mapping
build_mapping(const qcow2_io_ctx_t & ioc,u64 virt_offset,Qcow2L2Table * l2,u32 idx_in_slice,u64 * l2_entry)872 int Qcow2ClusterMapping::build_mapping(const qcow2_io_ctx_t &ioc,
873 		u64 virt_offset, Qcow2L2Table *l2, u32 idx_in_slice,
874 		u64 *l2_entry)
875 {
876 	const struct ublksrv_queue *q = ublksrv_get_queue(state.dev, ioc.get_qid());
877 	u64 data_cluster = -1;
878 	int ret;
879 
880 	qcow2_assert(l2->get_update());
881 
882 	//in case of being flushed, we can't update in-ram meta, so
883 	//exit and wait for flush completion
884 	//
885 	//If this slice is marked as PREP_FLUSH, the dependent refcount
886 	//block tables are being flushed, so delay this slice update
887 	//until our flushing is done
888 	if (l2->is_flushing() || l2->get_evicted() || l2->get_prep_flush()) {
889 		l2->add_waiter(ioc.get_tag());
890 		throw MetaUpdateException();
891 	}
892 
893 	qcow2_assert(l2->read_ref() > 0);
894 
895 	if (entry_is_allocating(virt_offset, false)) {
896 		u32 owner = entry_get_alloc_owner(virt_offset, false);
897 
898 		if (owner != ioc.get_tag()) {
899 			l2->add_waiter_idx(ioc.get_tag(), idx_in_slice);
900 			throw MetaUpdateException();
901 		}
902 	} else {
903 		entry_mark_allocating(virt_offset, ioc.get_tag(), false);
904 	}
905 
906 	data_cluster = state.cluster_allocator.allocate_cluster(ioc);
907 	qcow2_assert(l2->get_update() && !l2->is_flushing() &&
908 			!l2->get_evicted());
909 	if (data_cluster == -1) {
910 		l2->set_entry(idx_in_slice, 0);
911 		ret = -ENOSPC;
912 	} else {
913 		state.cluster_allocator.alloc_cluster_started(ioc,
914 				data_cluster, QCOW2_CLUSTER_USE::DATA);
915 		state.validate_cluster_use(data_cluster, virt_offset,
916 				QCOW2_CLUSTER_USE::DATA);
917 		data_cluster |= 1ULL << 63;
918 		*l2_entry = data_cluster;
919 		if (!l2->get_dirty(-1))
920 			state.meta_flushing.inc_dirtied_slice(true);
921 		l2->set_entry(idx_in_slice, data_cluster);
922 		ret = 0;
923 	}
924 
925 	l2->check(state, __func__, __LINE__);
926 
927 	entry_mark_allocated(virt_offset, false);
928 	l2->wakeup_all_idx(q, ioc.get_tag(), idx_in_slice);
929 	return ret;
930 }
931 
932 //we get one extra reference of l2 when calling this function.
__map_cluster(const qcow2_io_ctx_t & ioc,Qcow2L2Table * l2,u64 offset,bool create_l2)933 u64 Qcow2ClusterMapping::__map_cluster(const qcow2_io_ctx_t &ioc,
934 		Qcow2L2Table *l2, u64 offset, bool create_l2)
935 {
936 	const u32 idx_in_slice = ((l2_idx(offset) << 3) &
937 			(QCOW2_PARA::L2_TABLE_SLICE_BYTES - 1)) >> 3;
938 	u64 l2_entry;
939 	int ret;
940 
941 	qcow2_assert(l2->read_ref() > 0);
942 	l2->check(state, __func__, __LINE__);
943 
944 	/* the cache is being loaded */
945 	if (!l2->get_update()) {
946 		l2->add_waiter(ioc.get_tag());
947 		throw MetaUpdateException();
948 	}
949 
950 	l2_entry = l2->get_entry_fast(idx_in_slice);
951 	if (l2->entry_allocated(l2_entry))
952 		goto exit;
953 
954 	if (!create_l2)
955 		return 0;
956 
957 	ret = build_mapping(ioc, offset, l2, idx_in_slice, &l2_entry);
958 	if (ret) {
959 		qcow2_log("%s %d: tag %d build l2 mapping failed %d\n",
960 				__func__, __LINE__, ioc.get_tag(), ret);
961 		return 0;
962 	}
963 exit:
964 	qcow2_assert(l2->entry_allocated(l2_entry));
965 	return l2_entry & ((1ULL << 63) - 1);
966 }
967 
968 
969 //any caller has to catch both MetaIoException and MetaUpdateException
map_cluster(const qcow2_io_ctx_t & ioc,u64 offset,bool create_l2)970 u64 Qcow2ClusterMapping::map_cluster(const qcow2_io_ctx_t &ioc, u64 offset,
971 		bool create_l2)
972 {
973 	Qcow2L2Table *l2 = cache.find_slice(l2_slice_key(offset), true);
974 	u64 off_in_cls = offset & ((1ULL << cluster_bits) - 1);
975 	u64 host_off = 0;
976 
977 	offset = offset & ~((1ULL << cluster_bits) - 1);
978 
979 	// l2 could be freed when wakeup() is called, so refcount
980 	// has to be grabbed
981 	if (l2) {
982 		l2->get_ref();
983 	} else {
984 		try {
985 			l2 = create_l2_map(ioc, offset, create_l2);
986 		} catch (MetaIoException &meta_error) {
987 			throw MetaIoException();
988 		} catch (MetaUpdateException &meta_update_error) {
989 			throw MetaUpdateException();
990 		}
991 	}
992 
993 	if (l2 == nullptr)
994 		return 0;
995 
996 	try {
997 		host_off = __map_cluster(ioc, l2, offset, create_l2);
998 	} catch (MetaIoException &meta_error) {
999 		l2->put_ref();
1000 		throw MetaIoException();
1001 	} catch (MetaUpdateException &meta_update_error) {
1002 		l2->put_ref();
1003 		throw MetaUpdateException();
1004 	}
1005 
1006 	l2->put_ref();
1007 
1008 	if (host_off & QCOW_OFLAG_COMPRESSED)
1009 		return (u64)-1;
1010 
1011 	return host_off + off_in_cls;
1012 }
1013 
dump_meta()1014 void Qcow2ClusterMapping::dump_meta()
1015 {
1016 	qcow2_log("cluster mapping%s: max_alloc_entries %u/%lu\n", __func__,
1017 			max_alloc_entries, entry_alloc.size());
1018 	state.l1_table.dump();
1019 	cache.dump(state);
1020 }
1021