1 // SPDX-License-Identifier: GPL-2.0
2 #include "qcow2.h"
3
Qcow2Image(const char * path)4 Qcow2Image:: Qcow2Image(const char *path): fpath(path) {
5 fd = open(path, O_RDWR);
6 if (fd < 0)
7 ublk_err( "%s: backing file %s can't be opened %d\n",
8 __func__, path, fd);
9 fcntl(fd, F_SETFL, O_DIRECT);
10 }
11
~Qcow2Image()12 Qcow2Image:: ~Qcow2Image() {
13 if (fd >= 0)
14 close(fd);
15 }
16
Qcow2State(const char * path,const struct ublksrv_dev * d)17 Qcow2State:: Qcow2State(const char *path, const struct ublksrv_dev *d):
18 dev_info(ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(d))),
19 min_bs_bits(9), dev(d), img(path), header(*this), l1_table(*this),
20 refcount_table(*this), cluster_allocator(*this),
21 cluster_map(*this),
22 meta_io_map(dev_info->nr_hw_queues),
23 meta_flushing(*this)
24 {
25 u64 l1_bytes = get_l1_table_max_size();
26 u64 ref_table_bytes = get_refcount_table_act_size();
27
28 l1_table.load(*this, 0, l1_bytes, true);
29 //l1_table.dump();
30
31 refcount_table.load(*this, 0, ref_table_bytes, true);
32 //refcount_table.dump();
33
34 cluster_allocator.setup();
35 }
36
~Qcow2State()37 Qcow2State:: ~Qcow2State() {
38 }
39
get_l1_table_max_size()40 u32 Qcow2State::get_l1_table_max_size()
41 {
42 u32 l2_entry_size = 8;
43 u64 l2_size, res;
44
45 l2_entry_size = header.is_extended_l2_entries() ? 16 : 8;
46
47 l2_size = ((1 << header.cluster_bits) / l2_entry_size) <<
48 header.cluster_bits;
49 res = (header.get_size() + l2_size - 1) / l2_size;
50 res *= 8;
51
52 //qcow2_log("%s: cls bit %d, l2 entry size %d, l2_size %d, l1 tbl size %d\n",
53 // __func__, header.cluster_bits, l2_entry_size, l2_size, res);
54 if (res < QCOW_MAX_L1_SIZE)
55 return round_up(res, 1UL << min_bs_bits);
56 return QCOW_MAX_L1_SIZE;
57 }
58
get_refcount_table_max_size()59 u32 Qcow2State::get_refcount_table_max_size()
60 {
61 u64 blk_size, res;
62
63 blk_size = 1ULL << (2 * header.cluster_bits + 3 - header.refcount_order);
64 res = (header.get_size() + blk_size - 1) / blk_size;
65 res *= 8;
66
67 //qcow2_log("%s: cls bit %d, refcount_order %d, blk_size %llu, ref tbl size %d\n",
68 // __func__, header.cluster_bits, header.refcount_order, blk_size, res);
69 if (res < QCOW_MAX_REFTABLE_SIZE)
70 return round_up(res, 1UL << min_bs_bits);
71 return QCOW_MAX_REFTABLE_SIZE;
72 }
73
get_refcount_table_act_size()74 u32 Qcow2State::get_refcount_table_act_size()
75 {
76 u64 ref_table_bytes = header.get_refcount_table_clusters() <<
77 header.cluster_bits;
78
79 if (ref_table_bytes > get_refcount_table_max_size())
80 ref_table_bytes = get_refcount_table_max_size();
81
82 return round_up(ref_table_bytes, 1UL << min_bs_bits);
83 }
84
get_l1_table_offset()85 u64 Qcow2State::get_l1_table_offset()
86 {
87 return header.get_l1_table_offset();
88 }
89
get_refcount_table_offset()90 u64 Qcow2State::get_refcount_table_offset()
91 {
92 return header.get_refcount_table_offset();
93 }
94
get_l2_slices_count()95 u32 Qcow2State::get_l2_slices_count()
96 {
97 u32 mapping_bytes = get_dev_size() >> (header.cluster_bits - 3);
98
99 //align with qemu, at most 32MB
100 if (mapping_bytes > (32U << 20))
101 mapping_bytes = 32U << 20;
102
103 return mapping_bytes >> QCOW2_PARA::L2_TABLE_SLICE_BITS;
104 }
105
add_meta_io(u32 qid,Qcow2MappingMeta * m)106 u32 Qcow2State::add_meta_io(u32 qid, Qcow2MappingMeta *m)
107 {
108 struct meta_mapping *map = &meta_io_map[qid];
109 std::vector <Qcow2MappingMeta *> &v = map->meta;
110 int i;
111
112 for (i = 0; i < v.size(); i++)
113 if (v[i] == nullptr)
114 break;
115 if (i < v.size()) {
116 v[i] = m;
117 } else {
118 v.push_back(m);
119 i = v.size() - 1;
120 }
121
122 map->nr += 1;
123
124 return i;
125 }
126
has_dirty_slice()127 bool Qcow2State::has_dirty_slice()
128 {
129 return cluster_map.cache.has_dirty_slice(*this) ||
130 cluster_allocator.cache.has_dirty_slice(*this);
131 }
132
reclaim_slice(Qcow2SliceMeta * m)133 void Qcow2State::reclaim_slice(Qcow2SliceMeta *m)
134 {
135 if (m->is_mapping_meta()) {
136 Qcow2L2Table *t =
137 static_cast<Qcow2L2Table *>(m);
138
139 cluster_map.cache.add_slice_to_reclaim_list(t);
140 } else {
141 Qcow2RefcountBlock *t =
142 static_cast<Qcow2RefcountBlock *>(m);
143
144 cluster_allocator.cache.add_slice_to_reclaim_list(t);
145 }
146 }
147
remove_slice_from_evicted_list(Qcow2SliceMeta * m)148 void Qcow2State::remove_slice_from_evicted_list(Qcow2SliceMeta *m)
149 {
150 if (m->is_mapping_meta()) {
151 Qcow2L2Table *t =
152 static_cast<Qcow2L2Table *>(m);
153
154 cluster_map.cache.remove_slice_from_evicted_list(t);
155 } else {
156 Qcow2RefcountBlock *t =
157 static_cast<Qcow2RefcountBlock *>(m);
158
159 cluster_allocator.cache.remove_slice_from_evicted_list(t);
160 }
161 }
162
dump_meta()163 void Qcow2State::dump_meta()
164 {
165 cluster_allocator.dump_meta();
166 cluster_map.dump_meta();
167 meta_flushing.dump();
168 }
169
170 //todo: allocate from slices from reclaim_slices
kill_slices(const struct ublksrv_queue * q)171 void Qcow2State::kill_slices(const struct ublksrv_queue *q)
172 {
173 std::vector<Qcow2SliceMeta *> tmp(move(freed_slices));
174
175 if (tmp.empty())
176 return;
177
178 qcow2_assert(!tmp.empty() && freed_slices.empty());
179
180 //can't free new added slice from ->wakeup_all()
181 for (auto it = tmp.cbegin(); it != tmp.cend(); ++it) {
182 auto m = *it;
183
184 m->put_ref();
185 }
186 }
187
shrink_cache()188 void Qcow2State::shrink_cache()
189 {
190 cluster_map.cache.shrink(*this);
191 cluster_allocator.cache.shrink(*this);
192 }
193
194 #ifdef DEBUG_QCOW2_META_VALIDATE
validate_cluster_use(u64 host_off,u64 virt_off,u32 use)195 void Qcow2State::validate_cluster_use(u64 host_off, u64 virt_off, u32 use) {
196 auto it = cluster_use.find(host_off);
197
198 if (it == cluster_use.end())
199 cluster_use[host_off] = ((u64)use << 56) | virt_off;
200 else {
201 qcow2_log("%s: duplicated cluster assignment host off "
202 "%llx, virt_off %llx use %d, old entry %llx\n",
203 __func__, host_off, virt_off, use,
204 it->second);
205 qcow2_assert(0);
206 }
207 }
208
209 // call it for each entry before flushing the slice
validate_cluster_map(u64 host_off,u64 virt_off)210 bool Qcow2State::validate_cluster_map(u64 host_off, u64 virt_off) {
211 auto it = cluster_validate_map.find(host_off);
212
213 if (it == cluster_validate_map.end()) {
214 cluster_validate_map[host_off] = virt_off;
215 return true;
216 }
217
218 if (virt_off == it->second)
219 return true;
220
221 qcow2_log("%s: duplicated cluster assignment host off "
222 "%llx, virt_off %llx old virt_offset %llx\n",
223 __func__, host_off, virt_off, it->second);
224 return false;
225 }
226 #endif
227
228 /* Make any kind of Qcow2State, so far only support the plain one */
make_qcow2state(const char * file,struct ublksrv_dev * dev)229 Qcow2State *make_qcow2state(const char *file, struct ublksrv_dev *dev)
230 {
231 return new Qcow2StatePlain(file, dev);
232 }
233
234 template <class T>
slice_cache(u8 slice_bits,u8 cluster_bits,u8 slice_virt_bits,u32 max_size)235 slice_cache<T>::slice_cache(u8 slice_bits, u8 cluster_bits, u8 slice_virt_bits,
236 u32 max_size):
237 slice_size_bits(slice_bits),
238 cluster_size_bits(cluster_bits),
239 slice_virt_size_bits(slice_virt_bits),
240 slices(max_size >> slice_bits),
241 evicted_slices({})
242 {
243 }
244
245 template <class T>
__find_slice(u64 key,bool use_evicted_cache)246 T *slice_cache<T>::__find_slice(u64 key, bool use_evicted_cache) {
247 T *t = slices.__get(key);
248
249 if (t)
250 return t;
251
252 if (use_evicted_cache) {
253 auto it = evicted_slices.find(key);
254
255 if (it != evicted_slices.end())
256 return it->second;
257 }
258 return nullptr;
259 }
260
261 template <class T>
alloc_slice(Qcow2State & state,const qcow2_io_ctx_t & ioc,u64 virt_offset,u64 host_offset,u32 parent_idx)262 T *slice_cache<T>::alloc_slice(Qcow2State &state, const qcow2_io_ctx_t &ioc,
263 u64 virt_offset, u64 host_offset, u32 parent_idx)
264 {
265 T *t;
266 u32 flags;
267 bool zero_buf;
268
269 qcow2_assert(__find_slice(virt_offset, true) == nullptr);
270 qcow2_assert(!(virt_offset & ((1ULL << cluster_size_bits) - 1)));
271
272 if (!state.cluster_allocator.alloc_cluster_is_zeroed(host_offset &
273 ~((1ULL << cluster_size_bits) - 1))) {
274 flags = QCOW2_META_UPDATE | QCOW2_META_DIRTY;
275 zero_buf = true;
276 } else {
277 flags = 0;
278 zero_buf = false;
279 }
280
281 t = pick_slice_from_reclaim_list();
282 if (t == nullptr)
283 t = new T(state, host_offset, parent_idx, flags);
284 else
285 t->reset(state, host_offset, parent_idx, flags);
286
287 if (t->get_dirty(-1))
288 state.meta_flushing.inc_dirtied_slice(t->is_mapping_meta());
289
290 if (zero_buf)
291 t->zero_buf();
292
293 T *old = slices.put(virt_offset, t);
294 if (old) {
295 #ifdef DEBUG_QCOW2_META_OBJ
296 qcow2_assert(__find_slice(old->virt_offset(), true)
297 == nullptr);
298 #endif
299 //loading or flushing may be in-progress, that is allowed.
300 //and we guarantee that the slice isn't released until
301 //the loading or flushing is done
302 old->set_evicted();
303 add_slice_to_evicted_list(old->virt_offset(), old);
304
305 //can't free one dirty slice, but one clean slice can't
306 //be dirtied after it is evicted, so safe to move clean
307 //slice into free list for release
308 if (!old->get_dirty(-1))
309 state.add_slice_to_free_list(old);
310 old->put_ref();
311
312 #ifdef QCOW2_DEBUG
313 ublk_dbg(UBLK_DBG_QCOW2_META, "%s: %s evicted from tag %d, obj %p flags %x offset %lx ref %d\n",
314 __func__, old->get_id(), ioc.get_tag(), old,
315 old->get_flags(), old->get_offset(),
316 old->read_ref());
317 #endif
318 }
319
320 if (virt_offset != t->virt_offset()) {
321 ublk_err( "%s %d: %s %" PRIx64 "/%" PRIx64 " parent_idx %d host_off %" PRIx64 " flags %x\n",
322 __func__, __LINE__, typeid(*t).name(),
323 virt_offset, t->virt_offset(), parent_idx,
324 host_offset, flags);
325 qcow2_assert(virt_offset == t->virt_offset());
326 }
327
328 return t;
329 }
330
331 template <class T>
add_slice_to_evicted_list(u64 virt_offset,T * t)332 void slice_cache<T>::add_slice_to_evicted_list(u64 virt_offset, T *t)
333 {
334 auto it = evicted_slices.find(virt_offset);
335
336 qcow2_assert(virt_offset == t->virt_offset());
337
338 if (it == evicted_slices.end())
339 evicted_slices[virt_offset] = t;
340 else {
341 #if 1
342 auto m = it->second;
343 qcow2_log("%s: add duplicated cache virt_offset %" PRIx64 ", remove old entry(%p %lx/%lx %x %d)\n",
344 __func__, virt_offset, m, m->virt_offset(),
345 m->get_offset(), m->get_flags(), m->read_ref());
346 it->second->show(__func__, __LINE__);
347 qcow2_assert(0);
348 #endif
349
350 //this slice has been in handled in prep_flushing,
351 //so it is fine to remove it from freed list now
352 evicted_slices.erase(it);
353 evicted_slices[virt_offset] = t;
354 }
355 }
356
357 template <class T>
dump(Qcow2State & qs)358 void slice_cache<T>::dump(Qcow2State &qs) {
359 auto lru_list = slices.get_lru_list_ro();
360
361 ublk_log("cache size %zu, dirty cache size %zu\n",
362 slices.size(), evicted_slices.size());
363
364 //todo: use lrucache iterator to cut the loop time
365 for (auto it = lru_list.cbegin(); it != lru_list.cend(); ++it) {
366 T *t = it->second;
367
368 if (t)
369 t->dump();
370 }
371 }
372
373 template <class T>
figure_group_from_dirty_list(Qcow2State & qs)374 int slice_cache<T>::figure_group_from_dirty_list(Qcow2State &qs) {
375 std::unordered_map<u32, int> cnt;
376 int val = -1;
377 int idx = -1;
378
379 for (auto it = evicted_slices.cbegin(); it != evicted_slices.cend(); ++it) {
380 u32 key = (it->second->parent_idx * 8) / 512;
381 auto it1 = cnt.find(key);
382
383 if (it1 == cnt.end())
384 cnt[key] = 0;
385 else
386 cnt[key] += 1;
387 }
388
389 for (auto it = cnt.cbegin(); it != cnt.cend(); ++it) {
390 if (it->second > val) {
391 idx = it->first;
392 val = it->second;
393 }
394 }
395
396 flush_log("%s: dirty list: idx %d cnt %u\n", __func__, idx, val);
397
398 qcow2_assert(idx != -1);
399 return idx;
400 }
401
402 template <class T>
__figure_group_for_flush(Qcow2State & qs)403 int slice_cache<T>::__figure_group_for_flush(Qcow2State &qs)
404 {
405 std::unordered_map<u32, int> cnt;
406 int val = -1;
407 int idx = -1;
408 auto lru_list = slices.get_lru_list_ro();
409
410 //todo: use lrucache iterator to cut the loop time
411 for (auto it = lru_list.cbegin(); it != lru_list.cend(); ++it) {
412 T *t = it->second;
413
414 if (t != nullptr && t->get_dirty(-1) && !t->is_flushing()) {
415 u32 key = (t->parent_idx * 8) / 512;
416 auto it1 = cnt.find(key);
417
418 if (it1 == cnt.end())
419 cnt[key] = 0;
420 else
421 cnt[key] += 1;
422 }
423 }
424
425 if (cnt.size() == 0)
426 return -1;
427
428 for (auto it = cnt.cbegin(); it != cnt.cend(); ++it) {
429 if (it->second > val) {
430 idx = it->first;
431 val = it->second;
432 }
433 }
434 qcow2_assert(idx != -1);
435 flush_log("%s: lru list: idx %d cnt %u\n", __func__, idx, val);
436 return idx;
437 }
438
439 template <class T>
figure_group_for_flush(Qcow2State & qs)440 int slice_cache<T>::figure_group_for_flush(Qcow2State &qs)
441 {
442 if (evicted_slices.size() > 0)
443 return figure_group_from_dirty_list(qs);
444
445 return __figure_group_for_flush(qs);
446 }
447
448 template <class T>
has_dirty_slice(Qcow2State & qs)449 bool slice_cache<T>::has_dirty_slice(Qcow2State &qs)
450 {
451 auto lru_list = slices.get_lru_list_ro();
452
453 //todo: use lrucache iterator to cut the loop time
454 for (auto it = lru_list.cbegin(); it != lru_list.cend(); ++it) {
455 T *t = it->second;
456
457 if (t != nullptr && t->get_dirty(-1) && !t->is_flushing())
458 return true;
459 }
460
461 return has_evicted_dirty_slices();
462 }
463
464 template <class T>
shrink(Qcow2State & qs)465 void slice_cache<T>::shrink(Qcow2State &qs)
466 {
467 u32 cnt = qs.get_l2_slices_count();
468
469 for (auto it = reclaimed_slices.cbegin();
470 it != reclaimed_slices.cend(); ++it) {
471 delete *it;
472 }
473
474 reclaimed_slices.clear();
475
476 cnt >>= 3;
477
478 //shrink cache until 1/8 slices are kept
479 while (slices.size() > cnt) {
480 auto t = slices.remove_last();
481
482 delete t;
483 }
484 }
485
486 // refcount table shouldn't be so big
Qcow2ClusterAllocator(Qcow2State & qs)487 Qcow2ClusterAllocator::Qcow2ClusterAllocator(Qcow2State &qs): state(qs),
488 cache(REFCOUNT_BLK_SLICE_BITS, qs.header.cluster_bits,
489 qs.header.cluster_bits + 3 - qs.header.refcount_order +
490 QCOW2_PARA::REFCOUNT_BLK_SLICE_BITS,
491 QCOW2_PARA::REFCOUNT_BLK_MAX_CACHE_BYTES),
492 alloc_state({})
493 {
494 max_alloc_states = 0;
495 };
496
__find_slice(u64 key)497 Qcow2RefcountBlock* Qcow2ClusterAllocator::__find_slice(u64 key)
498 {
499 return cache.__find_slice(key, true);
500 }
501
figure_group_from_refcount_table()502 int Qcow2ClusterAllocator::figure_group_from_refcount_table()
503 {
504 int ret = cache.figure_group_for_flush(state);
505
506 if (ret == -1)
507 return state.refcount_table.get_1st_dirty_blk();
508 return ret;
509 }
510
alloc_cluster_started(const qcow2_io_ctx_t & ioc,u64 cluster_offset,u8 purpose)511 void Qcow2ClusterAllocator::alloc_cluster_started(const qcow2_io_ctx_t &ioc,
512 u64 cluster_offset, u8 purpose)
513 {
514 auto it = alloc_state.find(cluster_offset);
515 u32 sz;
516
517 qcow2_assert(it == alloc_state.end());
518
519 alloc_state[cluster_offset] = new Qcow2ClusterState(
520 QCOW2_ALLOC_STARTED, purpose);
521
522 sz = alloc_state.size();
523
524 if (sz > max_alloc_states)
525 max_alloc_states = sz;
526
527 alloc_log("%s: offset %lx state %d purpose %d\n",
528 __func__, cluster_offset,
529 QCOW2_ALLOC_STARTED, purpose);
530 }
531
alloc_cluster_zeroing(const qcow2_io_ctx_t & ioc,u64 cluster_offset)532 void Qcow2ClusterAllocator::alloc_cluster_zeroing(const qcow2_io_ctx_t &ioc,
533 u64 cluster_offset)
534 {
535 auto it = alloc_state.find(cluster_offset);
536
537 qcow2_assert(it != alloc_state.end());
538
539 it->second->set_state(QCOW2_ALLOC_ZEROING);
540
541 alloc_log("%s: offset %lx state %d purpose %d\n", __func__,
542 cluster_offset, it->second->get_state(),
543 it->second->get_purpose());
544 }
545
alloc_cluster_zeroed(const struct ublksrv_queue * q,int tag,u64 cluster_offset)546 void Qcow2ClusterAllocator::alloc_cluster_zeroed(const struct ublksrv_queue *q,
547 int tag, u64 cluster_offset)
548 {
549 auto it = alloc_state.find(cluster_offset);
550
551 if (it == alloc_state.end())
552 ublk_err( "%s: offset %lx\n", __func__, cluster_offset);
553 qcow2_assert(it != alloc_state.end());
554
555 it->second->set_state(QCOW2_ALLOC_ZEROED);
556 alloc_log("%s: offset %lx state %d purpose %d\n", __func__,
557 cluster_offset, it->second->get_state(),
558 it->second->get_purpose());
559
560 it->second->wakeup_all(q, tag);
561
562 /* safe to remove it now */
563 delete it->second;
564 alloc_state.erase(it);
565 }
566
567 //called after mapping is setup for this cluster
alloc_cluster_done(const qcow2_io_ctx_t & ioc,u64 cluster_offset)568 void Qcow2ClusterAllocator::alloc_cluster_done(const qcow2_io_ctx_t &ioc,
569 u64 cluster_offset)
570 {
571 auto it = alloc_state.find(cluster_offset);
572
573 qcow2_assert(it != alloc_state.end());
574
575 delete it->second;
576
577 alloc_state.erase(it);
578 }
579
dump_meta()580 void Qcow2ClusterAllocator::dump_meta() {
581
582 qcow2_log("cluster allocator %s: total allocates %" PRIu64 " clusters, bytes %" PRIu64 "KB, max states %u/%lu\n",
583 __func__, alloc_cnt, (alloc_cnt <<
584 state.header.cluster_bits) >> 10,
585 max_alloc_states, alloc_state.size());
586 state.refcount_table.dump();
587 cache.dump(state);
588 }
589
setup()590 void Qcow2ClusterAllocator::setup() {
591 long i = 0;
592
593 for (i = (state.refcount_table.get_data_len() / 8) - 1; i >= 0; i--)
594 if (state.refcount_table.get_entry(i) != 0)
595 break;
596 /*
597 * most of times this entry has slot available yet, otherwise
598 * allocate_cluster() will move to next refcount block cache
599 */
600 state.refcount_table.set_next_free_idx(i);
601
602 table_entry_virt_size_bits = 2 * state.header.cluster_bits + 3 -
603 state.header.refcount_order;
604 slice_idx = 0;
605 alloc_cnt = 0;
606
607 //just one estimation, for runtime check only
608 max_physical_size = ((u64)(i + 1)) << table_entry_virt_size_bits;
609 }
610
allocate_refcount_blk(const qcow2_io_ctx_t & ioc,s32 idx)611 void Qcow2ClusterAllocator::allocate_refcount_blk(const qcow2_io_ctx_t &ioc,
612 s32 idx)
613 {
614 Qcow2RefcountBlock *rb;
615 u64 virt_offset = (u64)idx << table_entry_virt_size_bits;
616 u64 host_offset = virt_offset;
617
618 if (state.refcount_table.is_flushing(idx)) {
619 state.refcount_table.add_waiter(ioc.get_tag());
620 throw MetaUpdateException();
621 }
622
623 max_physical_size = ((u64)(idx + 1)) << table_entry_virt_size_bits;
624 state.refcount_table.set_next_free_idx(idx);
625 qcow2_assert(!state.refcount_table.get_entry(idx));
626 state.refcount_table.set_entry(idx, host_offset);
627
628 //track the new allocated cluster
629 alloc_cluster_started(ioc, host_offset,
630 QCOW2_CLUSTER_USE::REFCOUNT_BLK);
631 state.validate_cluster_use(host_offset, virt_offset,
632 QCOW2_CLUSTER_USE::REFCOUNT_BLK);
633
634 rb = cache.alloc_slice(state, ioc, virt_offset, host_offset, idx);
635 qcow2_assert(rb != nullptr);
636 qcow2_assert(rb->get_update() && !rb->get_evicted() &&
637 !rb->is_flushing());
638
639 //the first cluster is for this refcount block
640 rb->set_entry(0, 1);
641 rb->set_next_free_idx(1);
642 }
643
allocate_cluster(const qcow2_io_ctx_t & ioc)644 u64 Qcow2ClusterAllocator::allocate_cluster(const qcow2_io_ctx_t &ioc)
645 {
646 Qcow2RefcountBlock *rb;
647 s32 free_idx;
648 u64 virt_offset, host_offset;
649
650 again:
651 free_idx = state.refcount_table.get_next_free_idx();
652 virt_offset = ((u64)free_idx << table_entry_virt_size_bits) +
653 ((u64)slice_idx << cache.get_slice_virt_size_bits());
654 rb = cache.find_slice(virt_offset, true);
655 if (rb == nullptr)
656 goto alloc_refcount_blk;
657 qcow2_assert(rb->read_ref() > 0);
658
659 check_new:
660 /* the cache has been allocated & being loaded */
661 if (!rb->get_update()) {
662 rb->add_waiter(ioc.get_tag());
663 throw MetaUpdateException();
664 }
665
666 //if we are being flushed, can't touch the in-ram table,
667 //so wait until the flushing is done
668 if (rb->is_flushing() || rb->get_evicted()) {
669 rb->add_waiter(ioc.get_tag());
670 throw MetaUpdateException();
671 }
672
673 #ifdef QCOW2_CACHE_DEBUG
674 qcow2_log("%s: hit: next free %d entries %d virt_off %llx slice_idx %d\n",
675 __func__, rb->get_next_free_idx(), rb->get_nr_entries(),
676 virt_offset, slice_idx);
677 #endif
678 //todo: cache the last free entry
679 for (int i = rb->get_next_free_idx(); i < rb->get_nr_entries(); i++) {
680 if (i < 0)
681 continue;
682 //qcow2_log("\t entry[%d]=%llx\n", i, rb->get_entry(i));
683 if (rb->get_entry_fast(i) == 0) {
684 u64 res = virt_offset + (((u64)i) <<
685 state.header.cluster_bits);
686
687 if (!rb->get_dirty(-1))
688 state.meta_flushing.inc_dirtied_slice(false);
689 qcow2_assert(rb->get_update() && !rb->is_flushing() &&
690 !rb->get_evicted());
691 rb->set_entry(i, 1);
692 rb->set_next_free_idx(i + 1);
693
694 alloc_cnt++;
695 return res;
696 }
697 }
698
699 if (++slice_idx < cache.get_nr_slices())
700 goto again;
701
702 // this current cache is full, so move to next one.
703 //
704 // Here it is different with l2 table's cache which is sliced, but
705 // refcount blk cache size is always equal to one cluster
706 qcow2_assert(free_idx < state.refcount_table.get_nr_entries());
707 allocate_refcount_blk(ioc, free_idx + 1);
708 slice_idx = 0;
709 goto again;
710
711 alloc_refcount_blk:
712 //start is host offset of refcount block object
713 host_offset = state.refcount_table.get_entry(free_idx) +
714 + (u64(slice_idx) << cache.get_slice_size_bits());
715
716 rb = cache.alloc_slice(state, ioc, virt_offset, host_offset, free_idx);
717
718 /* the cluster may be allocated just in ram, no need to load */
719 if (rb->get_update())
720 goto check_new;
721
722 rb->load(state, ioc, QCOW2_PARA::REFCOUNT_BLK_SLICE_BYTES, false);
723
724 //add our tag into io_waiters, so once we get updated,
725 //the current io context will be resumed when handling cqe
726 //
727 //we have to call it explicitly here for both io contexts
728 //which starts to load meta and wait for in-flight meta
729 rb->add_waiter(ioc.get_tag());
730
731 //->handle_io_async() has to handle this exception
732 throw MetaIoException();
733
734 return 0;
735 }
736
737 // refcount table shouldn't be so big
Qcow2ClusterMapping(Qcow2State & qs)738 Qcow2ClusterMapping::Qcow2ClusterMapping(Qcow2State &qs): state(qs),
739 cache(QCOW2_PARA::L2_TABLE_SLICE_BITS,
740 qs.header.cluster_bits,
741 qs.header.cluster_bits + L2_TABLE_SLICE_BITS - 3,
742 qs.get_l2_slices_count() * QCOW2_PARA::L2_TABLE_SLICE_BYTES),
743 cluster_bits(state.header.cluster_bits),
744 l2_entries_order(state.header.cluster_bits - 3),
745 max_alloc_entries(0)
746 {
747 }
748
__find_slice(u64 key,bool use_dirty)749 Qcow2L2Table* Qcow2ClusterMapping::__find_slice(u64 key, bool use_dirty)
750 {
751 return cache.__find_slice(key, use_dirty);
752 }
753
figure_group_from_l1_table()754 int Qcow2ClusterMapping::figure_group_from_l1_table()
755 {
756 int ret = cache.figure_group_for_flush(state);
757
758 if (ret == -1)
759 return state.l1_table.get_1st_dirty_blk();
760 return ret;
761 }
762
create_and_add_l2(const qcow2_io_ctx_t & ioc,u64 offset)763 Qcow2L2Table *Qcow2ClusterMapping::create_and_add_l2(const qcow2_io_ctx_t &ioc,
764 u64 offset)
765 {
766 const unsigned idx = l1_idx(offset);
767 u64 l1_entry = state.l1_table.get_entry(idx);
768 u64 l2_cluster = -1;
769 const struct ublksrv_queue *q = ublksrv_get_queue(state.dev, ioc.get_qid());
770 Qcow2L2Table *l2 = nullptr;
771
772 qcow2_assert(!state.l1_table.entry_allocated(l1_entry));
773
774 //in case of being flushed, we can't update in-ram meta, so
775 //exit and wait for flush completion
776 if (state.l1_table.is_flushing(idx)) {
777 state.l1_table.add_waiter(ioc.get_tag());
778 throw MetaUpdateException();
779 }
780
781 //if someone is allocating cluster for this entry, wait until
782 //the entry becomes valid or failed
783 if (entry_is_allocating(offset, true)) {
784 u32 owner = entry_get_alloc_owner(offset, true);
785
786 if (owner != ioc.get_tag()) {
787 state.l1_table.add_waiter_idx(ioc.get_tag(), idx);
788 throw MetaUpdateException();
789 }
790 } else {
791 //store owner into the entry for marking we are allocating, so
792 //others can't allocate for this entry any more, and others
793 //just need to wait until the allocation is done
794 entry_mark_allocating(offset, ioc.get_tag(), true);
795 }
796
797 l2_cluster = state.cluster_allocator.allocate_cluster(ioc);
798 if (l2_cluster == -1) {
799 state.l1_table.set_entry(idx, 0);
800 } else {
801 unsigned long s_idx = cache.get_slice_idx(l2_slice_key(offset));
802 u64 host_offset = l2_cluster +
803 (s_idx << cache.get_slice_size_bits());
804
805 state.cluster_allocator.alloc_cluster_started(ioc,
806 l2_cluster, QCOW2_CLUSTER_USE::L2_TABLE);
807 state.validate_cluster_use(l2_cluster, l2_slice_key(offset),
808 QCOW2_CLUSTER_USE::L2_TABLE);
809 //allocate l2 cache
810 l2 = cache.alloc_slice(state, ioc, l2_slice_key(offset),
811 host_offset, idx);
812 l2->get_ref();
813 qcow2_assert(l2->get_update());
814
815 l2_cluster |= 1ULL << 63;
816 state.l1_table.set_entry(idx, l2_cluster);
817 }
818
819 entry_mark_allocated(offset, true);
820 state.l1_table.wakeup_all_idx(q, ioc.get_tag(), idx);
821
822 return l2;
823 }
824
load_l2_slice(const qcow2_io_ctx_t & ioc,u64 offset,u64 l1_entry)825 Qcow2L2Table *Qcow2ClusterMapping::load_l2_slice(const qcow2_io_ctx_t &ioc, u64 offset,
826 u64 l1_entry)
827 {
828 const u64 slice_offset = (l2_idx(offset) << 3) &
829 ~(QCOW2_PARA::L2_TABLE_SLICE_BYTES - 1);
830 u64 start = (l1_entry & ((1ULL << 63) - 1)) + slice_offset;
831 Qcow2L2Table *l2;
832
833 l2 = cache.alloc_slice(state, ioc, l2_slice_key(offset), start,
834 l1_idx(offset));
835 //start may point to one new allocated cluster
836 if (l2->get_update()) {
837 l2->get_ref();
838 return l2;
839 }
840
841 ublk_dbg(UBLK_DBG_QCOW2_META_L2, "cache: alloc: key %" PRIx64 " val %p, update %d\n",
842 start, l2, l2->get_update());
843 l2->load(state, ioc, QCOW2_PARA::L2_TABLE_SLICE_BYTES, false);
844 l2->add_waiter(ioc.get_tag());
845 throw MetaIoException();
846
847 return l2;
848 }
849
850 //return l2 slice object with holding one extra reference
create_l2_map(const qcow2_io_ctx_t & ioc,u64 offset,bool create_l2)851 Qcow2L2Table *Qcow2ClusterMapping::create_l2_map(const qcow2_io_ctx_t &ioc,
852 u64 offset, bool create_l2)
853 {
854 u64 l1_entry = state.l1_table.get_entry_fast(l1_idx(offset));
855 Qcow2L2Table *l2 = nullptr;
856
857 if (state.l1_table.entry_allocated(l1_entry))
858 return load_l2_slice(ioc, offset, l1_entry);
859
860 if (create_l2) {
861 // l2 table isn't allocated yet, so create one and add it here
862 l2 = create_and_add_l2(ioc, offset);
863 if (!l2)
864 ublk_err( "%s: tag %d: allocate l2 failed for %" PRIx64 "\n",
865 __func__, ioc.get_tag(), offset);
866 }
867 return l2;
868 }
869
870 //virt_offset's l2 table doesn't include this entry yet, so allocate
871 //one cluster and install the mapping
build_mapping(const qcow2_io_ctx_t & ioc,u64 virt_offset,Qcow2L2Table * l2,u32 idx_in_slice,u64 * l2_entry)872 int Qcow2ClusterMapping::build_mapping(const qcow2_io_ctx_t &ioc,
873 u64 virt_offset, Qcow2L2Table *l2, u32 idx_in_slice,
874 u64 *l2_entry)
875 {
876 const struct ublksrv_queue *q = ublksrv_get_queue(state.dev, ioc.get_qid());
877 u64 data_cluster = -1;
878 int ret;
879
880 qcow2_assert(l2->get_update());
881
882 //in case of being flushed, we can't update in-ram meta, so
883 //exit and wait for flush completion
884 //
885 //If this slice is marked as PREP_FLUSH, the dependent refcount
886 //block tables are being flushed, so delay this slice update
887 //until our flushing is done
888 if (l2->is_flushing() || l2->get_evicted() || l2->get_prep_flush()) {
889 l2->add_waiter(ioc.get_tag());
890 throw MetaUpdateException();
891 }
892
893 qcow2_assert(l2->read_ref() > 0);
894
895 if (entry_is_allocating(virt_offset, false)) {
896 u32 owner = entry_get_alloc_owner(virt_offset, false);
897
898 if (owner != ioc.get_tag()) {
899 l2->add_waiter_idx(ioc.get_tag(), idx_in_slice);
900 throw MetaUpdateException();
901 }
902 } else {
903 entry_mark_allocating(virt_offset, ioc.get_tag(), false);
904 }
905
906 data_cluster = state.cluster_allocator.allocate_cluster(ioc);
907 qcow2_assert(l2->get_update() && !l2->is_flushing() &&
908 !l2->get_evicted());
909 if (data_cluster == -1) {
910 l2->set_entry(idx_in_slice, 0);
911 ret = -ENOSPC;
912 } else {
913 state.cluster_allocator.alloc_cluster_started(ioc,
914 data_cluster, QCOW2_CLUSTER_USE::DATA);
915 state.validate_cluster_use(data_cluster, virt_offset,
916 QCOW2_CLUSTER_USE::DATA);
917 data_cluster |= 1ULL << 63;
918 *l2_entry = data_cluster;
919 if (!l2->get_dirty(-1))
920 state.meta_flushing.inc_dirtied_slice(true);
921 l2->set_entry(idx_in_slice, data_cluster);
922 ret = 0;
923 }
924
925 l2->check(state, __func__, __LINE__);
926
927 entry_mark_allocated(virt_offset, false);
928 l2->wakeup_all_idx(q, ioc.get_tag(), idx_in_slice);
929 return ret;
930 }
931
932 //we get one extra reference of l2 when calling this function.
__map_cluster(const qcow2_io_ctx_t & ioc,Qcow2L2Table * l2,u64 offset,bool create_l2)933 u64 Qcow2ClusterMapping::__map_cluster(const qcow2_io_ctx_t &ioc,
934 Qcow2L2Table *l2, u64 offset, bool create_l2)
935 {
936 const u32 idx_in_slice = ((l2_idx(offset) << 3) &
937 (QCOW2_PARA::L2_TABLE_SLICE_BYTES - 1)) >> 3;
938 u64 l2_entry;
939 int ret;
940
941 qcow2_assert(l2->read_ref() > 0);
942 l2->check(state, __func__, __LINE__);
943
944 /* the cache is being loaded */
945 if (!l2->get_update()) {
946 l2->add_waiter(ioc.get_tag());
947 throw MetaUpdateException();
948 }
949
950 l2_entry = l2->get_entry_fast(idx_in_slice);
951 if (l2->entry_allocated(l2_entry))
952 goto exit;
953
954 if (!create_l2)
955 return 0;
956
957 ret = build_mapping(ioc, offset, l2, idx_in_slice, &l2_entry);
958 if (ret) {
959 qcow2_log("%s %d: tag %d build l2 mapping failed %d\n",
960 __func__, __LINE__, ioc.get_tag(), ret);
961 return 0;
962 }
963 exit:
964 qcow2_assert(l2->entry_allocated(l2_entry));
965 return l2_entry & ((1ULL << 63) - 1);
966 }
967
968
969 //any caller has to catch both MetaIoException and MetaUpdateException
map_cluster(const qcow2_io_ctx_t & ioc,u64 offset,bool create_l2)970 u64 Qcow2ClusterMapping::map_cluster(const qcow2_io_ctx_t &ioc, u64 offset,
971 bool create_l2)
972 {
973 Qcow2L2Table *l2 = cache.find_slice(l2_slice_key(offset), true);
974 u64 off_in_cls = offset & ((1ULL << cluster_bits) - 1);
975 u64 host_off = 0;
976
977 offset = offset & ~((1ULL << cluster_bits) - 1);
978
979 // l2 could be freed when wakeup() is called, so refcount
980 // has to be grabbed
981 if (l2) {
982 l2->get_ref();
983 } else {
984 try {
985 l2 = create_l2_map(ioc, offset, create_l2);
986 } catch (MetaIoException &meta_error) {
987 throw MetaIoException();
988 } catch (MetaUpdateException &meta_update_error) {
989 throw MetaUpdateException();
990 }
991 }
992
993 if (l2 == nullptr)
994 return 0;
995
996 try {
997 host_off = __map_cluster(ioc, l2, offset, create_l2);
998 } catch (MetaIoException &meta_error) {
999 l2->put_ref();
1000 throw MetaIoException();
1001 } catch (MetaUpdateException &meta_update_error) {
1002 l2->put_ref();
1003 throw MetaUpdateException();
1004 }
1005
1006 l2->put_ref();
1007
1008 if (host_off & QCOW_OFLAG_COMPRESSED)
1009 return (u64)-1;
1010
1011 return host_off + off_in_cls;
1012 }
1013
dump_meta()1014 void Qcow2ClusterMapping::dump_meta()
1015 {
1016 qcow2_log("cluster mapping%s: max_alloc_entries %u/%lu\n", __func__,
1017 max_alloc_entries, entry_alloc.size());
1018 state.l1_table.dump();
1019 cache.dump(state);
1020 }
1021