1 // SPDX-License-Identifier: GPL-2.0
2 #include "ublksrv_tgt.h"
3 #include "qcow2_format.h"
4 #include "qcow2.h"
5
6 #define HEADER_SIZE 512
7 #define QCOW2_UNMAPPED (u64)(-1)
8
qcow2_init_tgt(struct ublksrv_dev * dev,int type,int argc,char * argv[])9 static int qcow2_init_tgt(struct ublksrv_dev *dev, int type, int argc, char
10 *argv[])
11 {
12 struct ublksrv_tgt_info *tgt = &dev->tgt;
13 const struct ublksrv_ctrl_dev_info *info =
14 ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(dev));
15 static const struct option lo_longopts[] = {
16 { "file", 1, NULL, 'f' },
17 { NULL }
18 };
19 int jbuf_size;
20 char *jbuf;
21 int fd, opt, ret;
22 void *header_buf;
23 QCowHeader *header;
24 char *file = NULL;
25 struct ublksrv_tgt_base_json tgt_json = {
26 .type = type,
27 };
28 struct ublk_params p = {
29 .types = UBLK_PARAM_TYPE_BASIC,
30 .basic = {
31 //.attrs = UBLK_ATTR_READ_ONLY,
32 .logical_bs_shift = 9,
33 .physical_bs_shift = 12,
34 .io_opt_shift = 12,
35 .io_min_shift = 9,
36 .max_sectors = info->max_io_buf_bytes >> 9,
37 },
38 };
39 Qcow2State *qs;
40
41 /* qcow2 doesn't support user copy yet */
42 if (info->flags & UBLK_F_USER_COPY)
43 return -EINVAL;
44
45 //1024 queue depth is enough for qcow2, then we can store
46 //tag & l1 entry index in single u32 variable.
47 if (info->queue_depth > QCOW2_MAX_QUEUE_DEPTH)
48 return -EINVAL;
49
50 //qcow2 target doesn't support MQ yet
51 if (info->nr_hw_queues > 1)
52 return -EINVAL;
53
54 strcpy(tgt_json.name, "qcow2");
55
56 if (type != UBLKSRV_TGT_TYPE_QCOW2)
57 return -EINVAL;
58
59 while ((opt = getopt_long(argc, argv, "-:f:",
60 lo_longopts, NULL)) != -1) {
61 switch (opt) {
62 case 'f':
63 file = strdup(optarg);
64 break;
65 }
66 }
67
68 if (!file)
69 return -EINVAL;
70
71 if (posix_memalign((void **)&header_buf, 512, HEADER_SIZE))
72 return -EINVAL;
73
74 header = (QCowHeader *)header_buf;
75 fd = open(file, O_RDWR);
76 if (fd < 0) {
77 ublk_err( "%s backing file %s can't be opened\n",
78 __func__, file);
79 return -EINVAL;
80 }
81
82 if (fcntl(fd, F_SETFL, O_DIRECT))
83 ublk_err( "%s direct io on file %s isn't supported\n",
84 __func__, file);
85
86 ret = read(fd, header_buf, HEADER_SIZE);
87 if (ret != HEADER_SIZE) {
88 ublk_err( "%s: return backing file %s %d %d\n",
89 __func__, file, HEADER_SIZE, ret);
90 return -EINVAL;
91 }
92
93 if (be64_to_cpu(header->nb_snapshots) != 0) {
94 ublk_err( "%s: not support snapshots\n", __func__);
95 return -EINVAL;
96 }
97
98 tgt_json.dev_size = tgt->dev_size = be64_to_cpu(header->size);
99 p.basic.dev_sectors = tgt->dev_size >> 9,
100 p.basic.chunk_sectors = 1 << (be32_to_cpu(header->cluster_bits) - 9);
101 tgt->tgt_ring_depth = info->queue_depth * 4;
102 tgt->extra_ios = QCOW2_PARA::META_MAX_TAGS;
103 tgt->iowq_max_workers[0] = 1;
104 tgt->nr_fds = 1;
105 tgt->fds[1] = fd;
106 tgt->tgt_data = qs = make_qcow2state(file, dev);
107 ublksrv_tgt_set_io_data_size(tgt);
108
109 jbuf = ublksrv_tgt_realloc_json_buf(dev, &jbuf_size);
110 ublk_json_write_dev_info(dev, &jbuf, &jbuf_size);
111 ublk_json_write_target_base(dev, &jbuf, &jbuf_size, &tgt_json);
112
113 ublk_json_write_params(dev, &jbuf, &jbuf_size, &p);
114
115 ublk_json_write_tgt_str(dev, &jbuf, &jbuf_size,
116 "backing_file", file);
117 ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
118 "version", qs->header.get_version());
119 ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
120 "cluster_bits", qs->header.get_cluster_bits());
121 ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
122 "header_length", qs->header.get_header_length());
123 ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
124 "l1_size", qs->header.get_l1_size());
125 ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
126 "refcount_table_clusters",
127 qs->header.get_refcount_table_clusters());
128 ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
129 "refcount_order", qs->header.get_refcount_order());
130
131 qs->header.dump_ext();
132
133 return 0;
134 }
135
qcow2_recovery_tgt(struct ublksrv_dev * dev,int type)136 static int qcow2_recovery_tgt(struct ublksrv_dev *dev, int type)
137 {
138 const struct ublksrv_ctrl_dev *cdev = ublksrv_get_ctrl_dev(dev);
139 const char *jbuf = ublksrv_ctrl_get_recovery_jbuf(cdev);
140 const struct ublksrv_ctrl_dev_info *info =
141 ublksrv_ctrl_get_dev_info(cdev);
142 struct ublksrv_tgt_info *tgt = &dev->tgt;
143 int fd, ret;
144 char file[PATH_MAX];
145 struct ublk_params p;
146 int tgt_depth;
147
148 ublk_assert(jbuf);
149 ublk_assert(info->state == UBLK_S_DEV_QUIESCED);
150 ublk_assert(type == UBLKSRV_TGT_TYPE_QCOW2);
151
152 /* qcow2 doesn't support user copy yet */
153 if (info->flags & UBLK_F_USER_COPY)
154 return -EINVAL;
155
156 ret = ublksrv_json_read_target_str_info(jbuf, PATH_MAX, "backing_file", file);
157 if (ret < 0) {
158 ublk_err( "%s: backing file can't be retrieved from jbuf %d\n",
159 __func__, ret);
160 return ret;
161 }
162
163 ret = ublksrv_json_read_params(&p, jbuf);
164 if (ret) {
165 ublk_err( "%s: read ublk params failed %d\n",
166 __func__, ret);
167 return ret;
168 }
169
170 fd = open(file, O_RDWR);
171 if (fd < 0) {
172 ublk_err( "%s: backing file %s can't be opened\n",
173 __func__, file);
174 return fd;
175 }
176 if (fcntl(fd, F_SETFL, O_DIRECT))
177 ublk_err( "%s direct io on file %s isn't supported\n",
178 __func__, file);
179
180 tgt_depth = QCOW2_PARA::META_MAX_TAGS > info->queue_depth * 2 ?
181 QCOW2_PARA::META_MAX_TAGS : info->queue_depth * 2;
182 tgt->dev_size = p.basic.dev_sectors << 9;
183 tgt->extra_ios = QCOW2_PARA::META_MAX_TAGS;
184 tgt->tgt_ring_depth = tgt_depth;
185 tgt->iowq_max_workers[0] = 1;
186 tgt->nr_fds = 1;
187 tgt->fds[1] = fd;
188 tgt->tgt_data = make_qcow2state(file, dev);
189 ublksrv_tgt_set_io_data_size(tgt);
190
191 return 0;
192 }
193
qcow2_usage_for_add(void)194 static void qcow2_usage_for_add(void)
195 {
196 printf(" qcow2: -f backing_file\n");
197 }
198
199 /* todo: flush meta dirty data */
qcow2_queue_tgt_fsync(const struct ublksrv_queue * q,unsigned io_op,int tag,u32 len,u64 offset)200 static inline int qcow2_queue_tgt_fsync(const struct ublksrv_queue *q,
201 unsigned io_op, int tag, u32 len, u64 offset)
202 {
203 int fd = q->dev->tgt.fds[1];
204 struct io_uring_sqe *sqe = io_uring_get_sqe(q->ring_ptr);
205
206 if (!sqe) {
207 ublk_err("%s: tag %d offset %lx op %d, no sqe\n",
208 __func__, tag, offset, io_op);
209 return -ENOMEM;
210 }
211
212 io_uring_prep_sync_file_range(sqe, fd, len ,offset,
213 IORING_FSYNC_DATASYNC);
214 sqe->user_data = build_user_data(tag, io_op, 0, 1);
215 qcow2_io_log("%s: queue io op %d(%llu %llx %llx)"
216 " (qid %d tag %u, cmd_op %u target: %d, user_data %llx)\n",
217 __func__, io_op, sqe->off, sqe->len, sqe->addr,
218 q->q_id, tag, io_op, 1, sqe->user_data);
219 return 1;
220 }
221
qcow2_queue_tgt_zero_cluster(const Qcow2State * qs,const struct ublksrv_queue * q,int tag,u64 offset)222 static inline int qcow2_queue_tgt_zero_cluster(const Qcow2State *qs,
223 const struct ublksrv_queue *q, int tag, u64 offset)
224 {
225 int mode = FALLOC_FL_ZERO_RANGE;
226 int fd = q->dev->tgt.fds[1];
227 struct io_uring_sqe *sqe = io_uring_get_sqe(q->ring_ptr);
228
229 if (!sqe) {
230 ublk_err("%s: tag %d offset %lx op %d, no sqe for zeroing\n",
231 __func__, tag, offset, IORING_OP_FALLOCATE);
232 return -ENOMEM;
233 }
234
235 io_uring_prep_fallocate(sqe, fd, mode, offset,
236 (1ULL << qs->header.cluster_bits));
237 sqe->user_data = build_user_data(tag,
238 IORING_OP_FALLOCATE, 0, 1);
239 qcow2_io_log("%s: queue io op %d(%llx %llx %llx)"
240 " (qid %d tag %u, target: %d, user_data %llx)\n",
241 __func__, IORING_OP_FALLOCATE, offset,
242 sqe->len, sqe->addr, q->q_id, tag, 1, sqe->user_data);
243 return 1;
244 }
245
qcow2_queue_tgt_rw_fast(const struct ublksrv_queue * q,unsigned io_op,int tag,u64 offset,const struct ublksrv_io_desc * iod)246 static inline int qcow2_queue_tgt_rw_fast(const struct ublksrv_queue *q,
247 unsigned io_op, int tag, u64 offset,
248 const struct ublksrv_io_desc *iod)
249 {
250 struct io_uring_sqe *sqe = io_uring_get_sqe(q->ring_ptr);
251
252 if (!sqe) {
253 ublk_err("%s: tag %d offset %lx op %d, no sqe for rw\n",
254 __func__, tag, offset, io_op);
255 return -ENOMEM;
256 }
257
258 io_uring_prep_rw(io_op, sqe, 1, (void *)iod->addr,
259 iod->nr_sectors << 9, offset);
260 sqe->flags = IOSQE_FIXED_FILE;
261 sqe->user_data = build_user_data(tag, io_op, 0, 1);
262 qcow2_io_log("%s: queue io op %d(%llu %llx %llx)"
263 " (qid %d tag %u, cmd_op %u target: %d, user_data %llx)\n",
264 __func__, io_op, sqe->off, sqe->len, sqe->addr,
265 q->q_id, tag, io_op, 1, sqe->user_data);
266
267 return 1;
268
269 }
270
qcow2_queue_tgt_rw(const struct ublksrv_queue * q,unsigned io_op,int tag,u64 offset,const struct ublksrv_io_desc * iod,u32 * expected_op)271 static inline int qcow2_queue_tgt_rw(const struct ublksrv_queue *q, unsigned io_op,
272 int tag, u64 offset, const struct ublksrv_io_desc *iod,
273 u32 *expected_op)
274 {
275 Qcow2State *qs = queue_to_qcow2state(q);
276 u64 cluster_start = offset & ~((1ULL << qs->header.cluster_bits) - 1);
277 Qcow2ClusterState *cs = qs->cluster_allocator.
278 get_cluster_state(cluster_start);
279 u8 cs_state = (cs == nullptr ? QCOW2_ALLOC_DONE : cs->get_state());
280
281 if (cs_state >= QCOW2_ALLOC_ZEROED) {
282 *expected_op = io_op;
283 return qcow2_queue_tgt_rw_fast(q, io_op, tag, offset, iod);
284 }
285
286 if (io_op == IORING_OP_WRITE) {
287 if (cs_state == QCOW2_ALLOC_ZEROING) {
288 cs->add_waiter(tag);
289 throw MetaUpdateException();
290 }
291
292 if (cs_state == QCOW2_ALLOC_STARTED) {
293 int ret = qcow2_queue_tgt_zero_cluster(qs, q, tag,
294 cluster_start);
295 if (ret >= 0)
296 cs->set_state(QCOW2_ALLOC_ZEROING);
297 *expected_op = IORING_OP_FALLOCATE;
298 return ret;
299 }
300 return 0;
301 } else {
302 memset((void *)iod->addr, 0,
303 iod->nr_sectors << 9);
304 return 0;
305 }
306 }
307
308 /* return how many sqes queued */
qcow2_queue_tgt_io(const struct ublksrv_queue * q,unsigned io_op,int tag,u64 offset,u32 * exp_op,const struct ublksrv_io_desc * iod)309 static int qcow2_queue_tgt_io(const struct ublksrv_queue *q, unsigned io_op,
310 int tag, u64 offset, u32 *exp_op,
311 const struct ublksrv_io_desc *iod)
312 {
313 int ret;
314
315 //we don't support discard yet
316 if (io_op == IORING_OP_FALLOCATE)
317 return -ENOTSUP;
318
319 if (io_op == IORING_OP_FSYNC) {
320 ret = qcow2_queue_tgt_fsync(q, io_op, tag,
321 iod->nr_sectors << 9, offset);
322 *exp_op = io_op;
323 } else
324 ret = qcow2_queue_tgt_rw(q, io_op, tag, offset, iod, exp_op);
325
326 return ret;
327 }
328
l2_entry_read_as_zero(u64 entry)329 static inline bool l2_entry_read_as_zero(u64 entry)
330 {
331 if (!entry || (entry & 0x1))
332 return true;
333 return false;
334 }
335
__qcow2_handle_io_async(const struct ublksrv_queue * q,const struct ublk_io_data * data,int tag)336 static co_io_job __qcow2_handle_io_async(const struct ublksrv_queue *q,
337 const struct ublk_io_data *data, int tag)
338 {
339 struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
340 Qcow2State *qs = queue_to_qcow2state(q);
341 const struct ublksrv_io_desc *iod = data->iod;
342 unsigned long start = iod->start_sector << 9;
343 u64 mapped_start;
344 qcow2_io_ctx_t ioc(tag, q->q_id);
345 const struct io_uring_cqe *cqe;
346 int ret = 0;
347 unsigned int op = ublksrv_get_op(iod);
348 bool wait;
349
350 qcow2_io_log("%s: tag %d, ublk op %x virt %llx/%u\n",
351 __func__, tag, op, start, (iod->nr_sectors << 9));
352
353 qcow2_assert((start + (unsigned long)(iod->nr_sectors << 9)) <=
354 qs->get_dev_size());
355 again:
356 try {
357 mapped_start = qs->cluster_map.map_cluster(ioc, start,
358 op == UBLK_IO_OP_WRITE);
359 wait = false;
360 } catch (MetaIoException &meta_error) {
361 wait = true;
362 } catch (MetaUpdateException &meta_update_error) {
363 wait = true;
364 }
365
366 if (wait) {
367 co_await__suspend_always(tag);
368
369 cqe = io->tgt_io_cqe;
370 io->tgt_io_cqe = NULL;
371 ret = qcow2_meta_io_done(q, cqe);
372 if (ret == -EAGAIN)
373 goto again;
374 if (ret < 0)
375 goto exit;
376 }
377
378 qcow2_io_log("%s: tag %d, ublk op %x virt %llx/%u to host %llx\n",
379 __func__, tag, op, start, (iod->nr_sectors << 9),
380 mapped_start);
381
382 if (mapped_start == -1) {
383 ublk_err("%s: tag %d virt %lx op %d, unsupported format\n",
384 __func__, tag, start, op);
385 ret = -EIO;
386 } else if (!mapped_start) {
387 // write to unallocated cluster, so have to allocate first
388 if ((op == UBLK_IO_OP_READ) &&
389 l2_entry_read_as_zero(mapped_start)) {
390 ret = iod->nr_sectors << 9;
391 memset((void *)iod->addr, 0, ret);
392 } else {
393 ublk_err("%s: tag %d virt %lx op %d map failed\n",
394 __func__, tag, start, op);
395 ret = -EIO;
396 }
397 } else {
398 unsigned io_op = ublksrv_convert_cmd_op(iod);
399 unsigned exp_op;
400
401 mapped_start &= ((1ULL << 63) - 1);
402
403 qcow2_assert(mapped_start + (iod->nr_sectors << 9) <=
404 qs->cluster_allocator.max_physical_size);
405 queue_io:
406 //the only exception is from handling zeroing cluster
407 try {
408 ret = qcow2_queue_tgt_io(q, io_op, tag, mapped_start,
409 &exp_op, iod);
410 wait = false;
411 } catch (MetaUpdateException &meta_error) {
412 wait = true;
413 }
414
415 if (wait) {
416 co_await__suspend_always(tag);
417 goto queue_io;
418 }
419
420 if (ret > 0) {
421 u64 cluster_start = mapped_start &
422 ~((1ULL << qs->header.cluster_bits) - 1);
423
424 co_await__suspend_always(tag);
425 cqe = io->tgt_io_cqe;
426 ret = cqe->res;
427 if (ret == -EAGAIN) {
428 qcow2_log("%s zeroing cluster IO eagain\n",
429 __func__);
430 //submit this write IO again
431 if (user_data_to_op(cqe->user_data) == io_op)
432 goto queue_io;
433
434 //if the cluster zeroing IO isn't done, retry
435 if (qs->cluster_allocator.
436 alloc_cluster_reset(cluster_start))
437 goto queue_io;
438 }
439
440 qcow2_io_log("%s: io done, tag %d res %d user_data %llx\n",
441 __func__, tag, ret,
442 cqe->user_data);
443 if (exp_op != io_op) {
444 if (user_data_to_op(cqe->user_data) == IORING_OP_FALLOCATE)
445 qs->cluster_allocator.alloc_cluster_zeroed(q,
446 tag, cluster_start);
447 goto queue_io;
448 }
449 } else if (ret == 0) {
450 ret = iod->nr_sectors << 9;
451 }
452 }
453 exit:
454 if (ret < 0)
455 ublk_err("%s io failed(%d %lx %u) ret %d\n", __func__,
456 op, start, iod->nr_sectors, ret);
457 qcow2_io_log("%s tag %d io complete(%d %llx %lu) ret %d\n", __func__,
458 tag, op, start, iod->nr_sectors, ret);
459 ublksrv_complete_io(q, tag, ret);
460 }
461
qcow2_handle_io_async(const struct ublksrv_queue * q,const struct ublk_io_data * data)462 static int qcow2_handle_io_async(const struct ublksrv_queue *q,
463 const struct ublk_io_data *data)
464 {
465 struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
466
467 io->co = __qcow2_handle_io_async(q, data, data->tag);
468 return 0;
469 }
470
qcow2_deinit_tgt(const struct ublksrv_dev * dev)471 static void qcow2_deinit_tgt(const struct ublksrv_dev *dev)
472 {
473 Qcow2State *qs = dev_to_qcow2state(dev);
474
475 //now all io slots are available, just use the zero tag
476 qcow2_io_ctx_t ioc(0, 0);
477
478 qs->dump_meta();
479
480 delete qs;
481 }
482
qcow2_tgt_io_done(const struct ublksrv_queue * q,const struct ublk_io_data * data,const struct io_uring_cqe * cqe)483 static void qcow2_tgt_io_done(const struct ublksrv_queue *q,
484 const struct ublk_io_data *data, const struct io_uring_cqe *cqe)
485 {
486 unsigned tag = user_data_to_tag(cqe->user_data);
487
488 qcow2_io_log("%s: res %d qid %u tag %u, cmd_op %u\n",
489 __func__, cqe->res, q->q_id,
490 user_data_to_tag(cqe->user_data),
491 user_data_to_op(cqe->user_data));
492 //special tag is ignored, so far it is used in sending
493 //fsync during flushing meta
494 if (tag != 0xffff) {
495 struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
496 io->tgt_io_cqe = cqe;
497 io->co.resume();
498 }
499 }
500
qcow2_handle_io_bg(const struct ublksrv_queue * q,int nr_queued_io)501 static void qcow2_handle_io_bg(const struct ublksrv_queue *q, int nr_queued_io)
502 {
503 Qcow2State *qs = queue_to_qcow2state(q);
504
505 ublk_dbg(UBLK_DBG_QCOW2_FLUSH | UBLK_DBG_QCOW2_META,
506 "%s %d, queued io %d\n", __func__, __LINE__, nr_queued_io);
507 qs->kill_slices(q);
508 again:
509 qs->meta_flushing.run_flush(q, nr_queued_io);
510
511 if (!nr_queued_io && !qs->meta_flushing.is_flushing()) {
512 if (qs->has_dirty_slice())
513 goto again;
514 }
515 }
516
qcow2_idle(const struct ublksrv_queue * q,bool enter)517 static void qcow2_idle(const struct ublksrv_queue *q, bool enter)
518 {
519 Qcow2State *qs = queue_to_qcow2state(q);
520
521 if (!enter)
522 return;
523
524 qs->shrink_cache();
525 }
526
qcow2_init_queue(const struct ublksrv_queue * q,void ** queue_data_ptr)527 static int qcow2_init_queue(const struct ublksrv_queue *q,
528 void **queue_data_ptr)
529 {
530 Qcow2State *qs = dev_to_qcow2state(q->dev);
531
532 *queue_data_ptr = (void *)qs;
533
534 return 0;
535 }
536
537 struct ublksrv_tgt_type qcow2_tgt_type = {
538 .handle_io_async = qcow2_handle_io_async,
539 .tgt_io_done = qcow2_tgt_io_done,
540 .handle_io_background = qcow2_handle_io_bg,
541 .usage_for_add = qcow2_usage_for_add,
542 .init_tgt = qcow2_init_tgt,
543 .deinit_tgt = qcow2_deinit_tgt,
544 .idle_fn = qcow2_idle,
545 .type = UBLKSRV_TGT_TYPE_QCOW2,
546 .name = "qcow2",
547 .recovery_tgt = qcow2_recovery_tgt,
548 .init_queue = qcow2_init_queue,
549 };
550
551 static void tgt_qcow2_init() __attribute__((constructor));
552
tgt_qcow2_init(void)553 static void tgt_qcow2_init(void)
554 {
555 ublksrv_register_tgt_type(&qcow2_tgt_type);
556 }
557