xref: /aosp_15_r20/external/ublksrv/qcow2/tgt_qcow2.cpp (revision 94c4a1e103eb1715230460aab379dff275992c20)
1*94c4a1e1SFrank Piva // SPDX-License-Identifier: GPL-2.0
2*94c4a1e1SFrank Piva #include "ublksrv_tgt.h"
3*94c4a1e1SFrank Piva #include "qcow2_format.h"
4*94c4a1e1SFrank Piva #include "qcow2.h"
5*94c4a1e1SFrank Piva 
6*94c4a1e1SFrank Piva #define HEADER_SIZE  512
7*94c4a1e1SFrank Piva #define QCOW2_UNMAPPED   (u64)(-1)
8*94c4a1e1SFrank Piva 
qcow2_init_tgt(struct ublksrv_dev * dev,int type,int argc,char * argv[])9*94c4a1e1SFrank Piva static int qcow2_init_tgt(struct ublksrv_dev *dev, int type, int argc, char
10*94c4a1e1SFrank Piva 		*argv[])
11*94c4a1e1SFrank Piva {
12*94c4a1e1SFrank Piva 	struct ublksrv_tgt_info *tgt = &dev->tgt;
13*94c4a1e1SFrank Piva 	const struct ublksrv_ctrl_dev_info *info =
14*94c4a1e1SFrank Piva 		ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(dev));
15*94c4a1e1SFrank Piva 	static const struct option lo_longopts[] = {
16*94c4a1e1SFrank Piva 		{ "file",		1,	NULL, 'f' },
17*94c4a1e1SFrank Piva 		{ NULL }
18*94c4a1e1SFrank Piva 	};
19*94c4a1e1SFrank Piva 	int jbuf_size;
20*94c4a1e1SFrank Piva 	char *jbuf;
21*94c4a1e1SFrank Piva 	int fd, opt, ret;
22*94c4a1e1SFrank Piva 	void *header_buf;
23*94c4a1e1SFrank Piva 	QCowHeader *header;
24*94c4a1e1SFrank Piva 	char *file = NULL;
25*94c4a1e1SFrank Piva 	struct ublksrv_tgt_base_json tgt_json = {
26*94c4a1e1SFrank Piva 		.type = type,
27*94c4a1e1SFrank Piva 	};
28*94c4a1e1SFrank Piva 	struct ublk_params p = {
29*94c4a1e1SFrank Piva 		.types = UBLK_PARAM_TYPE_BASIC,
30*94c4a1e1SFrank Piva 		.basic = {
31*94c4a1e1SFrank Piva 			//.attrs = UBLK_ATTR_READ_ONLY,
32*94c4a1e1SFrank Piva 			.logical_bs_shift	= 9,
33*94c4a1e1SFrank Piva 			.physical_bs_shift	= 12,
34*94c4a1e1SFrank Piva 			.io_opt_shift	= 12,
35*94c4a1e1SFrank Piva 			.io_min_shift	= 9,
36*94c4a1e1SFrank Piva 			.max_sectors		= info->max_io_buf_bytes >> 9,
37*94c4a1e1SFrank Piva 		},
38*94c4a1e1SFrank Piva 	};
39*94c4a1e1SFrank Piva 	Qcow2State *qs;
40*94c4a1e1SFrank Piva 
41*94c4a1e1SFrank Piva 	/* qcow2 doesn't support user copy yet */
42*94c4a1e1SFrank Piva 	if (info->flags & UBLK_F_USER_COPY)
43*94c4a1e1SFrank Piva 		return -EINVAL;
44*94c4a1e1SFrank Piva 
45*94c4a1e1SFrank Piva 	//1024 queue depth is enough for qcow2, then we can store
46*94c4a1e1SFrank Piva 	//tag & l1 entry index in single u32 variable.
47*94c4a1e1SFrank Piva 	if (info->queue_depth > QCOW2_MAX_QUEUE_DEPTH)
48*94c4a1e1SFrank Piva 		return -EINVAL;
49*94c4a1e1SFrank Piva 
50*94c4a1e1SFrank Piva 	//qcow2 target doesn't support MQ yet
51*94c4a1e1SFrank Piva 	if (info->nr_hw_queues > 1)
52*94c4a1e1SFrank Piva 		return -EINVAL;
53*94c4a1e1SFrank Piva 
54*94c4a1e1SFrank Piva 	strcpy(tgt_json.name, "qcow2");
55*94c4a1e1SFrank Piva 
56*94c4a1e1SFrank Piva 	if (type != UBLKSRV_TGT_TYPE_QCOW2)
57*94c4a1e1SFrank Piva 		return -EINVAL;
58*94c4a1e1SFrank Piva 
59*94c4a1e1SFrank Piva 	while ((opt = getopt_long(argc, argv, "-:f:",
60*94c4a1e1SFrank Piva 				  lo_longopts, NULL)) != -1) {
61*94c4a1e1SFrank Piva 		switch (opt) {
62*94c4a1e1SFrank Piva 		case 'f':
63*94c4a1e1SFrank Piva 			file = strdup(optarg);
64*94c4a1e1SFrank Piva 			break;
65*94c4a1e1SFrank Piva 		}
66*94c4a1e1SFrank Piva 	}
67*94c4a1e1SFrank Piva 
68*94c4a1e1SFrank Piva 	if (!file)
69*94c4a1e1SFrank Piva 		return -EINVAL;
70*94c4a1e1SFrank Piva 
71*94c4a1e1SFrank Piva 	if (posix_memalign((void **)&header_buf, 512, HEADER_SIZE))
72*94c4a1e1SFrank Piva 		return -EINVAL;
73*94c4a1e1SFrank Piva 
74*94c4a1e1SFrank Piva 	header = (QCowHeader *)header_buf;
75*94c4a1e1SFrank Piva 	fd = open(file, O_RDWR);
76*94c4a1e1SFrank Piva 	if (fd < 0) {
77*94c4a1e1SFrank Piva 		ublk_err( "%s backing file %s can't be opened\n",
78*94c4a1e1SFrank Piva 				__func__, file);
79*94c4a1e1SFrank Piva 		return -EINVAL;
80*94c4a1e1SFrank Piva 	}
81*94c4a1e1SFrank Piva 
82*94c4a1e1SFrank Piva 	if (fcntl(fd, F_SETFL, O_DIRECT))
83*94c4a1e1SFrank Piva 		ublk_err( "%s direct io on file %s isn't supported\n",
84*94c4a1e1SFrank Piva 				__func__, file);
85*94c4a1e1SFrank Piva 
86*94c4a1e1SFrank Piva 	ret = read(fd, header_buf, HEADER_SIZE);
87*94c4a1e1SFrank Piva 	if (ret != HEADER_SIZE) {
88*94c4a1e1SFrank Piva 		ublk_err( "%s: return backing file %s %d %d\n",
89*94c4a1e1SFrank Piva 				__func__, file, HEADER_SIZE, ret);
90*94c4a1e1SFrank Piva 		return -EINVAL;
91*94c4a1e1SFrank Piva 	}
92*94c4a1e1SFrank Piva 
93*94c4a1e1SFrank Piva 	if (be64_to_cpu(header->nb_snapshots) != 0) {
94*94c4a1e1SFrank Piva 		ublk_err( "%s: not support snapshots\n", __func__);
95*94c4a1e1SFrank Piva 		return -EINVAL;
96*94c4a1e1SFrank Piva 	}
97*94c4a1e1SFrank Piva 
98*94c4a1e1SFrank Piva 	tgt_json.dev_size = tgt->dev_size = be64_to_cpu(header->size);
99*94c4a1e1SFrank Piva 	p.basic.dev_sectors = tgt->dev_size >> 9,
100*94c4a1e1SFrank Piva 	p.basic.chunk_sectors = 1 << (be32_to_cpu(header->cluster_bits) - 9);
101*94c4a1e1SFrank Piva 	tgt->tgt_ring_depth = info->queue_depth * 4;
102*94c4a1e1SFrank Piva 	tgt->extra_ios = QCOW2_PARA::META_MAX_TAGS;
103*94c4a1e1SFrank Piva 	tgt->iowq_max_workers[0] = 1;
104*94c4a1e1SFrank Piva 	tgt->nr_fds = 1;
105*94c4a1e1SFrank Piva 	tgt->fds[1] = fd;
106*94c4a1e1SFrank Piva 	tgt->tgt_data = qs = make_qcow2state(file, dev);
107*94c4a1e1SFrank Piva 	ublksrv_tgt_set_io_data_size(tgt);
108*94c4a1e1SFrank Piva 
109*94c4a1e1SFrank Piva 	jbuf = ublksrv_tgt_realloc_json_buf(dev, &jbuf_size);
110*94c4a1e1SFrank Piva 	ublk_json_write_dev_info(dev, &jbuf, &jbuf_size);
111*94c4a1e1SFrank Piva 	ublk_json_write_target_base(dev, &jbuf, &jbuf_size, &tgt_json);
112*94c4a1e1SFrank Piva 
113*94c4a1e1SFrank Piva 	ublk_json_write_params(dev, &jbuf, &jbuf_size, &p);
114*94c4a1e1SFrank Piva 
115*94c4a1e1SFrank Piva 	ublk_json_write_tgt_str(dev, &jbuf, &jbuf_size,
116*94c4a1e1SFrank Piva 			"backing_file", file);
117*94c4a1e1SFrank Piva 	ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
118*94c4a1e1SFrank Piva 		"version", qs->header.get_version());
119*94c4a1e1SFrank Piva 	ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
120*94c4a1e1SFrank Piva 		"cluster_bits", qs->header.get_cluster_bits());
121*94c4a1e1SFrank Piva 	ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
122*94c4a1e1SFrank Piva 		"header_length", qs->header.get_header_length());
123*94c4a1e1SFrank Piva 	ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
124*94c4a1e1SFrank Piva 		"l1_size", qs->header.get_l1_size());
125*94c4a1e1SFrank Piva 	ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
126*94c4a1e1SFrank Piva 		"refcount_table_clusters",
127*94c4a1e1SFrank Piva 		qs->header.get_refcount_table_clusters());
128*94c4a1e1SFrank Piva 	ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
129*94c4a1e1SFrank Piva 			"refcount_order", qs->header.get_refcount_order());
130*94c4a1e1SFrank Piva 
131*94c4a1e1SFrank Piva 	qs->header.dump_ext();
132*94c4a1e1SFrank Piva 
133*94c4a1e1SFrank Piva 	return 0;
134*94c4a1e1SFrank Piva }
135*94c4a1e1SFrank Piva 
qcow2_recovery_tgt(struct ublksrv_dev * dev,int type)136*94c4a1e1SFrank Piva static int qcow2_recovery_tgt(struct ublksrv_dev *dev, int type)
137*94c4a1e1SFrank Piva {
138*94c4a1e1SFrank Piva 	const struct ublksrv_ctrl_dev *cdev = ublksrv_get_ctrl_dev(dev);
139*94c4a1e1SFrank Piva 	const char *jbuf = ublksrv_ctrl_get_recovery_jbuf(cdev);
140*94c4a1e1SFrank Piva 	const struct ublksrv_ctrl_dev_info *info =
141*94c4a1e1SFrank Piva 		ublksrv_ctrl_get_dev_info(cdev);
142*94c4a1e1SFrank Piva 	struct ublksrv_tgt_info *tgt = &dev->tgt;
143*94c4a1e1SFrank Piva 	int fd, ret;
144*94c4a1e1SFrank Piva 	char file[PATH_MAX];
145*94c4a1e1SFrank Piva 	struct ublk_params p;
146*94c4a1e1SFrank Piva 	int tgt_depth;
147*94c4a1e1SFrank Piva 
148*94c4a1e1SFrank Piva 	ublk_assert(jbuf);
149*94c4a1e1SFrank Piva 	ublk_assert(info->state == UBLK_S_DEV_QUIESCED);
150*94c4a1e1SFrank Piva 	ublk_assert(type == UBLKSRV_TGT_TYPE_QCOW2);
151*94c4a1e1SFrank Piva 
152*94c4a1e1SFrank Piva 	/* qcow2 doesn't support user copy yet */
153*94c4a1e1SFrank Piva 	if (info->flags & UBLK_F_USER_COPY)
154*94c4a1e1SFrank Piva 		return -EINVAL;
155*94c4a1e1SFrank Piva 
156*94c4a1e1SFrank Piva 	ret = ublksrv_json_read_target_str_info(jbuf, PATH_MAX, "backing_file", file);
157*94c4a1e1SFrank Piva 	if (ret < 0) {
158*94c4a1e1SFrank Piva 		ublk_err( "%s: backing file can't be retrieved from jbuf %d\n",
159*94c4a1e1SFrank Piva 				__func__, ret);
160*94c4a1e1SFrank Piva 		return ret;
161*94c4a1e1SFrank Piva 	}
162*94c4a1e1SFrank Piva 
163*94c4a1e1SFrank Piva 	ret = ublksrv_json_read_params(&p, jbuf);
164*94c4a1e1SFrank Piva 	if (ret) {
165*94c4a1e1SFrank Piva 		ublk_err( "%s: read ublk params failed %d\n",
166*94c4a1e1SFrank Piva 				__func__, ret);
167*94c4a1e1SFrank Piva 		return ret;
168*94c4a1e1SFrank Piva 	}
169*94c4a1e1SFrank Piva 
170*94c4a1e1SFrank Piva 	fd = open(file, O_RDWR);
171*94c4a1e1SFrank Piva 	if (fd < 0) {
172*94c4a1e1SFrank Piva 		ublk_err( "%s: backing file %s can't be opened\n",
173*94c4a1e1SFrank Piva 				__func__, file);
174*94c4a1e1SFrank Piva 		return fd;
175*94c4a1e1SFrank Piva 	}
176*94c4a1e1SFrank Piva 	if (fcntl(fd, F_SETFL, O_DIRECT))
177*94c4a1e1SFrank Piva 		ublk_err( "%s direct io on file %s isn't supported\n",
178*94c4a1e1SFrank Piva 				__func__, file);
179*94c4a1e1SFrank Piva 
180*94c4a1e1SFrank Piva 	tgt_depth = QCOW2_PARA::META_MAX_TAGS > info->queue_depth * 2 ?
181*94c4a1e1SFrank Piva 			QCOW2_PARA::META_MAX_TAGS : info->queue_depth * 2;
182*94c4a1e1SFrank Piva 	tgt->dev_size = p.basic.dev_sectors << 9;
183*94c4a1e1SFrank Piva 	tgt->extra_ios = QCOW2_PARA::META_MAX_TAGS;
184*94c4a1e1SFrank Piva 	tgt->tgt_ring_depth = tgt_depth;
185*94c4a1e1SFrank Piva 	tgt->iowq_max_workers[0] = 1;
186*94c4a1e1SFrank Piva 	tgt->nr_fds = 1;
187*94c4a1e1SFrank Piva 	tgt->fds[1] = fd;
188*94c4a1e1SFrank Piva 	tgt->tgt_data = make_qcow2state(file, dev);
189*94c4a1e1SFrank Piva 	ublksrv_tgt_set_io_data_size(tgt);
190*94c4a1e1SFrank Piva 
191*94c4a1e1SFrank Piva 	return 0;
192*94c4a1e1SFrank Piva }
193*94c4a1e1SFrank Piva 
qcow2_usage_for_add(void)194*94c4a1e1SFrank Piva static void qcow2_usage_for_add(void)
195*94c4a1e1SFrank Piva {
196*94c4a1e1SFrank Piva 	printf("           qcow2: -f backing_file\n");
197*94c4a1e1SFrank Piva }
198*94c4a1e1SFrank Piva 
199*94c4a1e1SFrank Piva /* todo: flush meta dirty data */
qcow2_queue_tgt_fsync(const struct ublksrv_queue * q,unsigned io_op,int tag,u32 len,u64 offset)200*94c4a1e1SFrank Piva static inline int qcow2_queue_tgt_fsync(const struct ublksrv_queue *q,
201*94c4a1e1SFrank Piva 		unsigned io_op, int tag, u32 len, u64 offset)
202*94c4a1e1SFrank Piva {
203*94c4a1e1SFrank Piva 	int fd = q->dev->tgt.fds[1];
204*94c4a1e1SFrank Piva 	struct io_uring_sqe *sqe = io_uring_get_sqe(q->ring_ptr);
205*94c4a1e1SFrank Piva 
206*94c4a1e1SFrank Piva 	if (!sqe) {
207*94c4a1e1SFrank Piva 		ublk_err("%s: tag %d offset %lx op %d, no sqe\n",
208*94c4a1e1SFrank Piva 				__func__, tag, offset, io_op);
209*94c4a1e1SFrank Piva 		return -ENOMEM;
210*94c4a1e1SFrank Piva 	}
211*94c4a1e1SFrank Piva 
212*94c4a1e1SFrank Piva 	io_uring_prep_sync_file_range(sqe, fd, len ,offset,
213*94c4a1e1SFrank Piva 			IORING_FSYNC_DATASYNC);
214*94c4a1e1SFrank Piva 	sqe->user_data = build_user_data(tag, io_op, 0, 1);
215*94c4a1e1SFrank Piva 	qcow2_io_log("%s: queue io op %d(%llu %llx %llx)"
216*94c4a1e1SFrank Piva 				" (qid %d tag %u, cmd_op %u target: %d, user_data %llx)\n",
217*94c4a1e1SFrank Piva 			__func__, io_op, sqe->off, sqe->len, sqe->addr,
218*94c4a1e1SFrank Piva 			q->q_id, tag, io_op, 1, sqe->user_data);
219*94c4a1e1SFrank Piva 	return 1;
220*94c4a1e1SFrank Piva }
221*94c4a1e1SFrank Piva 
qcow2_queue_tgt_zero_cluster(const Qcow2State * qs,const struct ublksrv_queue * q,int tag,u64 offset)222*94c4a1e1SFrank Piva static inline int qcow2_queue_tgt_zero_cluster(const Qcow2State *qs,
223*94c4a1e1SFrank Piva 		const struct ublksrv_queue *q, int tag, u64 offset)
224*94c4a1e1SFrank Piva {
225*94c4a1e1SFrank Piva 	int mode = FALLOC_FL_ZERO_RANGE;
226*94c4a1e1SFrank Piva 	int fd = q->dev->tgt.fds[1];
227*94c4a1e1SFrank Piva 	struct io_uring_sqe *sqe = io_uring_get_sqe(q->ring_ptr);
228*94c4a1e1SFrank Piva 
229*94c4a1e1SFrank Piva 	if (!sqe) {
230*94c4a1e1SFrank Piva 		ublk_err("%s: tag %d offset %lx op %d, no sqe for zeroing\n",
231*94c4a1e1SFrank Piva 			__func__, tag, offset, IORING_OP_FALLOCATE);
232*94c4a1e1SFrank Piva 		return -ENOMEM;
233*94c4a1e1SFrank Piva 	}
234*94c4a1e1SFrank Piva 
235*94c4a1e1SFrank Piva 	io_uring_prep_fallocate(sqe, fd, mode, offset,
236*94c4a1e1SFrank Piva 			(1ULL << qs->header.cluster_bits));
237*94c4a1e1SFrank Piva 	sqe->user_data = build_user_data(tag,
238*94c4a1e1SFrank Piva 			IORING_OP_FALLOCATE, 0, 1);
239*94c4a1e1SFrank Piva 	qcow2_io_log("%s: queue io op %d(%llx %llx %llx)"
240*94c4a1e1SFrank Piva 				" (qid %d tag %u, target: %d, user_data %llx)\n",
241*94c4a1e1SFrank Piva 			__func__, IORING_OP_FALLOCATE, offset,
242*94c4a1e1SFrank Piva 			sqe->len, sqe->addr, q->q_id, tag, 1, sqe->user_data);
243*94c4a1e1SFrank Piva 	return 1;
244*94c4a1e1SFrank Piva }
245*94c4a1e1SFrank Piva 
qcow2_queue_tgt_rw_fast(const struct ublksrv_queue * q,unsigned io_op,int tag,u64 offset,const struct ublksrv_io_desc * iod)246*94c4a1e1SFrank Piva static inline int qcow2_queue_tgt_rw_fast(const struct ublksrv_queue *q,
247*94c4a1e1SFrank Piva 		unsigned io_op, int tag, u64 offset,
248*94c4a1e1SFrank Piva 		const struct ublksrv_io_desc *iod)
249*94c4a1e1SFrank Piva {
250*94c4a1e1SFrank Piva 	struct io_uring_sqe *sqe = io_uring_get_sqe(q->ring_ptr);
251*94c4a1e1SFrank Piva 
252*94c4a1e1SFrank Piva 	if (!sqe) {
253*94c4a1e1SFrank Piva 		ublk_err("%s: tag %d offset %lx op %d, no sqe for rw\n",
254*94c4a1e1SFrank Piva 				__func__, tag, offset, io_op);
255*94c4a1e1SFrank Piva 		return -ENOMEM;
256*94c4a1e1SFrank Piva 	}
257*94c4a1e1SFrank Piva 
258*94c4a1e1SFrank Piva 	io_uring_prep_rw(io_op, sqe, 1, (void *)iod->addr,
259*94c4a1e1SFrank Piva 			iod->nr_sectors << 9, offset);
260*94c4a1e1SFrank Piva 	sqe->flags = IOSQE_FIXED_FILE;
261*94c4a1e1SFrank Piva 	sqe->user_data = build_user_data(tag, io_op, 0, 1);
262*94c4a1e1SFrank Piva 	qcow2_io_log("%s: queue io op %d(%llu %llx %llx)"
263*94c4a1e1SFrank Piva 				" (qid %d tag %u, cmd_op %u target: %d, user_data %llx)\n",
264*94c4a1e1SFrank Piva 			__func__, io_op, sqe->off, sqe->len, sqe->addr,
265*94c4a1e1SFrank Piva 			q->q_id, tag, io_op, 1, sqe->user_data);
266*94c4a1e1SFrank Piva 
267*94c4a1e1SFrank Piva 	return 1;
268*94c4a1e1SFrank Piva 
269*94c4a1e1SFrank Piva }
270*94c4a1e1SFrank Piva 
qcow2_queue_tgt_rw(const struct ublksrv_queue * q,unsigned io_op,int tag,u64 offset,const struct ublksrv_io_desc * iod,u32 * expected_op)271*94c4a1e1SFrank Piva static inline int qcow2_queue_tgt_rw(const struct ublksrv_queue *q, unsigned io_op,
272*94c4a1e1SFrank Piva 		int tag, u64 offset, const struct ublksrv_io_desc *iod,
273*94c4a1e1SFrank Piva 		u32 *expected_op)
274*94c4a1e1SFrank Piva {
275*94c4a1e1SFrank Piva 	Qcow2State *qs = queue_to_qcow2state(q);
276*94c4a1e1SFrank Piva 	u64 cluster_start = offset & ~((1ULL << qs->header.cluster_bits) - 1);
277*94c4a1e1SFrank Piva 	Qcow2ClusterState *cs = qs->cluster_allocator.
278*94c4a1e1SFrank Piva 		get_cluster_state(cluster_start);
279*94c4a1e1SFrank Piva 	u8 cs_state = (cs == nullptr ? QCOW2_ALLOC_DONE : cs->get_state());
280*94c4a1e1SFrank Piva 
281*94c4a1e1SFrank Piva 	if (cs_state >= QCOW2_ALLOC_ZEROED) {
282*94c4a1e1SFrank Piva 		*expected_op = io_op;
283*94c4a1e1SFrank Piva 		return qcow2_queue_tgt_rw_fast(q, io_op, tag, offset, iod);
284*94c4a1e1SFrank Piva 	}
285*94c4a1e1SFrank Piva 
286*94c4a1e1SFrank Piva 	if (io_op == IORING_OP_WRITE) {
287*94c4a1e1SFrank Piva 		if (cs_state == QCOW2_ALLOC_ZEROING) {
288*94c4a1e1SFrank Piva 			cs->add_waiter(tag);
289*94c4a1e1SFrank Piva 			throw MetaUpdateException();
290*94c4a1e1SFrank Piva 		}
291*94c4a1e1SFrank Piva 
292*94c4a1e1SFrank Piva 		if (cs_state == QCOW2_ALLOC_STARTED) {
293*94c4a1e1SFrank Piva 			int ret = qcow2_queue_tgt_zero_cluster(qs, q, tag,
294*94c4a1e1SFrank Piva 					cluster_start);
295*94c4a1e1SFrank Piva 			if (ret >= 0)
296*94c4a1e1SFrank Piva 				cs->set_state(QCOW2_ALLOC_ZEROING);
297*94c4a1e1SFrank Piva 			*expected_op = IORING_OP_FALLOCATE;
298*94c4a1e1SFrank Piva 			return ret;
299*94c4a1e1SFrank Piva 		}
300*94c4a1e1SFrank Piva 		return 0;
301*94c4a1e1SFrank Piva 	} else {
302*94c4a1e1SFrank Piva 		memset((void *)iod->addr, 0,
303*94c4a1e1SFrank Piva 				iod->nr_sectors << 9);
304*94c4a1e1SFrank Piva 		return 0;
305*94c4a1e1SFrank Piva 	}
306*94c4a1e1SFrank Piva }
307*94c4a1e1SFrank Piva 
308*94c4a1e1SFrank Piva /* return how many sqes queued */
qcow2_queue_tgt_io(const struct ublksrv_queue * q,unsigned io_op,int tag,u64 offset,u32 * exp_op,const struct ublksrv_io_desc * iod)309*94c4a1e1SFrank Piva static int qcow2_queue_tgt_io(const struct ublksrv_queue *q, unsigned io_op,
310*94c4a1e1SFrank Piva 		int tag, u64 offset, u32 *exp_op,
311*94c4a1e1SFrank Piva 		const struct ublksrv_io_desc *iod)
312*94c4a1e1SFrank Piva {
313*94c4a1e1SFrank Piva 	int ret;
314*94c4a1e1SFrank Piva 
315*94c4a1e1SFrank Piva 	//we don't support discard yet
316*94c4a1e1SFrank Piva 	if (io_op == IORING_OP_FALLOCATE)
317*94c4a1e1SFrank Piva 		return -ENOTSUP;
318*94c4a1e1SFrank Piva 
319*94c4a1e1SFrank Piva 	if (io_op == IORING_OP_FSYNC) {
320*94c4a1e1SFrank Piva 		ret = qcow2_queue_tgt_fsync(q, io_op, tag,
321*94c4a1e1SFrank Piva 				iod->nr_sectors << 9, offset);
322*94c4a1e1SFrank Piva 		*exp_op = io_op;
323*94c4a1e1SFrank Piva 	} else
324*94c4a1e1SFrank Piva 		ret = qcow2_queue_tgt_rw(q, io_op, tag, offset, iod, exp_op);
325*94c4a1e1SFrank Piva 
326*94c4a1e1SFrank Piva 	return ret;
327*94c4a1e1SFrank Piva }
328*94c4a1e1SFrank Piva 
l2_entry_read_as_zero(u64 entry)329*94c4a1e1SFrank Piva static inline bool l2_entry_read_as_zero(u64 entry)
330*94c4a1e1SFrank Piva {
331*94c4a1e1SFrank Piva 	if (!entry || (entry & 0x1))
332*94c4a1e1SFrank Piva 		return true;
333*94c4a1e1SFrank Piva 	return false;
334*94c4a1e1SFrank Piva }
335*94c4a1e1SFrank Piva 
__qcow2_handle_io_async(const struct ublksrv_queue * q,const struct ublk_io_data * data,int tag)336*94c4a1e1SFrank Piva static co_io_job __qcow2_handle_io_async(const struct ublksrv_queue *q,
337*94c4a1e1SFrank Piva 		const struct ublk_io_data *data, int tag)
338*94c4a1e1SFrank Piva {
339*94c4a1e1SFrank Piva 	struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
340*94c4a1e1SFrank Piva 	Qcow2State *qs = queue_to_qcow2state(q);
341*94c4a1e1SFrank Piva 	const struct ublksrv_io_desc *iod = data->iod;
342*94c4a1e1SFrank Piva 	unsigned long start = iod->start_sector << 9;
343*94c4a1e1SFrank Piva 	u64 mapped_start;
344*94c4a1e1SFrank Piva 	qcow2_io_ctx_t ioc(tag, q->q_id);
345*94c4a1e1SFrank Piva 	const struct io_uring_cqe *cqe;
346*94c4a1e1SFrank Piva 	int ret = 0;
347*94c4a1e1SFrank Piva 	unsigned int op = ublksrv_get_op(iod);
348*94c4a1e1SFrank Piva 	bool wait;
349*94c4a1e1SFrank Piva 
350*94c4a1e1SFrank Piva 	qcow2_io_log("%s: tag %d, ublk op %x virt %llx/%u\n",
351*94c4a1e1SFrank Piva 			__func__, tag, op, start, (iod->nr_sectors << 9));
352*94c4a1e1SFrank Piva 
353*94c4a1e1SFrank Piva 	qcow2_assert((start + (unsigned long)(iod->nr_sectors << 9)) <=
354*94c4a1e1SFrank Piva 			qs->get_dev_size());
355*94c4a1e1SFrank Piva again:
356*94c4a1e1SFrank Piva 	try {
357*94c4a1e1SFrank Piva 		mapped_start = qs->cluster_map.map_cluster(ioc, start,
358*94c4a1e1SFrank Piva 				op == UBLK_IO_OP_WRITE);
359*94c4a1e1SFrank Piva 		wait = false;
360*94c4a1e1SFrank Piva 	} catch (MetaIoException &meta_error) {
361*94c4a1e1SFrank Piva 		wait = true;
362*94c4a1e1SFrank Piva 	} catch (MetaUpdateException &meta_update_error) {
363*94c4a1e1SFrank Piva 		wait = true;
364*94c4a1e1SFrank Piva 	}
365*94c4a1e1SFrank Piva 
366*94c4a1e1SFrank Piva 	if (wait) {
367*94c4a1e1SFrank Piva 		co_await__suspend_always(tag);
368*94c4a1e1SFrank Piva 
369*94c4a1e1SFrank Piva 		cqe = io->tgt_io_cqe;
370*94c4a1e1SFrank Piva 		io->tgt_io_cqe = NULL;
371*94c4a1e1SFrank Piva 		ret = qcow2_meta_io_done(q, cqe);
372*94c4a1e1SFrank Piva 		if (ret == -EAGAIN)
373*94c4a1e1SFrank Piva 			goto again;
374*94c4a1e1SFrank Piva 		if (ret < 0)
375*94c4a1e1SFrank Piva 			goto exit;
376*94c4a1e1SFrank Piva 	}
377*94c4a1e1SFrank Piva 
378*94c4a1e1SFrank Piva 	qcow2_io_log("%s: tag %d, ublk op %x virt %llx/%u to host %llx\n",
379*94c4a1e1SFrank Piva 			__func__, tag, op, start, (iod->nr_sectors << 9),
380*94c4a1e1SFrank Piva 			mapped_start);
381*94c4a1e1SFrank Piva 
382*94c4a1e1SFrank Piva 	if (mapped_start == -1) {
383*94c4a1e1SFrank Piva 		ublk_err("%s: tag %d virt %lx op %d, unsupported format\n",
384*94c4a1e1SFrank Piva 				__func__, tag, start, op);
385*94c4a1e1SFrank Piva 		ret = -EIO;
386*94c4a1e1SFrank Piva 	} else if (!mapped_start) {
387*94c4a1e1SFrank Piva 		// write to unallocated cluster, so have to allocate first
388*94c4a1e1SFrank Piva 		if ((op == UBLK_IO_OP_READ) &&
389*94c4a1e1SFrank Piva 			l2_entry_read_as_zero(mapped_start)) {
390*94c4a1e1SFrank Piva 			ret = iod->nr_sectors << 9;
391*94c4a1e1SFrank Piva 			memset((void *)iod->addr, 0, ret);
392*94c4a1e1SFrank Piva 		} else {
393*94c4a1e1SFrank Piva 			ublk_err("%s: tag %d virt %lx op %d map failed\n",
394*94c4a1e1SFrank Piva 					__func__, tag, start, op);
395*94c4a1e1SFrank Piva 			ret = -EIO;
396*94c4a1e1SFrank Piva 		}
397*94c4a1e1SFrank Piva 	} else {
398*94c4a1e1SFrank Piva 		unsigned io_op = ublksrv_convert_cmd_op(iod);
399*94c4a1e1SFrank Piva 		unsigned exp_op;
400*94c4a1e1SFrank Piva 
401*94c4a1e1SFrank Piva 		mapped_start &= ((1ULL << 63) - 1);
402*94c4a1e1SFrank Piva 
403*94c4a1e1SFrank Piva 		qcow2_assert(mapped_start + (iod->nr_sectors << 9) <=
404*94c4a1e1SFrank Piva 				qs->cluster_allocator.max_physical_size);
405*94c4a1e1SFrank Piva queue_io:
406*94c4a1e1SFrank Piva 		//the only exception is from handling zeroing cluster
407*94c4a1e1SFrank Piva 		try {
408*94c4a1e1SFrank Piva 			ret = qcow2_queue_tgt_io(q, io_op, tag, mapped_start,
409*94c4a1e1SFrank Piva 					&exp_op, iod);
410*94c4a1e1SFrank Piva 			wait = false;
411*94c4a1e1SFrank Piva 		} catch (MetaUpdateException &meta_error) {
412*94c4a1e1SFrank Piva 			wait = true;
413*94c4a1e1SFrank Piva 		}
414*94c4a1e1SFrank Piva 
415*94c4a1e1SFrank Piva 		if (wait) {
416*94c4a1e1SFrank Piva 			co_await__suspend_always(tag);
417*94c4a1e1SFrank Piva 			goto queue_io;
418*94c4a1e1SFrank Piva 		}
419*94c4a1e1SFrank Piva 
420*94c4a1e1SFrank Piva 		if (ret > 0) {
421*94c4a1e1SFrank Piva 			u64 cluster_start = mapped_start &
422*94c4a1e1SFrank Piva 				~((1ULL << qs->header.cluster_bits) - 1);
423*94c4a1e1SFrank Piva 
424*94c4a1e1SFrank Piva 			co_await__suspend_always(tag);
425*94c4a1e1SFrank Piva 			cqe = io->tgt_io_cqe;
426*94c4a1e1SFrank Piva 			ret = cqe->res;
427*94c4a1e1SFrank Piva 			if (ret == -EAGAIN) {
428*94c4a1e1SFrank Piva 				qcow2_log("%s zeroing cluster IO eagain\n",
429*94c4a1e1SFrank Piva 							__func__);
430*94c4a1e1SFrank Piva 				//submit this write IO again
431*94c4a1e1SFrank Piva 				if (user_data_to_op(cqe->user_data) == io_op)
432*94c4a1e1SFrank Piva 					goto queue_io;
433*94c4a1e1SFrank Piva 
434*94c4a1e1SFrank Piva 				//if the cluster zeroing IO isn't done, retry
435*94c4a1e1SFrank Piva 				if (qs->cluster_allocator.
436*94c4a1e1SFrank Piva 				    alloc_cluster_reset(cluster_start))
437*94c4a1e1SFrank Piva 					goto queue_io;
438*94c4a1e1SFrank Piva 			}
439*94c4a1e1SFrank Piva 
440*94c4a1e1SFrank Piva 			qcow2_io_log("%s: io done, tag %d res %d user_data %llx\n",
441*94c4a1e1SFrank Piva 							__func__, tag, ret,
442*94c4a1e1SFrank Piva 							cqe->user_data);
443*94c4a1e1SFrank Piva 			if (exp_op != io_op) {
444*94c4a1e1SFrank Piva 				if (user_data_to_op(cqe->user_data) == IORING_OP_FALLOCATE)
445*94c4a1e1SFrank Piva 					qs->cluster_allocator.alloc_cluster_zeroed(q,
446*94c4a1e1SFrank Piva 						tag, cluster_start);
447*94c4a1e1SFrank Piva 				goto queue_io;
448*94c4a1e1SFrank Piva 			}
449*94c4a1e1SFrank Piva 		} else if (ret == 0) {
450*94c4a1e1SFrank Piva 			ret = iod->nr_sectors << 9;
451*94c4a1e1SFrank Piva 		}
452*94c4a1e1SFrank Piva 	}
453*94c4a1e1SFrank Piva exit:
454*94c4a1e1SFrank Piva 	if (ret < 0)
455*94c4a1e1SFrank Piva 		ublk_err("%s io failed(%d %lx %u) ret %d\n", __func__,
456*94c4a1e1SFrank Piva 				op, start, iod->nr_sectors, ret);
457*94c4a1e1SFrank Piva 	qcow2_io_log("%s tag %d io complete(%d %llx %lu) ret %d\n", __func__,
458*94c4a1e1SFrank Piva 				tag, op, start, iod->nr_sectors, ret);
459*94c4a1e1SFrank Piva 	ublksrv_complete_io(q, tag, ret);
460*94c4a1e1SFrank Piva }
461*94c4a1e1SFrank Piva 
qcow2_handle_io_async(const struct ublksrv_queue * q,const struct ublk_io_data * data)462*94c4a1e1SFrank Piva static int qcow2_handle_io_async(const struct ublksrv_queue *q,
463*94c4a1e1SFrank Piva 		const struct ublk_io_data *data)
464*94c4a1e1SFrank Piva {
465*94c4a1e1SFrank Piva 	struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
466*94c4a1e1SFrank Piva 
467*94c4a1e1SFrank Piva 	io->co = __qcow2_handle_io_async(q, data, data->tag);
468*94c4a1e1SFrank Piva 	return 0;
469*94c4a1e1SFrank Piva }
470*94c4a1e1SFrank Piva 
qcow2_deinit_tgt(const struct ublksrv_dev * dev)471*94c4a1e1SFrank Piva static void qcow2_deinit_tgt(const struct ublksrv_dev *dev)
472*94c4a1e1SFrank Piva {
473*94c4a1e1SFrank Piva 	Qcow2State *qs = dev_to_qcow2state(dev);
474*94c4a1e1SFrank Piva 
475*94c4a1e1SFrank Piva 	//now all io slots are available, just use the zero tag
476*94c4a1e1SFrank Piva 	qcow2_io_ctx_t ioc(0, 0);
477*94c4a1e1SFrank Piva 
478*94c4a1e1SFrank Piva 	qs->dump_meta();
479*94c4a1e1SFrank Piva 
480*94c4a1e1SFrank Piva 	delete qs;
481*94c4a1e1SFrank Piva }
482*94c4a1e1SFrank Piva 
qcow2_tgt_io_done(const struct ublksrv_queue * q,const struct ublk_io_data * data,const struct io_uring_cqe * cqe)483*94c4a1e1SFrank Piva static void qcow2_tgt_io_done(const struct ublksrv_queue *q,
484*94c4a1e1SFrank Piva 		const struct ublk_io_data *data, const struct io_uring_cqe *cqe)
485*94c4a1e1SFrank Piva {
486*94c4a1e1SFrank Piva 	unsigned tag = user_data_to_tag(cqe->user_data);
487*94c4a1e1SFrank Piva 
488*94c4a1e1SFrank Piva 	qcow2_io_log("%s: res %d qid %u tag %u, cmd_op %u\n",
489*94c4a1e1SFrank Piva 			__func__, cqe->res, q->q_id,
490*94c4a1e1SFrank Piva 			user_data_to_tag(cqe->user_data),
491*94c4a1e1SFrank Piva 			user_data_to_op(cqe->user_data));
492*94c4a1e1SFrank Piva 	//special tag is ignored, so far it is used in sending
493*94c4a1e1SFrank Piva 	//fsync during flushing meta
494*94c4a1e1SFrank Piva 	if (tag != 0xffff) {
495*94c4a1e1SFrank Piva 		struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
496*94c4a1e1SFrank Piva 		io->tgt_io_cqe = cqe;
497*94c4a1e1SFrank Piva 		io->co.resume();
498*94c4a1e1SFrank Piva 	}
499*94c4a1e1SFrank Piva }
500*94c4a1e1SFrank Piva 
qcow2_handle_io_bg(const struct ublksrv_queue * q,int nr_queued_io)501*94c4a1e1SFrank Piva static void qcow2_handle_io_bg(const struct ublksrv_queue *q, int nr_queued_io)
502*94c4a1e1SFrank Piva {
503*94c4a1e1SFrank Piva 	Qcow2State *qs = queue_to_qcow2state(q);
504*94c4a1e1SFrank Piva 
505*94c4a1e1SFrank Piva 	ublk_dbg(UBLK_DBG_QCOW2_FLUSH | UBLK_DBG_QCOW2_META,
506*94c4a1e1SFrank Piva 			"%s %d, queued io %d\n", __func__, __LINE__, nr_queued_io);
507*94c4a1e1SFrank Piva 	qs->kill_slices(q);
508*94c4a1e1SFrank Piva again:
509*94c4a1e1SFrank Piva 	qs->meta_flushing.run_flush(q, nr_queued_io);
510*94c4a1e1SFrank Piva 
511*94c4a1e1SFrank Piva 	if (!nr_queued_io && !qs->meta_flushing.is_flushing()) {
512*94c4a1e1SFrank Piva 		if (qs->has_dirty_slice())
513*94c4a1e1SFrank Piva 			goto again;
514*94c4a1e1SFrank Piva 	}
515*94c4a1e1SFrank Piva }
516*94c4a1e1SFrank Piva 
qcow2_idle(const struct ublksrv_queue * q,bool enter)517*94c4a1e1SFrank Piva static void qcow2_idle(const struct ublksrv_queue *q, bool enter)
518*94c4a1e1SFrank Piva {
519*94c4a1e1SFrank Piva 	Qcow2State *qs = queue_to_qcow2state(q);
520*94c4a1e1SFrank Piva 
521*94c4a1e1SFrank Piva 	if (!enter)
522*94c4a1e1SFrank Piva 		return;
523*94c4a1e1SFrank Piva 
524*94c4a1e1SFrank Piva 	qs->shrink_cache();
525*94c4a1e1SFrank Piva }
526*94c4a1e1SFrank Piva 
qcow2_init_queue(const struct ublksrv_queue * q,void ** queue_data_ptr)527*94c4a1e1SFrank Piva static int qcow2_init_queue(const struct ublksrv_queue *q,
528*94c4a1e1SFrank Piva 		void **queue_data_ptr)
529*94c4a1e1SFrank Piva {
530*94c4a1e1SFrank Piva 	Qcow2State *qs = dev_to_qcow2state(q->dev);
531*94c4a1e1SFrank Piva 
532*94c4a1e1SFrank Piva 	*queue_data_ptr = (void *)qs;
533*94c4a1e1SFrank Piva 
534*94c4a1e1SFrank Piva 	return 0;
535*94c4a1e1SFrank Piva }
536*94c4a1e1SFrank Piva 
537*94c4a1e1SFrank Piva struct ublksrv_tgt_type  qcow2_tgt_type = {
538*94c4a1e1SFrank Piva 	.handle_io_async = qcow2_handle_io_async,
539*94c4a1e1SFrank Piva 	.tgt_io_done = qcow2_tgt_io_done,
540*94c4a1e1SFrank Piva 	.handle_io_background = qcow2_handle_io_bg,
541*94c4a1e1SFrank Piva 	.usage_for_add	=  qcow2_usage_for_add,
542*94c4a1e1SFrank Piva 	.init_tgt = qcow2_init_tgt,
543*94c4a1e1SFrank Piva 	.deinit_tgt	=  qcow2_deinit_tgt,
544*94c4a1e1SFrank Piva 	.idle_fn	=  qcow2_idle,
545*94c4a1e1SFrank Piva 	.type	= UBLKSRV_TGT_TYPE_QCOW2,
546*94c4a1e1SFrank Piva 	.name	=  "qcow2",
547*94c4a1e1SFrank Piva 	.recovery_tgt = qcow2_recovery_tgt,
548*94c4a1e1SFrank Piva 	.init_queue = qcow2_init_queue,
549*94c4a1e1SFrank Piva };
550*94c4a1e1SFrank Piva 
551*94c4a1e1SFrank Piva static void tgt_qcow2_init() __attribute__((constructor));
552*94c4a1e1SFrank Piva 
tgt_qcow2_init(void)553*94c4a1e1SFrank Piva static void tgt_qcow2_init(void)
554*94c4a1e1SFrank Piva {
555*94c4a1e1SFrank Piva 	ublksrv_register_tgt_type(&qcow2_tgt_type);
556*94c4a1e1SFrank Piva }
557