xref: /aosp_15_r20/external/ublksrv/lib/ublksrv.c (revision 94c4a1e103eb1715230460aab379dff275992c20)
1*94c4a1e1SFrank Piva // SPDX-License-Identifier: MIT or LGPL-2.1-only
2*94c4a1e1SFrank Piva 
3*94c4a1e1SFrank Piva #include <config.h>
4*94c4a1e1SFrank Piva #include <sys/mman.h>
5*94c4a1e1SFrank Piva #include <sys/time.h>
6*94c4a1e1SFrank Piva #include <sys/resource.h>
7*94c4a1e1SFrank Piva 
8*94c4a1e1SFrank Piva #include "ublksrv_priv.h"
9*94c4a1e1SFrank Piva #include "ublksrv_aio.h"
10*94c4a1e1SFrank Piva 
ublksrv_is_recovering(const struct ublksrv_ctrl_dev * ctrl_dev)11*94c4a1e1SFrank Piva bool ublksrv_is_recovering(const struct ublksrv_ctrl_dev *ctrl_dev)
12*94c4a1e1SFrank Piva {
13*94c4a1e1SFrank Piva 	return ctrl_dev->tgt_argc == -1;
14*94c4a1e1SFrank Piva }
15*94c4a1e1SFrank Piva 
ublksrv_get_iod(const struct _ublksrv_queue * q,int tag)16*94c4a1e1SFrank Piva static inline struct ublksrv_io_desc *ublksrv_get_iod(
17*94c4a1e1SFrank Piva 		const struct _ublksrv_queue *q, int tag)
18*94c4a1e1SFrank Piva {
19*94c4a1e1SFrank Piva         return (struct ublksrv_io_desc *)
20*94c4a1e1SFrank Piva                 &(q->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
21*94c4a1e1SFrank Piva }
22*94c4a1e1SFrank Piva 
23*94c4a1e1SFrank Piva /*
24*94c4a1e1SFrank Piva  * /dev/ublkbN shares same lifetime with the ublk io daemon:
25*94c4a1e1SFrank Piva  *
26*94c4a1e1SFrank Piva  * 1) IO from /dev/ublkbN is handled by the io daemon directly
27*94c4a1e1SFrank Piva  *
28*94c4a1e1SFrank Piva  * 2) io cmd buffer is allocated from ublk driver, mapped to
29*94c4a1e1SFrank Piva  * io daemon vm space via mmap, and each hw queue has its own
30*94c4a1e1SFrank Piva  * io cmd buffer
31*94c4a1e1SFrank Piva  *
32*94c4a1e1SFrank Piva  * 3) io buffers are pre-allocated from the io daemon and pass
33*94c4a1e1SFrank Piva  * to ublk driver via io command, meantime ublk driver may choose
34*94c4a1e1SFrank Piva  * to pin these user pages before starting device
35*94c4a1e1SFrank Piva  *
36*94c4a1e1SFrank Piva  * Each /dev/ublkcN is owned by only one io daemon, and can't be
37*94c4a1e1SFrank Piva  * opened by other daemon. And the io daemon uses its allocated
38*94c4a1e1SFrank Piva  * io_uring to communicate with ublk driver.
39*94c4a1e1SFrank Piva  *
40*94c4a1e1SFrank Piva  * For each request of /dev/ublkbN, the io daemon submits one
41*94c4a1e1SFrank Piva  * sqe for both fetching IO from ublk driver and commiting IO result
42*94c4a1e1SFrank Piva  * to ublk driver, and the io daemon has to issue all sqes
43*94c4a1e1SFrank Piva  * to /dev/ublkcN before sending START_DEV to /dev/udc-control.
44*94c4a1e1SFrank Piva  *
45*94c4a1e1SFrank Piva  * After STOP_DEV is sent to /dev/udc-control, udc driver needs
46*94c4a1e1SFrank Piva  * to freeze the request queue, and completes all pending sqes,
47*94c4a1e1SFrank Piva  * meantime tell the io daemon via cqe->res that don't issue seq
48*94c4a1e1SFrank Piva  * any more, also delete /dev/ublkbN.  After io daemon figures out
49*94c4a1e1SFrank Piva  * all sqes have been free, exit itself. Then STOP_DEV returns.
50*94c4a1e1SFrank Piva  */
51*94c4a1e1SFrank Piva 
52*94c4a1e1SFrank Piva /*
53*94c4a1e1SFrank Piva  * If ublksrv queue is idle in the past 20 seconds, start to discard
54*94c4a1e1SFrank Piva  * pages mapped to io buffer via madivise(MADV_DONTNEED), so these
55*94c4a1e1SFrank Piva  * pages can be available for others without needing swap out
56*94c4a1e1SFrank Piva  */
57*94c4a1e1SFrank Piva #define UBLKSRV_IO_IDLE_SECS    20
58*94c4a1e1SFrank Piva 
__ublksrv_tgt_init(struct _ublksrv_dev * dev,const char * type_name,const struct ublksrv_tgt_type * ops,int type,int argc,char * argv[])59*94c4a1e1SFrank Piva static int __ublksrv_tgt_init(struct _ublksrv_dev *dev, const char *type_name,
60*94c4a1e1SFrank Piva 		const struct ublksrv_tgt_type *ops, int type,
61*94c4a1e1SFrank Piva 		int argc, char *argv[])
62*94c4a1e1SFrank Piva {
63*94c4a1e1SFrank Piva 	struct ublksrv_tgt_info *tgt = &dev->tgt;
64*94c4a1e1SFrank Piva 	int ret;
65*94c4a1e1SFrank Piva 
66*94c4a1e1SFrank Piva 	if (!ops)
67*94c4a1e1SFrank Piva 		return -EINVAL;
68*94c4a1e1SFrank Piva 
69*94c4a1e1SFrank Piva 	if (strcmp(ops->name, type_name))
70*94c4a1e1SFrank Piva 		return -EINVAL;
71*94c4a1e1SFrank Piva 
72*94c4a1e1SFrank Piva 	if (!ops->init_tgt)
73*94c4a1e1SFrank Piva 		return -EINVAL;
74*94c4a1e1SFrank Piva 	if (!ops->handle_io_async)
75*94c4a1e1SFrank Piva 		return -EINVAL;
76*94c4a1e1SFrank Piva 	if (!ops->alloc_io_buf ^ !ops->free_io_buf)
77*94c4a1e1SFrank Piva 		return -EINVAL;
78*94c4a1e1SFrank Piva 
79*94c4a1e1SFrank Piva 	optind = 0;     /* so that we can parse our arguments */
80*94c4a1e1SFrank Piva 	tgt->ops = ops;
81*94c4a1e1SFrank Piva 
82*94c4a1e1SFrank Piva 	if (!ublksrv_is_recovering(dev->ctrl_dev))
83*94c4a1e1SFrank Piva 		ret = ops->init_tgt(local_to_tdev(dev), type, argc, argv);
84*94c4a1e1SFrank Piva 	else {
85*94c4a1e1SFrank Piva 		if (ops->recovery_tgt)
86*94c4a1e1SFrank Piva 			ret = ops->recovery_tgt(local_to_tdev(dev), type);
87*94c4a1e1SFrank Piva 		else
88*94c4a1e1SFrank Piva 			ret = -ENOTSUP;
89*94c4a1e1SFrank Piva 	}
90*94c4a1e1SFrank Piva 	if (ret) {
91*94c4a1e1SFrank Piva 		tgt->ops = NULL;
92*94c4a1e1SFrank Piva 		return ret;
93*94c4a1e1SFrank Piva 	}
94*94c4a1e1SFrank Piva 	return 0;
95*94c4a1e1SFrank Piva }
96*94c4a1e1SFrank Piva 
ublksrv_tgt_init(struct _ublksrv_dev * dev,const char * type_name,const struct ublksrv_tgt_type * ops,int argc,char * argv[])97*94c4a1e1SFrank Piva static int ublksrv_tgt_init(struct _ublksrv_dev *dev, const char *type_name,
98*94c4a1e1SFrank Piva 		const struct ublksrv_tgt_type *ops,
99*94c4a1e1SFrank Piva 		int argc, char *argv[])
100*94c4a1e1SFrank Piva {
101*94c4a1e1SFrank Piva 	if (type_name == NULL)
102*94c4a1e1SFrank Piva 		return -EINVAL;
103*94c4a1e1SFrank Piva 
104*94c4a1e1SFrank Piva 	if (ops)
105*94c4a1e1SFrank Piva 		return __ublksrv_tgt_init(dev, type_name, ops,
106*94c4a1e1SFrank Piva 				ops->type, argc, argv);
107*94c4a1e1SFrank Piva 
108*94c4a1e1SFrank Piva 	return -EINVAL;
109*94c4a1e1SFrank Piva }
110*94c4a1e1SFrank Piva 
ublksrv_tgt_exit(struct ublksrv_tgt_info * tgt)111*94c4a1e1SFrank Piva static inline void ublksrv_tgt_exit(struct ublksrv_tgt_info *tgt)
112*94c4a1e1SFrank Piva {
113*94c4a1e1SFrank Piva 	int i;
114*94c4a1e1SFrank Piva 
115*94c4a1e1SFrank Piva 	for (i = 1; i < tgt->nr_fds; i++)
116*94c4a1e1SFrank Piva 		close(tgt->fds[i]);
117*94c4a1e1SFrank Piva }
118*94c4a1e1SFrank Piva 
ublksrv_tgt_deinit(struct _ublksrv_dev * dev)119*94c4a1e1SFrank Piva static void ublksrv_tgt_deinit(struct _ublksrv_dev *dev)
120*94c4a1e1SFrank Piva {
121*94c4a1e1SFrank Piva 	struct ublksrv_tgt_info *tgt = &dev->tgt;
122*94c4a1e1SFrank Piva 
123*94c4a1e1SFrank Piva 	ublksrv_tgt_exit(tgt);
124*94c4a1e1SFrank Piva 
125*94c4a1e1SFrank Piva 	if (tgt->ops && tgt->ops->deinit_tgt)
126*94c4a1e1SFrank Piva 		tgt->ops->deinit_tgt(local_to_tdev(dev));
127*94c4a1e1SFrank Piva }
128*94c4a1e1SFrank Piva 
ublksrv_queue_io_cmd(struct _ublksrv_queue * q,struct ublk_io * io,unsigned tag)129*94c4a1e1SFrank Piva static inline int ublksrv_queue_io_cmd(struct _ublksrv_queue *q,
130*94c4a1e1SFrank Piva 		struct ublk_io *io, unsigned tag)
131*94c4a1e1SFrank Piva {
132*94c4a1e1SFrank Piva 	struct ublksrv_io_cmd *cmd;
133*94c4a1e1SFrank Piva 	struct io_uring_sqe *sqe;
134*94c4a1e1SFrank Piva 	unsigned int cmd_op = 0;
135*94c4a1e1SFrank Piva 	__u64 user_data;
136*94c4a1e1SFrank Piva 
137*94c4a1e1SFrank Piva 	/* only freed io can be issued */
138*94c4a1e1SFrank Piva 	if (!(io->flags & UBLKSRV_IO_FREE))
139*94c4a1e1SFrank Piva 		return 0;
140*94c4a1e1SFrank Piva 
141*94c4a1e1SFrank Piva 	/* we issue because we need either fetching or committing */
142*94c4a1e1SFrank Piva 	if (!(io->flags &
143*94c4a1e1SFrank Piva 		(UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_GET_DATA |
144*94c4a1e1SFrank Piva 		 UBLKSRV_NEED_COMMIT_RQ_COMP)))
145*94c4a1e1SFrank Piva 		return 0;
146*94c4a1e1SFrank Piva 
147*94c4a1e1SFrank Piva 	if (io->flags & UBLKSRV_NEED_GET_DATA)
148*94c4a1e1SFrank Piva 		cmd_op = UBLK_IO_NEED_GET_DATA;
149*94c4a1e1SFrank Piva 	else if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP)
150*94c4a1e1SFrank Piva 		cmd_op = UBLK_IO_COMMIT_AND_FETCH_REQ;
151*94c4a1e1SFrank Piva 	else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
152*94c4a1e1SFrank Piva 		cmd_op = UBLK_IO_FETCH_REQ;
153*94c4a1e1SFrank Piva 
154*94c4a1e1SFrank Piva 	sqe = io_uring_get_sqe(&q->ring);
155*94c4a1e1SFrank Piva 	if (!sqe) {
156*94c4a1e1SFrank Piva 		ublk_err("%s: run out of sqe %d, tag %d\n",
157*94c4a1e1SFrank Piva 				__func__, q->q_id, tag);
158*94c4a1e1SFrank Piva 		return -1;
159*94c4a1e1SFrank Piva 	}
160*94c4a1e1SFrank Piva 
161*94c4a1e1SFrank Piva 	cmd = (struct ublksrv_io_cmd *)ublksrv_get_sqe_cmd(sqe);
162*94c4a1e1SFrank Piva 
163*94c4a1e1SFrank Piva 	if (cmd_op == UBLK_IO_COMMIT_AND_FETCH_REQ)
164*94c4a1e1SFrank Piva 		cmd->result = io->result;
165*94c4a1e1SFrank Piva 
166*94c4a1e1SFrank Piva 	if (q->state & UBLKSRV_QUEUE_IOCTL_OP)
167*94c4a1e1SFrank Piva 		cmd_op = _IOWR('u', _IOC_NR(cmd_op), struct ublksrv_io_cmd);
168*94c4a1e1SFrank Piva 
169*94c4a1e1SFrank Piva 	/* These fields should be written once, never change */
170*94c4a1e1SFrank Piva 	ublksrv_set_sqe_cmd_op(sqe, cmd_op);
171*94c4a1e1SFrank Piva 	sqe->fd		= 0;	/*dev->cdev_fd*/
172*94c4a1e1SFrank Piva 	sqe->opcode	=  IORING_OP_URING_CMD;
173*94c4a1e1SFrank Piva 	sqe->flags	= IOSQE_FIXED_FILE;
174*94c4a1e1SFrank Piva 	sqe->rw_flags	= 0;
175*94c4a1e1SFrank Piva 	cmd->tag	= tag;
176*94c4a1e1SFrank Piva 	if (!(q->state & UBLKSRV_USER_COPY))
177*94c4a1e1SFrank Piva 		cmd->addr	= (__u64)io->buf_addr;
178*94c4a1e1SFrank Piva 	else
179*94c4a1e1SFrank Piva 		cmd->addr	= 0;
180*94c4a1e1SFrank Piva 	cmd->q_id	= q->q_id;
181*94c4a1e1SFrank Piva 
182*94c4a1e1SFrank Piva 	user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0);
183*94c4a1e1SFrank Piva 	io_uring_sqe_set_data64(sqe, user_data);
184*94c4a1e1SFrank Piva 
185*94c4a1e1SFrank Piva 	io->flags = 0;
186*94c4a1e1SFrank Piva 
187*94c4a1e1SFrank Piva 	q->cmd_inflight += 1;
188*94c4a1e1SFrank Piva 
189*94c4a1e1SFrank Piva 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n",
190*94c4a1e1SFrank Piva 			__func__, q->q_id, tag, cmd_op,
191*94c4a1e1SFrank Piva 			io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING));
192*94c4a1e1SFrank Piva 	return 1;
193*94c4a1e1SFrank Piva }
194*94c4a1e1SFrank Piva 
ublksrv_complete_io(const struct ublksrv_queue * tq,unsigned tag,int res)195*94c4a1e1SFrank Piva int ublksrv_complete_io(const struct ublksrv_queue *tq, unsigned tag, int res)
196*94c4a1e1SFrank Piva {
197*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q = tq_to_local(tq);
198*94c4a1e1SFrank Piva 
199*94c4a1e1SFrank Piva 	struct ublk_io *io = &q->ios[tag];
200*94c4a1e1SFrank Piva 
201*94c4a1e1SFrank Piva 	ublksrv_mark_io_done(io, res);
202*94c4a1e1SFrank Piva 
203*94c4a1e1SFrank Piva 	return ublksrv_queue_io_cmd(q, io, tag);
204*94c4a1e1SFrank Piva }
205*94c4a1e1SFrank Piva 
206*94c4a1e1SFrank Piva /*
207*94c4a1e1SFrank Piva  * eventfd is treated as special target IO which has to be queued
208*94c4a1e1SFrank Piva  * when queue is setup
209*94c4a1e1SFrank Piva  */
__ublksrv_queue_event(struct _ublksrv_queue * q)210*94c4a1e1SFrank Piva static inline int __ublksrv_queue_event(struct _ublksrv_queue *q)
211*94c4a1e1SFrank Piva {
212*94c4a1e1SFrank Piva 	if (q->efd >= 0) {
213*94c4a1e1SFrank Piva 		struct io_uring_sqe *sqe;
214*94c4a1e1SFrank Piva 		__u64 user_data = build_eventfd_data();
215*94c4a1e1SFrank Piva 
216*94c4a1e1SFrank Piva 		if (q->state & UBLKSRV_QUEUE_STOPPING)
217*94c4a1e1SFrank Piva 			return -EINVAL;
218*94c4a1e1SFrank Piva 
219*94c4a1e1SFrank Piva 		sqe = io_uring_get_sqe(&q->ring);
220*94c4a1e1SFrank Piva 		if (!sqe) {
221*94c4a1e1SFrank Piva 			ublk_err("%s: queue %d run out of sqe\n",
222*94c4a1e1SFrank Piva 				__func__, q->q_id);
223*94c4a1e1SFrank Piva 			return -1;
224*94c4a1e1SFrank Piva 		}
225*94c4a1e1SFrank Piva 
226*94c4a1e1SFrank Piva 		io_uring_prep_poll_add(sqe, q->efd, POLLIN);
227*94c4a1e1SFrank Piva 		io_uring_sqe_set_data64(sqe, user_data);
228*94c4a1e1SFrank Piva 	}
229*94c4a1e1SFrank Piva 	return 0;
230*94c4a1e1SFrank Piva }
231*94c4a1e1SFrank Piva 
232*94c4a1e1SFrank Piva /*
233*94c4a1e1SFrank Piva  * This API is supposed to be called in ->handle_event() after current
234*94c4a1e1SFrank Piva  * events are handled.
235*94c4a1e1SFrank Piva  */
ublksrv_queue_handled_event(const struct ublksrv_queue * tq)236*94c4a1e1SFrank Piva int ublksrv_queue_handled_event(const struct ublksrv_queue *tq)
237*94c4a1e1SFrank Piva {
238*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q = tq_to_local(tq);
239*94c4a1e1SFrank Piva 
240*94c4a1e1SFrank Piva 	if (q->efd >= 0) {
241*94c4a1e1SFrank Piva 		uint64_t data;
242*94c4a1e1SFrank Piva 		const int cnt = sizeof(uint64_t);
243*94c4a1e1SFrank Piva 
244*94c4a1e1SFrank Piva 		/* read has to be done, otherwise poll event won't be stopped */
245*94c4a1e1SFrank Piva 		if (read(q->efd, &data, cnt) != cnt)
246*94c4a1e1SFrank Piva 			ublk_err("%s: read wrong bytes from eventfd\n",
247*94c4a1e1SFrank Piva 					__func__);
248*94c4a1e1SFrank Piva 		/*
249*94c4a1e1SFrank Piva 		 * event needs to be issued immediately, since other io may rely
250*94c4a1e1SFrank Piva 		 * it
251*94c4a1e1SFrank Piva 		 */
252*94c4a1e1SFrank Piva 		if (!__ublksrv_queue_event(q))
253*94c4a1e1SFrank Piva 			io_uring_submit_and_wait(&q->ring, 0);
254*94c4a1e1SFrank Piva 	}
255*94c4a1e1SFrank Piva 	return 0;
256*94c4a1e1SFrank Piva }
257*94c4a1e1SFrank Piva 
258*94c4a1e1SFrank Piva /*
259*94c4a1e1SFrank Piva  * Send event to io command uring context, so that the queue pthread
260*94c4a1e1SFrank Piva  * can be waken up for handling io, then ->handle_event() will be
261*94c4a1e1SFrank Piva  * called to notify target code.
262*94c4a1e1SFrank Piva  *
263*94c4a1e1SFrank Piva  * This API is usually called from other context.
264*94c4a1e1SFrank Piva  */
ublksrv_queue_send_event(const struct ublksrv_queue * tq)265*94c4a1e1SFrank Piva int ublksrv_queue_send_event(const struct ublksrv_queue *tq)
266*94c4a1e1SFrank Piva {
267*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q = tq_to_local(tq);
268*94c4a1e1SFrank Piva 
269*94c4a1e1SFrank Piva 	if (q->efd >= 0) {
270*94c4a1e1SFrank Piva 		uint64_t data = 1;
271*94c4a1e1SFrank Piva 		const int cnt = sizeof(uint64_t);
272*94c4a1e1SFrank Piva 
273*94c4a1e1SFrank Piva 		if (write(q->efd, &data, cnt) != cnt) {
274*94c4a1e1SFrank Piva 			ublk_err("%s: wrote wrong bytes to eventfd\n",
275*94c4a1e1SFrank Piva 					__func__);
276*94c4a1e1SFrank Piva 			return -EPIPE;
277*94c4a1e1SFrank Piva 		}
278*94c4a1e1SFrank Piva 	}
279*94c4a1e1SFrank Piva 	return 0;
280*94c4a1e1SFrank Piva }
281*94c4a1e1SFrank Piva 
282*94c4a1e1SFrank Piva /*
283*94c4a1e1SFrank Piva  * Issue all available commands to /dev/ublkcN  and the exact cmd is figured
284*94c4a1e1SFrank Piva  * out in queue_io_cmd with help of each io->status.
285*94c4a1e1SFrank Piva  *
286*94c4a1e1SFrank Piva  * todo: queue io commands with batching
287*94c4a1e1SFrank Piva  */
ublksrv_submit_fetch_commands(struct _ublksrv_queue * q)288*94c4a1e1SFrank Piva static void ublksrv_submit_fetch_commands(struct _ublksrv_queue *q)
289*94c4a1e1SFrank Piva {
290*94c4a1e1SFrank Piva 	int i = 0;
291*94c4a1e1SFrank Piva 
292*94c4a1e1SFrank Piva 	for (i = 0; i < q->q_depth; i++)
293*94c4a1e1SFrank Piva 		ublksrv_queue_io_cmd(q, &q->ios[i], i);
294*94c4a1e1SFrank Piva 
295*94c4a1e1SFrank Piva 	__ublksrv_queue_event(q);
296*94c4a1e1SFrank Piva }
297*94c4a1e1SFrank Piva 
ublksrv_queue_is_done(struct _ublksrv_queue * q)298*94c4a1e1SFrank Piva static int ublksrv_queue_is_done(struct _ublksrv_queue *q)
299*94c4a1e1SFrank Piva {
300*94c4a1e1SFrank Piva 	return (q->state & UBLKSRV_QUEUE_STOPPING) &&
301*94c4a1e1SFrank Piva 		!io_uring_sq_ready(&q->ring);
302*94c4a1e1SFrank Piva }
303*94c4a1e1SFrank Piva 
304*94c4a1e1SFrank Piva /* used for allocating zero copy vma space */
ublk_queue_single_io_buf_size(struct _ublksrv_dev * dev)305*94c4a1e1SFrank Piva static inline int ublk_queue_single_io_buf_size(struct _ublksrv_dev *dev)
306*94c4a1e1SFrank Piva {
307*94c4a1e1SFrank Piva 	unsigned max_io_sz = dev->ctrl_dev->dev_info.max_io_buf_bytes;
308*94c4a1e1SFrank Piva 	unsigned int page_sz = getpagesize();
309*94c4a1e1SFrank Piva 
310*94c4a1e1SFrank Piva 	return round_up(max_io_sz, page_sz);
311*94c4a1e1SFrank Piva }
ublk_queue_io_buf_size(struct _ublksrv_dev * dev)312*94c4a1e1SFrank Piva static inline int ublk_queue_io_buf_size(struct _ublksrv_dev *dev)
313*94c4a1e1SFrank Piva {
314*94c4a1e1SFrank Piva 	unsigned depth = dev->ctrl_dev->dev_info.queue_depth;
315*94c4a1e1SFrank Piva 
316*94c4a1e1SFrank Piva 	return ublk_queue_single_io_buf_size(dev) * depth;
317*94c4a1e1SFrank Piva }
ublk_io_buf_size(struct _ublksrv_dev * dev)318*94c4a1e1SFrank Piva static inline int ublk_io_buf_size(struct _ublksrv_dev *dev)
319*94c4a1e1SFrank Piva {
320*94c4a1e1SFrank Piva 	unsigned nr_queues = dev->ctrl_dev->dev_info.nr_hw_queues;
321*94c4a1e1SFrank Piva 
322*94c4a1e1SFrank Piva 	return ublk_queue_io_buf_size(dev) * nr_queues;
323*94c4a1e1SFrank Piva }
324*94c4a1e1SFrank Piva 
ublksrv_queue_cmd_buf_sz(struct _ublksrv_queue * q)325*94c4a1e1SFrank Piva static int ublksrv_queue_cmd_buf_sz(struct _ublksrv_queue *q)
326*94c4a1e1SFrank Piva {
327*94c4a1e1SFrank Piva 	int size =  q->q_depth * sizeof(struct ublksrv_io_desc);
328*94c4a1e1SFrank Piva 	unsigned int page_sz = getpagesize();
329*94c4a1e1SFrank Piva 
330*94c4a1e1SFrank Piva 	return round_up(size, page_sz);
331*94c4a1e1SFrank Piva }
332*94c4a1e1SFrank Piva 
ublksrv_queue_unconsumed_cqes(const struct ublksrv_queue * tq)333*94c4a1e1SFrank Piva int ublksrv_queue_unconsumed_cqes(const struct ublksrv_queue *tq)
334*94c4a1e1SFrank Piva {
335*94c4a1e1SFrank Piva 	if (tq->ring_ptr)
336*94c4a1e1SFrank Piva 		return io_uring_cq_ready(tq->ring_ptr);
337*94c4a1e1SFrank Piva 
338*94c4a1e1SFrank Piva 	return -1;
339*94c4a1e1SFrank Piva }
340*94c4a1e1SFrank Piva 
ublksrv_queue_deinit(const struct ublksrv_queue * tq)341*94c4a1e1SFrank Piva void ublksrv_queue_deinit(const struct ublksrv_queue *tq)
342*94c4a1e1SFrank Piva {
343*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q = tq_to_local(tq);
344*94c4a1e1SFrank Piva 	int i;
345*94c4a1e1SFrank Piva 	int nr_ios = q->dev->tgt.extra_ios + q->q_depth;
346*94c4a1e1SFrank Piva 
347*94c4a1e1SFrank Piva 	if (q->dev->tgt.ops->deinit_queue)
348*94c4a1e1SFrank Piva 		q->dev->tgt.ops->deinit_queue(tq);
349*94c4a1e1SFrank Piva 
350*94c4a1e1SFrank Piva 	if (q->efd >= 0)
351*94c4a1e1SFrank Piva 		close(q->efd);
352*94c4a1e1SFrank Piva 
353*94c4a1e1SFrank Piva 	io_uring_unregister_ring_fd(&q->ring);
354*94c4a1e1SFrank Piva 
355*94c4a1e1SFrank Piva 	if (q->ring.ring_fd > 0) {
356*94c4a1e1SFrank Piva 		io_uring_unregister_files(&q->ring);
357*94c4a1e1SFrank Piva 		close(q->ring.ring_fd);
358*94c4a1e1SFrank Piva 		q->ring.ring_fd = -1;
359*94c4a1e1SFrank Piva 	}
360*94c4a1e1SFrank Piva 	if (q->io_cmd_buf) {
361*94c4a1e1SFrank Piva 		munmap(q->io_cmd_buf, ublksrv_queue_cmd_buf_sz(q));
362*94c4a1e1SFrank Piva 		q->io_cmd_buf = NULL;
363*94c4a1e1SFrank Piva 	}
364*94c4a1e1SFrank Piva 	for (i = 0; i < nr_ios; i++) {
365*94c4a1e1SFrank Piva 		if (q->ios[i].buf_addr) {
366*94c4a1e1SFrank Piva 			if (q->dev->tgt.ops->free_io_buf)
367*94c4a1e1SFrank Piva 				q->dev->tgt.ops->free_io_buf(tq,
368*94c4a1e1SFrank Piva 						q->ios[i].buf_addr, i);
369*94c4a1e1SFrank Piva 			else
370*94c4a1e1SFrank Piva 				free(q->ios[i].buf_addr);
371*94c4a1e1SFrank Piva 			q->ios[i].buf_addr = NULL;
372*94c4a1e1SFrank Piva 		}
373*94c4a1e1SFrank Piva 		free(q->ios[i].data.private_data);
374*94c4a1e1SFrank Piva 	}
375*94c4a1e1SFrank Piva 	q->dev->__queues[q->q_id] = NULL;
376*94c4a1e1SFrank Piva 	free(q);
377*94c4a1e1SFrank Piva 
378*94c4a1e1SFrank Piva }
379*94c4a1e1SFrank Piva 
ublksrv_build_cpu_str(char * buf,int len,const cpu_set_t * cpuset)380*94c4a1e1SFrank Piva void ublksrv_build_cpu_str(char *buf, int len, const cpu_set_t *cpuset)
381*94c4a1e1SFrank Piva {
382*94c4a1e1SFrank Piva 	int nr_cores = sysconf(_SC_NPROCESSORS_ONLN);
383*94c4a1e1SFrank Piva 	int i, offset = 0;
384*94c4a1e1SFrank Piva 
385*94c4a1e1SFrank Piva 	for (i = 0; i < nr_cores; i++) {
386*94c4a1e1SFrank Piva 		int n;
387*94c4a1e1SFrank Piva 
388*94c4a1e1SFrank Piva 		if (!CPU_ISSET(i, cpuset))
389*94c4a1e1SFrank Piva 			continue;
390*94c4a1e1SFrank Piva 		n = snprintf(&buf[offset], len - offset, "%d ", i);
391*94c4a1e1SFrank Piva 		if (n < 0 || n >= len - offset)
392*94c4a1e1SFrank Piva 			break;
393*94c4a1e1SFrank Piva 		offset += n;
394*94c4a1e1SFrank Piva 	}
395*94c4a1e1SFrank Piva }
396*94c4a1e1SFrank Piva 
ublksrv_set_sched_affinity(struct _ublksrv_dev * dev,unsigned short q_id)397*94c4a1e1SFrank Piva static void ublksrv_set_sched_affinity(struct _ublksrv_dev *dev,
398*94c4a1e1SFrank Piva 		unsigned short q_id)
399*94c4a1e1SFrank Piva {
400*94c4a1e1SFrank Piva 	const struct ublksrv_ctrl_dev *cdev = dev->ctrl_dev;
401*94c4a1e1SFrank Piva 	unsigned dev_id = cdev->dev_info.dev_id;
402*94c4a1e1SFrank Piva 	cpu_set_t *cpuset = ublksrv_get_queue_affinity(cdev, q_id);
403*94c4a1e1SFrank Piva 	pthread_t thread = pthread_self();
404*94c4a1e1SFrank Piva 	int ret;
405*94c4a1e1SFrank Piva 
406*94c4a1e1SFrank Piva 	ret = pthread_setaffinity_np(thread, sizeof(cpu_set_t), cpuset);
407*94c4a1e1SFrank Piva 	if (ret)
408*94c4a1e1SFrank Piva 		ublk_err("ublk dev %u queue %u set affinity failed",
409*94c4a1e1SFrank Piva 				dev_id, q_id);
410*94c4a1e1SFrank Piva }
411*94c4a1e1SFrank Piva 
ublksrv_kill_eventfd(struct _ublksrv_queue * q)412*94c4a1e1SFrank Piva static void ublksrv_kill_eventfd(struct _ublksrv_queue *q)
413*94c4a1e1SFrank Piva {
414*94c4a1e1SFrank Piva 	if ((q->state & UBLKSRV_QUEUE_STOPPING) && q->efd >= 0) {
415*94c4a1e1SFrank Piva 		uint64_t data = 1;
416*94c4a1e1SFrank Piva 		int ret;
417*94c4a1e1SFrank Piva 
418*94c4a1e1SFrank Piva 		ret = write(q->efd, &data, sizeof(uint64_t));
419*94c4a1e1SFrank Piva 		if (ret != sizeof(uint64_t))
420*94c4a1e1SFrank Piva 			ublk_err("%s:%d write fail %d/%zu\n",
421*94c4a1e1SFrank Piva 					__func__, __LINE__, ret, sizeof(uint64_t));
422*94c4a1e1SFrank Piva 	}
423*94c4a1e1SFrank Piva }
424*94c4a1e1SFrank Piva 
425*94c4a1e1SFrank Piva /*
426*94c4a1e1SFrank Piva  * Return eventfs or negative errno
427*94c4a1e1SFrank Piva  */
ublksrv_setup_eventfd(struct _ublksrv_queue * q)428*94c4a1e1SFrank Piva static int ublksrv_setup_eventfd(struct _ublksrv_queue *q)
429*94c4a1e1SFrank Piva {
430*94c4a1e1SFrank Piva 	const struct ublksrv_ctrl_dev_info *info = &q->dev->ctrl_dev->dev_info;
431*94c4a1e1SFrank Piva 
432*94c4a1e1SFrank Piva 	if (!(info->ublksrv_flags & UBLKSRV_F_NEED_EVENTFD)) {
433*94c4a1e1SFrank Piva 		q->efd = -1;
434*94c4a1e1SFrank Piva 		return 0;
435*94c4a1e1SFrank Piva 	}
436*94c4a1e1SFrank Piva 
437*94c4a1e1SFrank Piva 	if (q->dev->tgt.tgt_ring_depth == 0) {
438*94c4a1e1SFrank Piva 		ublk_err("ublk dev %d queue %d zero tgt queue depth",
439*94c4a1e1SFrank Piva 			info->dev_id, q->q_id);
440*94c4a1e1SFrank Piva 		return -EINVAL;
441*94c4a1e1SFrank Piva 	}
442*94c4a1e1SFrank Piva 
443*94c4a1e1SFrank Piva 	if (!q->dev->tgt.ops->handle_event) {
444*94c4a1e1SFrank Piva 		ublk_err("ublk dev %d/%d not define ->handle_event",
445*94c4a1e1SFrank Piva 			info->dev_id, q->q_id);
446*94c4a1e1SFrank Piva 		return -EINVAL;
447*94c4a1e1SFrank Piva 	}
448*94c4a1e1SFrank Piva 
449*94c4a1e1SFrank Piva 	q->efd = eventfd(0, 0);
450*94c4a1e1SFrank Piva 	if (q->efd < 0)
451*94c4a1e1SFrank Piva 		return -errno;
452*94c4a1e1SFrank Piva 	return 0;
453*94c4a1e1SFrank Piva }
454*94c4a1e1SFrank Piva 
ublksrv_queue_adjust_uring_io_wq_workers(struct _ublksrv_queue * q)455*94c4a1e1SFrank Piva static void ublksrv_queue_adjust_uring_io_wq_workers(struct _ublksrv_queue *q)
456*94c4a1e1SFrank Piva {
457*94c4a1e1SFrank Piva 	struct _ublksrv_dev *dev = q->dev;
458*94c4a1e1SFrank Piva 	unsigned int val[2] = {0, 0};
459*94c4a1e1SFrank Piva 	int ret;
460*94c4a1e1SFrank Piva 
461*94c4a1e1SFrank Piva 	if (!dev->tgt.iowq_max_workers[0] && !dev->tgt.iowq_max_workers[1])
462*94c4a1e1SFrank Piva 		return;
463*94c4a1e1SFrank Piva 
464*94c4a1e1SFrank Piva 	ret = io_uring_register_iowq_max_workers(&q->ring, val);
465*94c4a1e1SFrank Piva 	if (ret)
466*94c4a1e1SFrank Piva 		ublk_err("%s: register iowq max workers failed %d\n",
467*94c4a1e1SFrank Piva 				__func__, ret);
468*94c4a1e1SFrank Piva 
469*94c4a1e1SFrank Piva 	if (!dev->tgt.iowq_max_workers[0])
470*94c4a1e1SFrank Piva 		dev->tgt.iowq_max_workers[0] = val[0];
471*94c4a1e1SFrank Piva 	if (!dev->tgt.iowq_max_workers[1])
472*94c4a1e1SFrank Piva 		dev->tgt.iowq_max_workers[1] = val[1];
473*94c4a1e1SFrank Piva 
474*94c4a1e1SFrank Piva 	ret = io_uring_register_iowq_max_workers(&q->ring,
475*94c4a1e1SFrank Piva 			dev->tgt.iowq_max_workers);
476*94c4a1e1SFrank Piva 	if (ret)
477*94c4a1e1SFrank Piva 		ublk_err("%s: register iowq max workers failed %d\n",
478*94c4a1e1SFrank Piva 				__func__, ret);
479*94c4a1e1SFrank Piva }
480*94c4a1e1SFrank Piva 
ublksrv_calculate_depths(const struct _ublksrv_dev * dev,int * ring_depth,int * cq_depth,int * nr_ios)481*94c4a1e1SFrank Piva static void ublksrv_calculate_depths(const struct _ublksrv_dev *dev, int
482*94c4a1e1SFrank Piva 		*ring_depth, int *cq_depth, int *nr_ios)
483*94c4a1e1SFrank Piva {
484*94c4a1e1SFrank Piva 	const struct ublksrv_ctrl_dev *cdev = dev->ctrl_dev;
485*94c4a1e1SFrank Piva 
486*94c4a1e1SFrank Piva 	/*
487*94c4a1e1SFrank Piva 	 * eventfd consumes one extra sqe, and it can be thought as one target
488*94c4a1e1SFrank Piva 	 * depth
489*94c4a1e1SFrank Piva 	 */
490*94c4a1e1SFrank Piva 	int aio_depth = (cdev->dev_info.ublksrv_flags & UBLKSRV_F_NEED_EVENTFD)
491*94c4a1e1SFrank Piva 		? 1 : 0;
492*94c4a1e1SFrank Piva 	int depth = cdev->dev_info.queue_depth;
493*94c4a1e1SFrank Piva 	int tgt_depth = dev->tgt.tgt_ring_depth + aio_depth;
494*94c4a1e1SFrank Piva 
495*94c4a1e1SFrank Piva 	*nr_ios = depth + dev->tgt.extra_ios;
496*94c4a1e1SFrank Piva 
497*94c4a1e1SFrank Piva 	/*
498*94c4a1e1SFrank Piva 	 * queue_depth represents the max count of io commands issued from ublk driver.
499*94c4a1e1SFrank Piva 	 *
500*94c4a1e1SFrank Piva 	 * After io command is fetched from ublk driver, the consumed sqe for
501*94c4a1e1SFrank Piva 	 * fetching io command has been available for target usage, so the uring
502*94c4a1e1SFrank Piva 	 * depth can be set as the max(queue_depth, tgt_depth).
503*94c4a1e1SFrank Piva 	 */
504*94c4a1e1SFrank Piva 	depth = depth > tgt_depth ? depth : tgt_depth;
505*94c4a1e1SFrank Piva 	*ring_depth = depth;
506*94c4a1e1SFrank Piva 	*cq_depth = dev->cq_depth ? dev->cq_depth : depth;
507*94c4a1e1SFrank Piva }
508*94c4a1e1SFrank Piva 
ublksrv_queue_init(const struct ublksrv_dev * tdev,unsigned short q_id,void * queue_data)509*94c4a1e1SFrank Piva const struct ublksrv_queue *ublksrv_queue_init(const struct ublksrv_dev *tdev,
510*94c4a1e1SFrank Piva 		unsigned short q_id, void *queue_data)
511*94c4a1e1SFrank Piva {
512*94c4a1e1SFrank Piva 	struct _ublksrv_dev *dev = tdev_to_local(tdev);
513*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q;
514*94c4a1e1SFrank Piva 	const struct ublksrv_ctrl_dev *ctrl_dev = dev->ctrl_dev;
515*94c4a1e1SFrank Piva 	int depth = ctrl_dev->dev_info.queue_depth;
516*94c4a1e1SFrank Piva 	int i, ret = -1;
517*94c4a1e1SFrank Piva 	int cmd_buf_size, io_buf_size;
518*94c4a1e1SFrank Piva 	unsigned long off;
519*94c4a1e1SFrank Piva 	int io_data_size = round_up(dev->tgt.io_data_size,
520*94c4a1e1SFrank Piva 			sizeof(unsigned long));
521*94c4a1e1SFrank Piva 	int ring_depth, cq_depth, nr_ios;
522*94c4a1e1SFrank Piva 
523*94c4a1e1SFrank Piva 	ublksrv_calculate_depths(dev, &ring_depth, &cq_depth, &nr_ios);
524*94c4a1e1SFrank Piva 
525*94c4a1e1SFrank Piva 	/*
526*94c4a1e1SFrank Piva 	 * Too many extra ios
527*94c4a1e1SFrank Piva 	 */
528*94c4a1e1SFrank Piva 	if (nr_ios > depth * 3)
529*94c4a1e1SFrank Piva 		return NULL;
530*94c4a1e1SFrank Piva 
531*94c4a1e1SFrank Piva 	q = (struct _ublksrv_queue *)malloc(sizeof(struct _ublksrv_queue) +
532*94c4a1e1SFrank Piva 			sizeof(struct ublk_io) * nr_ios);
533*94c4a1e1SFrank Piva 	dev->__queues[q_id] = q;
534*94c4a1e1SFrank Piva 
535*94c4a1e1SFrank Piva 	q->tgt_ops = dev->tgt.ops;	//cache ops for fast path
536*94c4a1e1SFrank Piva 	q->dev = dev;
537*94c4a1e1SFrank Piva 	if (ctrl_dev->dev_info.flags & UBLK_F_CMD_IOCTL_ENCODE)
538*94c4a1e1SFrank Piva 		q->state = UBLKSRV_QUEUE_IOCTL_OP;
539*94c4a1e1SFrank Piva 	else
540*94c4a1e1SFrank Piva 		q->state = 0;
541*94c4a1e1SFrank Piva 	if (ctrl_dev->dev_info.flags & UBLK_F_USER_COPY)
542*94c4a1e1SFrank Piva 		q->state |= UBLKSRV_USER_COPY;
543*94c4a1e1SFrank Piva 	q->q_id = q_id;
544*94c4a1e1SFrank Piva 	/* FIXME: depth has to be PO 2 */
545*94c4a1e1SFrank Piva 	q->q_depth = depth;
546*94c4a1e1SFrank Piva 	q->io_cmd_buf = NULL;
547*94c4a1e1SFrank Piva 	q->cmd_inflight = 0;
548*94c4a1e1SFrank Piva 	q->tid = ublksrv_gettid();
549*94c4a1e1SFrank Piva 
550*94c4a1e1SFrank Piva 	cmd_buf_size = ublksrv_queue_cmd_buf_sz(q);
551*94c4a1e1SFrank Piva 	off = UBLKSRV_CMD_BUF_OFFSET +
552*94c4a1e1SFrank Piva 		q_id * (UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc));
553*94c4a1e1SFrank Piva 	q->io_cmd_buf = (char *)mmap(0, cmd_buf_size, PROT_READ,
554*94c4a1e1SFrank Piva 			MAP_SHARED | MAP_POPULATE, dev->cdev_fd, off);
555*94c4a1e1SFrank Piva 	if (q->io_cmd_buf == MAP_FAILED) {
556*94c4a1e1SFrank Piva 		ublk_err("ublk dev %d queue %d map io_cmd_buf failed",
557*94c4a1e1SFrank Piva 				q->dev->ctrl_dev->dev_info.dev_id, q->q_id);
558*94c4a1e1SFrank Piva 		goto fail;
559*94c4a1e1SFrank Piva 	}
560*94c4a1e1SFrank Piva 
561*94c4a1e1SFrank Piva 	io_buf_size = ctrl_dev->dev_info.max_io_buf_bytes;
562*94c4a1e1SFrank Piva 	for (i = 0; i < nr_ios; i++) {
563*94c4a1e1SFrank Piva 		q->ios[i].buf_addr = NULL;
564*94c4a1e1SFrank Piva 
565*94c4a1e1SFrank Piva 		/* extra ios needn't to allocate io buffer */
566*94c4a1e1SFrank Piva 		if (i >= q->q_depth)
567*94c4a1e1SFrank Piva 			goto skip_alloc_buf;
568*94c4a1e1SFrank Piva 
569*94c4a1e1SFrank Piva 		if (dev->tgt.ops->alloc_io_buf)
570*94c4a1e1SFrank Piva 			q->ios[i].buf_addr =
571*94c4a1e1SFrank Piva 				dev->tgt.ops->alloc_io_buf(local_to_tq(q),
572*94c4a1e1SFrank Piva 					i, io_buf_size);
573*94c4a1e1SFrank Piva 		else
574*94c4a1e1SFrank Piva 			if (posix_memalign((void **)&q->ios[i].buf_addr,
575*94c4a1e1SFrank Piva 						getpagesize(), io_buf_size)) {
576*94c4a1e1SFrank Piva 				ublk_err("ublk dev %d queue %d io %d posix_memalign failed",
577*94c4a1e1SFrank Piva 						q->dev->ctrl_dev->dev_info.dev_id, q->q_id, i);
578*94c4a1e1SFrank Piva 				goto fail;
579*94c4a1e1SFrank Piva 			}
580*94c4a1e1SFrank Piva 		//q->ios[i].buf_addr = malloc(io_buf_size);
581*94c4a1e1SFrank Piva 		if (!q->ios[i].buf_addr) {
582*94c4a1e1SFrank Piva 			ublk_err("ublk dev %d queue %d io %d alloc io_buf failed",
583*94c4a1e1SFrank Piva 					q->dev->ctrl_dev->dev_info.dev_id, q->q_id, i);
584*94c4a1e1SFrank Piva 			goto fail;
585*94c4a1e1SFrank Piva 		}
586*94c4a1e1SFrank Piva skip_alloc_buf:
587*94c4a1e1SFrank Piva 		q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE;
588*94c4a1e1SFrank Piva 		q->ios[i].data.private_data = malloc(io_data_size);
589*94c4a1e1SFrank Piva 		q->ios[i].data.tag = i;
590*94c4a1e1SFrank Piva 		if (i < q->q_depth)
591*94c4a1e1SFrank Piva 			q->ios[i].data.iod = ublksrv_get_iod(q, i);
592*94c4a1e1SFrank Piva 		else
593*94c4a1e1SFrank Piva 			q->ios[i].data.iod = NULL;
594*94c4a1e1SFrank Piva 
595*94c4a1e1SFrank Piva 		//ublk_assert(io_data_size ^ (unsigned long)q->ios[i].data.private_data);
596*94c4a1e1SFrank Piva 	}
597*94c4a1e1SFrank Piva 
598*94c4a1e1SFrank Piva 	ret = ublksrv_setup_ring(&q->ring, ring_depth, cq_depth,
599*94c4a1e1SFrank Piva 			IORING_SETUP_SQE128 | IORING_SETUP_COOP_TASKRUN);
600*94c4a1e1SFrank Piva 	if (ret < 0) {
601*94c4a1e1SFrank Piva 		ublk_err("ublk dev %d queue %d setup io_uring failed %d",
602*94c4a1e1SFrank Piva 				q->dev->ctrl_dev->dev_info.dev_id, q->q_id, ret);
603*94c4a1e1SFrank Piva 		goto fail;
604*94c4a1e1SFrank Piva 	}
605*94c4a1e1SFrank Piva 
606*94c4a1e1SFrank Piva 	q->ring_ptr = &q->ring;
607*94c4a1e1SFrank Piva 
608*94c4a1e1SFrank Piva 	ret = io_uring_register_files(&q->ring, dev->tgt.fds,
609*94c4a1e1SFrank Piva 			dev->tgt.nr_fds + 1);
610*94c4a1e1SFrank Piva 	if (ret) {
611*94c4a1e1SFrank Piva 		ublk_err("ublk dev %d queue %d register files failed %d",
612*94c4a1e1SFrank Piva 				q->dev->ctrl_dev->dev_info.dev_id, q->q_id, ret);
613*94c4a1e1SFrank Piva 		goto fail;
614*94c4a1e1SFrank Piva 	}
615*94c4a1e1SFrank Piva 
616*94c4a1e1SFrank Piva 	io_uring_register_ring_fd(&q->ring);
617*94c4a1e1SFrank Piva 
618*94c4a1e1SFrank Piva 	/*
619*94c4a1e1SFrank Piva 	* N.B. PR_SET_IO_FLUSHER was added with Linux 5.6+.
620*94c4a1e1SFrank Piva 	*/
621*94c4a1e1SFrank Piva #if defined(PR_SET_IO_FLUSHER)
622*94c4a1e1SFrank Piva 	if (prctl(PR_SET_IO_FLUSHER, 0, 0, 0, 0) != 0)
623*94c4a1e1SFrank Piva 		ublk_err("ublk dev %d queue %d set_io_flusher failed",
624*94c4a1e1SFrank Piva 			q->dev->ctrl_dev->dev_info.dev_id, q->q_id);
625*94c4a1e1SFrank Piva #endif
626*94c4a1e1SFrank Piva 
627*94c4a1e1SFrank Piva 	ublksrv_queue_adjust_uring_io_wq_workers(q);
628*94c4a1e1SFrank Piva 
629*94c4a1e1SFrank Piva 	q->private_data = queue_data;
630*94c4a1e1SFrank Piva 
631*94c4a1e1SFrank Piva 	if (ctrl_dev->tgt_ops->init_queue) {
632*94c4a1e1SFrank Piva 		if (ctrl_dev->tgt_ops->init_queue(local_to_tq(q),
633*94c4a1e1SFrank Piva 					&q->private_data))
634*94c4a1e1SFrank Piva 			goto fail;
635*94c4a1e1SFrank Piva 	}
636*94c4a1e1SFrank Piva 
637*94c4a1e1SFrank Piva 	if (ctrl_dev->queues_cpuset)
638*94c4a1e1SFrank Piva 		ublksrv_set_sched_affinity(dev, q_id);
639*94c4a1e1SFrank Piva 
640*94c4a1e1SFrank Piva 	setpriority(PRIO_PROCESS, getpid(), -20);
641*94c4a1e1SFrank Piva 
642*94c4a1e1SFrank Piva 	ret = ublksrv_setup_eventfd(q);
643*94c4a1e1SFrank Piva 	if (ret < 0) {
644*94c4a1e1SFrank Piva 		ublk_err("ublk dev %d queue %d setup eventfd failed: %s",
645*94c4a1e1SFrank Piva 			q->dev->ctrl_dev->dev_info.dev_id, q->q_id,
646*94c4a1e1SFrank Piva 			strerror(-ret));
647*94c4a1e1SFrank Piva 		goto fail;
648*94c4a1e1SFrank Piva 	}
649*94c4a1e1SFrank Piva 
650*94c4a1e1SFrank Piva 	/* submit all io commands to ublk driver */
651*94c4a1e1SFrank Piva 	ublksrv_submit_fetch_commands(q);
652*94c4a1e1SFrank Piva 
653*94c4a1e1SFrank Piva 	return (struct ublksrv_queue *)q;
654*94c4a1e1SFrank Piva  fail:
655*94c4a1e1SFrank Piva 	ublksrv_queue_deinit(local_to_tq(q));
656*94c4a1e1SFrank Piva 	ublk_err("ublk dev %d queue %d failed",
657*94c4a1e1SFrank Piva 			ctrl_dev->dev_info.dev_id, q_id);
658*94c4a1e1SFrank Piva 	return NULL;
659*94c4a1e1SFrank Piva }
660*94c4a1e1SFrank Piva 
ublksrv_create_pid_file(struct _ublksrv_dev * dev)661*94c4a1e1SFrank Piva static int ublksrv_create_pid_file(struct _ublksrv_dev *dev)
662*94c4a1e1SFrank Piva {
663*94c4a1e1SFrank Piva 	int dev_id = dev->ctrl_dev->dev_info.dev_id;
664*94c4a1e1SFrank Piva 	char pid_file[64];
665*94c4a1e1SFrank Piva 	int ret, pid_fd;
666*94c4a1e1SFrank Piva 
667*94c4a1e1SFrank Piva 	if (!dev->ctrl_dev->run_dir)
668*94c4a1e1SFrank Piva 		return 0;
669*94c4a1e1SFrank Piva 
670*94c4a1e1SFrank Piva 	/* create pid file and lock it, so that others can't */
671*94c4a1e1SFrank Piva 	snprintf(pid_file, 64, "%s/%d.pid", dev->ctrl_dev->run_dir, dev_id);
672*94c4a1e1SFrank Piva 
673*94c4a1e1SFrank Piva 	ret = create_pid_file(pid_file, &pid_fd);
674*94c4a1e1SFrank Piva 	if (ret < 0) {
675*94c4a1e1SFrank Piva 		/* -1 means the file is locked, and we need to remove it */
676*94c4a1e1SFrank Piva 		if (ret == -1) {
677*94c4a1e1SFrank Piva 			close(pid_fd);
678*94c4a1e1SFrank Piva 			unlink(pid_file);
679*94c4a1e1SFrank Piva 		}
680*94c4a1e1SFrank Piva 		return ret;
681*94c4a1e1SFrank Piva 	}
682*94c4a1e1SFrank Piva 	dev->pid_file_fd = pid_fd;
683*94c4a1e1SFrank Piva 	return 0;
684*94c4a1e1SFrank Piva }
685*94c4a1e1SFrank Piva 
ublksrv_remove_pid_file(const struct _ublksrv_dev * dev)686*94c4a1e1SFrank Piva static void ublksrv_remove_pid_file(const struct _ublksrv_dev *dev)
687*94c4a1e1SFrank Piva {
688*94c4a1e1SFrank Piva 	int dev_id = dev->ctrl_dev->dev_info.dev_id;
689*94c4a1e1SFrank Piva 	char pid_file[64];
690*94c4a1e1SFrank Piva 
691*94c4a1e1SFrank Piva 	if (!dev->ctrl_dev->run_dir)
692*94c4a1e1SFrank Piva 		return;
693*94c4a1e1SFrank Piva 
694*94c4a1e1SFrank Piva 	close(dev->pid_file_fd);
695*94c4a1e1SFrank Piva 	snprintf(pid_file, 64, "%s/%d.pid", dev->ctrl_dev->run_dir, dev_id);
696*94c4a1e1SFrank Piva 	unlink(pid_file);
697*94c4a1e1SFrank Piva }
698*94c4a1e1SFrank Piva 
ublksrv_dev_deinit(const struct ublksrv_dev * tdev)699*94c4a1e1SFrank Piva void ublksrv_dev_deinit(const struct ublksrv_dev *tdev)
700*94c4a1e1SFrank Piva {
701*94c4a1e1SFrank Piva 	struct _ublksrv_dev *dev = tdev_to_local(tdev);
702*94c4a1e1SFrank Piva 
703*94c4a1e1SFrank Piva 	ublksrv_remove_pid_file(dev);
704*94c4a1e1SFrank Piva 
705*94c4a1e1SFrank Piva 	ublksrv_tgt_deinit(dev);
706*94c4a1e1SFrank Piva 	free(dev->thread);
707*94c4a1e1SFrank Piva 
708*94c4a1e1SFrank Piva 	if (dev->cdev_fd >= 0) {
709*94c4a1e1SFrank Piva 		close(dev->cdev_fd);
710*94c4a1e1SFrank Piva 		dev->cdev_fd = -1;
711*94c4a1e1SFrank Piva 	}
712*94c4a1e1SFrank Piva 	free(dev);
713*94c4a1e1SFrank Piva }
714*94c4a1e1SFrank Piva 
ublksrv_dev_init(const struct ublksrv_ctrl_dev * ctrl_dev)715*94c4a1e1SFrank Piva const struct ublksrv_dev *ublksrv_dev_init(const struct ublksrv_ctrl_dev *ctrl_dev)
716*94c4a1e1SFrank Piva {
717*94c4a1e1SFrank Piva 	int dev_id = ctrl_dev->dev_info.dev_id;
718*94c4a1e1SFrank Piva 	char buf[64];
719*94c4a1e1SFrank Piva 	int ret = -1;
720*94c4a1e1SFrank Piva 	struct _ublksrv_dev *dev = (struct _ublksrv_dev *)calloc(1, sizeof(*dev));
721*94c4a1e1SFrank Piva 	struct ublksrv_tgt_info *tgt;
722*94c4a1e1SFrank Piva 
723*94c4a1e1SFrank Piva 	if (!dev)
724*94c4a1e1SFrank Piva 		return local_to_tdev(dev);
725*94c4a1e1SFrank Piva 
726*94c4a1e1SFrank Piva 	tgt = &dev->tgt;
727*94c4a1e1SFrank Piva 	dev->ctrl_dev = ctrl_dev;
728*94c4a1e1SFrank Piva 	dev->cdev_fd = -1;
729*94c4a1e1SFrank Piva 
730*94c4a1e1SFrank Piva 	snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
731*94c4a1e1SFrank Piva 	dev->cdev_fd = open(buf, O_RDWR | O_NONBLOCK);
732*94c4a1e1SFrank Piva 	if (dev->cdev_fd < 0) {
733*94c4a1e1SFrank Piva 		ublk_err("can't open %s, ret %d\n", buf, dev->cdev_fd);
734*94c4a1e1SFrank Piva 		goto fail;
735*94c4a1e1SFrank Piva 	}
736*94c4a1e1SFrank Piva 
737*94c4a1e1SFrank Piva 	tgt->fds[0] = dev->cdev_fd;
738*94c4a1e1SFrank Piva 
739*94c4a1e1SFrank Piva 	ret = ublksrv_tgt_init(dev, ctrl_dev->tgt_type, ctrl_dev->tgt_ops,
740*94c4a1e1SFrank Piva 			ctrl_dev->tgt_argc, ctrl_dev->tgt_argv);
741*94c4a1e1SFrank Piva 	if (ret) {
742*94c4a1e1SFrank Piva 		ublk_err( "can't init tgt %d/%s/%d, ret %d\n",
743*94c4a1e1SFrank Piva 				dev_id, ctrl_dev->tgt_type, ctrl_dev->tgt_argc,
744*94c4a1e1SFrank Piva 				ret);
745*94c4a1e1SFrank Piva 		goto fail;
746*94c4a1e1SFrank Piva 	}
747*94c4a1e1SFrank Piva 
748*94c4a1e1SFrank Piva 	ret = ublksrv_create_pid_file(dev);
749*94c4a1e1SFrank Piva 	if (ret) {
750*94c4a1e1SFrank Piva 		ublk_err( "can't create pid file for dev %d, ret %d\n",
751*94c4a1e1SFrank Piva 				dev_id, ret);
752*94c4a1e1SFrank Piva 		goto fail;
753*94c4a1e1SFrank Piva 	}
754*94c4a1e1SFrank Piva 
755*94c4a1e1SFrank Piva 	return local_to_tdev(dev);
756*94c4a1e1SFrank Piva fail:
757*94c4a1e1SFrank Piva 	ublksrv_dev_deinit(local_to_tdev(dev));
758*94c4a1e1SFrank Piva 	return NULL;
759*94c4a1e1SFrank Piva }
760*94c4a1e1SFrank Piva 
761*94c4a1e1SFrank Piva /* Be careful, target io may not have one ublk_io associated with  */
ublksrv_handle_tgt_cqe(struct _ublksrv_queue * q,struct io_uring_cqe * cqe)762*94c4a1e1SFrank Piva static inline void ublksrv_handle_tgt_cqe(struct _ublksrv_queue *q,
763*94c4a1e1SFrank Piva 		struct io_uring_cqe *cqe)
764*94c4a1e1SFrank Piva {
765*94c4a1e1SFrank Piva 	unsigned tag = user_data_to_tag(cqe->user_data);
766*94c4a1e1SFrank Piva 
767*94c4a1e1SFrank Piva 	if (cqe->res < 0 && cqe->res != -EAGAIN) {
768*94c4a1e1SFrank Piva 		ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
769*94c4a1e1SFrank Piva 			__func__, cqe->res, q->q_id,
770*94c4a1e1SFrank Piva 			user_data_to_tag(cqe->user_data),
771*94c4a1e1SFrank Piva 			user_data_to_op(cqe->user_data));
772*94c4a1e1SFrank Piva 	}
773*94c4a1e1SFrank Piva 
774*94c4a1e1SFrank Piva 	if (is_eventfd_io(cqe->user_data)) {
775*94c4a1e1SFrank Piva 		if (q->tgt_ops->handle_event)
776*94c4a1e1SFrank Piva 			q->tgt_ops->handle_event(local_to_tq(q));
777*94c4a1e1SFrank Piva 	} else {
778*94c4a1e1SFrank Piva 		if (q->tgt_ops->tgt_io_done)
779*94c4a1e1SFrank Piva 			q->tgt_ops->tgt_io_done(local_to_tq(q),
780*94c4a1e1SFrank Piva 					&q->ios[tag].data, cqe);
781*94c4a1e1SFrank Piva 	}
782*94c4a1e1SFrank Piva }
783*94c4a1e1SFrank Piva 
ublksrv_handle_cqe(struct io_uring * r,struct io_uring_cqe * cqe,void * data)784*94c4a1e1SFrank Piva static void ublksrv_handle_cqe(struct io_uring *r,
785*94c4a1e1SFrank Piva 		struct io_uring_cqe *cqe, void *data)
786*94c4a1e1SFrank Piva {
787*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q = container_of(r, struct _ublksrv_queue, ring);
788*94c4a1e1SFrank Piva 	unsigned tag = user_data_to_tag(cqe->user_data);
789*94c4a1e1SFrank Piva 	unsigned cmd_op = user_data_to_op(cqe->user_data);
790*94c4a1e1SFrank Piva 	int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
791*94c4a1e1SFrank Piva 		!(q->state & UBLKSRV_QUEUE_STOPPING);
792*94c4a1e1SFrank Piva 	struct ublk_io *io;
793*94c4a1e1SFrank Piva 
794*94c4a1e1SFrank Piva 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d event %d) stopping %d\n",
795*94c4a1e1SFrank Piva 			__func__, cqe->res, q->q_id, tag, cmd_op,
796*94c4a1e1SFrank Piva 			is_target_io(cqe->user_data),
797*94c4a1e1SFrank Piva 			is_eventfd_io(cqe->user_data),
798*94c4a1e1SFrank Piva 			(q->state & UBLKSRV_QUEUE_STOPPING));
799*94c4a1e1SFrank Piva 
800*94c4a1e1SFrank Piva 	/* Don't retrieve io in case of target io */
801*94c4a1e1SFrank Piva 	if (is_target_io(cqe->user_data)) {
802*94c4a1e1SFrank Piva 		ublksrv_handle_tgt_cqe(q, cqe);
803*94c4a1e1SFrank Piva 		return;
804*94c4a1e1SFrank Piva 	}
805*94c4a1e1SFrank Piva 
806*94c4a1e1SFrank Piva 	io = &q->ios[tag];
807*94c4a1e1SFrank Piva 	q->cmd_inflight--;
808*94c4a1e1SFrank Piva 
809*94c4a1e1SFrank Piva 	if (!fetch) {
810*94c4a1e1SFrank Piva 		q->state |= UBLKSRV_QUEUE_STOPPING;
811*94c4a1e1SFrank Piva 		io->flags &= ~UBLKSRV_NEED_FETCH_RQ;
812*94c4a1e1SFrank Piva 	}
813*94c4a1e1SFrank Piva 
814*94c4a1e1SFrank Piva 	/*
815*94c4a1e1SFrank Piva 	 * So far, only sync tgt's io handling is implemented.
816*94c4a1e1SFrank Piva 	 *
817*94c4a1e1SFrank Piva 	 * todo: support async tgt io handling via io_uring, and the ublksrv
818*94c4a1e1SFrank Piva 	 * daemon can poll on both two rings.
819*94c4a1e1SFrank Piva 	 */
820*94c4a1e1SFrank Piva 	if (cqe->res == UBLK_IO_RES_OK) {
821*94c4a1e1SFrank Piva 		//ublk_assert(tag < q->q_depth);
822*94c4a1e1SFrank Piva 		q->tgt_ops->handle_io_async(local_to_tq(q), &io->data);
823*94c4a1e1SFrank Piva 	} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
824*94c4a1e1SFrank Piva 		io->flags |= UBLKSRV_NEED_GET_DATA | UBLKSRV_IO_FREE;
825*94c4a1e1SFrank Piva 		ublksrv_queue_io_cmd(q, io, tag);
826*94c4a1e1SFrank Piva 	} else {
827*94c4a1e1SFrank Piva 		/*
828*94c4a1e1SFrank Piva 		 * COMMIT_REQ will be completed immediately since no fetching
829*94c4a1e1SFrank Piva 		 * piggyback is required.
830*94c4a1e1SFrank Piva 		 *
831*94c4a1e1SFrank Piva 		 * Marking IO_FREE only, then this io won't be issued since
832*94c4a1e1SFrank Piva 		 * we only issue io with (UBLKSRV_IO_FREE | UBLKSRV_NEED_*)
833*94c4a1e1SFrank Piva 		 *
834*94c4a1e1SFrank Piva 		 * */
835*94c4a1e1SFrank Piva 		io->flags = UBLKSRV_IO_FREE;
836*94c4a1e1SFrank Piva 	}
837*94c4a1e1SFrank Piva }
838*94c4a1e1SFrank Piva 
ublksrv_reap_events_uring(struct io_uring * r)839*94c4a1e1SFrank Piva static int ublksrv_reap_events_uring(struct io_uring *r)
840*94c4a1e1SFrank Piva {
841*94c4a1e1SFrank Piva 	struct io_uring_cqe *cqe;
842*94c4a1e1SFrank Piva 	unsigned head;
843*94c4a1e1SFrank Piva 	int count = 0;
844*94c4a1e1SFrank Piva 
845*94c4a1e1SFrank Piva 	io_uring_for_each_cqe(r, head, cqe) {
846*94c4a1e1SFrank Piva 		ublksrv_handle_cqe(r, cqe, NULL);
847*94c4a1e1SFrank Piva 		count += 1;
848*94c4a1e1SFrank Piva 	}
849*94c4a1e1SFrank Piva 	io_uring_cq_advance(r, count);
850*94c4a1e1SFrank Piva 
851*94c4a1e1SFrank Piva 	return count;
852*94c4a1e1SFrank Piva }
853*94c4a1e1SFrank Piva 
ublksrv_queue_discard_io_pages(struct _ublksrv_queue * q)854*94c4a1e1SFrank Piva static void ublksrv_queue_discard_io_pages(struct _ublksrv_queue *q)
855*94c4a1e1SFrank Piva {
856*94c4a1e1SFrank Piva 	const struct ublksrv_ctrl_dev *cdev = q->dev->ctrl_dev;
857*94c4a1e1SFrank Piva 	unsigned int io_buf_size = cdev->dev_info.max_io_buf_bytes;
858*94c4a1e1SFrank Piva 	int i = 0;
859*94c4a1e1SFrank Piva 
860*94c4a1e1SFrank Piva 	for (i = 0; i < q->q_depth; i++)
861*94c4a1e1SFrank Piva 		madvise(q->ios[i].buf_addr, io_buf_size, MADV_DONTNEED);
862*94c4a1e1SFrank Piva }
863*94c4a1e1SFrank Piva 
ublksrv_queue_idle_enter(struct _ublksrv_queue * q)864*94c4a1e1SFrank Piva static void ublksrv_queue_idle_enter(struct _ublksrv_queue *q)
865*94c4a1e1SFrank Piva {
866*94c4a1e1SFrank Piva 	if (q->state & UBLKSRV_QUEUE_IDLE)
867*94c4a1e1SFrank Piva 		return;
868*94c4a1e1SFrank Piva 
869*94c4a1e1SFrank Piva 	ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: enter idle %x\n",
870*94c4a1e1SFrank Piva 			q->dev->ctrl_dev->dev_info.dev_id, q->q_id, q->state);
871*94c4a1e1SFrank Piva 	ublksrv_queue_discard_io_pages(q);
872*94c4a1e1SFrank Piva 	q->state |= UBLKSRV_QUEUE_IDLE;
873*94c4a1e1SFrank Piva 
874*94c4a1e1SFrank Piva 	if (q->tgt_ops->idle_fn)
875*94c4a1e1SFrank Piva 		q->tgt_ops->idle_fn(local_to_tq(q), true);
876*94c4a1e1SFrank Piva }
877*94c4a1e1SFrank Piva 
ublksrv_queue_idle_exit(struct _ublksrv_queue * q)878*94c4a1e1SFrank Piva static inline void ublksrv_queue_idle_exit(struct _ublksrv_queue *q)
879*94c4a1e1SFrank Piva {
880*94c4a1e1SFrank Piva 	if (q->state & UBLKSRV_QUEUE_IDLE) {
881*94c4a1e1SFrank Piva 		ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: exit idle %x\n",
882*94c4a1e1SFrank Piva 			q->dev->ctrl_dev->dev_info.dev_id, q->q_id, q->state);
883*94c4a1e1SFrank Piva 		q->state &= ~UBLKSRV_QUEUE_IDLE;
884*94c4a1e1SFrank Piva 		if (q->tgt_ops->idle_fn)
885*94c4a1e1SFrank Piva 			q->tgt_ops->idle_fn(local_to_tq(q), false);
886*94c4a1e1SFrank Piva 	}
887*94c4a1e1SFrank Piva }
888*94c4a1e1SFrank Piva 
ublksrv_reset_aio_batch(struct _ublksrv_queue * q)889*94c4a1e1SFrank Piva static void ublksrv_reset_aio_batch(struct _ublksrv_queue *q)
890*94c4a1e1SFrank Piva {
891*94c4a1e1SFrank Piva 	q->nr_ctxs = 0;
892*94c4a1e1SFrank Piva }
893*94c4a1e1SFrank Piva 
ublksrv_submit_aio_batch(struct _ublksrv_queue * q)894*94c4a1e1SFrank Piva static void ublksrv_submit_aio_batch(struct _ublksrv_queue *q)
895*94c4a1e1SFrank Piva {
896*94c4a1e1SFrank Piva 	int i;
897*94c4a1e1SFrank Piva 
898*94c4a1e1SFrank Piva 	for (i = 0; i < q->nr_ctxs; i++) {
899*94c4a1e1SFrank Piva 		struct ublksrv_aio_ctx *ctx = q->ctxs[i];
900*94c4a1e1SFrank Piva 		uint64_t data = 1;
901*94c4a1e1SFrank Piva 		int ret;
902*94c4a1e1SFrank Piva 
903*94c4a1e1SFrank Piva 		ret = write(ctx->efd, &data, sizeof(uint64_t));
904*94c4a1e1SFrank Piva 		if (ret != sizeof(uint64_t))
905*94c4a1e1SFrank Piva 			ublk_err("%s:%d write fail ctx[%d]: %d/%zu\n",
906*94c4a1e1SFrank Piva 					__func__, __LINE__, i, ret, sizeof(uint64_t));
907*94c4a1e1SFrank Piva 	}
908*94c4a1e1SFrank Piva }
909*94c4a1e1SFrank Piva 
ublksrv_process_io(const struct ublksrv_queue * tq)910*94c4a1e1SFrank Piva int ublksrv_process_io(const struct ublksrv_queue *tq)
911*94c4a1e1SFrank Piva {
912*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q = tq_to_local(tq);
913*94c4a1e1SFrank Piva 	int ret, reapped;
914*94c4a1e1SFrank Piva 	struct __kernel_timespec ts = {
915*94c4a1e1SFrank Piva 		.tv_sec = UBLKSRV_IO_IDLE_SECS,
916*94c4a1e1SFrank Piva 		.tv_nsec = 0
917*94c4a1e1SFrank Piva         };
918*94c4a1e1SFrank Piva 	struct __kernel_timespec *tsp = (q->state & UBLKSRV_QUEUE_IDLE) ?
919*94c4a1e1SFrank Piva 		NULL : &ts;
920*94c4a1e1SFrank Piva 	struct io_uring_cqe *cqe;
921*94c4a1e1SFrank Piva 
922*94c4a1e1SFrank Piva 	ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight %u/%u stopping %d\n",
923*94c4a1e1SFrank Piva 				q->dev->ctrl_dev->dev_info.dev_id,
924*94c4a1e1SFrank Piva 				q->q_id, io_uring_sq_ready(&q->ring),
925*94c4a1e1SFrank Piva 				q->cmd_inflight, q->tgt_io_inflight,
926*94c4a1e1SFrank Piva 				(q->state & UBLKSRV_QUEUE_STOPPING));
927*94c4a1e1SFrank Piva 
928*94c4a1e1SFrank Piva 	if (ublksrv_queue_is_done(q))
929*94c4a1e1SFrank Piva 		return -ENODEV;
930*94c4a1e1SFrank Piva 
931*94c4a1e1SFrank Piva 	ret = io_uring_submit_and_wait_timeout(&q->ring, &cqe, 1, tsp, NULL);
932*94c4a1e1SFrank Piva 
933*94c4a1e1SFrank Piva 	ublksrv_reset_aio_batch(q);
934*94c4a1e1SFrank Piva 	reapped = ublksrv_reap_events_uring(&q->ring);
935*94c4a1e1SFrank Piva 	ublksrv_submit_aio_batch(q);
936*94c4a1e1SFrank Piva 
937*94c4a1e1SFrank Piva 	if (q->tgt_ops->handle_io_background)
938*94c4a1e1SFrank Piva 		q->tgt_ops->handle_io_background(local_to_tq(q),
939*94c4a1e1SFrank Piva 				io_uring_sq_ready(&q->ring));
940*94c4a1e1SFrank Piva 
941*94c4a1e1SFrank Piva 	ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d",
942*94c4a1e1SFrank Piva 			ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING),
943*94c4a1e1SFrank Piva 			(q->state & UBLKSRV_QUEUE_IDLE));
944*94c4a1e1SFrank Piva 
945*94c4a1e1SFrank Piva 	if ((q->state & UBLKSRV_QUEUE_STOPPING))
946*94c4a1e1SFrank Piva 		ublksrv_kill_eventfd(q);
947*94c4a1e1SFrank Piva 	else {
948*94c4a1e1SFrank Piva 		if (ret == -ETIME && reapped == 0 &&
949*94c4a1e1SFrank Piva 				!io_uring_sq_ready(&q->ring))
950*94c4a1e1SFrank Piva 			ublksrv_queue_idle_enter(q);
951*94c4a1e1SFrank Piva 		else
952*94c4a1e1SFrank Piva 			ublksrv_queue_idle_exit(q);
953*94c4a1e1SFrank Piva 	}
954*94c4a1e1SFrank Piva 
955*94c4a1e1SFrank Piva 	return reapped;
956*94c4a1e1SFrank Piva }
957*94c4a1e1SFrank Piva 
ublksrv_get_queue(const struct ublksrv_dev * dev,int q_id)958*94c4a1e1SFrank Piva const struct ublksrv_queue *ublksrv_get_queue(const struct ublksrv_dev *dev,
959*94c4a1e1SFrank Piva 		int q_id)
960*94c4a1e1SFrank Piva {
961*94c4a1e1SFrank Piva 	return (const struct ublksrv_queue *)tdev_to_local(dev)->__queues[q_id];
962*94c4a1e1SFrank Piva }
963*94c4a1e1SFrank Piva 
964*94c4a1e1SFrank Piva /* called in ublksrv process context */
ublksrv_apply_oom_protection()965*94c4a1e1SFrank Piva void ublksrv_apply_oom_protection()
966*94c4a1e1SFrank Piva {
967*94c4a1e1SFrank Piva 	char oom_score_adj_path[64];
968*94c4a1e1SFrank Piva 	pid_t pid = getpid();
969*94c4a1e1SFrank Piva 	int fd;
970*94c4a1e1SFrank Piva 
971*94c4a1e1SFrank Piva 	snprintf(oom_score_adj_path, 64, "/proc/%d/oom_score_adj", pid);
972*94c4a1e1SFrank Piva 
973*94c4a1e1SFrank Piva 	fd = open(oom_score_adj_path, O_RDWR);
974*94c4a1e1SFrank Piva 	if (fd > 0) {
975*94c4a1e1SFrank Piva 		char val[32];
976*94c4a1e1SFrank Piva 		int len, ret;
977*94c4a1e1SFrank Piva 
978*94c4a1e1SFrank Piva 		len = snprintf(val, 32, "%d", -1000);
979*94c4a1e1SFrank Piva 		ret = write(fd, val, len);
980*94c4a1e1SFrank Piva 		if (ret != len)
981*94c4a1e1SFrank Piva 			ublk_err("%s:%d write fail %d/%d\n",
982*94c4a1e1SFrank Piva 					__func__, __LINE__, ret, len);
983*94c4a1e1SFrank Piva 		close(fd);
984*94c4a1e1SFrank Piva 	}
985*94c4a1e1SFrank Piva }
986*94c4a1e1SFrank Piva 
ublksrv_get_ctrl_dev(const struct ublksrv_dev * dev)987*94c4a1e1SFrank Piva const struct ublksrv_ctrl_dev *ublksrv_get_ctrl_dev(
988*94c4a1e1SFrank Piva 		const struct ublksrv_dev *dev)
989*94c4a1e1SFrank Piva {
990*94c4a1e1SFrank Piva 	return tdev_to_local(dev)->ctrl_dev;
991*94c4a1e1SFrank Piva }
992*94c4a1e1SFrank Piva 
ublksrv_get_pidfile_fd(const struct ublksrv_dev * dev)993*94c4a1e1SFrank Piva int ublksrv_get_pidfile_fd(const struct ublksrv_dev *dev)
994*94c4a1e1SFrank Piva {
995*94c4a1e1SFrank Piva 	return tdev_to_local(dev)->pid_file_fd;
996*94c4a1e1SFrank Piva }
997*94c4a1e1SFrank Piva 
ublksrv_io_private_data(const struct ublksrv_queue * tq,int tag)998*94c4a1e1SFrank Piva void *ublksrv_io_private_data(const struct ublksrv_queue *tq, int tag)
999*94c4a1e1SFrank Piva {
1000*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q = tq_to_local(tq);
1001*94c4a1e1SFrank Piva 
1002*94c4a1e1SFrank Piva 	return q->ios[tag].data.private_data;
1003*94c4a1e1SFrank Piva }
1004*94c4a1e1SFrank Piva 
ublksrv_queue_state(const struct ublksrv_queue * q)1005*94c4a1e1SFrank Piva unsigned int ublksrv_queue_state(const struct ublksrv_queue *q)
1006*94c4a1e1SFrank Piva {
1007*94c4a1e1SFrank Piva 	return tq_to_local(q)->state;
1008*94c4a1e1SFrank Piva }
1009*94c4a1e1SFrank Piva 
1010*94c4a1e1SFrank Piva const struct ublk_io_data *
ublksrv_queue_get_io_data(const struct ublksrv_queue * tq,int tag)1011*94c4a1e1SFrank Piva ublksrv_queue_get_io_data(const struct ublksrv_queue *tq, int tag)
1012*94c4a1e1SFrank Piva {
1013*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q = tq_to_local(tq);
1014*94c4a1e1SFrank Piva 
1015*94c4a1e1SFrank Piva 	return &q->ios[tag].data;
1016*94c4a1e1SFrank Piva }
1017*94c4a1e1SFrank Piva 
ublksrv_queue_get_io_buf(const struct ublksrv_queue * tq,int tag)1018*94c4a1e1SFrank Piva void *ublksrv_queue_get_io_buf(const struct ublksrv_queue *tq, int tag)
1019*94c4a1e1SFrank Piva {
1020*94c4a1e1SFrank Piva 	struct _ublksrv_queue *q = tq_to_local(tq);
1021*94c4a1e1SFrank Piva 
1022*94c4a1e1SFrank Piva 	if (tag < q->q_depth)
1023*94c4a1e1SFrank Piva 		return q->ios[tag].buf_addr;
1024*94c4a1e1SFrank Piva 	return NULL;
1025*94c4a1e1SFrank Piva }
1026*94c4a1e1SFrank Piva 
1027*94c4a1e1SFrank Piva /*
1028*94c4a1e1SFrank Piva  * The default io_uring cq depth equals to queue depth plus
1029*94c4a1e1SFrank Piva  * .tgt_ring_depth, which is usually enough for typical ublk targets,
1030*94c4a1e1SFrank Piva  * such as loop and qcow2, but it may not be enough for nbd with send_zc
1031*94c4a1e1SFrank Piva  * which needs extra cqe for buffer notification.
1032*94c4a1e1SFrank Piva  *
1033*94c4a1e1SFrank Piva  * So add API to allow target to override default io_uring cq depth.
1034*94c4a1e1SFrank Piva  */
ublksrv_dev_set_cq_depth(struct ublksrv_dev * tdev,int cq_depth)1035*94c4a1e1SFrank Piva void ublksrv_dev_set_cq_depth(struct ublksrv_dev *tdev, int cq_depth)
1036*94c4a1e1SFrank Piva {
1037*94c4a1e1SFrank Piva 	tdev_to_local(tdev)->cq_depth = cq_depth;
1038*94c4a1e1SFrank Piva }
1039*94c4a1e1SFrank Piva 
ublksrv_dev_get_cq_depth(struct ublksrv_dev * tdev)1040*94c4a1e1SFrank Piva int ublksrv_dev_get_cq_depth(struct ublksrv_dev *tdev)
1041*94c4a1e1SFrank Piva {
1042*94c4a1e1SFrank Piva 	return tdev_to_local(tdev)->cq_depth;
1043*94c4a1e1SFrank Piva }
1044