1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <[email protected]>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/jiffies.h>
12 #include <linux/string.h>
13 #include <linux/in.h>
14 #include <linux/errno.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/siphash.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 #include <net/red.h>
24 
25 
26 /*	Stochastic Fairness Queuing algorithm.
27 	=======================================
28 
29 	Source:
30 	Paul E. McKenney "Stochastic Fairness Queuing",
31 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32 
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
35 
36 
37 	See also:
38 	M. Shreedhar and George Varghese "Efficient Fair
39 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40 
41 
42 	This is not the thing that is usually called (W)FQ nowadays.
43 	It does not use any timestamp mechanism, but instead
44 	processes queues in round-robin order.
45 
46 	ADVANTAGE:
47 
48 	- It is very cheap. Both CPU and memory requirements are minimal.
49 
50 	DRAWBACKS:
51 
52 	- "Stochastic" -> It is not 100% fair.
53 	When hash collisions occur, several flows are considered as one.
54 
55 	- "Round-robin" -> It introduces larger delays than virtual clock
56 	based schemes, and should not be used for isolating interactive
57 	traffic	from non-interactive. It means, that this scheduler
58 	should be used as leaf of CBQ or P3, which put interactive traffic
59 	to higher priority band.
60 
61 	We still need true WFQ for top level CSZ, but using WFQ
62 	for the best effort traffic is absolutely pointless:
63 	SFQ is superior for this purpose.
64 
65 	IMPLEMENTATION:
66 	This implementation limits :
67 	- maximal queue length per flow to 127 packets.
68 	- max mtu to 2^18-1;
69 	- max 65408 flows,
70 	- number of hash buckets to 65536.
71 
72 	It is easy to increase these values, but not in flight.  */
73 
74 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
75 #define SFQ_DEFAULT_FLOWS	128
76 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77 #define SFQ_EMPTY_SLOT		0xffff
78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
79 
80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
81 typedef u16 sfq_index;
82 
83 /*
84  * We dont use pointers to save space.
85  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
86  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
87  * are 'pointers' to dep[] array
88  */
89 struct sfq_head {
90 	sfq_index	next;
91 	sfq_index	prev;
92 };
93 
94 struct sfq_slot {
95 	struct sk_buff	*skblist_next;
96 	struct sk_buff	*skblist_prev;
97 	sfq_index	qlen; /* number of skbs in skblist */
98 	sfq_index	next; /* next slot in sfq RR chain */
99 	struct sfq_head dep; /* anchor in dep[] chains */
100 	unsigned short	hash; /* hash value (index in ht[]) */
101 	int		allot; /* credit for this slot */
102 
103 	unsigned int    backlog;
104 	struct red_vars vars;
105 };
106 
107 struct sfq_sched_data {
108 /* frequently used fields */
109 	int		limit;		/* limit of total number of packets in this qdisc */
110 	unsigned int	divisor;	/* number of slots in hash table */
111 	u8		headdrop;
112 	u8		maxdepth;	/* limit of packets per flow */
113 
114 	siphash_key_t 	perturbation;
115 	u8		cur_depth;	/* depth of longest slot */
116 	u8		flags;
117 	struct tcf_proto __rcu *filter_list;
118 	struct tcf_block *block;
119 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
120 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
121 
122 	struct red_parms *red_parms;
123 	struct tc_sfqred_stats stats;
124 	struct sfq_slot *tail;		/* current slot in round */
125 
126 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
127 					/* Linked lists of slots, indexed by depth
128 					 * dep[0] : list of unused flows
129 					 * dep[1] : list of flows with 1 packet
130 					 * dep[X] : list of flows with X packets
131 					 */
132 
133 	unsigned int	maxflows;	/* number of flows in flows array */
134 	int		perturb_period;
135 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
136 	struct timer_list perturb_timer;
137 	struct Qdisc	*sch;
138 };
139 
140 /*
141  * sfq_head are either in a sfq_slot or in dep[] array
142  */
sfq_dep_head(struct sfq_sched_data * q,sfq_index val)143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
144 {
145 	if (val < SFQ_MAX_FLOWS)
146 		return &q->slots[val].dep;
147 	return &q->dep[val - SFQ_MAX_FLOWS];
148 }
149 
sfq_hash(const struct sfq_sched_data * q,const struct sk_buff * skb)150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
151 			     const struct sk_buff *skb)
152 {
153 	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
154 }
155 
sfq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
157 				 int *qerr)
158 {
159 	struct sfq_sched_data *q = qdisc_priv(sch);
160 	struct tcf_result res;
161 	struct tcf_proto *fl;
162 	int result;
163 
164 	if (TC_H_MAJ(skb->priority) == sch->handle &&
165 	    TC_H_MIN(skb->priority) > 0 &&
166 	    TC_H_MIN(skb->priority) <= q->divisor)
167 		return TC_H_MIN(skb->priority);
168 
169 	fl = rcu_dereference_bh(q->filter_list);
170 	if (!fl)
171 		return sfq_hash(q, skb) + 1;
172 
173 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
174 	result = tcf_classify(skb, NULL, fl, &res, false);
175 	if (result >= 0) {
176 #ifdef CONFIG_NET_CLS_ACT
177 		switch (result) {
178 		case TC_ACT_STOLEN:
179 		case TC_ACT_QUEUED:
180 		case TC_ACT_TRAP:
181 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 			fallthrough;
183 		case TC_ACT_SHOT:
184 			return 0;
185 		}
186 #endif
187 		if (TC_H_MIN(res.classid) <= q->divisor)
188 			return TC_H_MIN(res.classid);
189 	}
190 	return 0;
191 }
192 
193 /*
194  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
195  */
sfq_link(struct sfq_sched_data * q,sfq_index x)196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
197 {
198 	sfq_index p, n;
199 	struct sfq_slot *slot = &q->slots[x];
200 	int qlen = slot->qlen;
201 
202 	p = qlen + SFQ_MAX_FLOWS;
203 	n = q->dep[qlen].next;
204 
205 	slot->dep.next = n;
206 	slot->dep.prev = p;
207 
208 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
209 	sfq_dep_head(q, n)->prev = x;
210 }
211 
212 #define sfq_unlink(q, x, n, p)			\
213 	do {					\
214 		n = q->slots[x].dep.next;	\
215 		p = q->slots[x].dep.prev;	\
216 		sfq_dep_head(q, p)->next = n;	\
217 		sfq_dep_head(q, n)->prev = p;	\
218 	} while (0)
219 
220 
sfq_dec(struct sfq_sched_data * q,sfq_index x)221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
222 {
223 	sfq_index p, n;
224 	int d;
225 
226 	sfq_unlink(q, x, n, p);
227 
228 	d = q->slots[x].qlen--;
229 	if (n == p && q->cur_depth == d)
230 		q->cur_depth--;
231 	sfq_link(q, x);
232 }
233 
sfq_inc(struct sfq_sched_data * q,sfq_index x)234 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
235 {
236 	sfq_index p, n;
237 	int d;
238 
239 	sfq_unlink(q, x, n, p);
240 
241 	d = ++q->slots[x].qlen;
242 	if (q->cur_depth < d)
243 		q->cur_depth = d;
244 	sfq_link(q, x);
245 }
246 
247 /* helper functions : might be changed when/if skb use a standard list_head */
248 
249 /* remove one skb from tail of slot queue */
slot_dequeue_tail(struct sfq_slot * slot)250 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
251 {
252 	struct sk_buff *skb = slot->skblist_prev;
253 
254 	slot->skblist_prev = skb->prev;
255 	skb->prev->next = (struct sk_buff *)slot;
256 	skb->next = skb->prev = NULL;
257 	return skb;
258 }
259 
260 /* remove one skb from head of slot queue */
slot_dequeue_head(struct sfq_slot * slot)261 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
262 {
263 	struct sk_buff *skb = slot->skblist_next;
264 
265 	slot->skblist_next = skb->next;
266 	skb->next->prev = (struct sk_buff *)slot;
267 	skb->next = skb->prev = NULL;
268 	return skb;
269 }
270 
slot_queue_init(struct sfq_slot * slot)271 static inline void slot_queue_init(struct sfq_slot *slot)
272 {
273 	memset(slot, 0, sizeof(*slot));
274 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
275 }
276 
277 /* add skb to slot queue (tail add) */
slot_queue_add(struct sfq_slot * slot,struct sk_buff * skb)278 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
279 {
280 	skb->prev = slot->skblist_prev;
281 	skb->next = (struct sk_buff *)slot;
282 	slot->skblist_prev->next = skb;
283 	slot->skblist_prev = skb;
284 }
285 
sfq_drop(struct Qdisc * sch,struct sk_buff ** to_free)286 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
287 {
288 	struct sfq_sched_data *q = qdisc_priv(sch);
289 	sfq_index x, d = q->cur_depth;
290 	struct sk_buff *skb;
291 	unsigned int len;
292 	struct sfq_slot *slot;
293 
294 	/* Queue is full! Find the longest slot and drop tail packet from it */
295 	if (d > 1) {
296 		x = q->dep[d].next;
297 		slot = &q->slots[x];
298 drop:
299 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
300 		len = qdisc_pkt_len(skb);
301 		slot->backlog -= len;
302 		sfq_dec(q, x);
303 		sch->q.qlen--;
304 		qdisc_qstats_backlog_dec(sch, skb);
305 		qdisc_drop(skb, sch, to_free);
306 		return len;
307 	}
308 
309 	if (d == 1) {
310 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
311 		x = q->tail->next;
312 		slot = &q->slots[x];
313 		q->tail->next = slot->next;
314 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
315 		goto drop;
316 	}
317 
318 	return 0;
319 }
320 
321 /* Is ECN parameter configured */
sfq_prob_mark(const struct sfq_sched_data * q)322 static int sfq_prob_mark(const struct sfq_sched_data *q)
323 {
324 	return q->flags & TC_RED_ECN;
325 }
326 
327 /* Should packets over max threshold just be marked */
sfq_hard_mark(const struct sfq_sched_data * q)328 static int sfq_hard_mark(const struct sfq_sched_data *q)
329 {
330 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
331 }
332 
sfq_headdrop(const struct sfq_sched_data * q)333 static int sfq_headdrop(const struct sfq_sched_data *q)
334 {
335 	return q->headdrop;
336 }
337 
338 static int
sfq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)339 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
340 {
341 	struct sfq_sched_data *q = qdisc_priv(sch);
342 	unsigned int hash, dropped;
343 	sfq_index x, qlen;
344 	struct sfq_slot *slot;
345 	int ret;
346 	struct sk_buff *head;
347 	int delta;
348 
349 	hash = sfq_classify(skb, sch, &ret);
350 	if (hash == 0) {
351 		if (ret & __NET_XMIT_BYPASS)
352 			qdisc_qstats_drop(sch);
353 		__qdisc_drop(skb, to_free);
354 		return ret;
355 	}
356 	hash--;
357 
358 	x = q->ht[hash];
359 	slot = &q->slots[x];
360 	if (x == SFQ_EMPTY_SLOT) {
361 		x = q->dep[0].next; /* get a free slot */
362 		if (x >= SFQ_MAX_FLOWS)
363 			return qdisc_drop(skb, sch, to_free);
364 		q->ht[hash] = x;
365 		slot = &q->slots[x];
366 		slot->hash = hash;
367 		slot->backlog = 0; /* should already be 0 anyway... */
368 		red_set_vars(&slot->vars);
369 		goto enqueue;
370 	}
371 	if (q->red_parms) {
372 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
373 							&slot->vars,
374 							slot->backlog);
375 		switch (red_action(q->red_parms,
376 				   &slot->vars,
377 				   slot->vars.qavg)) {
378 		case RED_DONT_MARK:
379 			break;
380 
381 		case RED_PROB_MARK:
382 			qdisc_qstats_overlimit(sch);
383 			if (sfq_prob_mark(q)) {
384 				/* We know we have at least one packet in queue */
385 				if (sfq_headdrop(q) &&
386 				    INET_ECN_set_ce(slot->skblist_next)) {
387 					q->stats.prob_mark_head++;
388 					break;
389 				}
390 				if (INET_ECN_set_ce(skb)) {
391 					q->stats.prob_mark++;
392 					break;
393 				}
394 			}
395 			q->stats.prob_drop++;
396 			goto congestion_drop;
397 
398 		case RED_HARD_MARK:
399 			qdisc_qstats_overlimit(sch);
400 			if (sfq_hard_mark(q)) {
401 				/* We know we have at least one packet in queue */
402 				if (sfq_headdrop(q) &&
403 				    INET_ECN_set_ce(slot->skblist_next)) {
404 					q->stats.forced_mark_head++;
405 					break;
406 				}
407 				if (INET_ECN_set_ce(skb)) {
408 					q->stats.forced_mark++;
409 					break;
410 				}
411 			}
412 			q->stats.forced_drop++;
413 			goto congestion_drop;
414 		}
415 	}
416 
417 	if (slot->qlen >= q->maxdepth) {
418 congestion_drop:
419 		if (!sfq_headdrop(q))
420 			return qdisc_drop(skb, sch, to_free);
421 
422 		/* We know we have at least one packet in queue */
423 		head = slot_dequeue_head(slot);
424 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
425 		sch->qstats.backlog -= delta;
426 		slot->backlog -= delta;
427 		qdisc_drop(head, sch, to_free);
428 
429 		slot_queue_add(slot, skb);
430 		qdisc_tree_reduce_backlog(sch, 0, delta);
431 		return NET_XMIT_CN;
432 	}
433 
434 enqueue:
435 	qdisc_qstats_backlog_inc(sch, skb);
436 	slot->backlog += qdisc_pkt_len(skb);
437 	slot_queue_add(slot, skb);
438 	sfq_inc(q, x);
439 	if (slot->qlen == 1) {		/* The flow is new */
440 		if (q->tail == NULL) {	/* It is the first flow */
441 			slot->next = x;
442 		} else {
443 			slot->next = q->tail->next;
444 			q->tail->next = x;
445 		}
446 		/* We put this flow at the end of our flow list.
447 		 * This might sound unfair for a new flow to wait after old ones,
448 		 * but we could endup servicing new flows only, and freeze old ones.
449 		 */
450 		q->tail = slot;
451 		/* We could use a bigger initial quantum for new flows */
452 		slot->allot = q->quantum;
453 	}
454 	if (++sch->q.qlen <= q->limit)
455 		return NET_XMIT_SUCCESS;
456 
457 	qlen = slot->qlen;
458 	dropped = sfq_drop(sch, to_free);
459 	/* Return Congestion Notification only if we dropped a packet
460 	 * from this flow.
461 	 */
462 	if (qlen != slot->qlen) {
463 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
464 		return NET_XMIT_CN;
465 	}
466 
467 	/* As we dropped a packet, better let upper stack know this */
468 	qdisc_tree_reduce_backlog(sch, 1, dropped);
469 	return NET_XMIT_SUCCESS;
470 }
471 
472 static struct sk_buff *
sfq_dequeue(struct Qdisc * sch)473 sfq_dequeue(struct Qdisc *sch)
474 {
475 	struct sfq_sched_data *q = qdisc_priv(sch);
476 	struct sk_buff *skb;
477 	sfq_index a, next_a;
478 	struct sfq_slot *slot;
479 
480 	/* No active slots */
481 	if (q->tail == NULL)
482 		return NULL;
483 
484 next_slot:
485 	a = q->tail->next;
486 	slot = &q->slots[a];
487 	if (slot->allot <= 0) {
488 		q->tail = slot;
489 		slot->allot += q->quantum;
490 		goto next_slot;
491 	}
492 	skb = slot_dequeue_head(slot);
493 	sfq_dec(q, a);
494 	qdisc_bstats_update(sch, skb);
495 	sch->q.qlen--;
496 	qdisc_qstats_backlog_dec(sch, skb);
497 	slot->backlog -= qdisc_pkt_len(skb);
498 	/* Is the slot empty? */
499 	if (slot->qlen == 0) {
500 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
501 		next_a = slot->next;
502 		if (a == next_a) {
503 			q->tail = NULL; /* no more active slots */
504 			return skb;
505 		}
506 		q->tail->next = next_a;
507 	} else {
508 		slot->allot -= qdisc_pkt_len(skb);
509 	}
510 	return skb;
511 }
512 
513 static void
sfq_reset(struct Qdisc * sch)514 sfq_reset(struct Qdisc *sch)
515 {
516 	struct sk_buff *skb;
517 
518 	while ((skb = sfq_dequeue(sch)) != NULL)
519 		rtnl_kfree_skbs(skb, skb);
520 }
521 
522 /*
523  * When q->perturbation is changed, we rehash all queued skbs
524  * to avoid OOO (Out Of Order) effects.
525  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
526  * counters.
527  */
sfq_rehash(struct Qdisc * sch)528 static void sfq_rehash(struct Qdisc *sch)
529 {
530 	struct sfq_sched_data *q = qdisc_priv(sch);
531 	struct sk_buff *skb;
532 	int i;
533 	struct sfq_slot *slot;
534 	struct sk_buff_head list;
535 	int dropped = 0;
536 	unsigned int drop_len = 0;
537 
538 	__skb_queue_head_init(&list);
539 
540 	for (i = 0; i < q->maxflows; i++) {
541 		slot = &q->slots[i];
542 		if (!slot->qlen)
543 			continue;
544 		while (slot->qlen) {
545 			skb = slot_dequeue_head(slot);
546 			sfq_dec(q, i);
547 			__skb_queue_tail(&list, skb);
548 		}
549 		slot->backlog = 0;
550 		red_set_vars(&slot->vars);
551 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
552 	}
553 	q->tail = NULL;
554 
555 	while ((skb = __skb_dequeue(&list)) != NULL) {
556 		unsigned int hash = sfq_hash(q, skb);
557 		sfq_index x = q->ht[hash];
558 
559 		slot = &q->slots[x];
560 		if (x == SFQ_EMPTY_SLOT) {
561 			x = q->dep[0].next; /* get a free slot */
562 			if (x >= SFQ_MAX_FLOWS) {
563 drop:
564 				qdisc_qstats_backlog_dec(sch, skb);
565 				drop_len += qdisc_pkt_len(skb);
566 				kfree_skb(skb);
567 				dropped++;
568 				continue;
569 			}
570 			q->ht[hash] = x;
571 			slot = &q->slots[x];
572 			slot->hash = hash;
573 		}
574 		if (slot->qlen >= q->maxdepth)
575 			goto drop;
576 		slot_queue_add(slot, skb);
577 		if (q->red_parms)
578 			slot->vars.qavg = red_calc_qavg(q->red_parms,
579 							&slot->vars,
580 							slot->backlog);
581 		slot->backlog += qdisc_pkt_len(skb);
582 		sfq_inc(q, x);
583 		if (slot->qlen == 1) {		/* The flow is new */
584 			if (q->tail == NULL) {	/* It is the first flow */
585 				slot->next = x;
586 			} else {
587 				slot->next = q->tail->next;
588 				q->tail->next = x;
589 			}
590 			q->tail = slot;
591 			slot->allot = q->quantum;
592 		}
593 	}
594 	sch->q.qlen -= dropped;
595 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
596 }
597 
sfq_perturbation(struct timer_list * t)598 static void sfq_perturbation(struct timer_list *t)
599 {
600 	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
601 	struct Qdisc *sch = q->sch;
602 	spinlock_t *root_lock;
603 	siphash_key_t nkey;
604 	int period;
605 
606 	get_random_bytes(&nkey, sizeof(nkey));
607 	rcu_read_lock();
608 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
609 	spin_lock(root_lock);
610 	q->perturbation = nkey;
611 	if (!q->filter_list && q->tail)
612 		sfq_rehash(sch);
613 	spin_unlock(root_lock);
614 
615 	/* q->perturb_period can change under us from
616 	 * sfq_change() and sfq_destroy().
617 	 */
618 	period = READ_ONCE(q->perturb_period);
619 	if (period)
620 		mod_timer(&q->perturb_timer, jiffies + period);
621 	rcu_read_unlock();
622 }
623 
sfq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)624 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
625 		      struct netlink_ext_ack *extack)
626 {
627 	struct sfq_sched_data *q = qdisc_priv(sch);
628 	struct tc_sfq_qopt *ctl = nla_data(opt);
629 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
630 	unsigned int qlen, dropped = 0;
631 	struct red_parms *p = NULL;
632 	struct sk_buff *to_free = NULL;
633 	struct sk_buff *tail = NULL;
634 	unsigned int maxflows;
635 	unsigned int quantum;
636 	unsigned int divisor;
637 	int perturb_period;
638 	u8 headdrop;
639 	u8 maxdepth;
640 	int limit;
641 	u8 flags;
642 
643 
644 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
645 		return -EINVAL;
646 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
647 		ctl_v1 = nla_data(opt);
648 	if (ctl->divisor &&
649 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
650 		return -EINVAL;
651 
652 	if ((int)ctl->quantum < 0) {
653 		NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
654 		return -EINVAL;
655 	}
656 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
657 					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
658 		return -EINVAL;
659 	if (ctl_v1 && ctl_v1->qth_min) {
660 		p = kmalloc(sizeof(*p), GFP_KERNEL);
661 		if (!p)
662 			return -ENOMEM;
663 	}
664 
665 	sch_tree_lock(sch);
666 
667 	limit = q->limit;
668 	divisor = q->divisor;
669 	headdrop = q->headdrop;
670 	maxdepth = q->maxdepth;
671 	maxflows = q->maxflows;
672 	perturb_period = q->perturb_period;
673 	quantum = q->quantum;
674 	flags = q->flags;
675 
676 	/* update and validate configuration */
677 	if (ctl->quantum)
678 		quantum = ctl->quantum;
679 	perturb_period = ctl->perturb_period * HZ;
680 	if (ctl->flows)
681 		maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
682 	if (ctl->divisor) {
683 		divisor = ctl->divisor;
684 		maxflows = min_t(u32, maxflows, divisor);
685 	}
686 	if (ctl_v1) {
687 		if (ctl_v1->depth)
688 			maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
689 		if (p) {
690 			red_set_parms(p,
691 				      ctl_v1->qth_min, ctl_v1->qth_max,
692 				      ctl_v1->Wlog,
693 				      ctl_v1->Plog, ctl_v1->Scell_log,
694 				      NULL,
695 				      ctl_v1->max_P);
696 		}
697 		flags = ctl_v1->flags;
698 		headdrop = ctl_v1->headdrop;
699 	}
700 	if (ctl->limit) {
701 		limit = min_t(u32, ctl->limit, maxdepth * maxflows);
702 		maxflows = min_t(u32, maxflows, limit);
703 	}
704 	if (limit == 1) {
705 		sch_tree_unlock(sch);
706 		kfree(p);
707 		NL_SET_ERR_MSG_MOD(extack, "invalid limit");
708 		return -EINVAL;
709 	}
710 
711 	/* commit configuration */
712 	q->limit = limit;
713 	q->divisor = divisor;
714 	q->headdrop = headdrop;
715 	q->maxdepth = maxdepth;
716 	q->maxflows = maxflows;
717 	WRITE_ONCE(q->perturb_period, perturb_period);
718 	q->quantum = quantum;
719 	q->flags = flags;
720 	if (p)
721 		swap(q->red_parms, p);
722 
723 	qlen = sch->q.qlen;
724 	while (sch->q.qlen > q->limit) {
725 		dropped += sfq_drop(sch, &to_free);
726 		if (!tail)
727 			tail = to_free;
728 	}
729 
730 	rtnl_kfree_skbs(to_free, tail);
731 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
732 
733 	del_timer(&q->perturb_timer);
734 	if (q->perturb_period) {
735 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
736 		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
737 	}
738 	sch_tree_unlock(sch);
739 	kfree(p);
740 	return 0;
741 }
742 
sfq_alloc(size_t sz)743 static void *sfq_alloc(size_t sz)
744 {
745 	return  kvmalloc(sz, GFP_KERNEL);
746 }
747 
sfq_free(void * addr)748 static void sfq_free(void *addr)
749 {
750 	kvfree(addr);
751 }
752 
sfq_destroy(struct Qdisc * sch)753 static void sfq_destroy(struct Qdisc *sch)
754 {
755 	struct sfq_sched_data *q = qdisc_priv(sch);
756 
757 	tcf_block_put(q->block);
758 	WRITE_ONCE(q->perturb_period, 0);
759 	del_timer_sync(&q->perturb_timer);
760 	sfq_free(q->ht);
761 	sfq_free(q->slots);
762 	kfree(q->red_parms);
763 }
764 
sfq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)765 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
766 		    struct netlink_ext_ack *extack)
767 {
768 	struct sfq_sched_data *q = qdisc_priv(sch);
769 	int i;
770 	int err;
771 
772 	q->sch = sch;
773 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
774 
775 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
776 	if (err)
777 		return err;
778 
779 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
780 		q->dep[i].next = i + SFQ_MAX_FLOWS;
781 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
782 	}
783 
784 	q->limit = SFQ_MAX_DEPTH;
785 	q->maxdepth = SFQ_MAX_DEPTH;
786 	q->cur_depth = 0;
787 	q->tail = NULL;
788 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
789 	q->maxflows = SFQ_DEFAULT_FLOWS;
790 	q->quantum = psched_mtu(qdisc_dev(sch));
791 	q->perturb_period = 0;
792 	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
793 
794 	if (opt) {
795 		int err = sfq_change(sch, opt, extack);
796 		if (err)
797 			return err;
798 	}
799 
800 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
801 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
802 	if (!q->ht || !q->slots) {
803 		/* Note: sfq_destroy() will be called by our caller */
804 		return -ENOMEM;
805 	}
806 
807 	for (i = 0; i < q->divisor; i++)
808 		q->ht[i] = SFQ_EMPTY_SLOT;
809 
810 	for (i = 0; i < q->maxflows; i++) {
811 		slot_queue_init(&q->slots[i]);
812 		sfq_link(q, i);
813 	}
814 	if (q->limit >= 1)
815 		sch->flags |= TCQ_F_CAN_BYPASS;
816 	else
817 		sch->flags &= ~TCQ_F_CAN_BYPASS;
818 	return 0;
819 }
820 
sfq_dump(struct Qdisc * sch,struct sk_buff * skb)821 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
822 {
823 	struct sfq_sched_data *q = qdisc_priv(sch);
824 	unsigned char *b = skb_tail_pointer(skb);
825 	struct tc_sfq_qopt_v1 opt;
826 	struct red_parms *p = q->red_parms;
827 
828 	memset(&opt, 0, sizeof(opt));
829 	opt.v0.quantum	= q->quantum;
830 	opt.v0.perturb_period = q->perturb_period / HZ;
831 	opt.v0.limit	= q->limit;
832 	opt.v0.divisor	= q->divisor;
833 	opt.v0.flows	= q->maxflows;
834 	opt.depth	= q->maxdepth;
835 	opt.headdrop	= q->headdrop;
836 
837 	if (p) {
838 		opt.qth_min	= p->qth_min >> p->Wlog;
839 		opt.qth_max	= p->qth_max >> p->Wlog;
840 		opt.Wlog	= p->Wlog;
841 		opt.Plog	= p->Plog;
842 		opt.Scell_log	= p->Scell_log;
843 		opt.max_P	= p->max_P;
844 	}
845 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
846 	opt.flags	= q->flags;
847 
848 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
849 		goto nla_put_failure;
850 
851 	return skb->len;
852 
853 nla_put_failure:
854 	nlmsg_trim(skb, b);
855 	return -1;
856 }
857 
sfq_leaf(struct Qdisc * sch,unsigned long arg)858 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
859 {
860 	return NULL;
861 }
862 
sfq_find(struct Qdisc * sch,u32 classid)863 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
864 {
865 	return 0;
866 }
867 
sfq_bind(struct Qdisc * sch,unsigned long parent,u32 classid)868 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
869 			      u32 classid)
870 {
871 	return 0;
872 }
873 
sfq_unbind(struct Qdisc * q,unsigned long cl)874 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
875 {
876 }
877 
sfq_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)878 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
879 				       struct netlink_ext_ack *extack)
880 {
881 	struct sfq_sched_data *q = qdisc_priv(sch);
882 
883 	if (cl)
884 		return NULL;
885 	return q->block;
886 }
887 
sfq_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)888 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
889 			  struct sk_buff *skb, struct tcmsg *tcm)
890 {
891 	tcm->tcm_handle |= TC_H_MIN(cl);
892 	return 0;
893 }
894 
sfq_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)895 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
896 				struct gnet_dump *d)
897 {
898 	struct sfq_sched_data *q = qdisc_priv(sch);
899 	sfq_index idx = q->ht[cl - 1];
900 	struct gnet_stats_queue qs = { 0 };
901 	struct tc_sfq_xstats xstats = { 0 };
902 
903 	if (idx != SFQ_EMPTY_SLOT) {
904 		const struct sfq_slot *slot = &q->slots[idx];
905 
906 		xstats.allot = slot->allot;
907 		qs.qlen = slot->qlen;
908 		qs.backlog = slot->backlog;
909 	}
910 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
911 		return -1;
912 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
913 }
914 
sfq_walk(struct Qdisc * sch,struct qdisc_walker * arg)915 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
916 {
917 	struct sfq_sched_data *q = qdisc_priv(sch);
918 	unsigned int i;
919 
920 	if (arg->stop)
921 		return;
922 
923 	for (i = 0; i < q->divisor; i++) {
924 		if (q->ht[i] == SFQ_EMPTY_SLOT) {
925 			arg->count++;
926 			continue;
927 		}
928 		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
929 			break;
930 	}
931 }
932 
933 static const struct Qdisc_class_ops sfq_class_ops = {
934 	.leaf		=	sfq_leaf,
935 	.find		=	sfq_find,
936 	.tcf_block	=	sfq_tcf_block,
937 	.bind_tcf	=	sfq_bind,
938 	.unbind_tcf	=	sfq_unbind,
939 	.dump		=	sfq_dump_class,
940 	.dump_stats	=	sfq_dump_class_stats,
941 	.walk		=	sfq_walk,
942 };
943 
944 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
945 	.cl_ops		=	&sfq_class_ops,
946 	.id		=	"sfq",
947 	.priv_size	=	sizeof(struct sfq_sched_data),
948 	.enqueue	=	sfq_enqueue,
949 	.dequeue	=	sfq_dequeue,
950 	.peek		=	qdisc_peek_dequeued,
951 	.init		=	sfq_init,
952 	.reset		=	sfq_reset,
953 	.destroy	=	sfq_destroy,
954 	.change		=	NULL,
955 	.dump		=	sfq_dump,
956 	.owner		=	THIS_MODULE,
957 };
958 MODULE_ALIAS_NET_SCH("sfq");
959 
sfq_module_init(void)960 static int __init sfq_module_init(void)
961 {
962 	return register_qdisc(&sfq_qdisc_ops);
963 }
sfq_module_exit(void)964 static void __exit sfq_module_exit(void)
965 {
966 	unregister_qdisc(&sfq_qdisc_ops);
967 }
968 module_init(sfq_module_init)
969 module_exit(sfq_module_exit)
970 MODULE_LICENSE("GPL");
971 MODULE_DESCRIPTION("Stochastic Fairness qdisc");
972