Lines Matching +full:xps +full:- +full:timer +full:- +full:1

1 // SPDX-License-Identifier: GPL-2.0-or-later
5 * Copyright (C) 2013-2023 Eric Dumazet <[email protected]>
8 * Fast classification depends on skb->sk being set before reaching us.
17 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
22 * - lookup one RB tree (out of 1024 or more) to find the flow.
25 * - Use a special fifo for high prio packets
61 return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; in fq_skb_cb()
66 * If packets have monotically increasing time_to_send, they are placed in O(1)
75 unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
79 /* Following field is only used for q->internal,
80 * because q->internal is not hashed in fq_root[]
93 struct rb_node rate_node; /* anchor in q->delayed tree */
109 #define FQ_PRIO2BAND_CRUMB_SIZE ((TC_PRIO_MAX + 1) >> 2)
162 /* return the i-th 2-bit value ("crumb") */
169 * f->tail and f->age share the same location.
172 * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
176 f->age = jiffies | 1UL; in fq_flow_set_detached()
181 return !!(f->age & 1UL); in fq_flow_is_detached()
189 return f->next == &throttled; in fq_flow_is_throttled()
200 struct fq_perband_flows *pband = &q->band_flows[flow->band]; in fq_flow_add_tail()
202 &pband->new_flows : in fq_flow_add_tail()
203 &pband->old_flows; in fq_flow_add_tail()
205 if (head->first) in fq_flow_add_tail()
206 head->last->next = flow; in fq_flow_add_tail()
208 head->first = flow; in fq_flow_add_tail()
209 head->last = flow; in fq_flow_add_tail()
210 flow->next = NULL; in fq_flow_add_tail()
215 rb_erase(&f->rate_node, &q->delayed); in fq_flow_unset_throttled()
216 q->throttled_flows--; in fq_flow_unset_throttled()
222 struct rb_node **p = &q->delayed.rb_node, *parent = NULL; in fq_flow_set_throttled()
229 if (f->time_next_packet >= aux->time_next_packet) in fq_flow_set_throttled()
230 p = &parent->rb_right; in fq_flow_set_throttled()
232 p = &parent->rb_left; in fq_flow_set_throttled()
234 rb_link_node(&f->rate_node, parent, p); in fq_flow_set_throttled()
235 rb_insert_color(&f->rate_node, &q->delayed); in fq_flow_set_throttled()
236 q->throttled_flows++; in fq_flow_set_throttled()
237 q->stat_throttled++; in fq_flow_set_throttled()
239 f->next = &throttled; in fq_flow_set_throttled()
240 if (q->time_next_delayed_flow > f->time_next_packet) in fq_flow_set_throttled()
241 q->time_next_delayed_flow = f->time_next_packet; in fq_flow_set_throttled()
255 time_after(jiffies, f->age + FQ_GC_AGE); in fq_gc_candidate()
267 p = &root->rb_node; in fq_gc()
273 if (f->sk == sk) in fq_gc()
282 if (f->sk > sk) in fq_gc()
283 p = &parent->rb_right; in fq_gc()
285 p = &parent->rb_left; in fq_gc()
292 f = tofree[--i]; in fq_gc()
293 rb_erase(&f->fq_node, root); in fq_gc()
295 q->flows -= fcnt; in fq_gc()
296 q->inactive_flows -= fcnt; in fq_gc()
297 q->stat_gc_flows += fcnt; in fq_gc()
303 * 1) Packet tstamp is in the past, or within the pacing offload horizon.
318 if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon) in fq_fastpath_check()
321 if (sch->q.qlen != 0) { in fq_fastpath_check()
327 if (q->flows != q->inactive_flows + q->throttled_flows) in fq_fastpath_check()
333 if (q->internal.qlen >= 8) in fq_fastpath_check()
339 if (q->time_next_delayed_flow <= now + q->offload_horizon) in fq_fastpath_check()
343 sk = skb->sk; in fq_fastpath_check()
345 sk->sk_max_pacing_rate != ~0UL) in fq_fastpath_check()
348 if (q->flow_max_rate != ~0UL) in fq_fastpath_check()
359 struct sock *sk = skb->sk; in fq_classify()
365 * 1) request sockets are not full blown, in fq_classify()
374 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; in fq_classify()
376 /* By forcing low order bit to 1, we make sure to not in fq_classify()
379 sk = (struct sock *)((hash << 1) | 1UL); in fq_classify()
381 } else if (sk->sk_state == TCP_CLOSE) { in fq_classify()
382 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; in fq_classify()
391 sk = (struct sock *)((hash << 1) | 1UL); in fq_classify()
395 q->internal.stat_fastpath_packets++; in fq_classify()
396 if (skb->sk == sk && q->rate_enable && in fq_classify()
397 READ_ONCE(sk->sk_pacing_status) != SK_PACING_FQ) in fq_classify()
398 smp_store_release(&sk->sk_pacing_status, in fq_classify()
400 return &q->internal; in fq_classify()
403 root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; in fq_classify()
407 p = &root->rb_node; in fq_classify()
413 if (f->sk == sk) { in fq_classify()
419 if (unlikely(skb->sk == sk && in fq_classify()
420 f->socket_hash != sk->sk_hash)) { in fq_classify()
421 f->credit = q->initial_quantum; in fq_classify()
422 f->socket_hash = sk->sk_hash; in fq_classify()
423 if (q->rate_enable) in fq_classify()
424 smp_store_release(&sk->sk_pacing_status, in fq_classify()
428 f->time_next_packet = 0ULL; in fq_classify()
432 if (f->sk > sk) in fq_classify()
433 p = &parent->rb_right; in fq_classify()
435 p = &parent->rb_left; in fq_classify()
440 q->stat_allocation_errors++; in fq_classify()
441 return &q->internal; in fq_classify()
443 /* f->t_root is already zeroed after kmem_cache_zalloc() */ in fq_classify()
446 f->sk = sk; in fq_classify()
447 if (skb->sk == sk) { in fq_classify()
448 f->socket_hash = sk->sk_hash; in fq_classify()
449 if (q->rate_enable) in fq_classify()
450 smp_store_release(&sk->sk_pacing_status, in fq_classify()
453 f->credit = q->initial_quantum; in fq_classify()
455 rb_link_node(&f->fq_node, parent, p); in fq_classify()
456 rb_insert_color(&f->fq_node, root); in fq_classify()
458 q->flows++; in fq_classify()
459 q->inactive_flows++; in fq_classify()
465 struct sk_buff *skb = skb_rb_first(&flow->t_root); in fq_peek()
466 struct sk_buff *head = flow->head; in fq_peek()
474 if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) in fq_peek()
482 if (skb == flow->head) { in fq_erase_head()
483 flow->head = skb->next; in fq_erase_head()
485 rb_erase(&skb->rbnode, &flow->t_root); in fq_erase_head()
486 skb->dev = qdisc_dev(sch); in fq_erase_head()
499 sch->q.qlen--; in fq_dequeue_skb()
507 head = flow->head; in flow_queue_add()
509 fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { in flow_queue_add()
511 flow->head = skb; in flow_queue_add()
513 flow->tail->next = skb; in flow_queue_add()
514 flow->tail = skb; in flow_queue_add()
515 skb->next = NULL; in flow_queue_add()
519 p = &flow->t_root.rb_node; in flow_queue_add()
525 if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) in flow_queue_add()
526 p = &parent->rb_right; in flow_queue_add()
528 p = &parent->rb_left; in flow_queue_add()
530 rb_link_node(&skb->rbnode, parent, p); in flow_queue_add()
531 rb_insert_color(&skb->rbnode, &flow->t_root); in flow_queue_add()
537 return unlikely((s64)skb->tstamp > (s64)(now + q->horizon)); in fq_packet_beyond_horizon()
550 band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); in fq_enqueue()
551 if (unlikely(q->band_pkt_count[band] >= sch->limit)) { in fq_enqueue()
552 q->stat_band_drops[band]++; in fq_enqueue()
558 if (!skb->tstamp) { in fq_enqueue()
559 fq_skb_cb(skb)->time_to_send = now; in fq_enqueue()
563 if (q->horizon_drop) { in fq_enqueue()
564 q->stat_horizon_drops++; in fq_enqueue()
568 q->stat_horizon_caps++; in fq_enqueue()
569 skb->tstamp = now + q->horizon; in fq_enqueue()
571 fq_skb_cb(skb)->time_to_send = skb->tstamp; in fq_enqueue()
576 if (f != &q->internal) { in fq_enqueue()
577 if (unlikely(f->qlen >= q->flow_plimit)) { in fq_enqueue()
578 q->stat_flows_plimit++; in fq_enqueue()
585 if (time_after(jiffies, f->age + q->flow_refill_delay)) in fq_enqueue()
586 f->credit = max_t(u32, f->credit, q->quantum); in fq_enqueue()
589 f->band = band; in fq_enqueue()
590 q->band_pkt_count[band]++; in fq_enqueue()
591 fq_skb_cb(skb)->band = band; in fq_enqueue()
592 if (f->qlen == 0) in fq_enqueue()
593 q->inactive_flows--; in fq_enqueue()
596 f->qlen++; in fq_enqueue()
597 /* Note: this overwrites f->age */ in fq_enqueue()
601 sch->q.qlen++; in fq_enqueue()
612 if (q->time_next_delayed_flow > now + q->offload_horizon) in fq_check_throttled()
616 * This is cheap and can help diagnosing timer/latency problems. in fq_check_throttled()
618 sample = (unsigned long)(now - q->time_next_delayed_flow); in fq_check_throttled()
620 q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; in fq_check_throttled()
621 q->unthrottle_latency_ns += sample >> 3; in fq_check_throttled()
623 now += q->offload_horizon; in fq_check_throttled()
625 q->time_next_delayed_flow = ~0ULL; in fq_check_throttled()
626 while ((p = rb_first(&q->delayed)) != NULL) { in fq_check_throttled()
629 if (f->time_next_packet > now) { in fq_check_throttled()
630 q->time_next_delayed_flow = f->time_next_packet; in fq_check_throttled()
639 if (pband->credit <= 0) in fq_pband_head_select()
642 if (pband->new_flows.first) in fq_pband_head_select()
643 return &pband->new_flows; in fq_pband_head_select()
645 return pband->old_flows.first ? &pband->old_flows : NULL; in fq_pband_head_select()
660 if (!sch->q.qlen) in fq_dequeue()
663 skb = fq_peek(&q->internal); in fq_dequeue()
665 q->internal.qlen--; in fq_dequeue()
666 fq_dequeue_skb(sch, &q->internal, skb); in fq_dequeue()
673 pband = &q->band_flows[q->band_nr]; in fq_dequeue()
678 if (++q->band_nr == FQ_BANDS) in fq_dequeue()
679 q->band_nr = 0; in fq_dequeue()
680 pband = &q->band_flows[q->band_nr]; in fq_dequeue()
681 pband->credit = min(pband->credit + pband->quantum, in fq_dequeue()
682 pband->quantum); in fq_dequeue()
683 if (pband->credit > 0) in fq_dequeue()
687 if (q->time_next_delayed_flow != ~0ULL) in fq_dequeue()
688 qdisc_watchdog_schedule_range_ns(&q->watchdog, in fq_dequeue()
689 q->time_next_delayed_flow, in fq_dequeue()
690 q->timer_slack); in fq_dequeue()
693 f = head->first; in fq_dequeue()
695 if (f->credit <= 0) { in fq_dequeue()
696 f->credit += q->quantum; in fq_dequeue()
697 head->first = f->next; in fq_dequeue()
704 u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, in fq_dequeue()
705 f->time_next_packet); in fq_dequeue()
707 if (now + q->offload_horizon < time_next_packet) { in fq_dequeue()
708 head->first = f->next; in fq_dequeue()
709 f->time_next_packet = time_next_packet; in fq_dequeue()
713 prefetch(&skb->end); in fq_dequeue()
714 if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { in fq_dequeue()
716 q->stat_ce_mark++; in fq_dequeue()
718 if (--f->qlen == 0) in fq_dequeue()
719 q->inactive_flows++; in fq_dequeue()
720 q->band_pkt_count[fq_skb_cb(skb)->band]--; in fq_dequeue()
723 head->first = f->next; in fq_dequeue()
725 if (head == &pband->new_flows) { in fq_dequeue()
733 f->credit -= plen; in fq_dequeue()
734 pband->credit -= plen; in fq_dequeue()
736 if (!q->rate_enable) in fq_dequeue()
739 rate = q->flow_max_rate; in fq_dequeue()
742 * update f->time_next_packet only if this qdisc enforces in fq_dequeue()
745 if (!skb->tstamp) { in fq_dequeue()
746 if (skb->sk) in fq_dequeue()
747 rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate); in fq_dequeue()
749 if (rate <= q->low_rate_threshold) { in fq_dequeue()
750 f->credit = 0; in fq_dequeue()
752 plen = max(plen, q->quantum); in fq_dequeue()
753 if (f->credit > 0) in fq_dequeue()
763 * clamp the delay to 1 second. in fq_dequeue()
768 q->stat_pkts_too_long++; in fq_dequeue()
771 * f->time_next_packet was set when prior packet was sent, in fq_dequeue()
774 if (f->time_next_packet) in fq_dequeue()
775 len -= min(len/2, now - f->time_next_packet); in fq_dequeue()
776 f->time_next_packet = now + len; in fq_dequeue()
785 struct rb_node *p = rb_first(&flow->t_root); in fq_flow_purge()
791 rb_erase(&skb->rbnode, &flow->t_root); in fq_flow_purge()
794 rtnl_kfree_skbs(flow->head, flow->tail); in fq_flow_purge()
795 flow->head = NULL; in fq_flow_purge()
796 flow->qlen = 0; in fq_flow_purge()
807 sch->q.qlen = 0; in fq_reset()
808 sch->qstats.backlog = 0; in fq_reset()
810 fq_flow_purge(&q->internal); in fq_reset()
812 if (!q->fq_root) in fq_reset()
815 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { in fq_reset()
816 root = &q->fq_root[idx]; in fq_reset()
827 q->band_flows[idx].new_flows.first = NULL; in fq_reset()
828 q->band_flows[idx].old_flows.first = NULL; in fq_reset()
830 q->delayed = RB_ROOT; in fq_reset()
831 q->flows = 0; in fq_reset()
832 q->inactive_flows = 0; in fq_reset()
833 q->throttled_flows = 0; in fq_reset()
846 for (idx = 0; idx < (1U << old_log); idx++) { in fq_rehash()
856 nroot = &new_array[hash_ptr(of->sk, new_log)]; in fq_rehash()
858 np = &nroot->rb_node; in fq_rehash()
864 BUG_ON(nf->sk == of->sk); in fq_rehash()
866 if (nf->sk > of->sk) in fq_rehash()
867 np = &parent->rb_right; in fq_rehash()
869 np = &parent->rb_left; in fq_rehash()
872 rb_link_node(&of->fq_node, parent, np); in fq_rehash()
873 rb_insert_color(&of->fq_node, nroot); in fq_rehash()
876 q->flows -= fcnt; in fq_rehash()
877 q->inactive_flows -= fcnt; in fq_rehash()
878 q->stat_gc_flows += fcnt; in fq_rehash()
893 if (q->fq_root && log == q->fq_trees_log) in fq_resize()
896 /* If XPS was setup, we can allocate memory on right NUMA node */ in fq_resize()
898 netdev_queue_numa_node_read(sch->dev_queue)); in fq_resize()
900 return -ENOMEM; in fq_resize()
902 for (idx = 0; idx < (1U << log); idx++) in fq_resize()
907 old_fq_root = q->fq_root; in fq_resize()
909 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); in fq_resize()
911 q->fq_root = array; in fq_resize()
912 WRITE_ONCE(q->fq_trees_log, log); in fq_resize()
925 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
948 /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
951 const int num_elems = TC_PRIO_MAX + 1; in fq_prio2band_compress_crumb()
965 const int num_elems = TC_PRIO_MAX + 1; in fq_prio2band_decompress_crumb()
983 return -EINVAL; in fq_load_weights()
987 WRITE_ONCE(q->band_flows[i].quantum, weights[i]); in fq_load_weights()
998 if (map->bands != FQ_BANDS) { in fq_load_priomap()
1000 return -EINVAL; in fq_load_priomap()
1002 for (i = 0; i < TC_PRIO_MAX + 1; i++) { in fq_load_priomap()
1003 if (map->priomap[i] >= FQ_BANDS) { in fq_load_priomap()
1005 i, map->priomap[i]); in fq_load_priomap()
1006 return -EINVAL; in fq_load_priomap()
1009 fq_prio2band_compress_crumb(map->priomap, q->prio2band); in fq_load_priomap()
1017 struct nlattr *tb[TCA_FQ_MAX + 1]; in fq_change()
1029 fq_log = q->fq_trees_log; in fq_change()
1034 if (nval >= 1 && nval <= ilog2(256*1024)) in fq_change()
1037 err = -EINVAL; in fq_change()
1040 WRITE_ONCE(sch->limit, in fq_change()
1044 WRITE_ONCE(q->flow_plimit, in fq_change()
1050 if (quantum > 0 && quantum <= (1 << 20)) { in fq_change()
1051 WRITE_ONCE(q->quantum, quantum); in fq_change()
1054 err = -EINVAL; in fq_change()
1059 WRITE_ONCE(q->initial_quantum, in fq_change()
1069 WRITE_ONCE(q->flow_max_rate, in fq_change()
1073 WRITE_ONCE(q->low_rate_threshold, in fq_change()
1079 if (enable <= 1) in fq_change()
1080 WRITE_ONCE(q->rate_enable, in fq_change()
1083 err = -EINVAL; in fq_change()
1089 WRITE_ONCE(q->flow_refill_delay, in fq_change()
1100 WRITE_ONCE(q->orphan_mask, in fq_change()
1104 WRITE_ONCE(q->ce_threshold, in fq_change()
1109 WRITE_ONCE(q->timer_slack, in fq_change()
1113 WRITE_ONCE(q->horizon, in fq_change()
1118 WRITE_ONCE(q->horizon_drop, in fq_change()
1125 if (offload_horizon <= qdisc_dev(sch)->max_pacing_offload_horizon) { in fq_change()
1126 WRITE_ONCE(q->offload_horizon, offload_horizon); in fq_change()
1129 err = -EINVAL; in fq_change()
1138 while (sch->q.qlen > sch->limit) { in fq_change()
1158 fq_free(q->fq_root); in fq_destroy()
1159 qdisc_watchdog_cancel(&q->watchdog); in fq_destroy()
1168 sch->limit = 10000; in fq_init()
1169 q->flow_plimit = 100; in fq_init()
1170 q->quantum = 2 * psched_mtu(qdisc_dev(sch)); in fq_init()
1171 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); in fq_init()
1172 q->flow_refill_delay = msecs_to_jiffies(40); in fq_init()
1173 q->flow_max_rate = ~0UL; in fq_init()
1174 q->time_next_delayed_flow = ~0ULL; in fq_init()
1175 q->rate_enable = 1; in fq_init()
1177 q->band_flows[i].new_flows.first = NULL; in fq_init()
1178 q->band_flows[i].old_flows.first = NULL; in fq_init()
1180 q->band_flows[0].quantum = 9 << 16; in fq_init()
1181 q->band_flows[1].quantum = 3 << 16; in fq_init()
1182 q->band_flows[2].quantum = 1 << 16; in fq_init()
1183 q->delayed = RB_ROOT; in fq_init()
1184 q->fq_root = NULL; in fq_init()
1185 q->fq_trees_log = ilog2(1024); in fq_init()
1186 q->orphan_mask = 1024 - 1; in fq_init()
1187 q->low_rate_threshold = 550000 / 8; in fq_init()
1189 q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ in fq_init()
1191 q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ in fq_init()
1192 q->horizon_drop = 1; /* by default, drop packets beyond horizon */ in fq_init()
1195 q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; in fq_init()
1197 fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band); in fq_init()
1198 qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); in fq_init()
1203 err = fq_resize(sch, q->fq_trees_log); in fq_init()
1226 ce_threshold = READ_ONCE(q->ce_threshold); in fq_dump()
1229 horizon = READ_ONCE(q->horizon); in fq_dump()
1232 offload_horizon = READ_ONCE(q->offload_horizon); in fq_dump()
1236 READ_ONCE(sch->limit)) || in fq_dump()
1238 READ_ONCE(q->flow_plimit)) || in fq_dump()
1240 READ_ONCE(q->quantum)) || in fq_dump()
1242 READ_ONCE(q->initial_quantum)) || in fq_dump()
1244 READ_ONCE(q->rate_enable)) || in fq_dump()
1247 READ_ONCE(q->flow_max_rate), ~0U)) || in fq_dump()
1249 jiffies_to_usecs(READ_ONCE(q->flow_refill_delay))) || in fq_dump()
1251 READ_ONCE(q->orphan_mask)) || in fq_dump()
1253 READ_ONCE(q->low_rate_threshold)) || in fq_dump()
1256 READ_ONCE(q->fq_trees_log)) || in fq_dump()
1258 READ_ONCE(q->timer_slack)) || in fq_dump()
1262 READ_ONCE(q->horizon_drop))) in fq_dump()
1265 fq_prio2band_decompress_crumb(q->prio2band, prio.priomap); in fq_dump()
1269 weights[0] = READ_ONCE(q->band_flows[0].quantum); in fq_dump()
1270 weights[1] = READ_ONCE(q->band_flows[1].quantum); in fq_dump()
1271 weights[2] = READ_ONCE(q->band_flows[2].quantum); in fq_dump()
1278 return -1; in fq_dump()
1291 st.gc_flows = q->stat_gc_flows; in fq_dump_stats()
1293 st.fastpath_packets = q->internal.stat_fastpath_packets; in fq_dump_stats()
1295 st.throttled = q->stat_throttled; in fq_dump_stats()
1296 st.flows_plimit = q->stat_flows_plimit; in fq_dump_stats()
1297 st.pkts_too_long = q->stat_pkts_too_long; in fq_dump_stats()
1298 st.allocation_errors = q->stat_allocation_errors; in fq_dump_stats()
1299 st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - in fq_dump_stats()
1301 st.flows = q->flows; in fq_dump_stats()
1302 st.inactive_flows = q->inactive_flows; in fq_dump_stats()
1303 st.throttled_flows = q->throttled_flows; in fq_dump_stats()
1305 q->unthrottle_latency_ns, ~0U); in fq_dump_stats()
1306 st.ce_mark = q->stat_ce_mark; in fq_dump_stats()
1307 st.horizon_drops = q->stat_horizon_drops; in fq_dump_stats()
1308 st.horizon_caps = q->stat_horizon_caps; in fq_dump_stats()
1310 st.band_drops[i] = q->stat_band_drops[i]; in fq_dump_stats()
1311 st.band_pkt_count[i] = q->band_pkt_count[i]; in fq_dump_stats()
1343 return -ENOMEM; in fq_module_init()