From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jesper Dangaard Brouer Subject: [net-next PATCH 11/11] RFC: net: RPS bulk enqueue to backlog Date: Tue, 02 Feb 2016 22:15:27 +0100 Message-ID: <20160202211512.16315.89019.stgit@firesoul> References: <20160202211051.16315.51808.stgit@firesoul> Mime-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Cc: Christoph Lameter , tom@herbertland.com, Alexander Duyck , alexei.starovoitov@gmail.com, Jesper Dangaard Brouer , ogerlitz@mellanox.com, gerlitz.or@gmail.com To: netdev@vger.kernel.org Return-path: Received: from mx1.redhat.com ([209.132.183.28]:37762 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754596AbcBBVP3 (ORCPT ); Tue, 2 Feb 2016 16:15:29 -0500 In-Reply-To: <20160202211051.16315.51808.stgit@firesoul> Sender: netdev-owner@vger.kernel.org List-ID: NEED TO CLEAN UP PATCH (likely still contains bugs...) When enabling Receive Packet Steering (RPS) like : echo 32768 > /proc/sys/net/core/rps_sock_flow_entries for N in $(seq 0 7) ; do echo 4096 > /sys/class/net/${DEV}/queues/rx-$N/rps_flow_cnt echo f > /sys/class/net/${DEV}/queues/rx-$N/rps_cpus grep -H . /sys/class/net/${DEV}/queues/rx-$N/rps_cpus done I noticed high contention on enqueue_to_backlog(). To mitigate this introduce enqueue_list_to_backlog(), which allow to enqueue an entire skb list, instead of having to take the per packet cost. The skb list's needed for bulk enqueing are constructed in the per CPU area of softnet_data. And thus, can be constructed without heavy CPU synchronization. I'm excited about the performance improvement of this patch: Before: Ethtool(mlx5p2 ) stat: 7246630 ( 7,246,630) <= rx3_packets /sec After: Ethtool(mlx5p2 ) stat: 9182886 ( 9,182,886) <= rx3_packets /sec Improvement: (1/9182886-1/7246630)*10^9 = saving -29.0 ns The benchmark is a single pktgen flow, which is RPS directed to another CPU, for further processing via process_backlog(). The remote CPU cannot handle the load and this CPU drops packets when it cannot get them enqueued. --- include/linux/netdevice.h | 10 +++ net/core/dev.c | 133 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 140 insertions(+), 3 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 11df9af41a3c..dc5baef95d27 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -633,6 +633,15 @@ struct rps_dev_flow { }; #define RPS_NO_FILTER 0xffff +struct rps_cpu_queue { + struct sk_buff_head skb_list; + int to_cpu; + struct rps_dev_flow *rflow; + struct net_device *dev; +}; +#define RPS_CPU_QUEUES 2 /* Must be power of 2 */ +#define RPS_CPU_QUEUES_MASK (RPS_CPU_QUEUES - 1) + /* * The rps_dev_flow_table structure contains a table of flow mappings. */ @@ -2662,6 +2671,7 @@ struct softnet_data { unsigned int received_rps; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; + struct rps_cpu_queue local_rps_queue[RPS_CPU_QUEUES]; #endif #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit __rcu *flow_limit; diff --git a/net/core/dev.c b/net/core/dev.c index 35c92a968937..0a231529bc0c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3736,6 +3736,60 @@ drop: return NET_RX_DROP; } +static int enqueue_list_to_backlog(struct sk_buff_head *skb_list, int cpu, + unsigned int *qtail, struct net_device *dev) +{ + unsigned int qlen, qlen_drop; + struct softnet_data *sd; + struct sk_buff *skb; + unsigned long flags; + + sd = &per_cpu(softnet_data, cpu); + + local_irq_save(flags); + + rps_lock(sd); + if (!netif_running(dev)) + goto drop; + qlen = skb_queue_len(&sd->input_pkt_queue); + /* NOTICE: Had to drop !skb_flow_limit(skb, qlen) check here */ + if (qlen <= netdev_max_backlog) { + if (qlen) { +enqueue: + //__skb_queue_tail(&sd->input_pkt_queue, skb); + skb_queue_splice_tail_init(skb_list, + &sd->input_pkt_queue); + input_queue_tail_incr_save(sd, qtail); + rps_unlock(sd); + local_irq_restore(flags); + return NET_RX_SUCCESS; + } + + /* Schedule NAPI for backlog device + * We can use non atomic operation since we own the queue lock + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { + if (!rps_ipi_queued(sd)) + ____napi_schedule(sd, &sd->backlog); + } + goto enqueue; + } + +drop: + qlen_drop = skb_queue_len(skb_list); + sd->dropped += qlen_drop; + rps_unlock(sd); + + local_irq_restore(flags); + + atomic_long_add(qlen_drop, &dev->rx_dropped); + while ((skb = __skb_dequeue(skb_list)) != NULL) { + __kfree_skb_defer(skb); + } + return NET_RX_DROP; +} + + static int netif_rx_internal(struct sk_buff *skb) { int ret; @@ -4211,14 +4265,43 @@ static int netif_receive_skb_internal(struct sk_buff *skb) #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + struct rps_cpu_queue *lq; /* softnet cpu local queue (lq) */ + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + if (cpu < 0) + goto no_rps; + + /* RPS destinated packet */ + // XXX: is local_irq_disable needed here? + sd = this_cpu_ptr(&softnet_data); + lq = &sd->local_rps_queue[cpu & RPS_CPU_QUEUES_MASK]; - if (cpu >= 0) { - ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + if (lq->to_cpu == cpu && lq->dev == skb->dev) { + /* Bonus, RPS dest match prev CPU, q pkt for later */ + __skb_queue_tail(&lq->skb_list, skb); + lq->rflow = rflow; rcu_read_unlock(); - return ret; + return NET_RX_SUCCESS; } + if (unlikely(lq->to_cpu < 0)) + goto init_localq; + + /* No match, bulk enq to remote cpu backlog */ + ret = enqueue_list_to_backlog(&lq->skb_list, lq->to_cpu, + &lq->rflow->last_qtail, lq->dev); + init_localq: /* start new localq (lq) */ + /* XXX: check if lq->skb_list was already re-init'ed */ + skb_queue_head_init(&lq->skb_list); + __skb_queue_tail(&lq->skb_list, skb); + lq->rflow = rflow; + lq->to_cpu = cpu; + lq->dev = skb->dev; + + rcu_read_unlock(); + return ret; } +no_rps: #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); @@ -4579,6 +4662,30 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) } EXPORT_SYMBOL(napi_gro_receive); +static void rps_flush_local_queues(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + int i; + +// local_irq_disable(); //TEST +// if (!sd) + sd = this_cpu_ptr(&softnet_data); + + /* Bulk flush any remaining locally queued packets for RPS */ + for (i = 0; i < RPS_CPU_QUEUES; i++) { + struct rps_cpu_queue *lq = &sd->local_rps_queue[i]; + + if (skb_queue_empty(&lq->skb_list)) + continue; + + enqueue_list_to_backlog(&lq->skb_list, lq->to_cpu, + &lq->rflow->last_qtail, lq->dev); + // FAILS: lq->to_cpu = -1; + } +// local_irq_enable(); //TEST +#endif +} + void napi_gro_receive_list(struct napi_struct *napi, struct sk_buff_head *skb_list, struct net_device *netdev) @@ -4594,6 +4701,10 @@ void napi_gro_receive_list(struct napi_struct *napi, skb_gro_reset_offset(skb); napi_skb_finish(dev_gro_receive(napi, skb), skb); } +#ifdef CONFIG_RPS +// if (static_key_false(&rps_needed)) +// rps_flush_local_queues(NULL); +#endif } EXPORT_SYMBOL(napi_gro_receive_list); @@ -4747,6 +4858,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) local_irq_enable(); +// rps_flush_local_queues(NULL); + /* Send pending IPI's to kick RPS processing on remote cpus. */ while (remsd) { struct softnet_data *next = remsd->rps_ipi_next; @@ -5176,6 +5289,8 @@ static void net_rx_action(struct softirq_action *h) __kfree_skb_flush(); local_irq_disable(); + rps_flush_local_queues(NULL); + list_splice_tail_init(&sd->poll_list, &list); list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); @@ -8085,6 +8200,9 @@ static int __init net_dev_init(void) for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); +#ifdef CONFIG_RPS + int j; +#endif skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); @@ -8094,6 +8212,15 @@ static int __init net_dev_init(void) sd->csd.func = rps_trigger_softirq; sd->csd.info = sd; sd->cpu = i; + for (j = 0; j < RPS_CPU_QUEUES; j++) { + struct rps_cpu_queue *lq = &sd->local_rps_queue[j]; + + skb_queue_head_init(&lq->skb_list); + lq->to_cpu = -1; + // XXX: below not needed + lq->rflow = NULL; + lq->dev = NULL; + } #endif sd->backlog.poll = process_backlog;