netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v2] net/smc: Use percpu ref for wr tx reference
@ 2023-02-27 12:16 Kai
  2023-02-28 10:55 ` Wenjia Zhang
  0 siblings, 1 reply; 7+ messages in thread
From: Kai @ 2023-02-27 12:16 UTC (permalink / raw)
  To: kgraul, wenjia, jaka; +Cc: kuba, davem, netdev, linux-s390, linux-rdma, Kai

The refcount wr_tx_refcnt may cause cache thrashing problems among
cores and we can use percpu ref to mitigate this issue here. We
gain some performance improvement with percpu ref here on our
customized smc-r verion. Applying cache alignment may also mitigate
this problem but it seem more reasonable to use percpu ref here.

redis-benchmark on smc-r with atomic wr_tx_refcnt:
SET: 525817.62 requests per second, p50=0.087 msec
GET: 570841.44 requests per second, p50=0.087 msec

redis-benchmark on the percpu_ref version:
SET: 539956.81 requests per second, p50=0.087 msec
GET: 587613.12 requests per second, p50=0.079 msec

Signed-off-by: Kai <KaiShen@linux.alibaba.com>
---
 net/smc/smc_core.h |  5 ++++-
 net/smc/smc_wr.c   | 18 ++++++++++++++++--
 net/smc/smc_wr.h   |  5 ++---
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 08b457c2d294..0705e33e2d68 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -106,7 +106,10 @@ struct smc_link {
 	unsigned long		*wr_tx_mask;	/* bit mask of used indexes */
 	u32			wr_tx_cnt;	/* number of WR send buffers */
 	wait_queue_head_t	wr_tx_wait;	/* wait for free WR send buf */
-	atomic_t		wr_tx_refcnt;	/* tx refs to link */
+	struct {
+		struct percpu_ref	wr_tx_refs;
+	} ____cacheline_aligned_in_smp;
+	struct completion	ref_comp;
 
 	struct smc_wr_buf	*wr_rx_bufs;	/* WR recv payload buffers */
 	struct ib_recv_wr	*wr_rx_ibs;	/* WR recv meta data */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index b0678a417e09..dd923e76139f 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -648,7 +648,8 @@ void smc_wr_free_link(struct smc_link *lnk)
 
 	smc_wr_tx_wait_no_pending_sends(lnk);
 	wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
-	wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
+	percpu_ref_kill(&lnk->wr_tx_refs);
+	wait_for_completion(&lnk->ref_comp);
 
 	if (lnk->wr_rx_dma_addr) {
 		ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
@@ -847,6 +848,13 @@ void smc_wr_add_dev(struct smc_ib_device *smcibdev)
 	tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
 }
 
+static void smcr_wr_tx_refs_free(struct percpu_ref *ref)
+{
+	struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs);
+
+	complete(&lnk->ref_comp);
+}
+
 int smc_wr_create_link(struct smc_link *lnk)
 {
 	struct ib_device *ibdev = lnk->smcibdev->ibdev;
@@ -890,7 +898,13 @@ int smc_wr_create_link(struct smc_link *lnk)
 	smc_wr_init_sge(lnk);
 	bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
 	init_waitqueue_head(&lnk->wr_tx_wait);
-	atomic_set(&lnk->wr_tx_refcnt, 0);
+
+	rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free,
+			     PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+	if (rc)
+		goto dma_unmap;
+	init_completion(&lnk->ref_comp);
+
 	init_waitqueue_head(&lnk->wr_reg_wait);
 	atomic_set(&lnk->wr_reg_refcnt, 0);
 	init_waitqueue_head(&lnk->wr_rx_empty_wait);
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 45e9b894d3f8..f3008dda222a 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -63,14 +63,13 @@ static inline bool smc_wr_tx_link_hold(struct smc_link *link)
 {
 	if (!smc_link_sendable(link))
 		return false;
-	atomic_inc(&link->wr_tx_refcnt);
+	percpu_ref_get(&link->wr_tx_refs);
 	return true;
 }
 
 static inline void smc_wr_tx_link_put(struct smc_link *link)
 {
-	if (atomic_dec_and_test(&link->wr_tx_refcnt))
-		wake_up_all(&link->wr_tx_wait);
+	percpu_ref_put(&link->wr_tx_refs);
 }
 
 static inline void smc_wr_drain_cq(struct smc_link *lnk)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] net/smc: Use percpu ref for wr tx reference
  2023-02-27 12:16 [PATCH net-next v2] net/smc: Use percpu ref for wr tx reference Kai
@ 2023-02-28 10:55 ` Wenjia Zhang
  2023-02-28 11:34   ` Kai
  0 siblings, 1 reply; 7+ messages in thread
From: Wenjia Zhang @ 2023-02-28 10:55 UTC (permalink / raw)
  To: Kai, kgraul, jaka; +Cc: kuba, davem, netdev, linux-s390, linux-rdma



On 27.02.23 13:16, Kai wrote:
> The refcount wr_tx_refcnt may cause cache thrashing problems among
> cores and we can use percpu ref to mitigate this issue here. We
> gain some performance improvement with percpu ref here on our
> customized smc-r verion. Applying cache alignment may also mitigate
> this problem but it seem more reasonable to use percpu ref here.
> 
> redis-benchmark on smc-r with atomic wr_tx_refcnt:
> SET: 525817.62 requests per second, p50=0.087 msec
> GET: 570841.44 requests per second, p50=0.087 msec
> 
> redis-benchmark on the percpu_ref version:
> SET: 539956.81 requests per second, p50=0.087 msec
> GET: 587613.12 requests per second, p50=0.079 msec
> 
> Signed-off-by: Kai <KaiShen@linux.alibaba.com>
> ---
>   net/smc/smc_core.h |  5 ++++-
>   net/smc/smc_wr.c   | 18 ++++++++++++++++--
>   net/smc/smc_wr.h   |  5 ++---
>   3 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> index 08b457c2d294..0705e33e2d68 100644
> --- a/net/smc/smc_core.h
> +++ b/net/smc/smc_core.h
> @@ -106,7 +106,10 @@ struct smc_link {
>   	unsigned long		*wr_tx_mask;	/* bit mask of used indexes */
>   	u32			wr_tx_cnt;	/* number of WR send buffers */
>   	wait_queue_head_t	wr_tx_wait;	/* wait for free WR send buf */
> -	atomic_t		wr_tx_refcnt;	/* tx refs to link */
> +	struct {
> +		struct percpu_ref	wr_tx_refs;
> +	} ____cacheline_aligned_in_smp;
> +	struct completion	ref_comp;
>   
>   	struct smc_wr_buf	*wr_rx_bufs;	/* WR recv payload buffers */
>   	struct ib_recv_wr	*wr_rx_ibs;	/* WR recv meta data */
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index b0678a417e09..dd923e76139f 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
> @@ -648,7 +648,8 @@ void smc_wr_free_link(struct smc_link *lnk)
>   
>   	smc_wr_tx_wait_no_pending_sends(lnk);
>   	wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
> -	wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
> +	percpu_ref_kill(&lnk->wr_tx_refs);
> +	wait_for_completion(&lnk->ref_comp);
>   
>   	if (lnk->wr_rx_dma_addr) {
>   		ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
> @@ -847,6 +848,13 @@ void smc_wr_add_dev(struct smc_ib_device *smcibdev)
>   	tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
>   }
>   
> +static void smcr_wr_tx_refs_free(struct percpu_ref *ref)
> +{
> +	struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs);
> +
> +	complete(&lnk->ref_comp);
> +}
> +
>   int smc_wr_create_link(struct smc_link *lnk)
>   {
>   	struct ib_device *ibdev = lnk->smcibdev->ibdev;
> @@ -890,7 +898,13 @@ int smc_wr_create_link(struct smc_link *lnk)
>   	smc_wr_init_sge(lnk);
>   	bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
>   	init_waitqueue_head(&lnk->wr_tx_wait);
> -	atomic_set(&lnk->wr_tx_refcnt, 0);
> +
> +	rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free,
> +			     PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
> +	if (rc)
> +		goto dma_unmap;
> +	init_completion(&lnk->ref_comp);
> +
>   	init_waitqueue_head(&lnk->wr_reg_wait);
>   	atomic_set(&lnk->wr_reg_refcnt, 0);
>   	init_waitqueue_head(&lnk->wr_rx_empty_wait);
> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> index 45e9b894d3f8..f3008dda222a 100644
> --- a/net/smc/smc_wr.h
> +++ b/net/smc/smc_wr.h
> @@ -63,14 +63,13 @@ static inline bool smc_wr_tx_link_hold(struct smc_link *link)
>   {
>   	if (!smc_link_sendable(link))
>   		return false;
> -	atomic_inc(&link->wr_tx_refcnt);
> +	percpu_ref_get(&link->wr_tx_refs);
>   	return true;
>   }
>   
>   static inline void smc_wr_tx_link_put(struct smc_link *link)
>   {
> -	if (atomic_dec_and_test(&link->wr_tx_refcnt))
> -		wake_up_all(&link->wr_tx_wait);
> +	percpu_ref_put(&link->wr_tx_refs);
>   }
>   
>   static inline void smc_wr_drain_cq(struct smc_link *lnk)

@Tony, thank you for the sugguestion! The decription now looks much 
better to me.

@Kai, the performance improvement seems not so giant, but the method 
looks good, indeed. However, to keep the consistency of the code, I'm 
wondering why you only use the perf_ref for wr_tx_wait, but not for 
wr_reg_refcnt?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] net/smc: Use percpu ref for wr tx reference
  2023-02-28 10:55 ` Wenjia Zhang
@ 2023-02-28 11:34   ` Kai
  2023-02-28 12:15     ` Guangguan Wang
  0 siblings, 1 reply; 7+ messages in thread
From: Kai @ 2023-02-28 11:34 UTC (permalink / raw)
  To: Wenjia Zhang, kgraul, jaka; +Cc: kuba, davem, netdev, linux-s390, linux-rdma



On 2023/2/28 6:55 下午, Wenjia Zhang wrote:

> @Kai, the performance improvement seems not so giant, but the method 
> looks good, indeed. However, to keep the consistency of the code, I'm 
> wondering why you only use the perf_ref for wr_tx_wait, but not for 
> wr_reg_refcnt?
Didn't check the similar refcnt, my bad.
On the other hand, Our work is inspired by performance analysis, it 
seems wr_reg_refcnt is not on the IO path. It may not contribute to 
performance improvement.
And inspired by your comment, it seems we can also make the refcnt 
cdc_pend_tx_wr a perfcpu one. I will look into this.

Thanks

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] net/smc: Use percpu ref for wr tx reference
  2023-02-28 11:34   ` Kai
@ 2023-02-28 12:15     ` Guangguan Wang
  2023-02-28 12:20       ` Kai
  2023-02-28 12:52       ` Wenjia Zhang
  0 siblings, 2 replies; 7+ messages in thread
From: Guangguan Wang @ 2023-02-28 12:15 UTC (permalink / raw)
  To: Kai, Wenjia Zhang, kgraul, jaka
  Cc: kuba, davem, netdev, linux-s390, linux-rdma


On 2023/2/28 19:34, Kai wrote:
> 
> 
> On 2023/2/28 6:55 下午, Wenjia Zhang wrote:
> 
>> @Kai, the performance improvement seems not so giant, but the method looks good, indeed. However, to keep the consistency of the code, I'm wondering why you only use the perf_ref for wr_tx_wait, but not for wr_reg_refcnt?
> Didn't check the similar refcnt, my bad.
> On the other hand, Our work is inspired by performance analysis, it seems wr_reg_refcnt is not on the IO path. It may not contribute to performance improvement.
> And inspired by your comment, it seems we can also make the refcnt cdc_pend_tx_wr a perfcpu one. I will look into this.
> 
> Thanks

cdc_pend_tx_wr needs to be zero value tested every time it decreases in smc_cdc_tx_handler.
I don't think this is the right scenario for percpu_ref.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] net/smc: Use percpu ref for wr tx reference
  2023-02-28 12:15     ` Guangguan Wang
@ 2023-02-28 12:20       ` Kai
  2023-02-28 12:52       ` Wenjia Zhang
  1 sibling, 0 replies; 7+ messages in thread
From: Kai @ 2023-02-28 12:20 UTC (permalink / raw)
  To: Guangguan Wang, Wenjia Zhang, kgraul, jaka
  Cc: kuba, davem, netdev, linux-s390, linux-rdma



On 2023/2/28 8:15 下午, Guangguan Wang wrote:
> 
> On 2023/2/28 19:34, Kai wrote:
>>
>>
>> On 2023/2/28 6:55 下午, Wenjia Zhang wrote:
>>
>>> @Kai, the performance improvement seems not so giant, but the method looks good, indeed. However, to keep the consistency of the code, I'm wondering why you only use the perf_ref for wr_tx_wait, but not for wr_reg_refcnt?
>> Didn't check the similar refcnt, my bad.
>> On the other hand, Our work is inspired by performance analysis, it seems wr_reg_refcnt is not on the IO path. It may not contribute to performance improvement.
>> And inspired by your comment, it seems we can also make the refcnt cdc_pend_tx_wr a perfcpu one. I will look into this.
>>
>> Thanks
> 
> cdc_pend_tx_wr needs to be zero value tested every time it decreases in smc_cdc_tx_handler.
> I don't think this is the right scenario for percpu_ref.
OK :)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] net/smc: Use percpu ref for wr tx reference
  2023-02-28 12:15     ` Guangguan Wang
  2023-02-28 12:20       ` Kai
@ 2023-02-28 12:52       ` Wenjia Zhang
  2023-03-01  1:44         ` Kai
  1 sibling, 1 reply; 7+ messages in thread
From: Wenjia Zhang @ 2023-02-28 12:52 UTC (permalink / raw)
  To: Guangguan Wang, Kai, kgraul, jaka
  Cc: kuba, davem, netdev, linux-s390, linux-rdma



On 28.02.23 13:15, Guangguan Wang wrote:
> 
> On 2023/2/28 19:34, Kai wrote:
>>
>>
>> On 2023/2/28 6:55 下午, Wenjia Zhang wrote:
>>
>>> @Kai, the performance improvement seems not so giant, but the method looks good, indeed. However, to keep the consistency of the code, I'm wondering why you only use the perf_ref for wr_tx_wait, but not for wr_reg_refcnt?
>> Didn't check the similar refcnt, my bad.
>> On the other hand, Our work is inspired by performance analysis, it seems wr_reg_refcnt is not on the IO path. It may not contribute to performance improvement.
>> And inspired by your comment, it seems we can also make the refcnt cdc_pend_tx_wr a perfcpu one. I will look into this.
>>
>> Thanks
> 
> cdc_pend_tx_wr needs to be zero value tested every time it decreases in smc_cdc_tx_handler.
> I don't think this is the right scenario for percpu_ref.

I agree, that's why I didn't mention it;)

But could you please check about wr_reg_refcnt? Because we do need to 
find the right balance between the code consistency and improvement

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] net/smc: Use percpu ref for wr tx reference
  2023-02-28 12:52       ` Wenjia Zhang
@ 2023-03-01  1:44         ` Kai
  0 siblings, 0 replies; 7+ messages in thread
From: Kai @ 2023-03-01  1:44 UTC (permalink / raw)
  To: Wenjia Zhang, Guangguan Wang, kgraul, jaka
  Cc: kuba, davem, netdev, linux-s390, linux-rdma



On 2023/2/28 8:52 下午, Wenjia Zhang wrote:
> 
> 
> On 28.02.23 13:15, Guangguan Wang wrote:
>>
>> On 2023/2/28 19:34, Kai wrote:
>>>
>>>
>>> On 2023/2/28 6:55 下午, Wenjia Zhang wrote:
>>>
>>>> @Kai, the performance improvement seems not so giant, but the method 
>>>> looks good, indeed. However, to keep the consistency of the code, 
>>>> I'm wondering why you only use the perf_ref for wr_tx_wait, but not 
>>>> for wr_reg_refcnt?
>>> Didn't check the similar refcnt, my bad.
>>> On the other hand, Our work is inspired by performance analysis, it 
>>> seems wr_reg_refcnt is not on the IO path. It may not contribute to 
>>> performance improvement.
>>> And inspired by your comment, it seems we can also make the refcnt 
>>> cdc_pend_tx_wr a perfcpu one. I will look into this.
>>>
>>> Thanks
>>
>> cdc_pend_tx_wr needs to be zero value tested every time it decreases 
>> in smc_cdc_tx_handler.
>> I don't think this is the right scenario for percpu_ref.
> 
> I agree, that's why I didn't mention it;)
> 
> But could you please check about wr_reg_refcnt? Because we do need to 
> find the right balance between the code consistency and improvement
Will do

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-03-01  1:44 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-27 12:16 [PATCH net-next v2] net/smc: Use percpu ref for wr tx reference Kai
2023-02-28 10:55 ` Wenjia Zhang
2023-02-28 11:34   ` Kai
2023-02-28 12:15     ` Guangguan Wang
2023-02-28 12:20       ` Kai
2023-02-28 12:52       ` Wenjia Zhang
2023-03-01  1:44         ` Kai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).