Linux-NVME Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v2 rfc 1/2] nvmet-tcp: time based stop condition in io_work
@ 2019-09-26 16:38 Wunderlich, Mark
  2019-09-26 20:10 ` Sagi Grimberg
  0 siblings, 1 reply; 3+ messages in thread
From: Wunderlich, Mark @ 2019-09-26 16:38 UTC (permalink / raw)
  To: linux-nvme; +Cc: Sagi Grimberg

Move to a do/while loop terminate condition in
nvmet_tcp_io_work() that is time based that can be
applicable for network interface that is in either
interrupt or polling mode.  If a busy poll time period
configuration setting (net.core.busy_read) not active
then assume interrupt mode and default loop period to
sufficient time period that would have satisfied previous
default ops count of 64.

Preserve previous interrupt mode behavior in that do/while
loop could exit early if at end of an iteration that 'pending'
is false.

Outside loop add poll mode interpretation of 'pending',
resetting it to true if any recv/send activity during
complete loop time period.

Re-queue the work item if either mode determines that
previous activity indicates there may be additional 'pending'
to be processed.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: Kiran Patil <kiran.patil@intel.com>
Signed-off-by: Mark Wunderlich <mark.wunderlich@intel.com>
---
 drivers/nvme/target/tcp.c |   29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index bf4f034..8ec124a 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -21,7 +21,6 @@
 
 #define NVMET_TCP_RECV_BUDGET		8
 #define NVMET_TCP_SEND_BUDGET		8
-#define NVMET_TCP_IO_WORK_BUDGET	64
 
 enum nvmet_tcp_send_state {
 	NVMET_TCP_SEND_DATA_PDU,
@@ -1167,6 +1166,15 @@ static void nvmet_tcp_io_work(struct work_struct *w)
 		container_of(w, struct nvmet_tcp_queue, io_work);
 	bool pending;
 	int ret, ops = 0;
+	unsigned long deadline, bp_usec;
+	bool interrupt_mode = false;
+
+	bp_usec = READ_ONCE(queue->sock->sk->sk_ll_usec);
+	if (!bp_usec) {
+		bp_usec = 1000; /* 1 msec default for interrupt mode */
+		interrupt_mode = true;
+	}
+	deadline = jiffies + usecs_to_jiffies(bp_usec);
 
 	do {
 		pending = false;
@@ -1194,10 +1202,23 @@ static void nvmet_tcp_io_work(struct work_struct *w)
 			return;
 		}
 
-	} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
+		/* If in interrupt mode, exit loop if this
+		 * time through there was no activity.
+		 */
+		if (interrupt_mode && !pending)
+			break;
 
-	/*
-	 * We exahusted our budget, requeue our selves
+	} while (!time_after(jiffies, deadline));
+
+	/* For poll mode, pending will be true if there was
+	 * any completed operations during the complete loop
+	 * period.
+	 */
+	if (!interrupt_mode && ops > 0)
+		pending = true;
+
+	/* We exhausted our budget, requeue if pending indicates
+	 * potential of more to process in either mode.
 	 */
 	if (pending)
 		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2 rfc 1/2] nvmet-tcp: time based stop condition in io_work
  2019-09-26 16:38 [PATCH v2 rfc 1/2] nvmet-tcp: time based stop condition in io_work Wunderlich, Mark
@ 2019-09-26 20:10 ` Sagi Grimberg
  2019-10-01 17:12   ` Wunderlich, Mark
  0 siblings, 1 reply; 3+ messages in thread
From: Sagi Grimberg @ 2019-09-26 20:10 UTC (permalink / raw)
  To: Wunderlich, Mark, linux-nvme



On 9/26/19 9:38 AM, Wunderlich, Mark wrote:
> Move to a do/while loop terminate condition in
> nvmet_tcp_io_work() that is time based that can be
> applicable for network interface that is in either
> interrupt or polling mode.  If a busy poll time period
> configuration setting (net.core.busy_read) not active
> then assume interrupt mode and default loop period to
> sufficient time period that would have satisfied previous
> default ops count of 64.
> 
> Preserve previous interrupt mode behavior in that do/while
> loop could exit early if at end of an iteration that 'pending'
> is false.
> 
> Outside loop add poll mode interpretation of 'pending',
> resetting it to true if any recv/send activity during
> complete loop time period.
> 
> Re-queue the work item if either mode determines that
> previous activity indicates there may be additional 'pending'
> to be processed.
> 
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> Signed-off-by: Kiran Patil <kiran.patil@intel.com>
> Signed-off-by: Mark Wunderlich <mark.wunderlich@intel.com>
> ---
>   drivers/nvme/target/tcp.c |   29 +++++++++++++++++++++++++----
>   1 file changed, 25 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index bf4f034..8ec124a 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -21,7 +21,6 @@
>   
>   #define NVMET_TCP_RECV_BUDGET		8
>   #define NVMET_TCP_SEND_BUDGET		8
> -#define NVMET_TCP_IO_WORK_BUDGET	64
>   
>   enum nvmet_tcp_send_state {
>   	NVMET_TCP_SEND_DATA_PDU,
> @@ -1167,6 +1166,15 @@ static void nvmet_tcp_io_work(struct work_struct *w)
>   		container_of(w, struct nvmet_tcp_queue, io_work);
>   	bool pending;
>   	int ret, ops = 0;
> +	unsigned long deadline, bp_usec;
> +	bool interrupt_mode = false;
> +
> +	bp_usec = READ_ONCE(queue->sock->sk->sk_ll_usec);
> +	if (!bp_usec) {
> +		bp_usec = 1000; /* 1 msec default for interrupt mode */
> +		interrupt_mode = true;

Can't say I like the interrupt_mode variable...

> +	}
> +	deadline = jiffies + usecs_to_jiffies(bp_usec);
>   
>   	do {
>   		pending = false;
> @@ -1194,10 +1202,23 @@ static void nvmet_tcp_io_work(struct work_struct *w)
>   			return;
>   		}
>   
> -	} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
> +		/* If in interrupt mode, exit loop if this
> +		 * time through there was no activity.
> +		 */
> +		if (interrupt_mode && !pending)
> +			break;

Why can't you break in the adq mode if you don't have any pending work?
This will mean that you don't get an interrupt?

I think we're lacking understanding of how this works, when does the
application get an interrupt and when it does not...

I would like to see some abstraction from the socket interface
that will tell us if we should poll more instead of becoming
aware of the interrupt vs. adq thing...

Overall the performance improvement is impressive with adq, does
this have any impact vs. a non-adq device?

>   
> -	/*
> -	 * We exahusted our budget, requeue our selves
> +	} while (!time_after(jiffies, deadline));
> +
> +	/* For poll mode, pending will be true if there was
> +	 * any completed operations during the complete loop
> +	 * period.
> +	 */
> +	if (!interrupt_mode && ops > 0)
> +		pending = true;
> +
> +	/* We exhausted our budget, requeue if pending indicates
> +	 * potential of more to process in either mode.
>   	 */
>   	if (pending)
>   		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
> 

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH v2 rfc 1/2] nvmet-tcp: time based stop condition in io_work
  2019-09-26 20:10 ` Sagi Grimberg
@ 2019-10-01 17:12   ` Wunderlich, Mark
  0 siblings, 0 replies; 3+ messages in thread
From: Wunderlich, Mark @ 2019-10-01 17:12 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme


>> Move to a do/while loop terminate condition in
>> nvmet_tcp_io_work() that is time based that can be applicable for 
>> network interface that is in either interrupt or polling mode.  If a 
>> busy poll time period configuration setting (net.core.busy_read) not 
>> active then assume interrupt mode and default loop period to 
>> sufficient time period that would have satisfied previous default ops 
>> count of 64.
>> 
>> Preserve previous interrupt mode behavior in that do/while loop could 
>> exit early if at end of an iteration that 'pending'
>> is false.
>> 
>> Outside loop add poll mode interpretation of 'pending', resetting it 
>> to true if any recv/send activity during complete loop time period.
>> 
>> Re-queue the work item if either mode determines that previous 
>> activity indicates there may be additional 'pending'
>> to be processed.
>> 
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> Signed-off-by: Kiran Patil <kiran.patil@intel.com>
>> Signed-off-by: Mark Wunderlich <mark.wunderlich@intel.com>
>> ---
>>   drivers/nvme/target/tcp.c |   29 +++++++++++++++++++++++++----
>>   1 file changed, 25 insertions(+), 4 deletions(-)
>> 
>> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c 
>> index bf4f034..8ec124a 100644
>> --- a/drivers/nvme/target/tcp.c
>> +++ b/drivers/nvme/target/tcp.c
>> @@ -21,7 +21,6 @@
>>   
>>   #define NVMET_TCP_RECV_BUDGET		8
>>   #define NVMET_TCP_SEND_BUDGET		8
>> -#define NVMET_TCP_IO_WORK_BUDGET	64
>>   
>>   enum nvmet_tcp_send_state {
>>   	NVMET_TCP_SEND_DATA_PDU,
>> @@ -1167,6 +1166,15 @@ static void nvmet_tcp_io_work(struct work_struct *w)
>>   		container_of(w, struct nvmet_tcp_queue, io_work);
>>   	bool pending;
>>   	int ret, ops = 0;
>> +	unsigned long deadline, bp_usec;
>> +	bool interrupt_mode = false;
>> +
>> +	bp_usec = READ_ONCE(queue->sock->sk->sk_ll_usec);
>> +	if (!bp_usec) {
>> +		bp_usec = 1000; /* 1 msec default for interrupt mode */
>> +		interrupt_mode = true;

> Can't say I like the interrupt_mode variable...
Yes, I was torn on this as well.  Because it is not really about interrupt vs. non-interrupt, that is confusing things.  It is really about taking advantage of the advanced polling option to reap packets that are available via the sockets layer, which is busy polling.  Maybe the more appropriate variable would be something like 'adv_polling_mode' or 'busy_polling_enabled'?  the key thing is having a variable that the code could use to preserve default behavior if busy polling was not available.  The code should change to include busy_poll.h and use the function 'sk_can_busy_poll()' to get the sk_ll_usec polling period if that advanced kernel config option is available.

>> +	}
>> +	deadline = jiffies + usecs_to_jiffies(bp_usec);
>>   
>>   	do {
>>   		pending = false;
>> @@ -1194,10 +1202,23 @@ static void nvmet_tcp_io_work(struct work_struct *w)
>>   			return;
>>   		}
>>   
>> -	} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
>> +		/* If in interrupt mode, exit loop if this
>> +		 * time through there was no activity.
>> +		 */
>> +		if (interrupt_mode && !pending)
>> +			break;

> Why can't you break in the adq mode if you don't have any pending work?
> This will mean that you don't get an interrupt?
The meaning of pending work somewhat changes during busy polling, where the underlying sockets layer is indicating a recommended time period in which optimized performance can be achieved while looping or polling for packet arrivals.  This polling option being able to efficiently bridge gaps in packet arrivals.

> I think we're lacking understanding of how this works, when does the application get an interrupt and when it does not...
As I indicated above, I probably confused the issue using the term interrupt mode.  Indeed, in the beginning, an interrupt is required at the device level that results in the first data_ready indication that kicks off this Io_work thread of execution.  Once indicated then busy polling is active (if configured).  During the busy polling period interrupts aren't necessarily disabled, and how any NIC implements advanced busy polling may be different.  But all should efficiently support the polling period they advertise.

> I would like to see some abstraction from the socket interface that will tell us if we should poll more instead of becoming aware of the interrupt vs. adq thing...
In a way it is.  The sockets layer is indicating a polling time period to monitor for activity.  If no activity during the period, that is the best indication it is time to stop polling.  There are no means for the SW to predict future arrivals other than leveraging the idea that if there was recorded activity during one period then there should be more in a highly active network scenario.  This is the benefit of busy polling and why one would configure it into the system.

> Overall the performance improvement is impressive with adq, does this have any impact vs. a non-adq device?
I expect that any vendors implemented support for busy polling (adq or non-adq) would perform better vs. the default when busy poll support is not configured.  If busy poll support is not configured then this code will perform basically as it did before.  The only change being that the loop becoming time based vs using the hard defined ops limit (NVMET_TCP_IO_WORK_LIMIT).  

Using a time period in general seemed more consistent with how the host side of the code worked.  Since the default behavior is to exit on the first loop iteration with no activity, then the loop limit of 64 historically was defined to cover a specific burst size or max I/O size?  There may be a better time period to use than 1 msec to represent the previous work limit that was 64?

I suppose if there is significant pushback in giving up use of NVMET_TCP_IO_WORK_LIMIT a hybrid loop terminate condition like the following could be used?
'while ((!busy_polling_enabled && ops < NVMET_TCP_IO_WORK_LIMIT) || (busy_polling_enabled && !time_after(jiffies, deadline));'  But in my mind, using just the time based condition is cleaner.

>>   
>> -	/*
>> -	 * We exahusted our budget, requeue our selves
>> +	} while (!time_after(jiffies, deadline));
>> +
>> +	/* For poll mode, pending will be true if there was
>> +	 * any completed operations during the complete loop
>> +	 * period.
>> +	 */
>> +	if (!interrupt_mode && ops > 0)
>> +		pending = true;
>> +
>> +	/* We exhausted our budget, requeue if pending indicates
>> +	 * potential of more to process in either mode.
>>   	 */
>>   	if (pending)
>>   		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
>> 
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, back to index

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-26 16:38 [PATCH v2 rfc 1/2] nvmet-tcp: time based stop condition in io_work Wunderlich, Mark
2019-09-26 20:10 ` Sagi Grimberg
2019-10-01 17:12   ` Wunderlich, Mark

Linux-NVME Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-nvme/0 linux-nvme/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-nvme linux-nvme/ https://lore.kernel.org/linux-nvme \
		linux-nvme@lists.infradead.org linux-nvme@archiver.kernel.org
	public-inbox-index linux-nvme

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.infradead.lists.linux-nvme


AGPL code for this site: git clone https://public-inbox.org/ public-inbox