All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
@ 2024-02-22 13:49 Rengarajan S
  2024-02-22 16:01 ` Ilpo Järvinen
  2024-02-23  6:08 ` Jiri Slaby
  0 siblings, 2 replies; 12+ messages in thread
From: Rengarajan S @ 2024-02-22 13:49 UTC (permalink / raw)
  To: kumaravel.thiagarajan, tharunkumar.pasumarthi, gregkh, jirislaby,
	linux-serial, linux-kernel, unglinuxdriver
  Cc: rengarajan.s

Updated the TX Burst implementation by changing the circular buffer
processing with the pre-existing APIs in kernel. Also updated conditional
statements and alignment issues for better readability.

Signed-off-by: Rengarajan S <rengarajan.s@microchip.com>
---
 drivers/tty/serial/8250/8250_pci1xxxx.c | 39 ++++++++++++-------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/tty/serial/8250/8250_pci1xxxx.c b/drivers/tty/serial/8250/8250_pci1xxxx.c
index 6cfeba058dba..84e0a0725f41 100644
--- a/drivers/tty/serial/8250/8250_pci1xxxx.c
+++ b/drivers/tty/serial/8250/8250_pci1xxxx.c
@@ -374,7 +374,7 @@ static void pci1xxxx_rx_burst(struct uart_port *port, u32 uart_status)
 
 static void pci1xxxx_process_write_data(struct uart_port *port,
 					struct circ_buf *xmit,
-					int *data_empty_count,
+					u32 *data_empty_count,
 					u32 *valid_byte_count)
 {
 	u32 valid_burst_count = *valid_byte_count / UART_BURST_SIZE;
@@ -386,22 +386,24 @@ static void pci1xxxx_process_write_data(struct uart_port *port,
 	 * one byte at a time.
 	 */
 	while (valid_burst_count) {
-		if (*data_empty_count - UART_BURST_SIZE < 0)
+		if (*data_empty_count < UART_BURST_SIZE)
 			break;
-		if (xmit->tail > (UART_XMIT_SIZE - UART_BURST_SIZE))
+
+		if (CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE) <
+		    UART_BURST_SIZE)
 			break;
-		writel(*(unsigned int *)&xmit->buf[xmit->tail],
+
+		writel(*(u32 *)&xmit->buf[xmit->tail],
 		       port->membase + UART_TX_BURST_FIFO);
 		*valid_byte_count -= UART_BURST_SIZE;
 		*data_empty_count -= UART_BURST_SIZE;
 		valid_burst_count -= UART_BYTE_SIZE;
 
-		xmit->tail = (xmit->tail + UART_BURST_SIZE) &
-			     (UART_XMIT_SIZE - 1);
+		uart_xmit_advance(port, UART_BURST_SIZE);
 	}
 
 	while (*valid_byte_count) {
-		if (*data_empty_count - UART_BYTE_SIZE < 0)
+		if (*data_empty_count < UART_BYTE_SIZE)
 			break;
 		writeb(xmit->buf[xmit->tail], port->membase +
 		       UART_TX_BYTE_FIFO);
@@ -412,8 +414,7 @@ static void pci1xxxx_process_write_data(struct uart_port *port,
 		 * When the tail of the circular buffer is reached, the next
 		 * byte is transferred to the beginning of the buffer.
 		 */
-		xmit->tail = (xmit->tail + UART_BYTE_SIZE) &
-			     (UART_XMIT_SIZE - 1);
+		uart_xmit_advance(port, UART_BYTE_SIZE);
 
 		/*
 		 * If there are any pending burst count, data is handled by
@@ -434,16 +435,7 @@ static void pci1xxxx_tx_burst(struct uart_port *port, u32 uart_status)
 
 	xmit = &port->state->xmit;
 
-	if (port->x_char) {
-		writeb(port->x_char, port->membase + UART_TX);
-		port->icount.tx++;
-		port->x_char = 0;
-		return;
-	}
-
-	if ((uart_tx_stopped(port)) || (uart_circ_empty(xmit))) {
-		port->ops->stop_tx(port);
-	} else {
+	if (!(port->x_char)) {
 		data_empty_count = (pci1xxxx_read_burst_status(port) &
 				    UART_BST_STAT_TX_COUNT_MASK) >> 8;
 		do {
@@ -453,15 +445,22 @@ static void pci1xxxx_tx_burst(struct uart_port *port, u32 uart_status)
 						    &data_empty_count,
 						    &valid_byte_count);
 
-			port->icount.tx++;
 			if (uart_circ_empty(xmit))
 				break;
 		} while (data_empty_count && valid_byte_count);
+	} else {
+		writeb(port->x_char, port->membase + UART_TX);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
 	}
 
 	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
+	if ((uart_tx_stopped(port)) || (uart_circ_empty(xmit)))
+		port->ops->stop_tx(port);
+
 	 /*
 	  * With RPM enabled, we have to wait until the FIFO is empty before
 	  * the HW can go idle. So we get here once again with empty FIFO and
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-02-22 13:49 [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs Rengarajan S
@ 2024-02-22 16:01 ` Ilpo Järvinen
  2024-02-23  9:22   ` Rengarajan.S
  2024-02-23  6:08 ` Jiri Slaby
  1 sibling, 1 reply; 12+ messages in thread
From: Ilpo Järvinen @ 2024-02-22 16:01 UTC (permalink / raw)
  To: Rengarajan S
  Cc: kumaravel.thiagarajan, tharunkumar.pasumarthi, gregkh, jirislaby,
	linux-serial, linux-kernel, unglinuxdriver

On Thu, 22 Feb 2024, Rengarajan S wrote:

> Updated the TX Burst implementation by changing the circular buffer
> processing with the pre-existing APIs in kernel. Also updated conditional
> statements and alignment issues for better readability.
> 
> Signed-off-by: Rengarajan S <rengarajan.s@microchip.com>
> ---

> @@ -434,16 +435,7 @@ static void pci1xxxx_tx_burst(struct uart_port *port, u32 uart_status)
>  
>  	xmit = &port->state->xmit;
>  
> -	if (port->x_char) {
> -		writeb(port->x_char, port->membase + UART_TX);
> -		port->icount.tx++;
> -		port->x_char = 0;
> -		return;
> -	}
> -
> -	if ((uart_tx_stopped(port)) || (uart_circ_empty(xmit))) {
> -		port->ops->stop_tx(port);
> -	} else {
> +	if (!(port->x_char)) {
>  		data_empty_count = (pci1xxxx_read_burst_status(port) &
>  				    UART_BST_STAT_TX_COUNT_MASK) >> 8;
>  		do {
> @@ -453,15 +445,22 @@ static void pci1xxxx_tx_burst(struct uart_port *port, u32 uart_status)
>  						    &data_empty_count,
>  						    &valid_byte_count);
>  
> -			port->icount.tx++;
>  			if (uart_circ_empty(xmit))
>  				break;
>  		} while (data_empty_count && valid_byte_count);
> +	} else {
> +		writeb(port->x_char, port->membase + UART_TX);
> +		port->icount.tx++;
> +		port->x_char = 0;
> +		return;

Why you made this reorganization for x_char handling?? It seems 
entirely wrong thing to do, x_char should have precendence over 
sending normal chars.

This patch would have been some much simpler to review if it would have 
not attempted to n things in one go, please try to split into sensible 
changes.


-- 
 i.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-02-22 13:49 [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs Rengarajan S
  2024-02-22 16:01 ` Ilpo Järvinen
@ 2024-02-23  6:08 ` Jiri Slaby
  2024-02-23  9:21   ` Rengarajan.S
  1 sibling, 1 reply; 12+ messages in thread
From: Jiri Slaby @ 2024-02-23  6:08 UTC (permalink / raw)
  To: Rengarajan S, kumaravel.thiagarajan, tharunkumar.pasumarthi,
	gregkh, linux-serial, linux-kernel, unglinuxdriver

On 22. 02. 24, 14:49, Rengarajan S wrote:
> Updated the TX Burst implementation by changing the circular buffer
> processing with the pre-existing APIs in kernel. Also updated conditional
> statements and alignment issues for better readability.

Hi,

so why are you keeping the nested double loop?

-- 
js
suse labs


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-02-23  6:08 ` Jiri Slaby
@ 2024-02-23  9:21   ` Rengarajan.S
  2024-02-23  9:26     ` Jiri Slaby
  0 siblings, 1 reply; 12+ messages in thread
From: Rengarajan.S @ 2024-02-23  9:21 UTC (permalink / raw)
  To: jirislaby, linux-serial, gregkh, UNGLinuxDriver,
	Kumaravel.Thiagarajan, linux-kernel, Tharunkumar.Pasumarthi

On Fri, 2024-02-23 at 07:08 +0100, Jiri Slaby wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you
> know the content is safe
> 
> On 22. 02. 24, 14:49, Rengarajan S wrote:
> > Updated the TX Burst implementation by changing the circular buffer
> > processing with the pre-existing APIs in kernel. Also updated
> > conditional
> > statements and alignment issues for better readability.
> 
> Hi,
> 
> so why are you keeping the nested double loop?
> 

Hi, in order to differentiate Burst mode handling with byte mode had
seperate loops for both. Since, having single while loop also does not
align with rx implementation (where we have seperate handling for burst
and byte) have retained the double loop. 

> --
> js
> suse labs
> 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-02-22 16:01 ` Ilpo Järvinen
@ 2024-02-23  9:22   ` Rengarajan.S
  0 siblings, 0 replies; 12+ messages in thread
From: Rengarajan.S @ 2024-02-23  9:22 UTC (permalink / raw)
  To: ilpo.jarvinen
  Cc: jirislaby, linux-serial, gregkh, UNGLinuxDriver,
	Kumaravel.Thiagarajan, linux-kernel, Tharunkumar.Pasumarthi

On Thu, 2024-02-22 at 18:01 +0200, Ilpo Järvinen wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you
> know the content is safe
> 
> On Thu, 22 Feb 2024, Rengarajan S wrote:
> 
> > Updated the TX Burst implementation by changing the circular buffer
> > processing with the pre-existing APIs in kernel. Also updated
> > conditional
> > statements and alignment issues for better readability.
> > 
> > Signed-off-by: Rengarajan S <rengarajan.s@microchip.com>
> > ---
> 
> > @@ -434,16 +435,7 @@ static void pci1xxxx_tx_burst(struct uart_port
> > *port, u32 uart_status)
> > 
> >       xmit = &port->state->xmit;
> > 
> > -     if (port->x_char) {
> > -             writeb(port->x_char, port->membase + UART_TX);
> > -             port->icount.tx++;
> > -             port->x_char = 0;
> > -             return;
> > -     }
> > -
> > -     if ((uart_tx_stopped(port)) || (uart_circ_empty(xmit))) {
> > -             port->ops->stop_tx(port);
> > -     } else {
> > +     if (!(port->x_char)) {
> >               data_empty_count = (pci1xxxx_read_burst_status(port)
> > &
> >                                   UART_BST_STAT_TX_COUNT_MASK) >>
> > 8;
> >               do {
> > @@ -453,15 +445,22 @@ static void pci1xxxx_tx_burst(struct
> > uart_port *port, u32 uart_status)
> >                                                  
> > &data_empty_count,
> >                                                  
> > &valid_byte_count);
> > 
> > -                     port->icount.tx++;
> >                       if (uart_circ_empty(xmit))
> >                               break;
> >               } while (data_empty_count && valid_byte_count);
> > +     } else {
> > +             writeb(port->x_char, port->membase + UART_TX);
> > +             port->icount.tx++;
> > +             port->x_char = 0;
> > +             return;
> 
> Why you made this reorganization for x_char handling?? It seems
> entirely wrong thing to do, x_char should have precendence over
> sending normal chars.
> 
> This patch would have been some much simpler to review if it would
> have
> not attempted to n things in one go, please try to split into
> sensible
> changes.
> 

Hi, Thanks for reviewing the patch. Will address the comments and share
the updated patch shortly.

> 
> --
>  i.
> 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-02-23  9:21   ` Rengarajan.S
@ 2024-02-23  9:26     ` Jiri Slaby
  2024-02-23  9:36       ` Rengarajan.S
  2024-03-04  4:37       ` Rengarajan.S
  0 siblings, 2 replies; 12+ messages in thread
From: Jiri Slaby @ 2024-02-23  9:26 UTC (permalink / raw)
  To: Rengarajan.S, linux-serial, gregkh, UNGLinuxDriver,
	Kumaravel.Thiagarajan, linux-kernel, Tharunkumar.Pasumarthi

On 23. 02. 24, 10:21, Rengarajan.S@microchip.com wrote:
> On Fri, 2024-02-23 at 07:08 +0100, Jiri Slaby wrote:
>> EXTERNAL EMAIL: Do not click links or open attachments unless you
>> know the content is safe
>>
>> On 22. 02. 24, 14:49, Rengarajan S wrote:
>>> Updated the TX Burst implementation by changing the circular buffer
>>> processing with the pre-existing APIs in kernel. Also updated
>>> conditional
>>> statements and alignment issues for better readability.
>>
>> Hi,
>>
>> so why are you keeping the nested double loop?
>>
> 
> Hi, in order to differentiate Burst mode handling with byte mode had
> seperate loops for both. Since, having single while loop also does not
> align with rx implementation (where we have seperate handling for burst
> and byte) have retained the double loop.

So obviously, align RX to a single loop if possible. The current TX code 
is very hard to follow and sort of unmaintainable (and buggy). And IMO 
it's unnecessary as I proposed [1]. And even if RX cannot be one loop, 
you still can make TX easy to read as the two need not be the same.

[1] 
https://lore.kernel.org/all/b8325c3f-bf5b-4c55-8dce-ef395edce251@kernel.org/

thanks,
-- 
js
suse labs


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-02-23  9:26     ` Jiri Slaby
@ 2024-02-23  9:36       ` Rengarajan.S
  2024-03-04  4:37       ` Rengarajan.S
  1 sibling, 0 replies; 12+ messages in thread
From: Rengarajan.S @ 2024-02-23  9:36 UTC (permalink / raw)
  To: jirislaby, linux-serial, gregkh, Kumaravel.Thiagarajan,
	UNGLinuxDriver, Tharunkumar.Pasumarthi, linux-kernel

On Fri, 2024-02-23 at 10:26 +0100, Jiri Slaby wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you
> know the content is safe
> 
> On 23. 02. 24, 10:21, Rengarajan.S@microchip.com wrote:
> > On Fri, 2024-02-23 at 07:08 +0100, Jiri Slaby wrote:
> > > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > > know the content is safe
> > > 
> > > On 22. 02. 24, 14:49, Rengarajan S wrote:
> > > > Updated the TX Burst implementation by changing the circular
> > > > buffer
> > > > processing with the pre-existing APIs in kernel. Also updated
> > > > conditional
> > > > statements and alignment issues for better readability.
> > > 
> > > Hi,
> > > 
> > > so why are you keeping the nested double loop?
> > > 
> > 
> > Hi, in order to differentiate Burst mode handling with byte mode
> > had
> > seperate loops for both. Since, having single while loop also does
> > not
> > align with rx implementation (where we have seperate handling for
> > burst
> > and byte) have retained the double loop.
> 
> So obviously, align RX to a single loop if possible. The current TX
> code
> is very hard to follow and sort of unmaintainable (and buggy). And
> IMO
> it's unnecessary as I proposed [1]. And even if RX cannot be one
> loop,
> you still can make TX easy to read as the two need not be the same.
> 
> [1]
> https://lore.kernel.org/all/b8325c3f-bf5b-4c55-8dce-ef395edce251@kernel.org/


Sure. Will update the TX implementation as suggested and will send out
the patch shortly.
> 
> thanks,
> --
> js
> suse labs
> 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-02-23  9:26     ` Jiri Slaby
  2024-02-23  9:36       ` Rengarajan.S
@ 2024-03-04  4:37       ` Rengarajan.S
  2024-03-04  6:19         ` Jiri Slaby
  1 sibling, 1 reply; 12+ messages in thread
From: Rengarajan.S @ 2024-03-04  4:37 UTC (permalink / raw)
  To: jirislaby, linux-serial, gregkh, Kumaravel.Thiagarajan,
	UNGLinuxDriver, Tharunkumar.Pasumarthi, linux-kernel

Hi Jiri,

On Fri, 2024-02-23 at 10:26 +0100, Jiri Slaby wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you
> know the content is safe
> 
> On 23. 02. 24, 10:21, Rengarajan.S@microchip.com wrote:
> > On Fri, 2024-02-23 at 07:08 +0100, Jiri Slaby wrote:
> > > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > > know the content is safe
> > > 
> > > On 22. 02. 24, 14:49, Rengarajan S wrote:
> > > > Updated the TX Burst implementation by changing the circular
> > > > buffer
> > > > processing with the pre-existing APIs in kernel. Also updated
> > > > conditional
> > > > statements and alignment issues for better readability.
> > > 
> > > Hi,
> > > 
> > > so why are you keeping the nested double loop?
> > > 
> > 
> > Hi, in order to differentiate Burst mode handling with byte mode
> > had
> > seperate loops for both. Since, having single while loop also does
> > not
> > align with rx implementation (where we have seperate handling for
> > burst
> > and byte) have retained the double loop.
> 
> So obviously, align RX to a single loop if possible. The current TX
> code
> is very hard to follow and sort of unmaintainable (and buggy). And
> IMO
> it's unnecessary as I proposed [1]. And even if RX cannot be one
> loop,
> you still can make TX easy to read as the two need not be the same.
> 
> [1]
> https://lore.kernel.org/all/b8325c3f-bf5b-4c55-8dce-ef395edce251@kernel.org/


while (data_empty_count) {
   cnt = CIRC_CNT_TO_END();
   if (!cnt)
     break;
   if (cnt < UART_BURST_SIZE || (tail & 3)) { // is_unaligned()
     writeb();
     cnt = 1;
   } else {
     writel()
     cnt = UART_BURST_SIZE;
   }
   uart_xmit_advance(cnt);
   data_empty_count -= cnt;
}

With the above implementation we are observing performance drop of 2
Mbps at baud rate of 4 Mbps. The reason for this is the fact that for
each iteration we are checking if the the data need to be processed via
DWORDs or Bytes. The condition check for each iteration is causing the
drop in performance.

With the previous implementation(with nested loops) the performance is
found to be around 4 Mbps at baud rate of 4 Mbps. In that
implementation we handle sending DWORDs continuosly until the transfer
size < 4. Can you let us know any other alternatives for the above
performance drop.


> 
> thanks,
> --
> js
> suse labs
> 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-03-04  4:37       ` Rengarajan.S
@ 2024-03-04  6:19         ` Jiri Slaby
  2024-03-05  4:15           ` Rengarajan.S
  0 siblings, 1 reply; 12+ messages in thread
From: Jiri Slaby @ 2024-03-04  6:19 UTC (permalink / raw)
  To: Rengarajan.S, linux-serial, gregkh, Kumaravel.Thiagarajan,
	UNGLinuxDriver, Tharunkumar.Pasumarthi, linux-kernel

On 04. 03. 24, 5:37, Rengarajan.S@microchip.com wrote:
> Hi Jiri,
> 
> On Fri, 2024-02-23 at 10:26 +0100, Jiri Slaby wrote:
>> EXTERNAL EMAIL: Do not click links or open attachments unless you
>> know the content is safe
>>
>> On 23. 02. 24, 10:21, Rengarajan.S@microchip.com wrote:
>>> On Fri, 2024-02-23 at 07:08 +0100, Jiri Slaby wrote:
>>>> EXTERNAL EMAIL: Do not click links or open attachments unless you
>>>> know the content is safe
>>>>
>>>> On 22. 02. 24, 14:49, Rengarajan S wrote:
>>>>> Updated the TX Burst implementation by changing the circular
>>>>> buffer
>>>>> processing with the pre-existing APIs in kernel. Also updated
>>>>> conditional
>>>>> statements and alignment issues for better readability.
>>>>
>>>> Hi,
>>>>
>>>> so why are you keeping the nested double loop?
>>>>
>>>
>>> Hi, in order to differentiate Burst mode handling with byte mode
>>> had
>>> seperate loops for both. Since, having single while loop also does
>>> not
>>> align with rx implementation (where we have seperate handling for
>>> burst
>>> and byte) have retained the double loop.
>>
>> So obviously, align RX to a single loop if possible. The current TX
>> code
>> is very hard to follow and sort of unmaintainable (and buggy). And
>> IMO
>> it's unnecessary as I proposed [1]. And even if RX cannot be one
>> loop,
>> you still can make TX easy to read as the two need not be the same.
>>
>> [1]
>> https://lore.kernel.org/all/b8325c3f-bf5b-4c55-8dce-ef395edce251@kernel.org/
> 
> 
> while (data_empty_count) {
>     cnt = CIRC_CNT_TO_END();
>     if (!cnt)
>       break;
>     if (cnt < UART_BURST_SIZE || (tail & 3)) { // is_unaligned()
>       writeb();
>       cnt = 1;
>     } else {
>       writel()
>       cnt = UART_BURST_SIZE;
>     }
>     uart_xmit_advance(cnt);
>     data_empty_count -= cnt;
> }
> 
> With the above implementation we are observing performance drop of 2
> Mbps at baud rate of 4 Mbps. The reason for this is the fact that for
> each iteration we are checking if the the data need to be processed via
> DWORDs or Bytes. The condition check for each iteration is causing the
> drop in performance.

Hi,

the check is by several orders of magnitude faster than the I/O proper. 
So I don't think that's the root cause.

> With the previous implementation(with nested loops) the performance is
> found to be around 4 Mbps at baud rate of 4 Mbps. In that
> implementation we handle sending DWORDs continuosly until the transfer
> size < 4. Can you let us know any other alternatives for the above
> performance drop.

Could you attach the patch you are testing?

thanks,
-- 
js
suse labs


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-03-04  6:19         ` Jiri Slaby
@ 2024-03-05  4:15           ` Rengarajan.S
  2024-03-05  7:19             ` Jiri Slaby
  0 siblings, 1 reply; 12+ messages in thread
From: Rengarajan.S @ 2024-03-05  4:15 UTC (permalink / raw)
  To: jirislaby, linux-serial, gregkh, Kumaravel.Thiagarajan,
	UNGLinuxDriver, Tharunkumar.Pasumarthi, linux-kernel

Hi Jiri,

On Mon, 2024-03-04 at 07:19 +0100, Jiri Slaby wrote:
> [Some people who received this message don't often get email from
> jirislaby@kernel.org. Learn why this is important at
> https://aka.ms/LearnAboutSenderIdentification ]
> 
> EXTERNAL EMAIL: Do not click links or open attachments unless you
> know the content is safe
> 
> On 04. 03. 24, 5:37, Rengarajan.S@microchip.com wrote:
> > Hi Jiri,
> > 
> > On Fri, 2024-02-23 at 10:26 +0100, Jiri Slaby wrote:
> > > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > > know the content is safe
> > > 
> > > On 23. 02. 24, 10:21, Rengarajan.S@microchip.com wrote:
> > > > On Fri, 2024-02-23 at 07:08 +0100, Jiri Slaby wrote:
> > > > > EXTERNAL EMAIL: Do not click links or open attachments unless
> > > > > you
> > > > > know the content is safe
> > > > > 
> > > > > On 22. 02. 24, 14:49, Rengarajan S wrote:
> > > > > > Updated the TX Burst implementation by changing the
> > > > > > circular
> > > > > > buffer
> > > > > > processing with the pre-existing APIs in kernel. Also
> > > > > > updated
> > > > > > conditional
> > > > > > statements and alignment issues for better readability.
> > > > > 
> > > > > Hi,
> > > > > 
> > > > > so why are you keeping the nested double loop?
> > > > > 
> > > > 
> > > > Hi, in order to differentiate Burst mode handling with byte
> > > > mode
> > > > had
> > > > seperate loops for both. Since, having single while loop also
> > > > does
> > > > not
> > > > align with rx implementation (where we have seperate handling
> > > > for
> > > > burst
> > > > and byte) have retained the double loop.
> > > 
> > > So obviously, align RX to a single loop if possible. The current
> > > TX
> > > code
> > > is very hard to follow and sort of unmaintainable (and buggy).
> > > And
> > > IMO
> > > it's unnecessary as I proposed [1]. And even if RX cannot be one
> > > loop,
> > > you still can make TX easy to read as the two need not be the
> > > same.
> > > 
> > > [1]
> > > https://lore.kernel.org/all/b8325c3f-bf5b-4c55-8dce-ef395edce251@kernel.org/
> > 
> > 
> > while (data_empty_count) {
> >     cnt = CIRC_CNT_TO_END();
> >     if (!cnt)
> >       break;
> >     if (cnt < UART_BURST_SIZE || (tail & 3)) { // is_unaligned()
> >       writeb();
> >       cnt = 1;
> >     } else {
> >       writel()
> >       cnt = UART_BURST_SIZE;
> >     }
> >     uart_xmit_advance(cnt);
> >     data_empty_count -= cnt;
> > }
> > 
> > With the above implementation we are observing performance drop of
> > 2
> > Mbps at baud rate of 4 Mbps. The reason for this is the fact that
> > for
> > each iteration we are checking if the the data need to be processed
> > via
> > DWORDs or Bytes. The condition check for each iteration is causing
> > the
> > drop in performance.
> 
> Hi,
> 
> the check is by several orders of magnitude faster than the I/O
> proper.
> So I don't think that's the root cause.
> 
> > With the previous implementation(with nested loops) the performance
> > is
> > found to be around 4 Mbps at baud rate of 4 Mbps. In that
> > implementation we handle sending DWORDs continuosly until the
> > transfer
> > size < 4. Can you let us know any other alternatives for the above
> > performance drop.
> 
> Could you attach the patch you are testing?

Please find the updated pci1xxxx_process_write_data

	u32 xfer_cnt;

        while (*valid_byte_count) {
                xfer_cnt = CIRC_CNT_TO_END(xmit->head, xmit->tail,
                                           UART_XMIT_SIZE);

                if (!xfer_cnt)
                        break;

                if (xfer_cnt < UART_BURST_SIZE || (xmit->tail & 3)) {
                        writeb(xmit->buf[xmit->tail], port->membase +
                               UART_TX_BYTE_FIFO);
                        xfer_cnt = UART_BYTE_SIZE;
                } else {
                        writel(*(u32 *)&xmit->buf[xmit->tail],
                               port->membase + UART_TX_BURST_FIFO);
                        xfer_cnt = UART_BURST_SIZE;
                }

                uart_xmit_advance(port, xfer_cnt);
                *data_empty_count -= xfer_cnt;
                *valid_byte_count -= xfer_cnt;
        }

Testing is done via minicom by transferring a 10 MB file at 4 Mbps,

After the minicom transfer with single instance:

Previous implementation(Nested While Loops):
Transferred 10 MB at 3900000 CPS

Current implementation:
Transferred 10 MB at 2459999 CPS

Thanks,
Rengarajan S

> 
> thanks,
> --
> js
> suse labs
> 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-03-05  4:15           ` Rengarajan.S
@ 2024-03-05  7:19             ` Jiri Slaby
  2024-03-06  6:55               ` Rengarajan.S
  0 siblings, 1 reply; 12+ messages in thread
From: Jiri Slaby @ 2024-03-05  7:19 UTC (permalink / raw)
  To: Rengarajan.S, linux-serial, gregkh, Kumaravel.Thiagarajan,
	UNGLinuxDriver, Tharunkumar.Pasumarthi, linux-kernel

On 05. 03. 24, 5:15, Rengarajan.S@microchip.com wrote:
> Hi Jiri,
> 
> On Mon, 2024-03-04 at 07:19 +0100, Jiri Slaby wrote:
>> [Some people who received this message don't often get email from
>> jirislaby@kernel.org. Learn why this is important at
>> https://aka.ms/LearnAboutSenderIdentification ]
>>
>> EXTERNAL EMAIL: Do not click links or open attachments unless you
>> know the content is safe
>>
>> On 04. 03. 24, 5:37, Rengarajan.S@microchip.com wrote:
>>> Hi Jiri,
>>>
>>> On Fri, 2024-02-23 at 10:26 +0100, Jiri Slaby wrote:
>>>> EXTERNAL EMAIL: Do not click links or open attachments unless you
>>>> know the content is safe
>>>>
>>>> On 23. 02. 24, 10:21, Rengarajan.S@microchip.com wrote:
>>>>> On Fri, 2024-02-23 at 07:08 +0100, Jiri Slaby wrote:
>>>>>> EXTERNAL EMAIL: Do not click links or open attachments unless
>>>>>> you
>>>>>> know the content is safe
>>>>>>
>>>>>> On 22. 02. 24, 14:49, Rengarajan S wrote:
>>>>>>> Updated the TX Burst implementation by changing the
>>>>>>> circular
>>>>>>> buffer
>>>>>>> processing with the pre-existing APIs in kernel. Also
>>>>>>> updated
>>>>>>> conditional
>>>>>>> statements and alignment issues for better readability.
>>>>>>
>>>>>> Hi,
>>>>>>
>>>>>> so why are you keeping the nested double loop?
>>>>>>
>>>>>
>>>>> Hi, in order to differentiate Burst mode handling with byte
>>>>> mode
>>>>> had
>>>>> seperate loops for both. Since, having single while loop also
>>>>> does
>>>>> not
>>>>> align with rx implementation (where we have seperate handling
>>>>> for
>>>>> burst
>>>>> and byte) have retained the double loop.
>>>>
>>>> So obviously, align RX to a single loop if possible. The current
>>>> TX
>>>> code
>>>> is very hard to follow and sort of unmaintainable (and buggy).
>>>> And
>>>> IMO
>>>> it's unnecessary as I proposed [1]. And even if RX cannot be one
>>>> loop,
>>>> you still can make TX easy to read as the two need not be the
>>>> same.
>>>>
>>>> [1]
>>>> https://lore.kernel.org/all/b8325c3f-bf5b-4c55-8dce-ef395edce251@kernel.org/
>>>
>>>
>>> while (data_empty_count) {
>>>      cnt = CIRC_CNT_TO_END();
>>>      if (!cnt)
>>>        break;
>>>      if (cnt < UART_BURST_SIZE || (tail & 3)) { // is_unaligned()
>>>        writeb();
>>>        cnt = 1;
>>>      } else {
>>>        writel()
>>>        cnt = UART_BURST_SIZE;
>>>      }
>>>      uart_xmit_advance(cnt);
>>>      data_empty_count -= cnt;
>>> }
>>>
>>> With the above implementation we are observing performance drop of
>>> 2
>>> Mbps at baud rate of 4 Mbps. The reason for this is the fact that
>>> for
>>> each iteration we are checking if the the data need to be processed
>>> via
>>> DWORDs or Bytes. The condition check for each iteration is causing
>>> the
>>> drop in performance.
>>
>> Hi,
>>
>> the check is by several orders of magnitude faster than the I/O
>> proper.
>> So I don't think that's the root cause.
>>
>>> With the previous implementation(with nested loops) the performance
>>> is
>>> found to be around 4 Mbps at baud rate of 4 Mbps. In that
>>> implementation we handle sending DWORDs continuosly until the
>>> transfer
>>> size < 4. Can you let us know any other alternatives for the above
>>> performance drop.
>>
>> Could you attach the patch you are testing?
> 
> Please find the updated pci1xxxx_process_write_data
> 
> 	u32 xfer_cnt;
> 
>          while (*valid_byte_count) {
>                  xfer_cnt = CIRC_CNT_TO_END(xmit->head, xmit->tail,
>                                             UART_XMIT_SIZE);
> 
>                  if (!xfer_cnt)
>                          break;
> 
>                  if (xfer_cnt < UART_BURST_SIZE || (xmit->tail & 3)) {

Hi,

OK, is it different if you remove the alignment checking (which should 
be correct™ thing to do, but may/will slow down things on platforms 
which don't care)?

>                          writeb(xmit->buf[xmit->tail], port->membase +
>                                 UART_TX_BYTE_FIFO);
>                          xfer_cnt = UART_BYTE_SIZE;
>                  } else {
>                          writel(*(u32 *)&xmit->buf[xmit->tail],

If you remove the "tail & 3" check, you can use get_unaligned() here and 
need not care about unaligned accesses after all...

>                                 port->membase + UART_TX_BURST_FIFO);
>                          xfer_cnt = UART_BURST_SIZE;
>                  }
> 
>                  uart_xmit_advance(port, xfer_cnt);
>                  *data_empty_count -= xfer_cnt;
>                  *valid_byte_count -= xfer_cnt;
>          }
> 
> Testing is done via minicom by transferring a 10 MB file at 4 Mbps,
> 
> After the minicom transfer with single instance:
> 
> Previous implementation(Nested While Loops):
> Transferred 10 MB at 3900000 CPS
> 
> Current implementation:
> Transferred 10 MB at 2459999 CPS



-- 
js
suse labs


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs
  2024-03-05  7:19             ` Jiri Slaby
@ 2024-03-06  6:55               ` Rengarajan.S
  0 siblings, 0 replies; 12+ messages in thread
From: Rengarajan.S @ 2024-03-06  6:55 UTC (permalink / raw)
  To: jirislaby, linux-serial, gregkh, Kumaravel.Thiagarajan,
	UNGLinuxDriver, Tharunkumar.Pasumarthi, linux-kernel

Hi Jiri,

On Tue, 2024-03-05 at 08:19 +0100, Jiri Slaby wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you
> know the content is safe
> 
> On 05. 03. 24, 5:15, Rengarajan.S@microchip.com wrote:
> > Hi Jiri,
> > 
> > On Mon, 2024-03-04 at 07:19 +0100, Jiri Slaby wrote:
> > > [Some people who received this message don't often get email from
> > > jirislaby@kernel.org. Learn why this is important at
> > > https://aka.ms/LearnAboutSenderIdentification ]
> > > 
> > > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > > know the content is safe
> > > 
> > > On 04. 03. 24, 5:37, Rengarajan.S@microchip.com wrote:
> > > > Hi Jiri,
> > > > 
> > > > On Fri, 2024-02-23 at 10:26 +0100, Jiri Slaby wrote:
> > > > > EXTERNAL EMAIL: Do not click links or open attachments unless
> > > > > you
> > > > > know the content is safe
> > > > > 
> > > > > On 23. 02. 24, 10:21, Rengarajan.S@microchip.com wrote:
> > > > > > On Fri, 2024-02-23 at 07:08 +0100, Jiri Slaby wrote:
> > > > > > > EXTERNAL EMAIL: Do not click links or open attachments
> > > > > > > unless
> > > > > > > you
> > > > > > > know the content is safe
> > > > > > > 
> > > > > > > On 22. 02. 24, 14:49, Rengarajan S wrote:
> > > > > > > > Updated the TX Burst implementation by changing the
> > > > > > > > circular
> > > > > > > > buffer
> > > > > > > > processing with the pre-existing APIs in kernel. Also
> > > > > > > > updated
> > > > > > > > conditional
> > > > > > > > statements and alignment issues for better readability.
> > > > > > > 
> > > > > > > Hi,
> > > > > > > 
> > > > > > > so why are you keeping the nested double loop?
> > > > > > > 
> > > > > > 
> > > > > > Hi, in order to differentiate Burst mode handling with byte
> > > > > > mode
> > > > > > had
> > > > > > seperate loops for both. Since, having single while loop
> > > > > > also
> > > > > > does
> > > > > > not
> > > > > > align with rx implementation (where we have seperate
> > > > > > handling
> > > > > > for
> > > > > > burst
> > > > > > and byte) have retained the double loop.
> > > > > 
> > > > > So obviously, align RX to a single loop if possible. The
> > > > > current
> > > > > TX
> > > > > code
> > > > > is very hard to follow and sort of unmaintainable (and
> > > > > buggy).
> > > > > And
> > > > > IMO
> > > > > it's unnecessary as I proposed [1]. And even if RX cannot be
> > > > > one
> > > > > loop,
> > > > > you still can make TX easy to read as the two need not be the
> > > > > same.
> > > > > 
> > > > > [1]
> > > > > https://lore.kernel.org/all/b8325c3f-bf5b-4c55-8dce-ef395edce251@kernel.org/
> > > > 
> > > > 
> > > > while (data_empty_count) {
> > > >      cnt = CIRC_CNT_TO_END();
> > > >      if (!cnt)
> > > >        break;
> > > >      if (cnt < UART_BURST_SIZE || (tail & 3)) { //
> > > > is_unaligned()
> > > >        writeb();
> > > >        cnt = 1;
> > > >      } else {
> > > >        writel()
> > > >        cnt = UART_BURST_SIZE;
> > > >      }
> > > >      uart_xmit_advance(cnt);
> > > >      data_empty_count -= cnt;
> > > > }
> > > > 
> > > > With the above implementation we are observing performance drop
> > > > of
> > > > 2
> > > > Mbps at baud rate of 4 Mbps. The reason for this is the fact
> > > > that
> > > > for
> > > > each iteration we are checking if the the data need to be
> > > > processed
> > > > via
> > > > DWORDs or Bytes. The condition check for each iteration is
> > > > causing
> > > > the
> > > > drop in performance.
> > > 
> > > Hi,
> > > 
> > > the check is by several orders of magnitude faster than the I/O
> > > proper.
> > > So I don't think that's the root cause.
> > > 
> > > > With the previous implementation(with nested loops) the
> > > > performance
> > > > is
> > > > found to be around 4 Mbps at baud rate of 4 Mbps. In that
> > > > implementation we handle sending DWORDs continuosly until the
> > > > transfer
> > > > size < 4. Can you let us know any other alternatives for the
> > > > above
> > > > performance drop.
> > > 
> > > Could you attach the patch you are testing?
> > 
> > Please find the updated pci1xxxx_process_write_data
> > 
> >       u32 xfer_cnt;
> > 
> >          while (*valid_byte_count) {
> >                  xfer_cnt = CIRC_CNT_TO_END(xmit->head, xmit->tail,
> >                                             UART_XMIT_SIZE);
> > 
> >                  if (!xfer_cnt)
> >                          break;
> > 
> >                  if (xfer_cnt < UART_BURST_SIZE || (xmit->tail &
> > 3)) {
> 
> Hi,
> 
> OK, is it different if you remove the alignment checking (which
> should
> be correct™ thing to do, but may/will slow down things on platforms
> which don't care)?

After removing alignment checking the performance increases marginally,
Transferred 10 MB at 2759999 CPS. But still observing it is less than
the previous implementation.

> 
> >                          writeb(xmit->buf[xmit->tail], port-
> > >membase +
> >                                 UART_TX_BYTE_FIFO);
> >                          xfer_cnt = UART_BYTE_SIZE;
> >                  } else {
> >                          writel(*(u32 *)&xmit->buf[xmit->tail],
> 
> If you remove the "tail & 3" check, you can use get_unaligned() here
> and
> need not care about unaligned accesses after all...

Using get_unaligned((u32 *) xmit) shows the performance drop to
Transferred 10 MB at 1959999 CPS.

> 
> >                                 port->membase +
> > UART_TX_BURST_FIFO);
> >                          xfer_cnt = UART_BURST_SIZE;
> >                  }
> > 
> >                  uart_xmit_advance(port, xfer_cnt);
> >                  *data_empty_count -= xfer_cnt;
> >                  *valid_byte_count -= xfer_cnt;
> >          }
> > 
> > Testing is done via minicom by transferring a 10 MB file at 4 Mbps,
> > 
> > After the minicom transfer with single instance:
> > 
> > Previous implementation(Nested While Loops):
> > Transferred 10 MB at 3900000 CPS
> > 
> > Current implementation:
> > Transferred 10 MB at 2459999 CPS
> 
> 
> 
> --
> js
> suse labs
> 


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-03-06  6:56 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-22 13:49 [PATCH v1 tty] 8250: microchip: pci1xxxx: Refactor TX Burst code to use pre-existing APIs Rengarajan S
2024-02-22 16:01 ` Ilpo Järvinen
2024-02-23  9:22   ` Rengarajan.S
2024-02-23  6:08 ` Jiri Slaby
2024-02-23  9:21   ` Rengarajan.S
2024-02-23  9:26     ` Jiri Slaby
2024-02-23  9:36       ` Rengarajan.S
2024-03-04  4:37       ` Rengarajan.S
2024-03-04  6:19         ` Jiri Slaby
2024-03-05  4:15           ` Rengarajan.S
2024-03-05  7:19             ` Jiri Slaby
2024-03-06  6:55               ` Rengarajan.S

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.